1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 202 // that predication is preferred, and this lists all options. I.e., the 203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 204 // and predicate the instructions accordingly. If tail-folding fails, there are 205 // different fallback strategies depending on these values: 206 namespace PreferPredicateTy { 207 enum Option { 208 ScalarEpilogue = 0, 209 PredicateElseScalarEpilogue, 210 PredicateOrDontVectorize 211 }; 212 } // namespace PreferPredicateTy 213 214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 215 "prefer-predicate-over-epilogue", 216 cl::init(PreferPredicateTy::ScalarEpilogue), 217 cl::Hidden, 218 cl::desc("Tail-folding and predication preferences over creating a scalar " 219 "epilogue loop."), 220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 221 "scalar-epilogue", 222 "Don't tail-predicate loops, create scalar epilogue"), 223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 224 "predicate-else-scalar-epilogue", 225 "prefer tail-folding, create scalar epilogue if tail " 226 "folding fails."), 227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 228 "predicate-dont-vectorize", 229 "prefers tail-folding, don't attempt vectorization if " 230 "tail-folding fails."))); 231 232 static cl::opt<bool> MaximizeBandwidth( 233 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 234 cl::desc("Maximize bandwidth when selecting vectorization factor which " 235 "will be determined by the smallest type in loop.")); 236 237 static cl::opt<bool> EnableInterleavedMemAccesses( 238 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 239 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 240 241 /// An interleave-group may need masking if it resides in a block that needs 242 /// predication, or in order to mask away gaps. 243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 244 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 246 247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 248 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 249 cl::desc("We don't interleave loops with a estimated constant trip count " 250 "below this number")); 251 252 static cl::opt<unsigned> ForceTargetNumScalarRegs( 253 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 254 cl::desc("A flag that overrides the target's number of scalar registers.")); 255 256 static cl::opt<unsigned> ForceTargetNumVectorRegs( 257 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of vector registers.")); 259 260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 261 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's max interleave factor for " 263 "scalar loops.")); 264 265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 266 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "vectorized loops.")); 269 270 static cl::opt<unsigned> ForceTargetInstructionCost( 271 "force-target-instruction-cost", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's expected cost for " 273 "an instruction to a single constant value. Mostly " 274 "useful for getting consistent testing.")); 275 276 static cl::opt<bool> ForceTargetSupportsScalableVectors( 277 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 278 cl::desc( 279 "Pretend that scalable vectors are supported, even if the target does " 280 "not support them. This flag should only be used for testing.")); 281 282 static cl::opt<unsigned> SmallLoopCost( 283 "small-loop-cost", cl::init(20), cl::Hidden, 284 cl::desc( 285 "The cost of a loop that is considered 'small' by the interleaver.")); 286 287 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 288 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 289 cl::desc("Enable the use of the block frequency analysis to access PGO " 290 "heuristics minimizing code growth in cold regions and being more " 291 "aggressive in hot regions.")); 292 293 // Runtime interleave loops for load/store throughput. 294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 295 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 296 cl::desc( 297 "Enable runtime interleaving until load/store ports are saturated")); 298 299 /// Interleave small loops with scalar reductions. 300 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 301 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 302 cl::desc("Enable interleaving for loops with small iteration counts that " 303 "contain scalar reductions to expose ILP.")); 304 305 /// The number of stores in a loop that are allowed to need predication. 306 static cl::opt<unsigned> NumberOfStoresToPredicate( 307 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 308 cl::desc("Max number of stores to be predicated behind an if.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> PreferPredicatedReductionSelect( 330 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 331 cl::desc( 332 "Prefer predicating a reduction operation over an after loop select.")); 333 334 cl::opt<bool> EnableVPlanNativePath( 335 "enable-vplan-native-path", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path with " 337 "support for outer loop vectorization.")); 338 339 // FIXME: Remove this switch once we have divergence analysis. Currently we 340 // assume divergent non-backedge branches when this switch is true. 341 cl::opt<bool> EnableVPlanPredication( 342 "enable-vplan-predication", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path predicator with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 /// A helper function that returns the type of loaded or stored value. 365 static Type *getMemInstValueType(Value *I) { 366 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 367 "Expected Load or Store instruction"); 368 if (auto *LI = dyn_cast<LoadInst>(I)) 369 return LI->getType(); 370 return cast<StoreInst>(I)->getValueOperand()->getType(); 371 } 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type at the given vectorization factor. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 377 // Determine if an array of VF elements of type Ty is "bitcast compatible" 378 // with a <VF x Ty> vector. 379 if (VF.isVector()) { 380 auto *VectorTy = VectorType::get(Ty, VF); 381 return TypeSize::get(VF.getKnownMinValue() * 382 DL.getTypeAllocSize(Ty).getFixedValue(), 383 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 384 } 385 386 // If the vectorization factor is one, we just check if an array of type Ty 387 // requires padding between elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that adds a 'fast' flag to floating-point operations. 400 static Value *addFastMathFlag(Value *V) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 403 return V; 404 } 405 406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 407 if (isa<FPMathOperator>(V)) 408 cast<Instruction>(V)->setFastMathFlags(FMF); 409 return V; 410 } 411 412 /// A helper function that returns an integer or floating-point constant with 413 /// value C. 414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 416 : ConstantFP::get(Ty, C); 417 } 418 419 /// Returns "best known" trip count for the specified loop \p L as defined by 420 /// the following procedure: 421 /// 1) Returns exact trip count if it is known. 422 /// 2) Returns expected trip count according to profile data if any. 423 /// 3) Returns upper bound estimate if it is known. 424 /// 4) Returns None if all of the above failed. 425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return None; 440 } 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), 471 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 472 BFI(BFI), PSI(PSI) { 473 // Query this against the original loop and save it here because the profile 474 // of the original loop header may change as the transformation happens. 475 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 476 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 477 } 478 479 virtual ~InnerLoopVectorizer() = default; 480 481 /// Create a new empty loop that will contain vectorized instructions later 482 /// on, while the old loop will be used as the scalar remainder. Control flow 483 /// is generated around the vectorized (and scalar epilogue) loops consisting 484 /// of various checks and bypasses. Return the pre-header block of the new 485 /// loop. 486 /// In the case of epilogue vectorization, this function is overriden to 487 /// handle the more complex control flow around the loops. 488 virtual BasicBlock *createVectorizedLoopSkeleton(); 489 490 /// Widen a single instruction within the innermost loop. 491 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 492 VPTransformState &State); 493 494 /// Widen a single call instruction within the innermost loop. 495 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 496 VPTransformState &State); 497 498 /// Widen a single select instruction within the innermost loop. 499 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 500 bool InvariantCond, VPTransformState &State); 501 502 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 503 void fixVectorizedLoop(); 504 505 // Return true if any runtime check is added. 506 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 507 508 /// A type for vectorized values in the new loop. Each value from the 509 /// original loop, when vectorized, is represented by UF vector values in the 510 /// new unrolled loop, where UF is the unroll factor. 511 using VectorParts = SmallVector<Value *, 2>; 512 513 /// Vectorize a single GetElementPtrInst based on information gathered and 514 /// decisions taken during planning. 515 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 516 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 517 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 518 519 /// Vectorize a single PHINode in a block. This method handles the induction 520 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 521 /// arbitrary length vectors. 522 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 523 Value *StartV, unsigned UF, ElementCount VF); 524 525 /// A helper function to scalarize a single Instruction in the innermost loop. 526 /// Generates a sequence of scalar instances for each lane between \p MinLane 527 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 528 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 529 /// Instr's operands. 530 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 531 const VPIteration &Instance, bool IfPredicateInstr, 532 VPTransformState &State); 533 534 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 535 /// is provided, the integer induction variable will first be truncated to 536 /// the corresponding type. 537 void widenIntOrFpInduction(PHINode *IV, Value *Start, 538 TruncInst *Trunc = nullptr); 539 540 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 541 /// vector or scalar value on-demand if one is not yet available. When 542 /// vectorizing a loop, we visit the definition of an instruction before its 543 /// uses. When visiting the definition, we either vectorize or scalarize the 544 /// instruction, creating an entry for it in the corresponding map. (In some 545 /// cases, such as induction variables, we will create both vector and scalar 546 /// entries.) Then, as we encounter uses of the definition, we derive values 547 /// for each scalar or vector use unless such a value is already available. 548 /// For example, if we scalarize a definition and one of its uses is vector, 549 /// we build the required vector on-demand with an insertelement sequence 550 /// when visiting the use. Otherwise, if the use is scalar, we can use the 551 /// existing scalar definition. 552 /// 553 /// Return a value in the new loop corresponding to \p V from the original 554 /// loop at unroll index \p Part. If the value has already been vectorized, 555 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 556 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 557 /// a new vector value on-demand by inserting the scalar values into a vector 558 /// with an insertelement sequence. If the value has been neither vectorized 559 /// nor scalarized, it must be loop invariant, so we simply broadcast the 560 /// value into a vector. 561 Value *getOrCreateVectorValue(Value *V, unsigned Part); 562 563 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 564 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 565 } 566 567 /// Return a value in the new loop corresponding to \p V from the original 568 /// loop at unroll and vector indices \p Instance. If the value has been 569 /// vectorized but not scalarized, the necessary extractelement instruction 570 /// will be generated. 571 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 572 573 /// Construct the vector value of a scalarized value \p V one lane at a time. 574 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 575 576 /// Try to vectorize interleaved access group \p Group with the base address 577 /// given in \p Addr, optionally masking the vector operations if \p 578 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 579 /// values in the vectorized loop. 580 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 581 ArrayRef<VPValue *> VPDefs, 582 VPTransformState &State, VPValue *Addr, 583 ArrayRef<VPValue *> StoredValues, 584 VPValue *BlockInMask = nullptr); 585 586 /// Vectorize Load and Store instructions with the base address given in \p 587 /// Addr, optionally masking the vector operations if \p BlockInMask is 588 /// non-null. Use \p State to translate given VPValues to IR values in the 589 /// vectorized loop. 590 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 591 VPValue *Def, VPValue *Addr, 592 VPValue *StoredValue, VPValue *BlockInMask); 593 594 /// Set the debug location in the builder using the debug location in 595 /// the instruction. 596 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 597 598 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 599 void fixNonInductionPHIs(void); 600 601 protected: 602 friend class LoopVectorizationPlanner; 603 604 /// A small list of PHINodes. 605 using PhiVector = SmallVector<PHINode *, 4>; 606 607 /// A type for scalarized values in the new loop. Each value from the 608 /// original loop, when scalarized, is represented by UF x VF scalar values 609 /// in the new unrolled loop, where UF is the unroll factor and VF is the 610 /// vectorization factor. 611 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 612 613 /// Set up the values of the IVs correctly when exiting the vector loop. 614 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 615 Value *CountRoundDown, Value *EndValue, 616 BasicBlock *MiddleBlock); 617 618 /// Create a new induction variable inside L. 619 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 620 Value *Step, Instruction *DL); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(); 624 625 /// Fix a first-order recurrence. This is the second phase of vectorizing 626 /// this phi node. 627 void fixFirstOrderRecurrence(PHINode *Phi); 628 629 /// Fix a reduction cross-iteration phi. This is the second phase of 630 /// vectorizing this phi node. 631 void fixReduction(PHINode *Phi); 632 633 /// Clear NSW/NUW flags from reduction instructions if necessary. 634 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 635 636 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 637 /// means we need to add the appropriate incoming value from the middle 638 /// block as exiting edges from the scalar epilogue loop (if present) are 639 /// already in place, and we exit the vector loop exclusively to the middle 640 /// block. 641 void fixLCSSAPHIs(); 642 643 /// Iteratively sink the scalarized operands of a predicated instruction into 644 /// the block that was created for it. 645 void sinkScalarOperands(Instruction *PredInst); 646 647 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 648 /// represented as. 649 void truncateToMinimalBitwidths(); 650 651 /// Create a broadcast instruction. This method generates a broadcast 652 /// instruction (shuffle) for loop invariant values and for the induction 653 /// value. If this is the induction variable then we extend it to N, N+1, ... 654 /// this is needed because each iteration in the loop corresponds to a SIMD 655 /// element. 656 virtual Value *getBroadcastInstrs(Value *V); 657 658 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 659 /// to each vector element of Val. The sequence starts at StartIndex. 660 /// \p Opcode is relevant for FP induction variable. 661 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 662 Instruction::BinaryOps Opcode = 663 Instruction::BinaryOpsEnd); 664 665 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 666 /// variable on which to base the steps, \p Step is the size of the step, and 667 /// \p EntryVal is the value from the original loop that maps to the steps. 668 /// Note that \p EntryVal doesn't have to be an induction variable - it 669 /// can also be a truncate instruction. 670 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 671 const InductionDescriptor &ID); 672 673 /// Create a vector induction phi node based on an existing scalar one. \p 674 /// EntryVal is the value from the original loop that maps to the vector phi 675 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 676 /// truncate instruction, instead of widening the original IV, we widen a 677 /// version of the IV truncated to \p EntryVal's type. 678 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 679 Value *Step, Value *Start, 680 Instruction *EntryVal); 681 682 /// Returns true if an instruction \p I should be scalarized instead of 683 /// vectorized for the chosen vectorization factor. 684 bool shouldScalarizeInstruction(Instruction *I) const; 685 686 /// Returns true if we should generate a scalar version of \p IV. 687 bool needsScalarInduction(Instruction *IV) const; 688 689 /// If there is a cast involved in the induction variable \p ID, which should 690 /// be ignored in the vectorized loop body, this function records the 691 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 692 /// cast. We had already proved that the casted Phi is equal to the uncasted 693 /// Phi in the vectorized loop (under a runtime guard), and therefore 694 /// there is no need to vectorize the cast - the same value can be used in the 695 /// vector loop for both the Phi and the cast. 696 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 697 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 698 /// 699 /// \p EntryVal is the value from the original loop that maps to the vector 700 /// phi node and is used to distinguish what is the IV currently being 701 /// processed - original one (if \p EntryVal is a phi corresponding to the 702 /// original IV) or the "newly-created" one based on the proof mentioned above 703 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 704 /// latter case \p EntryVal is a TruncInst and we must not record anything for 705 /// that IV, but it's error-prone to expect callers of this routine to care 706 /// about that, hence this explicit parameter. 707 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 708 const Instruction *EntryVal, 709 Value *VectorLoopValue, 710 unsigned Part, 711 unsigned Lane = UINT_MAX); 712 713 /// Generate a shuffle sequence that will reverse the vector Vec. 714 virtual Value *reverseVector(Value *Vec); 715 716 /// Returns (and creates if needed) the original loop trip count. 717 Value *getOrCreateTripCount(Loop *NewLoop); 718 719 /// Returns (and creates if needed) the trip count of the widened loop. 720 Value *getOrCreateVectorTripCount(Loop *NewLoop); 721 722 /// Returns a bitcasted value to the requested vector type. 723 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 724 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 725 const DataLayout &DL); 726 727 /// Emit a bypass check to see if the vector trip count is zero, including if 728 /// it overflows. 729 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 730 731 /// Emit a bypass check to see if all of the SCEV assumptions we've 732 /// had to make are correct. 733 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 734 735 /// Emit bypass checks to check any memory assumptions we may have made. 736 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 737 738 /// Compute the transformed value of Index at offset StartValue using step 739 /// StepValue. 740 /// For integer induction, returns StartValue + Index * StepValue. 741 /// For pointer induction, returns StartValue[Index * StepValue]. 742 /// FIXME: The newly created binary instructions should contain nsw/nuw 743 /// flags, which can be found from the original scalar operations. 744 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 745 const DataLayout &DL, 746 const InductionDescriptor &ID) const; 747 748 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 749 /// vector loop preheader, middle block and scalar preheader. Also 750 /// allocate a loop object for the new vector loop and return it. 751 Loop *createVectorLoopSkeleton(StringRef Prefix); 752 753 /// Create new phi nodes for the induction variables to resume iteration count 754 /// in the scalar epilogue, from where the vectorized loop left off (given by 755 /// \p VectorTripCount). 756 /// In cases where the loop skeleton is more complicated (eg. epilogue 757 /// vectorization) and the resume values can come from an additional bypass 758 /// block, the \p AdditionalBypass pair provides information about the bypass 759 /// block and the end value on the edge from bypass to this loop. 760 void createInductionResumeValues( 761 Loop *L, Value *VectorTripCount, 762 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 763 764 /// Complete the loop skeleton by adding debug MDs, creating appropriate 765 /// conditional branches in the middle block, preparing the builder and 766 /// running the verifier. Take in the vector loop \p L as argument, and return 767 /// the preheader of the completed vector loop. 768 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 769 770 /// Add additional metadata to \p To that was not present on \p Orig. 771 /// 772 /// Currently this is used to add the noalias annotations based on the 773 /// inserted memchecks. Use this for instructions that are *cloned* into the 774 /// vector loop. 775 void addNewMetadata(Instruction *To, const Instruction *Orig); 776 777 /// Add metadata from one instruction to another. 778 /// 779 /// This includes both the original MDs from \p From and additional ones (\see 780 /// addNewMetadata). Use this for *newly created* instructions in the vector 781 /// loop. 782 void addMetadata(Instruction *To, Instruction *From); 783 784 /// Similar to the previous function but it adds the metadata to a 785 /// vector of instructions. 786 void addMetadata(ArrayRef<Value *> To, Instruction *From); 787 788 /// Allow subclasses to override and print debug traces before/after vplan 789 /// execution, when trace information is requested. 790 virtual void printDebugTracesAtStart(){}; 791 virtual void printDebugTracesAtEnd(){}; 792 793 /// The original loop. 794 Loop *OrigLoop; 795 796 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 797 /// dynamic knowledge to simplify SCEV expressions and converts them to a 798 /// more usable form. 799 PredicatedScalarEvolution &PSE; 800 801 /// Loop Info. 802 LoopInfo *LI; 803 804 /// Dominator Tree. 805 DominatorTree *DT; 806 807 /// Alias Analysis. 808 AAResults *AA; 809 810 /// Target Library Info. 811 const TargetLibraryInfo *TLI; 812 813 /// Target Transform Info. 814 const TargetTransformInfo *TTI; 815 816 /// Assumption Cache. 817 AssumptionCache *AC; 818 819 /// Interface to emit optimization remarks. 820 OptimizationRemarkEmitter *ORE; 821 822 /// LoopVersioning. It's only set up (non-null) if memchecks were 823 /// used. 824 /// 825 /// This is currently only used to add no-alias metadata based on the 826 /// memchecks. The actually versioning is performed manually. 827 std::unique_ptr<LoopVersioning> LVer; 828 829 /// The vectorization SIMD factor to use. Each vector will have this many 830 /// vector elements. 831 ElementCount VF; 832 833 /// The vectorization unroll factor to use. Each scalar is vectorized to this 834 /// many different vector instructions. 835 unsigned UF; 836 837 /// The builder that we use 838 IRBuilder<> Builder; 839 840 // --- Vectorization state --- 841 842 /// The vector-loop preheader. 843 BasicBlock *LoopVectorPreHeader; 844 845 /// The scalar-loop preheader. 846 BasicBlock *LoopScalarPreHeader; 847 848 /// Middle Block between the vector and the scalar. 849 BasicBlock *LoopMiddleBlock; 850 851 /// The (unique) ExitBlock of the scalar loop. Note that 852 /// there can be multiple exiting edges reaching this block. 853 BasicBlock *LoopExitBlock; 854 855 /// The vector loop body. 856 BasicBlock *LoopVectorBody; 857 858 /// The scalar loop body. 859 BasicBlock *LoopScalarBody; 860 861 /// A list of all bypass blocks. The first block is the entry of the loop. 862 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 863 864 /// The new Induction variable which was added to the new block. 865 PHINode *Induction = nullptr; 866 867 /// The induction variable of the old basic block. 868 PHINode *OldInduction = nullptr; 869 870 /// Maps values from the original loop to their corresponding values in the 871 /// vectorized loop. A key value can map to either vector values, scalar 872 /// values or both kinds of values, depending on whether the key was 873 /// vectorized and scalarized. 874 VectorizerValueMap VectorLoopValueMap; 875 876 /// Store instructions that were predicated. 877 SmallVector<Instruction *, 4> PredicatedInstructions; 878 879 /// Trip count of the original loop. 880 Value *TripCount = nullptr; 881 882 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 883 Value *VectorTripCount = nullptr; 884 885 /// The legality analysis. 886 LoopVectorizationLegality *Legal; 887 888 /// The profitablity analysis. 889 LoopVectorizationCostModel *Cost; 890 891 // Record whether runtime checks are added. 892 bool AddedSafetyChecks = false; 893 894 // Holds the end values for each induction variable. We save the end values 895 // so we can later fix-up the external users of the induction variables. 896 DenseMap<PHINode *, Value *> IVEndValues; 897 898 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 899 // fixed up at the end of vector code generation. 900 SmallVector<PHINode *, 8> OrigPHIsToFix; 901 902 /// BFI and PSI are used to check for profile guided size optimizations. 903 BlockFrequencyInfo *BFI; 904 ProfileSummaryInfo *PSI; 905 906 // Whether this loop should be optimized for size based on profile guided size 907 // optimizatios. 908 bool OptForSizeBasedOnProfile; 909 }; 910 911 class InnerLoopUnroller : public InnerLoopVectorizer { 912 public: 913 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 914 LoopInfo *LI, DominatorTree *DT, 915 const TargetLibraryInfo *TLI, 916 const TargetTransformInfo *TTI, AssumptionCache *AC, 917 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 918 LoopVectorizationLegality *LVL, 919 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 920 ProfileSummaryInfo *PSI) 921 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 922 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 923 BFI, PSI) {} 924 925 private: 926 Value *getBroadcastInstrs(Value *V) override; 927 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 928 Instruction::BinaryOps Opcode = 929 Instruction::BinaryOpsEnd) override; 930 Value *reverseVector(Value *Vec) override; 931 }; 932 933 /// Encapsulate information regarding vectorization of a loop and its epilogue. 934 /// This information is meant to be updated and used across two stages of 935 /// epilogue vectorization. 936 struct EpilogueLoopVectorizationInfo { 937 ElementCount MainLoopVF = ElementCount::getFixed(0); 938 unsigned MainLoopUF = 0; 939 ElementCount EpilogueVF = ElementCount::getFixed(0); 940 unsigned EpilogueUF = 0; 941 BasicBlock *MainLoopIterationCountCheck = nullptr; 942 BasicBlock *EpilogueIterationCountCheck = nullptr; 943 BasicBlock *SCEVSafetyCheck = nullptr; 944 BasicBlock *MemSafetyCheck = nullptr; 945 Value *TripCount = nullptr; 946 Value *VectorTripCount = nullptr; 947 948 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 949 unsigned EUF) 950 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 951 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 952 assert(EUF == 1 && 953 "A high UF for the epilogue loop is likely not beneficial."); 954 } 955 }; 956 957 /// An extension of the inner loop vectorizer that creates a skeleton for a 958 /// vectorized loop that has its epilogue (residual) also vectorized. 959 /// The idea is to run the vplan on a given loop twice, firstly to setup the 960 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 961 /// from the first step and vectorize the epilogue. This is achieved by 962 /// deriving two concrete strategy classes from this base class and invoking 963 /// them in succession from the loop vectorizer planner. 964 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 965 public: 966 InnerLoopAndEpilogueVectorizer( 967 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 968 DominatorTree *DT, const TargetLibraryInfo *TLI, 969 const TargetTransformInfo *TTI, AssumptionCache *AC, 970 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 971 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 972 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 973 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 974 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 975 EPI(EPI) {} 976 977 // Override this function to handle the more complex control flow around the 978 // three loops. 979 BasicBlock *createVectorizedLoopSkeleton() final override { 980 return createEpilogueVectorizedLoopSkeleton(); 981 } 982 983 /// The interface for creating a vectorized skeleton using one of two 984 /// different strategies, each corresponding to one execution of the vplan 985 /// as described above. 986 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 987 988 /// Holds and updates state information required to vectorize the main loop 989 /// and its epilogue in two separate passes. This setup helps us avoid 990 /// regenerating and recomputing runtime safety checks. It also helps us to 991 /// shorten the iteration-count-check path length for the cases where the 992 /// iteration count of the loop is so small that the main vector loop is 993 /// completely skipped. 994 EpilogueLoopVectorizationInfo &EPI; 995 }; 996 997 /// A specialized derived class of inner loop vectorizer that performs 998 /// vectorization of *main* loops in the process of vectorizing loops and their 999 /// epilogues. 1000 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerMainLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1009 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1010 EPI, LVL, CM, BFI, PSI) {} 1011 /// Implements the interface for creating a vectorized skeleton using the 1012 /// *main loop* strategy (ie the first pass of vplan execution). 1013 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1014 1015 protected: 1016 /// Emits an iteration count bypass check once for the main loop (when \p 1017 /// ForEpilogue is false) and once for the epilogue loop (when \p 1018 /// ForEpilogue is true). 1019 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1020 bool ForEpilogue); 1021 void printDebugTracesAtStart() override; 1022 void printDebugTracesAtEnd() override; 1023 }; 1024 1025 // A specialized derived class of inner loop vectorizer that performs 1026 // vectorization of *epilogue* loops in the process of vectorizing loops and 1027 // their epilogues. 1028 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1029 public: 1030 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1031 LoopInfo *LI, DominatorTree *DT, 1032 const TargetLibraryInfo *TLI, 1033 const TargetTransformInfo *TTI, AssumptionCache *AC, 1034 OptimizationRemarkEmitter *ORE, 1035 EpilogueLoopVectorizationInfo &EPI, 1036 LoopVectorizationLegality *LVL, 1037 llvm::LoopVectorizationCostModel *CM, 1038 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1039 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1040 EPI, LVL, CM, BFI, PSI) {} 1041 /// Implements the interface for creating a vectorized skeleton using the 1042 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1043 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1044 1045 protected: 1046 /// Emits an iteration count bypass check after the main vector loop has 1047 /// finished to see if there are any iterations left to execute by either 1048 /// the vector epilogue or the scalar epilogue. 1049 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1050 BasicBlock *Bypass, 1051 BasicBlock *Insert); 1052 void printDebugTracesAtStart() override; 1053 void printDebugTracesAtEnd() override; 1054 }; 1055 } // end namespace llvm 1056 1057 /// Look for a meaningful debug location on the instruction or it's 1058 /// operands. 1059 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1060 if (!I) 1061 return I; 1062 1063 DebugLoc Empty; 1064 if (I->getDebugLoc() != Empty) 1065 return I; 1066 1067 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1068 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1069 if (OpInst->getDebugLoc() != Empty) 1070 return OpInst; 1071 } 1072 1073 return I; 1074 } 1075 1076 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1077 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1078 const DILocation *DIL = Inst->getDebugLoc(); 1079 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1080 !isa<DbgInfoIntrinsic>(Inst)) { 1081 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1082 auto NewDIL = 1083 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1084 if (NewDIL) 1085 B.SetCurrentDebugLocation(NewDIL.getValue()); 1086 else 1087 LLVM_DEBUG(dbgs() 1088 << "Failed to create new discriminator: " 1089 << DIL->getFilename() << " Line: " << DIL->getLine()); 1090 } 1091 else 1092 B.SetCurrentDebugLocation(DIL); 1093 } else 1094 B.SetCurrentDebugLocation(DebugLoc()); 1095 } 1096 1097 /// Write a record \p DebugMsg about vectorization failure to the debug 1098 /// output stream. If \p I is passed, it is an instruction that prevents 1099 /// vectorization. 1100 #ifndef NDEBUG 1101 static void debugVectorizationFailure(const StringRef DebugMsg, 1102 Instruction *I) { 1103 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1104 if (I != nullptr) 1105 dbgs() << " " << *I; 1106 else 1107 dbgs() << '.'; 1108 dbgs() << '\n'; 1109 } 1110 #endif 1111 1112 /// Create an analysis remark that explains why vectorization failed 1113 /// 1114 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1115 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1116 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1117 /// the location of the remark. \return the remark object that can be 1118 /// streamed to. 1119 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1120 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1121 Value *CodeRegion = TheLoop->getHeader(); 1122 DebugLoc DL = TheLoop->getStartLoc(); 1123 1124 if (I) { 1125 CodeRegion = I->getParent(); 1126 // If there is no debug location attached to the instruction, revert back to 1127 // using the loop's. 1128 if (I->getDebugLoc()) 1129 DL = I->getDebugLoc(); 1130 } 1131 1132 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1133 R << "loop not vectorized: "; 1134 return R; 1135 } 1136 1137 /// Return a value for Step multiplied by VF. 1138 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1139 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1140 Constant *StepVal = ConstantInt::get( 1141 Step->getType(), 1142 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1143 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1144 } 1145 1146 namespace llvm { 1147 1148 void reportVectorizationFailure(const StringRef DebugMsg, 1149 const StringRef OREMsg, const StringRef ORETag, 1150 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1151 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1152 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1153 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1154 ORETag, TheLoop, I) << OREMsg); 1155 } 1156 1157 } // end namespace llvm 1158 1159 #ifndef NDEBUG 1160 /// \return string containing a file name and a line # for the given loop. 1161 static std::string getDebugLocString(const Loop *L) { 1162 std::string Result; 1163 if (L) { 1164 raw_string_ostream OS(Result); 1165 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1166 LoopDbgLoc.print(OS); 1167 else 1168 // Just print the module name. 1169 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1170 OS.flush(); 1171 } 1172 return Result; 1173 } 1174 #endif 1175 1176 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1177 const Instruction *Orig) { 1178 // If the loop was versioned with memchecks, add the corresponding no-alias 1179 // metadata. 1180 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1181 LVer->annotateInstWithNoAlias(To, Orig); 1182 } 1183 1184 void InnerLoopVectorizer::addMetadata(Instruction *To, 1185 Instruction *From) { 1186 propagateMetadata(To, From); 1187 addNewMetadata(To, From); 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1191 Instruction *From) { 1192 for (Value *V : To) { 1193 if (Instruction *I = dyn_cast<Instruction>(V)) 1194 addMetadata(I, From); 1195 } 1196 } 1197 1198 namespace llvm { 1199 1200 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1201 // lowered. 1202 enum ScalarEpilogueLowering { 1203 1204 // The default: allowing scalar epilogues. 1205 CM_ScalarEpilogueAllowed, 1206 1207 // Vectorization with OptForSize: don't allow epilogues. 1208 CM_ScalarEpilogueNotAllowedOptSize, 1209 1210 // A special case of vectorisation with OptForSize: loops with a very small 1211 // trip count are considered for vectorization under OptForSize, thereby 1212 // making sure the cost of their loop body is dominant, free of runtime 1213 // guards and scalar iteration overheads. 1214 CM_ScalarEpilogueNotAllowedLowTripLoop, 1215 1216 // Loop hint predicate indicating an epilogue is undesired. 1217 CM_ScalarEpilogueNotNeededUsePredicate, 1218 1219 // Directive indicating we must either tail fold or not vectorize 1220 CM_ScalarEpilogueNotAllowedUsePredicate 1221 }; 1222 1223 /// LoopVectorizationCostModel - estimates the expected speedups due to 1224 /// vectorization. 1225 /// In many cases vectorization is not profitable. This can happen because of 1226 /// a number of reasons. In this class we mainly attempt to predict the 1227 /// expected speedup/slowdowns due to the supported instruction set. We use the 1228 /// TargetTransformInfo to query the different backends for the cost of 1229 /// different operations. 1230 class LoopVectorizationCostModel { 1231 public: 1232 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1233 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1234 LoopVectorizationLegality *Legal, 1235 const TargetTransformInfo &TTI, 1236 const TargetLibraryInfo *TLI, DemandedBits *DB, 1237 AssumptionCache *AC, 1238 OptimizationRemarkEmitter *ORE, const Function *F, 1239 const LoopVectorizeHints *Hints, 1240 InterleavedAccessInfo &IAI) 1241 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1242 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1243 Hints(Hints), InterleaveInfo(IAI) {} 1244 1245 /// \return An upper bound for the vectorization factor, or None if 1246 /// vectorization and interleaving should be avoided up front. 1247 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1248 1249 /// \return True if runtime checks are required for vectorization, and false 1250 /// otherwise. 1251 bool runtimeChecksRequired(); 1252 1253 /// \return The most profitable vectorization factor and the cost of that VF. 1254 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1255 /// then this vectorization factor will be selected if vectorization is 1256 /// possible. 1257 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1258 VectorizationFactor 1259 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1260 const LoopVectorizationPlanner &LVP); 1261 1262 /// Setup cost-based decisions for user vectorization factor. 1263 void selectUserVectorizationFactor(ElementCount UserVF) { 1264 collectUniformsAndScalars(UserVF); 1265 collectInstsToScalarize(UserVF); 1266 } 1267 1268 /// \return The size (in bits) of the smallest and widest types in the code 1269 /// that needs to be vectorized. We ignore values that remain scalar such as 1270 /// 64 bit loop indices. 1271 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1272 1273 /// \return The desired interleave count. 1274 /// If interleave count has been specified by metadata it will be returned. 1275 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1276 /// are the selected vectorization factor and the cost of the selected VF. 1277 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1278 1279 /// Memory access instruction may be vectorized in more than one way. 1280 /// Form of instruction after vectorization depends on cost. 1281 /// This function takes cost-based decisions for Load/Store instructions 1282 /// and collects them in a map. This decisions map is used for building 1283 /// the lists of loop-uniform and loop-scalar instructions. 1284 /// The calculated cost is saved with widening decision in order to 1285 /// avoid redundant calculations. 1286 void setCostBasedWideningDecision(ElementCount VF); 1287 1288 /// A struct that represents some properties of the register usage 1289 /// of a loop. 1290 struct RegisterUsage { 1291 /// Holds the number of loop invariant values that are used in the loop. 1292 /// The key is ClassID of target-provided register class. 1293 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1294 /// Holds the maximum number of concurrent live intervals in the loop. 1295 /// The key is ClassID of target-provided register class. 1296 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1297 }; 1298 1299 /// \return Returns information about the register usages of the loop for the 1300 /// given vectorization factors. 1301 SmallVector<RegisterUsage, 8> 1302 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1303 1304 /// Collect values we want to ignore in the cost model. 1305 void collectValuesToIgnore(); 1306 1307 /// Split reductions into those that happen in the loop, and those that happen 1308 /// outside. In loop reductions are collected into InLoopReductionChains. 1309 void collectInLoopReductions(); 1310 1311 /// \returns The smallest bitwidth each instruction can be represented with. 1312 /// The vector equivalents of these instructions should be truncated to this 1313 /// type. 1314 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1315 return MinBWs; 1316 } 1317 1318 /// \returns True if it is more profitable to scalarize instruction \p I for 1319 /// vectorization factor \p VF. 1320 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1321 assert(VF.isVector() && 1322 "Profitable to scalarize relevant only for VF > 1."); 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto Scalars = InstsToScalarize.find(VF); 1330 assert(Scalars != InstsToScalarize.end() && 1331 "VF not yet analyzed for scalarization profitability"); 1332 return Scalars->second.find(I) != Scalars->second.end(); 1333 } 1334 1335 /// Returns true if \p I is known to be uniform after vectorization. 1336 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto UniformsPerVF = Uniforms.find(VF); 1346 assert(UniformsPerVF != Uniforms.end() && 1347 "VF not yet analyzed for uniformity"); 1348 return UniformsPerVF->second.count(I); 1349 } 1350 1351 /// Returns true if \p I is known to be scalar after vectorization. 1352 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1353 if (VF.isScalar()) 1354 return true; 1355 1356 // Cost model is not run in the VPlan-native path - return conservative 1357 // result until this changes. 1358 if (EnableVPlanNativePath) 1359 return false; 1360 1361 auto ScalarsPerVF = Scalars.find(VF); 1362 assert(ScalarsPerVF != Scalars.end() && 1363 "Scalar values are not calculated for VF"); 1364 return ScalarsPerVF->second.count(I); 1365 } 1366 1367 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1368 /// for vectorization factor \p VF. 1369 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1370 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1371 !isProfitableToScalarize(I, VF) && 1372 !isScalarAfterVectorization(I, VF); 1373 } 1374 1375 /// Decision that was taken during cost calculation for memory instruction. 1376 enum InstWidening { 1377 CM_Unknown, 1378 CM_Widen, // For consecutive accesses with stride +1. 1379 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1380 CM_Interleave, 1381 CM_GatherScatter, 1382 CM_Scalarize 1383 }; 1384 1385 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1386 /// instruction \p I and vector width \p VF. 1387 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1388 InstructionCost Cost) { 1389 assert(VF.isVector() && "Expected VF >=2"); 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 } 1392 1393 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1394 /// interleaving group \p Grp and vector width \p VF. 1395 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1396 ElementCount VF, InstWidening W, 1397 InstructionCost Cost) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 /// Broadcast this decicion to all instructions inside the group. 1400 /// But the cost will be assigned to one instruction only. 1401 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1402 if (auto *I = Grp->getMember(i)) { 1403 if (Grp->getInsertPos() == I) 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1405 else 1406 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1407 } 1408 } 1409 } 1410 1411 /// Return the cost model decision for the given instruction \p I and vector 1412 /// width \p VF. Return CM_Unknown if this instruction did not pass 1413 /// through the cost modeling. 1414 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1415 assert(VF.isVector() && "Expected VF to be a vector VF"); 1416 // Cost model is not run in the VPlan-native path - return conservative 1417 // result until this changes. 1418 if (EnableVPlanNativePath) 1419 return CM_GatherScatter; 1420 1421 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1422 auto Itr = WideningDecisions.find(InstOnVF); 1423 if (Itr == WideningDecisions.end()) 1424 return CM_Unknown; 1425 return Itr->second.first; 1426 } 1427 1428 /// Return the vectorization cost for the given instruction \p I and vector 1429 /// width \p VF. 1430 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1431 assert(VF.isVector() && "Expected VF >=2"); 1432 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1433 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1434 "The cost is not calculated"); 1435 return WideningDecisions[InstOnVF].second; 1436 } 1437 1438 /// Return True if instruction \p I is an optimizable truncate whose operand 1439 /// is an induction variable. Such a truncate will be removed by adding a new 1440 /// induction variable with the destination type. 1441 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1442 // If the instruction is not a truncate, return false. 1443 auto *Trunc = dyn_cast<TruncInst>(I); 1444 if (!Trunc) 1445 return false; 1446 1447 // Get the source and destination types of the truncate. 1448 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1449 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1450 1451 // If the truncate is free for the given types, return false. Replacing a 1452 // free truncate with an induction variable would add an induction variable 1453 // update instruction to each iteration of the loop. We exclude from this 1454 // check the primary induction variable since it will need an update 1455 // instruction regardless. 1456 Value *Op = Trunc->getOperand(0); 1457 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1458 return false; 1459 1460 // If the truncated value is not an induction variable, return false. 1461 return Legal->isInductionPhi(Op); 1462 } 1463 1464 /// Collects the instructions to scalarize for each predicated instruction in 1465 /// the loop. 1466 void collectInstsToScalarize(ElementCount VF); 1467 1468 /// Collect Uniform and Scalar values for the given \p VF. 1469 /// The sets depend on CM decision for Load/Store instructions 1470 /// that may be vectorized as interleave, gather-scatter or scalarized. 1471 void collectUniformsAndScalars(ElementCount VF) { 1472 // Do the analysis once. 1473 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1474 return; 1475 setCostBasedWideningDecision(VF); 1476 collectLoopUniforms(VF); 1477 collectLoopScalars(VF); 1478 } 1479 1480 /// Returns true if the target machine supports masked store operation 1481 /// for the given \p DataType and kind of access to \p Ptr. 1482 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1483 return Legal->isConsecutivePtr(Ptr) && 1484 TTI.isLegalMaskedStore(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked load operation 1488 /// for the given \p DataType and kind of access to \p Ptr. 1489 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1490 return Legal->isConsecutivePtr(Ptr) && 1491 TTI.isLegalMaskedLoad(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine supports masked scatter operation 1495 /// for the given \p DataType. 1496 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1497 return TTI.isLegalMaskedScatter(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked gather operation 1501 /// for the given \p DataType. 1502 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1503 return TTI.isLegalMaskedGather(DataType, Alignment); 1504 } 1505 1506 /// Returns true if the target machine can represent \p V as a masked gather 1507 /// or scatter operation. 1508 bool isLegalGatherOrScatter(Value *V) { 1509 bool LI = isa<LoadInst>(V); 1510 bool SI = isa<StoreInst>(V); 1511 if (!LI && !SI) 1512 return false; 1513 auto *Ty = getMemInstValueType(V); 1514 Align Align = getLoadStoreAlignment(V); 1515 return (LI && isLegalMaskedGather(Ty, Align)) || 1516 (SI && isLegalMaskedScatter(Ty, Align)); 1517 } 1518 1519 /// Returns true if \p I is an instruction that will be scalarized with 1520 /// predication. Such instructions include conditional stores and 1521 /// instructions that may divide by zero. 1522 /// If a non-zero VF has been calculated, we check if I will be scalarized 1523 /// predication for that VF. 1524 bool isScalarWithPredication(Instruction *I, 1525 ElementCount VF = ElementCount::getFixed(1)); 1526 1527 // Returns true if \p I is an instruction that will be predicated either 1528 // through scalar predication or masked load/store or masked gather/scatter. 1529 // Superset of instructions that return true for isScalarWithPredication. 1530 bool isPredicatedInst(Instruction *I) { 1531 if (!blockNeedsPredication(I->getParent())) 1532 return false; 1533 // Loads and stores that need some form of masked operation are predicated 1534 // instructions. 1535 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1536 return Legal->isMaskRequired(I); 1537 return isScalarWithPredication(I); 1538 } 1539 1540 /// Returns true if \p I is a memory instruction with consecutive memory 1541 /// access that can be widened. 1542 bool 1543 memoryInstructionCanBeWidened(Instruction *I, 1544 ElementCount VF = ElementCount::getFixed(1)); 1545 1546 /// Returns true if \p I is a memory instruction in an interleaved-group 1547 /// of memory accesses that can be vectorized with wide vector loads/stores 1548 /// and shuffles. 1549 bool 1550 interleavedAccessCanBeWidened(Instruction *I, 1551 ElementCount VF = ElementCount::getFixed(1)); 1552 1553 /// Check if \p Instr belongs to any interleaved access group. 1554 bool isAccessInterleaved(Instruction *Instr) { 1555 return InterleaveInfo.isInterleaved(Instr); 1556 } 1557 1558 /// Get the interleaved access group that \p Instr belongs to. 1559 const InterleaveGroup<Instruction> * 1560 getInterleavedAccessGroup(Instruction *Instr) { 1561 return InterleaveInfo.getInterleaveGroup(Instr); 1562 } 1563 1564 /// Returns true if we're required to use a scalar epilogue for at least 1565 /// the final iteration of the original loop. 1566 bool requiresScalarEpilogue() const { 1567 if (!isScalarEpilogueAllowed()) 1568 return false; 1569 // If we might exit from anywhere but the latch, must run the exiting 1570 // iteration in scalar form. 1571 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1572 return true; 1573 return InterleaveInfo.requiresScalarEpilogue(); 1574 } 1575 1576 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1577 /// loop hint annotation. 1578 bool isScalarEpilogueAllowed() const { 1579 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1580 } 1581 1582 /// Returns true if all loop blocks should be masked to fold tail loop. 1583 bool foldTailByMasking() const { return FoldTailByMasking; } 1584 1585 bool blockNeedsPredication(BasicBlock *BB) { 1586 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1587 } 1588 1589 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1590 /// nodes to the chain of instructions representing the reductions. Uses a 1591 /// MapVector to ensure deterministic iteration order. 1592 using ReductionChainMap = 1593 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1594 1595 /// Return the chain of instructions representing an inloop reduction. 1596 const ReductionChainMap &getInLoopReductionChains() const { 1597 return InLoopReductionChains; 1598 } 1599 1600 /// Returns true if the Phi is part of an inloop reduction. 1601 bool isInLoopReduction(PHINode *Phi) const { 1602 return InLoopReductionChains.count(Phi); 1603 } 1604 1605 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1606 /// with factor VF. Return the cost of the instruction, including 1607 /// scalarization overhead if it's needed. 1608 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1609 1610 /// Estimate cost of a call instruction CI if it were vectorized with factor 1611 /// VF. Return the cost of the instruction, including scalarization overhead 1612 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1613 /// scalarized - 1614 /// i.e. either vector version isn't available, or is too expensive. 1615 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1616 bool &NeedToScalarize); 1617 1618 /// Invalidates decisions already taken by the cost model. 1619 void invalidateCostModelingDecisions() { 1620 WideningDecisions.clear(); 1621 Uniforms.clear(); 1622 Scalars.clear(); 1623 } 1624 1625 private: 1626 unsigned NumPredStores = 0; 1627 1628 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1629 /// than zero. One is returned if vectorization should best be avoided due 1630 /// to cost. 1631 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1632 ElementCount UserVF); 1633 1634 /// The vectorization cost is a combination of the cost itself and a boolean 1635 /// indicating whether any of the contributing operations will actually 1636 /// operate on 1637 /// vector values after type legalization in the backend. If this latter value 1638 /// is 1639 /// false, then all operations will be scalarized (i.e. no vectorization has 1640 /// actually taken place). 1641 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1642 1643 /// Returns the expected execution cost. The unit of the cost does 1644 /// not matter because we use the 'cost' units to compare different 1645 /// vector widths. The cost that is returned is *not* normalized by 1646 /// the factor width. 1647 VectorizationCostTy expectedCost(ElementCount VF); 1648 1649 /// Returns the execution time cost of an instruction for a given vector 1650 /// width. Vector width of one means scalar. 1651 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1652 1653 /// The cost-computation logic from getInstructionCost which provides 1654 /// the vector type as an output parameter. 1655 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1656 Type *&VectorTy); 1657 1658 /// Return the cost of instructions in an inloop reduction pattern, if I is 1659 /// part of that pattern. 1660 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1661 Type *VectorTy, 1662 TTI::TargetCostKind CostKind); 1663 1664 /// Calculate vectorization cost of memory instruction \p I. 1665 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1666 1667 /// The cost computation for scalarized memory instruction. 1668 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1669 1670 /// The cost computation for interleaving group of memory instructions. 1671 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1672 1673 /// The cost computation for Gather/Scatter instruction. 1674 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1675 1676 /// The cost computation for widening instruction \p I with consecutive 1677 /// memory access. 1678 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1679 1680 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1681 /// Load: scalar load + broadcast. 1682 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1683 /// element) 1684 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1685 1686 /// Estimate the overhead of scalarizing an instruction. This is a 1687 /// convenience wrapper for the type-based getScalarizationOverhead API. 1688 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1689 1690 /// Returns whether the instruction is a load or store and will be a emitted 1691 /// as a vector operation. 1692 bool isConsecutiveLoadOrStore(Instruction *I); 1693 1694 /// Returns true if an artificially high cost for emulated masked memrefs 1695 /// should be used. 1696 bool useEmulatedMaskMemRefHack(Instruction *I); 1697 1698 /// Map of scalar integer values to the smallest bitwidth they can be legally 1699 /// represented as. The vector equivalents of these values should be truncated 1700 /// to this type. 1701 MapVector<Instruction *, uint64_t> MinBWs; 1702 1703 /// A type representing the costs for instructions if they were to be 1704 /// scalarized rather than vectorized. The entries are Instruction-Cost 1705 /// pairs. 1706 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1707 1708 /// A set containing all BasicBlocks that are known to present after 1709 /// vectorization as a predicated block. 1710 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1711 1712 /// Records whether it is allowed to have the original scalar loop execute at 1713 /// least once. This may be needed as a fallback loop in case runtime 1714 /// aliasing/dependence checks fail, or to handle the tail/remainder 1715 /// iterations when the trip count is unknown or doesn't divide by the VF, 1716 /// or as a peel-loop to handle gaps in interleave-groups. 1717 /// Under optsize and when the trip count is very small we don't allow any 1718 /// iterations to execute in the scalar loop. 1719 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1720 1721 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1722 bool FoldTailByMasking = false; 1723 1724 /// A map holding scalar costs for different vectorization factors. The 1725 /// presence of a cost for an instruction in the mapping indicates that the 1726 /// instruction will be scalarized when vectorizing with the associated 1727 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1728 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1729 1730 /// Holds the instructions known to be uniform after vectorization. 1731 /// The data is collected per VF. 1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1733 1734 /// Holds the instructions known to be scalar after vectorization. 1735 /// The data is collected per VF. 1736 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1737 1738 /// Holds the instructions (address computations) that are forced to be 1739 /// scalarized. 1740 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1741 1742 /// PHINodes of the reductions that should be expanded in-loop along with 1743 /// their associated chains of reduction operations, in program order from top 1744 /// (PHI) to bottom 1745 ReductionChainMap InLoopReductionChains; 1746 1747 /// A Map of inloop reduction operations and their immediate chain operand. 1748 /// FIXME: This can be removed once reductions can be costed correctly in 1749 /// vplan. This was added to allow quick lookup to the inloop operations, 1750 /// without having to loop through InLoopReductionChains. 1751 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1752 1753 /// Returns the expected difference in cost from scalarizing the expression 1754 /// feeding a predicated instruction \p PredInst. The instructions to 1755 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1756 /// non-negative return value implies the expression will be scalarized. 1757 /// Currently, only single-use chains are considered for scalarization. 1758 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1759 ElementCount VF); 1760 1761 /// Collect the instructions that are uniform after vectorization. An 1762 /// instruction is uniform if we represent it with a single scalar value in 1763 /// the vectorized loop corresponding to each vector iteration. Examples of 1764 /// uniform instructions include pointer operands of consecutive or 1765 /// interleaved memory accesses. Note that although uniformity implies an 1766 /// instruction will be scalar, the reverse is not true. In general, a 1767 /// scalarized instruction will be represented by VF scalar values in the 1768 /// vectorized loop, each corresponding to an iteration of the original 1769 /// scalar loop. 1770 void collectLoopUniforms(ElementCount VF); 1771 1772 /// Collect the instructions that are scalar after vectorization. An 1773 /// instruction is scalar if it is known to be uniform or will be scalarized 1774 /// during vectorization. Non-uniform scalarized instructions will be 1775 /// represented by VF values in the vectorized loop, each corresponding to an 1776 /// iteration of the original scalar loop. 1777 void collectLoopScalars(ElementCount VF); 1778 1779 /// Keeps cost model vectorization decision and cost for instructions. 1780 /// Right now it is used for memory instructions only. 1781 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1782 std::pair<InstWidening, InstructionCost>>; 1783 1784 DecisionList WideningDecisions; 1785 1786 /// Returns true if \p V is expected to be vectorized and it needs to be 1787 /// extracted. 1788 bool needsExtract(Value *V, ElementCount VF) const { 1789 Instruction *I = dyn_cast<Instruction>(V); 1790 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1791 TheLoop->isLoopInvariant(I)) 1792 return false; 1793 1794 // Assume we can vectorize V (and hence we need extraction) if the 1795 // scalars are not computed yet. This can happen, because it is called 1796 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1797 // the scalars are collected. That should be a safe assumption in most 1798 // cases, because we check if the operands have vectorizable types 1799 // beforehand in LoopVectorizationLegality. 1800 return Scalars.find(VF) == Scalars.end() || 1801 !isScalarAfterVectorization(I, VF); 1802 }; 1803 1804 /// Returns a range containing only operands needing to be extracted. 1805 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1806 ElementCount VF) { 1807 return SmallVector<Value *, 4>(make_filter_range( 1808 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1809 } 1810 1811 /// Determines if we have the infrastructure to vectorize loop \p L and its 1812 /// epilogue, assuming the main loop is vectorized by \p VF. 1813 bool isCandidateForEpilogueVectorization(const Loop &L, 1814 const ElementCount VF) const; 1815 1816 /// Returns true if epilogue vectorization is considered profitable, and 1817 /// false otherwise. 1818 /// \p VF is the vectorization factor chosen for the original loop. 1819 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1820 1821 public: 1822 /// The loop that we evaluate. 1823 Loop *TheLoop; 1824 1825 /// Predicated scalar evolution analysis. 1826 PredicatedScalarEvolution &PSE; 1827 1828 /// Loop Info analysis. 1829 LoopInfo *LI; 1830 1831 /// Vectorization legality. 1832 LoopVectorizationLegality *Legal; 1833 1834 /// Vector target information. 1835 const TargetTransformInfo &TTI; 1836 1837 /// Target Library Info. 1838 const TargetLibraryInfo *TLI; 1839 1840 /// Demanded bits analysis. 1841 DemandedBits *DB; 1842 1843 /// Assumption cache. 1844 AssumptionCache *AC; 1845 1846 /// Interface to emit optimization remarks. 1847 OptimizationRemarkEmitter *ORE; 1848 1849 const Function *TheFunction; 1850 1851 /// Loop Vectorize Hint. 1852 const LoopVectorizeHints *Hints; 1853 1854 /// The interleave access information contains groups of interleaved accesses 1855 /// with the same stride and close to each other. 1856 InterleavedAccessInfo &InterleaveInfo; 1857 1858 /// Values to ignore in the cost model. 1859 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1860 1861 /// Values to ignore in the cost model when VF > 1. 1862 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1863 1864 /// Profitable vector factors. 1865 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1866 }; 1867 1868 } // end namespace llvm 1869 1870 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1871 // vectorization. The loop needs to be annotated with #pragma omp simd 1872 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1873 // vector length information is not provided, vectorization is not considered 1874 // explicit. Interleave hints are not allowed either. These limitations will be 1875 // relaxed in the future. 1876 // Please, note that we are currently forced to abuse the pragma 'clang 1877 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1878 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1879 // provides *explicit vectorization hints* (LV can bypass legal checks and 1880 // assume that vectorization is legal). However, both hints are implemented 1881 // using the same metadata (llvm.loop.vectorize, processed by 1882 // LoopVectorizeHints). This will be fixed in the future when the native IR 1883 // representation for pragma 'omp simd' is introduced. 1884 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1885 OptimizationRemarkEmitter *ORE) { 1886 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1887 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1888 1889 // Only outer loops with an explicit vectorization hint are supported. 1890 // Unannotated outer loops are ignored. 1891 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1892 return false; 1893 1894 Function *Fn = OuterLp->getHeader()->getParent(); 1895 if (!Hints.allowVectorization(Fn, OuterLp, 1896 true /*VectorizeOnlyWhenForced*/)) { 1897 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1898 return false; 1899 } 1900 1901 if (Hints.getInterleave() > 1) { 1902 // TODO: Interleave support is future work. 1903 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1904 "outer loops.\n"); 1905 Hints.emitRemarkWithHints(); 1906 return false; 1907 } 1908 1909 return true; 1910 } 1911 1912 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1913 OptimizationRemarkEmitter *ORE, 1914 SmallVectorImpl<Loop *> &V) { 1915 // Collect inner loops and outer loops without irreducible control flow. For 1916 // now, only collect outer loops that have explicit vectorization hints. If we 1917 // are stress testing the VPlan H-CFG construction, we collect the outermost 1918 // loop of every loop nest. 1919 if (L.isInnermost() || VPlanBuildStressTest || 1920 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1921 LoopBlocksRPO RPOT(&L); 1922 RPOT.perform(LI); 1923 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1924 V.push_back(&L); 1925 // TODO: Collect inner loops inside marked outer loops in case 1926 // vectorization fails for the outer loop. Do not invoke 1927 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1928 // already known to be reducible. We can use an inherited attribute for 1929 // that. 1930 return; 1931 } 1932 } 1933 for (Loop *InnerL : L) 1934 collectSupportedLoops(*InnerL, LI, ORE, V); 1935 } 1936 1937 namespace { 1938 1939 /// The LoopVectorize Pass. 1940 struct LoopVectorize : public FunctionPass { 1941 /// Pass identification, replacement for typeid 1942 static char ID; 1943 1944 LoopVectorizePass Impl; 1945 1946 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1947 bool VectorizeOnlyWhenForced = false) 1948 : FunctionPass(ID), 1949 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1950 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1951 } 1952 1953 bool runOnFunction(Function &F) override { 1954 if (skipFunction(F)) 1955 return false; 1956 1957 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1958 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1959 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1960 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1961 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1962 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1963 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1964 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1965 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1966 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1967 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1968 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1969 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1970 1971 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1972 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1973 1974 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1975 GetLAA, *ORE, PSI).MadeAnyChange; 1976 } 1977 1978 void getAnalysisUsage(AnalysisUsage &AU) const override { 1979 AU.addRequired<AssumptionCacheTracker>(); 1980 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1981 AU.addRequired<DominatorTreeWrapperPass>(); 1982 AU.addRequired<LoopInfoWrapperPass>(); 1983 AU.addRequired<ScalarEvolutionWrapperPass>(); 1984 AU.addRequired<TargetTransformInfoWrapperPass>(); 1985 AU.addRequired<AAResultsWrapperPass>(); 1986 AU.addRequired<LoopAccessLegacyAnalysis>(); 1987 AU.addRequired<DemandedBitsWrapperPass>(); 1988 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1989 AU.addRequired<InjectTLIMappingsLegacy>(); 1990 1991 // We currently do not preserve loopinfo/dominator analyses with outer loop 1992 // vectorization. Until this is addressed, mark these analyses as preserved 1993 // only for non-VPlan-native path. 1994 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1995 if (!EnableVPlanNativePath) { 1996 AU.addPreserved<LoopInfoWrapperPass>(); 1997 AU.addPreserved<DominatorTreeWrapperPass>(); 1998 } 1999 2000 AU.addPreserved<BasicAAWrapperPass>(); 2001 AU.addPreserved<GlobalsAAWrapperPass>(); 2002 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2003 } 2004 }; 2005 2006 } // end anonymous namespace 2007 2008 //===----------------------------------------------------------------------===// 2009 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2010 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2011 //===----------------------------------------------------------------------===// 2012 2013 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2014 // We need to place the broadcast of invariant variables outside the loop, 2015 // but only if it's proven safe to do so. Else, broadcast will be inside 2016 // vector loop body. 2017 Instruction *Instr = dyn_cast<Instruction>(V); 2018 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2019 (!Instr || 2020 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2021 // Place the code for broadcasting invariant variables in the new preheader. 2022 IRBuilder<>::InsertPointGuard Guard(Builder); 2023 if (SafeToHoist) 2024 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2025 2026 // Broadcast the scalar into all locations in the vector. 2027 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2028 2029 return Shuf; 2030 } 2031 2032 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2033 const InductionDescriptor &II, Value *Step, Value *Start, 2034 Instruction *EntryVal) { 2035 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2036 "Expected either an induction phi-node or a truncate of it!"); 2037 2038 // Construct the initial value of the vector IV in the vector loop preheader 2039 auto CurrIP = Builder.saveIP(); 2040 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2041 if (isa<TruncInst>(EntryVal)) { 2042 assert(Start->getType()->isIntegerTy() && 2043 "Truncation requires an integer type"); 2044 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2045 Step = Builder.CreateTrunc(Step, TruncType); 2046 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2047 } 2048 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2049 Value *SteppedStart = 2050 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2051 2052 // We create vector phi nodes for both integer and floating-point induction 2053 // variables. Here, we determine the kind of arithmetic we will perform. 2054 Instruction::BinaryOps AddOp; 2055 Instruction::BinaryOps MulOp; 2056 if (Step->getType()->isIntegerTy()) { 2057 AddOp = Instruction::Add; 2058 MulOp = Instruction::Mul; 2059 } else { 2060 AddOp = II.getInductionOpcode(); 2061 MulOp = Instruction::FMul; 2062 } 2063 2064 // Multiply the vectorization factor by the step using integer or 2065 // floating-point arithmetic as appropriate. 2066 Value *ConstVF = 2067 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2068 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2069 2070 // Create a vector splat to use in the induction update. 2071 // 2072 // FIXME: If the step is non-constant, we create the vector splat with 2073 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2074 // handle a constant vector splat. 2075 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2076 Value *SplatVF = isa<Constant>(Mul) 2077 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2078 : Builder.CreateVectorSplat(VF, Mul); 2079 Builder.restoreIP(CurrIP); 2080 2081 // We may need to add the step a number of times, depending on the unroll 2082 // factor. The last of those goes into the PHI. 2083 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2084 &*LoopVectorBody->getFirstInsertionPt()); 2085 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2086 Instruction *LastInduction = VecInd; 2087 for (unsigned Part = 0; Part < UF; ++Part) { 2088 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2089 2090 if (isa<TruncInst>(EntryVal)) 2091 addMetadata(LastInduction, EntryVal); 2092 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2093 2094 LastInduction = cast<Instruction>(addFastMathFlag( 2095 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2096 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2097 } 2098 2099 // Move the last step to the end of the latch block. This ensures consistent 2100 // placement of all induction updates. 2101 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2102 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2103 auto *ICmp = cast<Instruction>(Br->getCondition()); 2104 LastInduction->moveBefore(ICmp); 2105 LastInduction->setName("vec.ind.next"); 2106 2107 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2108 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2109 } 2110 2111 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2112 return Cost->isScalarAfterVectorization(I, VF) || 2113 Cost->isProfitableToScalarize(I, VF); 2114 } 2115 2116 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2117 if (shouldScalarizeInstruction(IV)) 2118 return true; 2119 auto isScalarInst = [&](User *U) -> bool { 2120 auto *I = cast<Instruction>(U); 2121 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2122 }; 2123 return llvm::any_of(IV->users(), isScalarInst); 2124 } 2125 2126 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2127 const InductionDescriptor &ID, const Instruction *EntryVal, 2128 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2129 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2130 "Expected either an induction phi-node or a truncate of it!"); 2131 2132 // This induction variable is not the phi from the original loop but the 2133 // newly-created IV based on the proof that casted Phi is equal to the 2134 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2135 // re-uses the same InductionDescriptor that original IV uses but we don't 2136 // have to do any recording in this case - that is done when original IV is 2137 // processed. 2138 if (isa<TruncInst>(EntryVal)) 2139 return; 2140 2141 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2142 if (Casts.empty()) 2143 return; 2144 // Only the first Cast instruction in the Casts vector is of interest. 2145 // The rest of the Casts (if exist) have no uses outside the 2146 // induction update chain itself. 2147 Instruction *CastInst = *Casts.begin(); 2148 if (Lane < UINT_MAX) 2149 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2150 else 2151 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2152 } 2153 2154 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2155 TruncInst *Trunc) { 2156 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2157 "Primary induction variable must have an integer type"); 2158 2159 auto II = Legal->getInductionVars().find(IV); 2160 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2161 2162 auto ID = II->second; 2163 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2164 2165 // The value from the original loop to which we are mapping the new induction 2166 // variable. 2167 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2168 2169 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2170 2171 // Generate code for the induction step. Note that induction steps are 2172 // required to be loop-invariant 2173 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2174 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2175 "Induction step should be loop invariant"); 2176 if (PSE.getSE()->isSCEVable(IV->getType())) { 2177 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2178 return Exp.expandCodeFor(Step, Step->getType(), 2179 LoopVectorPreHeader->getTerminator()); 2180 } 2181 return cast<SCEVUnknown>(Step)->getValue(); 2182 }; 2183 2184 // The scalar value to broadcast. This is derived from the canonical 2185 // induction variable. If a truncation type is given, truncate the canonical 2186 // induction variable and step. Otherwise, derive these values from the 2187 // induction descriptor. 2188 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2189 Value *ScalarIV = Induction; 2190 if (IV != OldInduction) { 2191 ScalarIV = IV->getType()->isIntegerTy() 2192 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2193 : Builder.CreateCast(Instruction::SIToFP, Induction, 2194 IV->getType()); 2195 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2196 ScalarIV->setName("offset.idx"); 2197 } 2198 if (Trunc) { 2199 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2200 assert(Step->getType()->isIntegerTy() && 2201 "Truncation requires an integer step"); 2202 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2203 Step = Builder.CreateTrunc(Step, TruncType); 2204 } 2205 return ScalarIV; 2206 }; 2207 2208 // Create the vector values from the scalar IV, in the absence of creating a 2209 // vector IV. 2210 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2211 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2212 for (unsigned Part = 0; Part < UF; ++Part) { 2213 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2214 Value *EntryPart = 2215 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2216 ID.getInductionOpcode()); 2217 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2218 if (Trunc) 2219 addMetadata(EntryPart, Trunc); 2220 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2221 } 2222 }; 2223 2224 // Now do the actual transformations, and start with creating the step value. 2225 Value *Step = CreateStepValue(ID.getStep()); 2226 if (VF.isZero() || VF.isScalar()) { 2227 Value *ScalarIV = CreateScalarIV(Step); 2228 CreateSplatIV(ScalarIV, Step); 2229 return; 2230 } 2231 2232 // Determine if we want a scalar version of the induction variable. This is 2233 // true if the induction variable itself is not widened, or if it has at 2234 // least one user in the loop that is not widened. 2235 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2236 if (!NeedsScalarIV) { 2237 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2238 return; 2239 } 2240 2241 // Try to create a new independent vector induction variable. If we can't 2242 // create the phi node, we will splat the scalar induction variable in each 2243 // loop iteration. 2244 if (!shouldScalarizeInstruction(EntryVal)) { 2245 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2246 Value *ScalarIV = CreateScalarIV(Step); 2247 // Create scalar steps that can be used by instructions we will later 2248 // scalarize. Note that the addition of the scalar steps will not increase 2249 // the number of instructions in the loop in the common case prior to 2250 // InstCombine. We will be trading one vector extract for each scalar step. 2251 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2252 return; 2253 } 2254 2255 // All IV users are scalar instructions, so only emit a scalar IV, not a 2256 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2257 // predicate used by the masked loads/stores. 2258 Value *ScalarIV = CreateScalarIV(Step); 2259 if (!Cost->isScalarEpilogueAllowed()) 2260 CreateSplatIV(ScalarIV, Step); 2261 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2262 } 2263 2264 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2265 Instruction::BinaryOps BinOp) { 2266 // Create and check the types. 2267 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2268 int VLen = ValVTy->getNumElements(); 2269 2270 Type *STy = Val->getType()->getScalarType(); 2271 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2272 "Induction Step must be an integer or FP"); 2273 assert(Step->getType() == STy && "Step has wrong type"); 2274 2275 SmallVector<Constant *, 8> Indices; 2276 2277 if (STy->isIntegerTy()) { 2278 // Create a vector of consecutive numbers from zero to VF. 2279 for (int i = 0; i < VLen; ++i) 2280 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2281 2282 // Add the consecutive indices to the vector value. 2283 Constant *Cv = ConstantVector::get(Indices); 2284 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2285 Step = Builder.CreateVectorSplat(VLen, Step); 2286 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2287 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2288 // which can be found from the original scalar operations. 2289 Step = Builder.CreateMul(Cv, Step); 2290 return Builder.CreateAdd(Val, Step, "induction"); 2291 } 2292 2293 // Floating point induction. 2294 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2295 "Binary Opcode should be specified for FP induction"); 2296 // Create a vector of consecutive numbers from zero to VF. 2297 for (int i = 0; i < VLen; ++i) 2298 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2299 2300 // Add the consecutive indices to the vector value. 2301 Constant *Cv = ConstantVector::get(Indices); 2302 2303 Step = Builder.CreateVectorSplat(VLen, Step); 2304 2305 // Floating point operations had to be 'fast' to enable the induction. 2306 FastMathFlags Flags; 2307 Flags.setFast(); 2308 2309 Value *MulOp = Builder.CreateFMul(Cv, Step); 2310 if (isa<Instruction>(MulOp)) 2311 // Have to check, MulOp may be a constant 2312 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2313 2314 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2315 if (isa<Instruction>(BOp)) 2316 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2317 return BOp; 2318 } 2319 2320 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2321 Instruction *EntryVal, 2322 const InductionDescriptor &ID) { 2323 // We shouldn't have to build scalar steps if we aren't vectorizing. 2324 assert(VF.isVector() && "VF should be greater than one"); 2325 // Get the value type and ensure it and the step have the same integer type. 2326 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2327 assert(ScalarIVTy == Step->getType() && 2328 "Val and Step should have the same type"); 2329 2330 // We build scalar steps for both integer and floating-point induction 2331 // variables. Here, we determine the kind of arithmetic we will perform. 2332 Instruction::BinaryOps AddOp; 2333 Instruction::BinaryOps MulOp; 2334 if (ScalarIVTy->isIntegerTy()) { 2335 AddOp = Instruction::Add; 2336 MulOp = Instruction::Mul; 2337 } else { 2338 AddOp = ID.getInductionOpcode(); 2339 MulOp = Instruction::FMul; 2340 } 2341 2342 // Determine the number of scalars we need to generate for each unroll 2343 // iteration. If EntryVal is uniform, we only need to generate the first 2344 // lane. Otherwise, we generate all VF values. 2345 unsigned Lanes = 2346 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2347 ? 1 2348 : VF.getKnownMinValue(); 2349 assert((!VF.isScalable() || Lanes == 1) && 2350 "Should never scalarize a scalable vector"); 2351 // Compute the scalar steps and save the results in VectorLoopValueMap. 2352 for (unsigned Part = 0; Part < UF; ++Part) { 2353 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2354 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2355 ScalarIVTy->getScalarSizeInBits()); 2356 Value *StartIdx = 2357 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2358 if (ScalarIVTy->isFloatingPointTy()) 2359 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2360 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2361 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2362 // The step returned by `createStepForVF` is a runtime-evaluated value 2363 // when VF is scalable. Otherwise, it should be folded into a Constant. 2364 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2365 "Expected StartIdx to be folded to a constant when VF is not " 2366 "scalable"); 2367 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2368 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2369 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2370 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2371 } 2372 } 2373 } 2374 2375 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2376 assert(V != Induction && "The new induction variable should not be used."); 2377 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2378 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2379 2380 // If we have a stride that is replaced by one, do it here. Defer this for 2381 // the VPlan-native path until we start running Legal checks in that path. 2382 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2383 V = ConstantInt::get(V->getType(), 1); 2384 2385 // If we have a vector mapped to this value, return it. 2386 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2387 return VectorLoopValueMap.getVectorValue(V, Part); 2388 2389 // If the value has not been vectorized, check if it has been scalarized 2390 // instead. If it has been scalarized, and we actually need the value in 2391 // vector form, we will construct the vector values on demand. 2392 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2393 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2394 2395 // If we've scalarized a value, that value should be an instruction. 2396 auto *I = cast<Instruction>(V); 2397 2398 // If we aren't vectorizing, we can just copy the scalar map values over to 2399 // the vector map. 2400 if (VF.isScalar()) { 2401 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2402 return ScalarValue; 2403 } 2404 2405 // Get the last scalar instruction we generated for V and Part. If the value 2406 // is known to be uniform after vectorization, this corresponds to lane zero 2407 // of the Part unroll iteration. Otherwise, the last instruction is the one 2408 // we created for the last vector lane of the Part unroll iteration. 2409 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2410 ? 0 2411 : VF.getKnownMinValue() - 1; 2412 assert((!VF.isScalable() || LastLane == 0) && 2413 "Scalable vectorization can't lead to any scalarized values."); 2414 auto *LastInst = cast<Instruction>( 2415 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2416 2417 // Set the insert point after the last scalarized instruction. This ensures 2418 // the insertelement sequence will directly follow the scalar definitions. 2419 auto OldIP = Builder.saveIP(); 2420 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2421 Builder.SetInsertPoint(&*NewIP); 2422 2423 // However, if we are vectorizing, we need to construct the vector values. 2424 // If the value is known to be uniform after vectorization, we can just 2425 // broadcast the scalar value corresponding to lane zero for each unroll 2426 // iteration. Otherwise, we construct the vector values using insertelement 2427 // instructions. Since the resulting vectors are stored in 2428 // VectorLoopValueMap, we will only generate the insertelements once. 2429 Value *VectorValue = nullptr; 2430 if (Cost->isUniformAfterVectorization(I, VF)) { 2431 VectorValue = getBroadcastInstrs(ScalarValue); 2432 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2433 } else { 2434 // Initialize packing with insertelements to start from poison. 2435 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2436 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2437 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2438 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2439 packScalarIntoVectorValue(V, {Part, Lane}); 2440 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2441 } 2442 Builder.restoreIP(OldIP); 2443 return VectorValue; 2444 } 2445 2446 // If this scalar is unknown, assume that it is a constant or that it is 2447 // loop invariant. Broadcast V and save the value for future uses. 2448 Value *B = getBroadcastInstrs(V); 2449 VectorLoopValueMap.setVectorValue(V, Part, B); 2450 return B; 2451 } 2452 2453 Value * 2454 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2455 const VPIteration &Instance) { 2456 // If the value is not an instruction contained in the loop, it should 2457 // already be scalar. 2458 if (OrigLoop->isLoopInvariant(V)) 2459 return V; 2460 2461 assert(Instance.Lane > 0 2462 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2463 : true && "Uniform values only have lane zero"); 2464 2465 // If the value from the original loop has not been vectorized, it is 2466 // represented by UF x VF scalar values in the new loop. Return the requested 2467 // scalar value. 2468 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2469 return VectorLoopValueMap.getScalarValue(V, Instance); 2470 2471 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2472 // for the given unroll part. If this entry is not a vector type (i.e., the 2473 // vectorization factor is one), there is no need to generate an 2474 // extractelement instruction. 2475 auto *U = getOrCreateVectorValue(V, Instance.Part); 2476 if (!U->getType()->isVectorTy()) { 2477 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2478 return U; 2479 } 2480 2481 // Otherwise, the value from the original loop has been vectorized and is 2482 // represented by UF vector values. Extract and return the requested scalar 2483 // value from the appropriate vector lane. 2484 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2485 } 2486 2487 void InnerLoopVectorizer::packScalarIntoVectorValue( 2488 Value *V, const VPIteration &Instance) { 2489 assert(V != Induction && "The new induction variable should not be used."); 2490 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2491 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2492 2493 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2494 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2495 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2496 Builder.getInt32(Instance.Lane)); 2497 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2498 } 2499 2500 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2501 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2502 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2503 SmallVector<int, 8> ShuffleMask; 2504 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2505 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2506 2507 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2508 } 2509 2510 // Return whether we allow using masked interleave-groups (for dealing with 2511 // strided loads/stores that reside in predicated blocks, or for dealing 2512 // with gaps). 2513 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2514 // If an override option has been passed in for interleaved accesses, use it. 2515 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2516 return EnableMaskedInterleavedMemAccesses; 2517 2518 return TTI.enableMaskedInterleavedAccessVectorization(); 2519 } 2520 2521 // Try to vectorize the interleave group that \p Instr belongs to. 2522 // 2523 // E.g. Translate following interleaved load group (factor = 3): 2524 // for (i = 0; i < N; i+=3) { 2525 // R = Pic[i]; // Member of index 0 2526 // G = Pic[i+1]; // Member of index 1 2527 // B = Pic[i+2]; // Member of index 2 2528 // ... // do something to R, G, B 2529 // } 2530 // To: 2531 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2532 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2533 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2534 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2535 // 2536 // Or translate following interleaved store group (factor = 3): 2537 // for (i = 0; i < N; i+=3) { 2538 // ... do something to R, G, B 2539 // Pic[i] = R; // Member of index 0 2540 // Pic[i+1] = G; // Member of index 1 2541 // Pic[i+2] = B; // Member of index 2 2542 // } 2543 // To: 2544 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2545 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2546 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2547 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2548 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2549 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2550 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2551 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2552 VPValue *BlockInMask) { 2553 Instruction *Instr = Group->getInsertPos(); 2554 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2555 2556 // Prepare for the vector type of the interleaved load/store. 2557 Type *ScalarTy = getMemInstValueType(Instr); 2558 unsigned InterleaveFactor = Group->getFactor(); 2559 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2560 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2561 2562 // Prepare for the new pointers. 2563 SmallVector<Value *, 2> AddrParts; 2564 unsigned Index = Group->getIndex(Instr); 2565 2566 // TODO: extend the masked interleaved-group support to reversed access. 2567 assert((!BlockInMask || !Group->isReverse()) && 2568 "Reversed masked interleave-group not supported."); 2569 2570 // If the group is reverse, adjust the index to refer to the last vector lane 2571 // instead of the first. We adjust the index from the first vector lane, 2572 // rather than directly getting the pointer for lane VF - 1, because the 2573 // pointer operand of the interleaved access is supposed to be uniform. For 2574 // uniform instructions, we're only required to generate a value for the 2575 // first vector lane in each unroll iteration. 2576 assert(!VF.isScalable() && 2577 "scalable vector reverse operation is not implemented"); 2578 if (Group->isReverse()) 2579 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2580 2581 for (unsigned Part = 0; Part < UF; Part++) { 2582 Value *AddrPart = State.get(Addr, {Part, 0}); 2583 setDebugLocFromInst(Builder, AddrPart); 2584 2585 // Notice current instruction could be any index. Need to adjust the address 2586 // to the member of index 0. 2587 // 2588 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2589 // b = A[i]; // Member of index 0 2590 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2591 // 2592 // E.g. A[i+1] = a; // Member of index 1 2593 // A[i] = b; // Member of index 0 2594 // A[i+2] = c; // Member of index 2 (Current instruction) 2595 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2596 2597 bool InBounds = false; 2598 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2599 InBounds = gep->isInBounds(); 2600 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2601 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2602 2603 // Cast to the vector pointer type. 2604 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2605 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2606 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2607 } 2608 2609 setDebugLocFromInst(Builder, Instr); 2610 Value *PoisonVec = PoisonValue::get(VecTy); 2611 2612 Value *MaskForGaps = nullptr; 2613 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2614 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2615 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2616 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2617 } 2618 2619 // Vectorize the interleaved load group. 2620 if (isa<LoadInst>(Instr)) { 2621 // For each unroll part, create a wide load for the group. 2622 SmallVector<Value *, 2> NewLoads; 2623 for (unsigned Part = 0; Part < UF; Part++) { 2624 Instruction *NewLoad; 2625 if (BlockInMask || MaskForGaps) { 2626 assert(useMaskedInterleavedAccesses(*TTI) && 2627 "masked interleaved groups are not allowed."); 2628 Value *GroupMask = MaskForGaps; 2629 if (BlockInMask) { 2630 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2631 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2632 Value *ShuffledMask = Builder.CreateShuffleVector( 2633 BlockInMaskPart, 2634 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2635 "interleaved.mask"); 2636 GroupMask = MaskForGaps 2637 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2638 MaskForGaps) 2639 : ShuffledMask; 2640 } 2641 NewLoad = 2642 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2643 GroupMask, PoisonVec, "wide.masked.vec"); 2644 } 2645 else 2646 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2647 Group->getAlign(), "wide.vec"); 2648 Group->addMetadata(NewLoad); 2649 NewLoads.push_back(NewLoad); 2650 } 2651 2652 // For each member in the group, shuffle out the appropriate data from the 2653 // wide loads. 2654 unsigned J = 0; 2655 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2656 Instruction *Member = Group->getMember(I); 2657 2658 // Skip the gaps in the group. 2659 if (!Member) 2660 continue; 2661 2662 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2663 auto StrideMask = 2664 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2665 for (unsigned Part = 0; Part < UF; Part++) { 2666 Value *StridedVec = Builder.CreateShuffleVector( 2667 NewLoads[Part], StrideMask, "strided.vec"); 2668 2669 // If this member has different type, cast the result type. 2670 if (Member->getType() != ScalarTy) { 2671 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2672 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2673 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2674 } 2675 2676 if (Group->isReverse()) 2677 StridedVec = reverseVector(StridedVec); 2678 2679 State.set(VPDefs[J], Member, StridedVec, Part); 2680 } 2681 ++J; 2682 } 2683 return; 2684 } 2685 2686 // The sub vector type for current instruction. 2687 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2688 auto *SubVT = VectorType::get(ScalarTy, VF); 2689 2690 // Vectorize the interleaved store group. 2691 for (unsigned Part = 0; Part < UF; Part++) { 2692 // Collect the stored vector from each member. 2693 SmallVector<Value *, 4> StoredVecs; 2694 for (unsigned i = 0; i < InterleaveFactor; i++) { 2695 // Interleaved store group doesn't allow a gap, so each index has a member 2696 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2697 2698 Value *StoredVec = State.get(StoredValues[i], Part); 2699 2700 if (Group->isReverse()) 2701 StoredVec = reverseVector(StoredVec); 2702 2703 // If this member has different type, cast it to a unified type. 2704 2705 if (StoredVec->getType() != SubVT) 2706 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2707 2708 StoredVecs.push_back(StoredVec); 2709 } 2710 2711 // Concatenate all vectors into a wide vector. 2712 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2713 2714 // Interleave the elements in the wide vector. 2715 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2716 Value *IVec = Builder.CreateShuffleVector( 2717 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2718 "interleaved.vec"); 2719 2720 Instruction *NewStoreInstr; 2721 if (BlockInMask) { 2722 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2723 Value *ShuffledMask = Builder.CreateShuffleVector( 2724 BlockInMaskPart, 2725 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2726 "interleaved.mask"); 2727 NewStoreInstr = Builder.CreateMaskedStore( 2728 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2729 } 2730 else 2731 NewStoreInstr = 2732 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2733 2734 Group->addMetadata(NewStoreInstr); 2735 } 2736 } 2737 2738 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2739 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2740 VPValue *StoredValue, VPValue *BlockInMask) { 2741 // Attempt to issue a wide load. 2742 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2743 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2744 2745 assert((LI || SI) && "Invalid Load/Store instruction"); 2746 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2747 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2748 2749 LoopVectorizationCostModel::InstWidening Decision = 2750 Cost->getWideningDecision(Instr, VF); 2751 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2752 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2753 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2754 "CM decision is not to widen the memory instruction"); 2755 2756 Type *ScalarDataTy = getMemInstValueType(Instr); 2757 2758 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2759 const Align Alignment = getLoadStoreAlignment(Instr); 2760 2761 // Determine if the pointer operand of the access is either consecutive or 2762 // reverse consecutive. 2763 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2764 bool ConsecutiveStride = 2765 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2766 bool CreateGatherScatter = 2767 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2768 2769 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2770 // gather/scatter. Otherwise Decision should have been to Scalarize. 2771 assert((ConsecutiveStride || CreateGatherScatter) && 2772 "The instruction should be scalarized"); 2773 (void)ConsecutiveStride; 2774 2775 VectorParts BlockInMaskParts(UF); 2776 bool isMaskRequired = BlockInMask; 2777 if (isMaskRequired) 2778 for (unsigned Part = 0; Part < UF; ++Part) 2779 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2780 2781 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2782 // Calculate the pointer for the specific unroll-part. 2783 GetElementPtrInst *PartPtr = nullptr; 2784 2785 bool InBounds = false; 2786 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2787 InBounds = gep->isInBounds(); 2788 2789 if (Reverse) { 2790 assert(!VF.isScalable() && 2791 "Reversing vectors is not yet supported for scalable vectors."); 2792 2793 // If the address is consecutive but reversed, then the 2794 // wide store needs to start at the last vector element. 2795 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2796 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2797 PartPtr->setIsInBounds(InBounds); 2798 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2799 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2800 PartPtr->setIsInBounds(InBounds); 2801 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2802 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2803 } else { 2804 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2805 PartPtr = cast<GetElementPtrInst>( 2806 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2807 PartPtr->setIsInBounds(InBounds); 2808 } 2809 2810 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2811 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2812 }; 2813 2814 // Handle Stores: 2815 if (SI) { 2816 setDebugLocFromInst(Builder, SI); 2817 2818 for (unsigned Part = 0; Part < UF; ++Part) { 2819 Instruction *NewSI = nullptr; 2820 Value *StoredVal = State.get(StoredValue, Part); 2821 if (CreateGatherScatter) { 2822 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2823 Value *VectorGep = State.get(Addr, Part); 2824 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2825 MaskPart); 2826 } else { 2827 if (Reverse) { 2828 // If we store to reverse consecutive memory locations, then we need 2829 // to reverse the order of elements in the stored value. 2830 StoredVal = reverseVector(StoredVal); 2831 // We don't want to update the value in the map as it might be used in 2832 // another expression. So don't call resetVectorValue(StoredVal). 2833 } 2834 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2835 if (isMaskRequired) 2836 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2837 BlockInMaskParts[Part]); 2838 else 2839 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2840 } 2841 addMetadata(NewSI, SI); 2842 } 2843 return; 2844 } 2845 2846 // Handle loads. 2847 assert(LI && "Must have a load instruction"); 2848 setDebugLocFromInst(Builder, LI); 2849 for (unsigned Part = 0; Part < UF; ++Part) { 2850 Value *NewLI; 2851 if (CreateGatherScatter) { 2852 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2853 Value *VectorGep = State.get(Addr, Part); 2854 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2855 nullptr, "wide.masked.gather"); 2856 addMetadata(NewLI, LI); 2857 } else { 2858 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2859 if (isMaskRequired) 2860 NewLI = Builder.CreateMaskedLoad( 2861 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2862 "wide.masked.load"); 2863 else 2864 NewLI = 2865 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2866 2867 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2868 addMetadata(NewLI, LI); 2869 if (Reverse) 2870 NewLI = reverseVector(NewLI); 2871 } 2872 2873 State.set(Def, Instr, NewLI, Part); 2874 } 2875 } 2876 2877 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2878 const VPIteration &Instance, 2879 bool IfPredicateInstr, 2880 VPTransformState &State) { 2881 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2882 2883 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2884 // the first lane and part. 2885 if (isa<NoAliasScopeDeclInst>(Instr)) 2886 if (Instance.Lane != 0 || Instance.Part != 0) 2887 return; 2888 2889 setDebugLocFromInst(Builder, Instr); 2890 2891 // Does this instruction return a value ? 2892 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2893 2894 Instruction *Cloned = Instr->clone(); 2895 if (!IsVoidRetTy) 2896 Cloned->setName(Instr->getName() + ".cloned"); 2897 2898 // Replace the operands of the cloned instructions with their scalar 2899 // equivalents in the new loop. 2900 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2901 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2902 auto InputInstance = Instance; 2903 if (!Operand || !OrigLoop->contains(Operand) || 2904 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2905 InputInstance.Lane = 0; 2906 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2907 Cloned->setOperand(op, NewOp); 2908 } 2909 addNewMetadata(Cloned, Instr); 2910 2911 // Place the cloned scalar in the new loop. 2912 Builder.Insert(Cloned); 2913 2914 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2915 // representing scalar values in VPTransformState. Add the cloned scalar to 2916 // the scalar map entry. 2917 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2918 2919 // If we just cloned a new assumption, add it the assumption cache. 2920 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2921 if (II->getIntrinsicID() == Intrinsic::assume) 2922 AC->registerAssumption(II); 2923 2924 // End if-block. 2925 if (IfPredicateInstr) 2926 PredicatedInstructions.push_back(Cloned); 2927 } 2928 2929 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2930 Value *End, Value *Step, 2931 Instruction *DL) { 2932 BasicBlock *Header = L->getHeader(); 2933 BasicBlock *Latch = L->getLoopLatch(); 2934 // As we're just creating this loop, it's possible no latch exists 2935 // yet. If so, use the header as this will be a single block loop. 2936 if (!Latch) 2937 Latch = Header; 2938 2939 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2940 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2941 setDebugLocFromInst(Builder, OldInst); 2942 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2943 2944 Builder.SetInsertPoint(Latch->getTerminator()); 2945 setDebugLocFromInst(Builder, OldInst); 2946 2947 // Create i+1 and fill the PHINode. 2948 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2949 Induction->addIncoming(Start, L->getLoopPreheader()); 2950 Induction->addIncoming(Next, Latch); 2951 // Create the compare. 2952 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2953 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2954 2955 // Now we have two terminators. Remove the old one from the block. 2956 Latch->getTerminator()->eraseFromParent(); 2957 2958 return Induction; 2959 } 2960 2961 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2962 if (TripCount) 2963 return TripCount; 2964 2965 assert(L && "Create Trip Count for null loop."); 2966 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2967 // Find the loop boundaries. 2968 ScalarEvolution *SE = PSE.getSE(); 2969 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2970 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2971 "Invalid loop count"); 2972 2973 Type *IdxTy = Legal->getWidestInductionType(); 2974 assert(IdxTy && "No type for induction"); 2975 2976 // The exit count might have the type of i64 while the phi is i32. This can 2977 // happen if we have an induction variable that is sign extended before the 2978 // compare. The only way that we get a backedge taken count is that the 2979 // induction variable was signed and as such will not overflow. In such a case 2980 // truncation is legal. 2981 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2982 IdxTy->getPrimitiveSizeInBits()) 2983 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2984 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2985 2986 // Get the total trip count from the count by adding 1. 2987 const SCEV *ExitCount = SE->getAddExpr( 2988 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2989 2990 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2991 2992 // Expand the trip count and place the new instructions in the preheader. 2993 // Notice that the pre-header does not change, only the loop body. 2994 SCEVExpander Exp(*SE, DL, "induction"); 2995 2996 // Count holds the overall loop count (N). 2997 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2998 L->getLoopPreheader()->getTerminator()); 2999 3000 if (TripCount->getType()->isPointerTy()) 3001 TripCount = 3002 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3003 L->getLoopPreheader()->getTerminator()); 3004 3005 return TripCount; 3006 } 3007 3008 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3009 if (VectorTripCount) 3010 return VectorTripCount; 3011 3012 Value *TC = getOrCreateTripCount(L); 3013 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3014 3015 Type *Ty = TC->getType(); 3016 // This is where we can make the step a runtime constant. 3017 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3018 3019 // If the tail is to be folded by masking, round the number of iterations N 3020 // up to a multiple of Step instead of rounding down. This is done by first 3021 // adding Step-1 and then rounding down. Note that it's ok if this addition 3022 // overflows: the vector induction variable will eventually wrap to zero given 3023 // that it starts at zero and its Step is a power of two; the loop will then 3024 // exit, with the last early-exit vector comparison also producing all-true. 3025 if (Cost->foldTailByMasking()) { 3026 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3027 "VF*UF must be a power of 2 when folding tail by masking"); 3028 assert(!VF.isScalable() && 3029 "Tail folding not yet supported for scalable vectors"); 3030 TC = Builder.CreateAdd( 3031 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3032 } 3033 3034 // Now we need to generate the expression for the part of the loop that the 3035 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3036 // iterations are not required for correctness, or N - Step, otherwise. Step 3037 // is equal to the vectorization factor (number of SIMD elements) times the 3038 // unroll factor (number of SIMD instructions). 3039 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3040 3041 // There are two cases where we need to ensure (at least) the last iteration 3042 // runs in the scalar remainder loop. Thus, if the step evenly divides 3043 // the trip count, we set the remainder to be equal to the step. If the step 3044 // does not evenly divide the trip count, no adjustment is necessary since 3045 // there will already be scalar iterations. Note that the minimum iterations 3046 // check ensures that N >= Step. The cases are: 3047 // 1) If there is a non-reversed interleaved group that may speculatively 3048 // access memory out-of-bounds. 3049 // 2) If any instruction may follow a conditionally taken exit. That is, if 3050 // the loop contains multiple exiting blocks, or a single exiting block 3051 // which is not the latch. 3052 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3053 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3054 R = Builder.CreateSelect(IsZero, Step, R); 3055 } 3056 3057 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3058 3059 return VectorTripCount; 3060 } 3061 3062 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3063 const DataLayout &DL) { 3064 // Verify that V is a vector type with same number of elements as DstVTy. 3065 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3066 unsigned VF = DstFVTy->getNumElements(); 3067 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3068 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3069 Type *SrcElemTy = SrcVecTy->getElementType(); 3070 Type *DstElemTy = DstFVTy->getElementType(); 3071 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3072 "Vector elements must have same size"); 3073 3074 // Do a direct cast if element types are castable. 3075 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3076 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3077 } 3078 // V cannot be directly casted to desired vector type. 3079 // May happen when V is a floating point vector but DstVTy is a vector of 3080 // pointers or vice-versa. Handle this using a two-step bitcast using an 3081 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3082 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3083 "Only one type should be a pointer type"); 3084 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3085 "Only one type should be a floating point type"); 3086 Type *IntTy = 3087 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3088 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3089 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3090 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3091 } 3092 3093 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3094 BasicBlock *Bypass) { 3095 Value *Count = getOrCreateTripCount(L); 3096 // Reuse existing vector loop preheader for TC checks. 3097 // Note that new preheader block is generated for vector loop. 3098 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3099 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3100 3101 // Generate code to check if the loop's trip count is less than VF * UF, or 3102 // equal to it in case a scalar epilogue is required; this implies that the 3103 // vector trip count is zero. This check also covers the case where adding one 3104 // to the backedge-taken count overflowed leading to an incorrect trip count 3105 // of zero. In this case we will also jump to the scalar loop. 3106 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3107 : ICmpInst::ICMP_ULT; 3108 3109 // If tail is to be folded, vector loop takes care of all iterations. 3110 Value *CheckMinIters = Builder.getFalse(); 3111 if (!Cost->foldTailByMasking()) { 3112 Value *Step = 3113 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3114 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3115 } 3116 // Create new preheader for vector loop. 3117 LoopVectorPreHeader = 3118 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3119 "vector.ph"); 3120 3121 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3122 DT->getNode(Bypass)->getIDom()) && 3123 "TC check is expected to dominate Bypass"); 3124 3125 // Update dominator for Bypass & LoopExit. 3126 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3127 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3128 3129 ReplaceInstWithInst( 3130 TCCheckBlock->getTerminator(), 3131 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3132 LoopBypassBlocks.push_back(TCCheckBlock); 3133 } 3134 3135 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3136 // Reuse existing vector loop preheader for SCEV checks. 3137 // Note that new preheader block is generated for vector loop. 3138 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3139 3140 // Generate the code to check that the SCEV assumptions that we made. 3141 // We want the new basic block to start at the first instruction in a 3142 // sequence of instructions that form a check. 3143 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3144 "scev.check"); 3145 Value *SCEVCheck = Exp.expandCodeForPredicate( 3146 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3147 3148 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3149 if (C->isZero()) 3150 return; 3151 3152 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3153 (OptForSizeBasedOnProfile && 3154 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3155 "Cannot SCEV check stride or overflow when optimizing for size"); 3156 3157 SCEVCheckBlock->setName("vector.scevcheck"); 3158 // Create new preheader for vector loop. 3159 LoopVectorPreHeader = 3160 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3161 nullptr, "vector.ph"); 3162 3163 // Update dominator only if this is first RT check. 3164 if (LoopBypassBlocks.empty()) { 3165 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3166 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3167 } 3168 3169 ReplaceInstWithInst( 3170 SCEVCheckBlock->getTerminator(), 3171 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3172 LoopBypassBlocks.push_back(SCEVCheckBlock); 3173 AddedSafetyChecks = true; 3174 } 3175 3176 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3177 // VPlan-native path does not do any analysis for runtime checks currently. 3178 if (EnableVPlanNativePath) 3179 return; 3180 3181 // Reuse existing vector loop preheader for runtime memory checks. 3182 // Note that new preheader block is generated for vector loop. 3183 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3184 3185 // Generate the code that checks in runtime if arrays overlap. We put the 3186 // checks into a separate block to make the more common case of few elements 3187 // faster. 3188 auto *LAI = Legal->getLAI(); 3189 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3190 if (!RtPtrChecking.Need) 3191 return; 3192 3193 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3194 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3195 "Cannot emit memory checks when optimizing for size, unless forced " 3196 "to vectorize."); 3197 ORE->emit([&]() { 3198 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3199 L->getStartLoc(), L->getHeader()) 3200 << "Code-size may be reduced by not forcing " 3201 "vectorization, or by source-code modifications " 3202 "eliminating the need for runtime checks " 3203 "(e.g., adding 'restrict')."; 3204 }); 3205 } 3206 3207 MemCheckBlock->setName("vector.memcheck"); 3208 // Create new preheader for vector loop. 3209 LoopVectorPreHeader = 3210 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3211 "vector.ph"); 3212 3213 auto *CondBranch = cast<BranchInst>( 3214 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3215 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3216 LoopBypassBlocks.push_back(MemCheckBlock); 3217 AddedSafetyChecks = true; 3218 3219 // Update dominator only if this is first RT check. 3220 if (LoopBypassBlocks.empty()) { 3221 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3222 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3223 } 3224 3225 Instruction *FirstCheckInst; 3226 Instruction *MemRuntimeCheck; 3227 std::tie(FirstCheckInst, MemRuntimeCheck) = 3228 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3229 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3230 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3231 "claimed checks are required"); 3232 CondBranch->setCondition(MemRuntimeCheck); 3233 3234 // We currently don't use LoopVersioning for the actual loop cloning but we 3235 // still use it to add the noalias metadata. 3236 LVer = std::make_unique<LoopVersioning>( 3237 *Legal->getLAI(), 3238 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3239 DT, PSE.getSE()); 3240 LVer->prepareNoAliasMetadata(); 3241 } 3242 3243 Value *InnerLoopVectorizer::emitTransformedIndex( 3244 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3245 const InductionDescriptor &ID) const { 3246 3247 SCEVExpander Exp(*SE, DL, "induction"); 3248 auto Step = ID.getStep(); 3249 auto StartValue = ID.getStartValue(); 3250 assert(Index->getType() == Step->getType() && 3251 "Index type does not match StepValue type"); 3252 3253 // Note: the IR at this point is broken. We cannot use SE to create any new 3254 // SCEV and then expand it, hoping that SCEV's simplification will give us 3255 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3256 // lead to various SCEV crashes. So all we can do is to use builder and rely 3257 // on InstCombine for future simplifications. Here we handle some trivial 3258 // cases only. 3259 auto CreateAdd = [&B](Value *X, Value *Y) { 3260 assert(X->getType() == Y->getType() && "Types don't match!"); 3261 if (auto *CX = dyn_cast<ConstantInt>(X)) 3262 if (CX->isZero()) 3263 return Y; 3264 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3265 if (CY->isZero()) 3266 return X; 3267 return B.CreateAdd(X, Y); 3268 }; 3269 3270 auto CreateMul = [&B](Value *X, Value *Y) { 3271 assert(X->getType() == Y->getType() && "Types don't match!"); 3272 if (auto *CX = dyn_cast<ConstantInt>(X)) 3273 if (CX->isOne()) 3274 return Y; 3275 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3276 if (CY->isOne()) 3277 return X; 3278 return B.CreateMul(X, Y); 3279 }; 3280 3281 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3282 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3283 // the DomTree is not kept up-to-date for additional blocks generated in the 3284 // vector loop. By using the header as insertion point, we guarantee that the 3285 // expanded instructions dominate all their uses. 3286 auto GetInsertPoint = [this, &B]() { 3287 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3288 if (InsertBB != LoopVectorBody && 3289 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3290 return LoopVectorBody->getTerminator(); 3291 return &*B.GetInsertPoint(); 3292 }; 3293 switch (ID.getKind()) { 3294 case InductionDescriptor::IK_IntInduction: { 3295 assert(Index->getType() == StartValue->getType() && 3296 "Index type does not match StartValue type"); 3297 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3298 return B.CreateSub(StartValue, Index); 3299 auto *Offset = CreateMul( 3300 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3301 return CreateAdd(StartValue, Offset); 3302 } 3303 case InductionDescriptor::IK_PtrInduction: { 3304 assert(isa<SCEVConstant>(Step) && 3305 "Expected constant step for pointer induction"); 3306 return B.CreateGEP( 3307 StartValue->getType()->getPointerElementType(), StartValue, 3308 CreateMul(Index, 3309 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3310 } 3311 case InductionDescriptor::IK_FpInduction: { 3312 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3313 auto InductionBinOp = ID.getInductionBinOp(); 3314 assert(InductionBinOp && 3315 (InductionBinOp->getOpcode() == Instruction::FAdd || 3316 InductionBinOp->getOpcode() == Instruction::FSub) && 3317 "Original bin op should be defined for FP induction"); 3318 3319 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3320 3321 // Floating point operations had to be 'fast' to enable the induction. 3322 FastMathFlags Flags; 3323 Flags.setFast(); 3324 3325 Value *MulExp = B.CreateFMul(StepValue, Index); 3326 if (isa<Instruction>(MulExp)) 3327 // We have to check, the MulExp may be a constant. 3328 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3329 3330 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3331 "induction"); 3332 if (isa<Instruction>(BOp)) 3333 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3334 3335 return BOp; 3336 } 3337 case InductionDescriptor::IK_NoInduction: 3338 return nullptr; 3339 } 3340 llvm_unreachable("invalid enum"); 3341 } 3342 3343 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3344 LoopScalarBody = OrigLoop->getHeader(); 3345 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3346 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3347 assert(LoopExitBlock && "Must have an exit block"); 3348 assert(LoopVectorPreHeader && "Invalid loop structure"); 3349 3350 LoopMiddleBlock = 3351 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3352 LI, nullptr, Twine(Prefix) + "middle.block"); 3353 LoopScalarPreHeader = 3354 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3355 nullptr, Twine(Prefix) + "scalar.ph"); 3356 3357 // Set up branch from middle block to the exit and scalar preheader blocks. 3358 // completeLoopSkeleton will update the condition to use an iteration check, 3359 // if required to decide whether to execute the remainder. 3360 BranchInst *BrInst = 3361 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3362 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3363 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3364 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3365 3366 // We intentionally don't let SplitBlock to update LoopInfo since 3367 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3368 // LoopVectorBody is explicitly added to the correct place few lines later. 3369 LoopVectorBody = 3370 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3371 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3372 3373 // Update dominator for loop exit. 3374 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3375 3376 // Create and register the new vector loop. 3377 Loop *Lp = LI->AllocateLoop(); 3378 Loop *ParentLoop = OrigLoop->getParentLoop(); 3379 3380 // Insert the new loop into the loop nest and register the new basic blocks 3381 // before calling any utilities such as SCEV that require valid LoopInfo. 3382 if (ParentLoop) { 3383 ParentLoop->addChildLoop(Lp); 3384 } else { 3385 LI->addTopLevelLoop(Lp); 3386 } 3387 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3388 return Lp; 3389 } 3390 3391 void InnerLoopVectorizer::createInductionResumeValues( 3392 Loop *L, Value *VectorTripCount, 3393 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3394 assert(VectorTripCount && L && "Expected valid arguments"); 3395 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3396 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3397 "Inconsistent information about additional bypass."); 3398 // We are going to resume the execution of the scalar loop. 3399 // Go over all of the induction variables that we found and fix the 3400 // PHIs that are left in the scalar version of the loop. 3401 // The starting values of PHI nodes depend on the counter of the last 3402 // iteration in the vectorized loop. 3403 // If we come from a bypass edge then we need to start from the original 3404 // start value. 3405 for (auto &InductionEntry : Legal->getInductionVars()) { 3406 PHINode *OrigPhi = InductionEntry.first; 3407 InductionDescriptor II = InductionEntry.second; 3408 3409 // Create phi nodes to merge from the backedge-taken check block. 3410 PHINode *BCResumeVal = 3411 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3412 LoopScalarPreHeader->getTerminator()); 3413 // Copy original phi DL over to the new one. 3414 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3415 Value *&EndValue = IVEndValues[OrigPhi]; 3416 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3417 if (OrigPhi == OldInduction) { 3418 // We know what the end value is. 3419 EndValue = VectorTripCount; 3420 } else { 3421 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3422 Type *StepType = II.getStep()->getType(); 3423 Instruction::CastOps CastOp = 3424 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3425 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3426 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3427 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3428 EndValue->setName("ind.end"); 3429 3430 // Compute the end value for the additional bypass (if applicable). 3431 if (AdditionalBypass.first) { 3432 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3433 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3434 StepType, true); 3435 CRD = 3436 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3437 EndValueFromAdditionalBypass = 3438 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3439 EndValueFromAdditionalBypass->setName("ind.end"); 3440 } 3441 } 3442 // The new PHI merges the original incoming value, in case of a bypass, 3443 // or the value at the end of the vectorized loop. 3444 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3445 3446 // Fix the scalar body counter (PHI node). 3447 // The old induction's phi node in the scalar body needs the truncated 3448 // value. 3449 for (BasicBlock *BB : LoopBypassBlocks) 3450 BCResumeVal->addIncoming(II.getStartValue(), BB); 3451 3452 if (AdditionalBypass.first) 3453 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3454 EndValueFromAdditionalBypass); 3455 3456 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3457 } 3458 } 3459 3460 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3461 MDNode *OrigLoopID) { 3462 assert(L && "Expected valid loop."); 3463 3464 // The trip counts should be cached by now. 3465 Value *Count = getOrCreateTripCount(L); 3466 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3467 3468 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3469 3470 // Add a check in the middle block to see if we have completed 3471 // all of the iterations in the first vector loop. 3472 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3473 // If tail is to be folded, we know we don't need to run the remainder. 3474 if (!Cost->foldTailByMasking()) { 3475 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3476 Count, VectorTripCount, "cmp.n", 3477 LoopMiddleBlock->getTerminator()); 3478 3479 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3480 // of the corresponding compare because they may have ended up with 3481 // different line numbers and we want to avoid awkward line stepping while 3482 // debugging. Eg. if the compare has got a line number inside the loop. 3483 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3484 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3485 } 3486 3487 // Get ready to start creating new instructions into the vectorized body. 3488 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3489 "Inconsistent vector loop preheader"); 3490 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3491 3492 Optional<MDNode *> VectorizedLoopID = 3493 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3494 LLVMLoopVectorizeFollowupVectorized}); 3495 if (VectorizedLoopID.hasValue()) { 3496 L->setLoopID(VectorizedLoopID.getValue()); 3497 3498 // Do not setAlreadyVectorized if loop attributes have been defined 3499 // explicitly. 3500 return LoopVectorPreHeader; 3501 } 3502 3503 // Keep all loop hints from the original loop on the vector loop (we'll 3504 // replace the vectorizer-specific hints below). 3505 if (MDNode *LID = OrigLoop->getLoopID()) 3506 L->setLoopID(LID); 3507 3508 LoopVectorizeHints Hints(L, true, *ORE); 3509 Hints.setAlreadyVectorized(); 3510 3511 #ifdef EXPENSIVE_CHECKS 3512 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3513 LI->verify(*DT); 3514 #endif 3515 3516 return LoopVectorPreHeader; 3517 } 3518 3519 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3520 /* 3521 In this function we generate a new loop. The new loop will contain 3522 the vectorized instructions while the old loop will continue to run the 3523 scalar remainder. 3524 3525 [ ] <-- loop iteration number check. 3526 / | 3527 / v 3528 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3529 | / | 3530 | / v 3531 || [ ] <-- vector pre header. 3532 |/ | 3533 | v 3534 | [ ] \ 3535 | [ ]_| <-- vector loop. 3536 | | 3537 | v 3538 | -[ ] <--- middle-block. 3539 | / | 3540 | / v 3541 -|- >[ ] <--- new preheader. 3542 | | 3543 | v 3544 | [ ] \ 3545 | [ ]_| <-- old scalar loop to handle remainder. 3546 \ | 3547 \ v 3548 >[ ] <-- exit block. 3549 ... 3550 */ 3551 3552 // Get the metadata of the original loop before it gets modified. 3553 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3554 3555 // Create an empty vector loop, and prepare basic blocks for the runtime 3556 // checks. 3557 Loop *Lp = createVectorLoopSkeleton(""); 3558 3559 // Now, compare the new count to zero. If it is zero skip the vector loop and 3560 // jump to the scalar loop. This check also covers the case where the 3561 // backedge-taken count is uint##_max: adding one to it will overflow leading 3562 // to an incorrect trip count of zero. In this (rare) case we will also jump 3563 // to the scalar loop. 3564 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3565 3566 // Generate the code to check any assumptions that we've made for SCEV 3567 // expressions. 3568 emitSCEVChecks(Lp, LoopScalarPreHeader); 3569 3570 // Generate the code that checks in runtime if arrays overlap. We put the 3571 // checks into a separate block to make the more common case of few elements 3572 // faster. 3573 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3574 3575 // Some loops have a single integer induction variable, while other loops 3576 // don't. One example is c++ iterators that often have multiple pointer 3577 // induction variables. In the code below we also support a case where we 3578 // don't have a single induction variable. 3579 // 3580 // We try to obtain an induction variable from the original loop as hard 3581 // as possible. However if we don't find one that: 3582 // - is an integer 3583 // - counts from zero, stepping by one 3584 // - is the size of the widest induction variable type 3585 // then we create a new one. 3586 OldInduction = Legal->getPrimaryInduction(); 3587 Type *IdxTy = Legal->getWidestInductionType(); 3588 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3589 // The loop step is equal to the vectorization factor (num of SIMD elements) 3590 // times the unroll factor (num of SIMD instructions). 3591 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3592 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3593 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3594 Induction = 3595 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3596 getDebugLocFromInstOrOperands(OldInduction)); 3597 3598 // Emit phis for the new starting index of the scalar loop. 3599 createInductionResumeValues(Lp, CountRoundDown); 3600 3601 return completeLoopSkeleton(Lp, OrigLoopID); 3602 } 3603 3604 // Fix up external users of the induction variable. At this point, we are 3605 // in LCSSA form, with all external PHIs that use the IV having one input value, 3606 // coming from the remainder loop. We need those PHIs to also have a correct 3607 // value for the IV when arriving directly from the middle block. 3608 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3609 const InductionDescriptor &II, 3610 Value *CountRoundDown, Value *EndValue, 3611 BasicBlock *MiddleBlock) { 3612 // There are two kinds of external IV usages - those that use the value 3613 // computed in the last iteration (the PHI) and those that use the penultimate 3614 // value (the value that feeds into the phi from the loop latch). 3615 // We allow both, but they, obviously, have different values. 3616 3617 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3618 3619 DenseMap<Value *, Value *> MissingVals; 3620 3621 // An external user of the last iteration's value should see the value that 3622 // the remainder loop uses to initialize its own IV. 3623 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3624 for (User *U : PostInc->users()) { 3625 Instruction *UI = cast<Instruction>(U); 3626 if (!OrigLoop->contains(UI)) { 3627 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3628 MissingVals[UI] = EndValue; 3629 } 3630 } 3631 3632 // An external user of the penultimate value need to see EndValue - Step. 3633 // The simplest way to get this is to recompute it from the constituent SCEVs, 3634 // that is Start + (Step * (CRD - 1)). 3635 for (User *U : OrigPhi->users()) { 3636 auto *UI = cast<Instruction>(U); 3637 if (!OrigLoop->contains(UI)) { 3638 const DataLayout &DL = 3639 OrigLoop->getHeader()->getModule()->getDataLayout(); 3640 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3641 3642 IRBuilder<> B(MiddleBlock->getTerminator()); 3643 Value *CountMinusOne = B.CreateSub( 3644 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3645 Value *CMO = 3646 !II.getStep()->getType()->isIntegerTy() 3647 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3648 II.getStep()->getType()) 3649 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3650 CMO->setName("cast.cmo"); 3651 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3652 Escape->setName("ind.escape"); 3653 MissingVals[UI] = Escape; 3654 } 3655 } 3656 3657 for (auto &I : MissingVals) { 3658 PHINode *PHI = cast<PHINode>(I.first); 3659 // One corner case we have to handle is two IVs "chasing" each-other, 3660 // that is %IV2 = phi [...], [ %IV1, %latch ] 3661 // In this case, if IV1 has an external use, we need to avoid adding both 3662 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3663 // don't already have an incoming value for the middle block. 3664 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3665 PHI->addIncoming(I.second, MiddleBlock); 3666 } 3667 } 3668 3669 namespace { 3670 3671 struct CSEDenseMapInfo { 3672 static bool canHandle(const Instruction *I) { 3673 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3674 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3675 } 3676 3677 static inline Instruction *getEmptyKey() { 3678 return DenseMapInfo<Instruction *>::getEmptyKey(); 3679 } 3680 3681 static inline Instruction *getTombstoneKey() { 3682 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3683 } 3684 3685 static unsigned getHashValue(const Instruction *I) { 3686 assert(canHandle(I) && "Unknown instruction!"); 3687 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3688 I->value_op_end())); 3689 } 3690 3691 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3692 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3693 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3694 return LHS == RHS; 3695 return LHS->isIdenticalTo(RHS); 3696 } 3697 }; 3698 3699 } // end anonymous namespace 3700 3701 ///Perform cse of induction variable instructions. 3702 static void cse(BasicBlock *BB) { 3703 // Perform simple cse. 3704 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3705 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3706 Instruction *In = &*I++; 3707 3708 if (!CSEDenseMapInfo::canHandle(In)) 3709 continue; 3710 3711 // Check if we can replace this instruction with any of the 3712 // visited instructions. 3713 if (Instruction *V = CSEMap.lookup(In)) { 3714 In->replaceAllUsesWith(V); 3715 In->eraseFromParent(); 3716 continue; 3717 } 3718 3719 CSEMap[In] = In; 3720 } 3721 } 3722 3723 InstructionCost 3724 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3725 bool &NeedToScalarize) { 3726 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3727 Function *F = CI->getCalledFunction(); 3728 Type *ScalarRetTy = CI->getType(); 3729 SmallVector<Type *, 4> Tys, ScalarTys; 3730 for (auto &ArgOp : CI->arg_operands()) 3731 ScalarTys.push_back(ArgOp->getType()); 3732 3733 // Estimate cost of scalarized vector call. The source operands are assumed 3734 // to be vectors, so we need to extract individual elements from there, 3735 // execute VF scalar calls, and then gather the result into the vector return 3736 // value. 3737 InstructionCost ScalarCallCost = 3738 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3739 if (VF.isScalar()) 3740 return ScalarCallCost; 3741 3742 // Compute corresponding vector type for return value and arguments. 3743 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3744 for (Type *ScalarTy : ScalarTys) 3745 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3746 3747 // Compute costs of unpacking argument values for the scalar calls and 3748 // packing the return values to a vector. 3749 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3750 3751 InstructionCost Cost = 3752 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3753 3754 // If we can't emit a vector call for this function, then the currently found 3755 // cost is the cost we need to return. 3756 NeedToScalarize = true; 3757 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3758 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3759 3760 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3761 return Cost; 3762 3763 // If the corresponding vector cost is cheaper, return its cost. 3764 InstructionCost VectorCallCost = 3765 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3766 if (VectorCallCost < Cost) { 3767 NeedToScalarize = false; 3768 Cost = VectorCallCost; 3769 } 3770 return Cost; 3771 } 3772 3773 InstructionCost 3774 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3775 ElementCount VF) { 3776 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3777 assert(ID && "Expected intrinsic call!"); 3778 3779 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3780 return TTI.getIntrinsicInstrCost(CostAttrs, 3781 TargetTransformInfo::TCK_RecipThroughput); 3782 } 3783 3784 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3785 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3786 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3787 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3788 } 3789 3790 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3791 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3792 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3793 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3794 } 3795 3796 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3797 // For every instruction `I` in MinBWs, truncate the operands, create a 3798 // truncated version of `I` and reextend its result. InstCombine runs 3799 // later and will remove any ext/trunc pairs. 3800 SmallPtrSet<Value *, 4> Erased; 3801 for (const auto &KV : Cost->getMinimalBitwidths()) { 3802 // If the value wasn't vectorized, we must maintain the original scalar 3803 // type. The absence of the value from VectorLoopValueMap indicates that it 3804 // wasn't vectorized. 3805 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3806 continue; 3807 for (unsigned Part = 0; Part < UF; ++Part) { 3808 Value *I = getOrCreateVectorValue(KV.first, Part); 3809 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3810 continue; 3811 Type *OriginalTy = I->getType(); 3812 Type *ScalarTruncatedTy = 3813 IntegerType::get(OriginalTy->getContext(), KV.second); 3814 auto *TruncatedTy = FixedVectorType::get( 3815 ScalarTruncatedTy, 3816 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3817 if (TruncatedTy == OriginalTy) 3818 continue; 3819 3820 IRBuilder<> B(cast<Instruction>(I)); 3821 auto ShrinkOperand = [&](Value *V) -> Value * { 3822 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3823 if (ZI->getSrcTy() == TruncatedTy) 3824 return ZI->getOperand(0); 3825 return B.CreateZExtOrTrunc(V, TruncatedTy); 3826 }; 3827 3828 // The actual instruction modification depends on the instruction type, 3829 // unfortunately. 3830 Value *NewI = nullptr; 3831 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3832 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3833 ShrinkOperand(BO->getOperand(1))); 3834 3835 // Any wrapping introduced by shrinking this operation shouldn't be 3836 // considered undefined behavior. So, we can't unconditionally copy 3837 // arithmetic wrapping flags to NewI. 3838 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3839 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3840 NewI = 3841 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3842 ShrinkOperand(CI->getOperand(1))); 3843 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3844 NewI = B.CreateSelect(SI->getCondition(), 3845 ShrinkOperand(SI->getTrueValue()), 3846 ShrinkOperand(SI->getFalseValue())); 3847 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3848 switch (CI->getOpcode()) { 3849 default: 3850 llvm_unreachable("Unhandled cast!"); 3851 case Instruction::Trunc: 3852 NewI = ShrinkOperand(CI->getOperand(0)); 3853 break; 3854 case Instruction::SExt: 3855 NewI = B.CreateSExtOrTrunc( 3856 CI->getOperand(0), 3857 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3858 break; 3859 case Instruction::ZExt: 3860 NewI = B.CreateZExtOrTrunc( 3861 CI->getOperand(0), 3862 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3863 break; 3864 } 3865 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3866 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3867 ->getNumElements(); 3868 auto *O0 = B.CreateZExtOrTrunc( 3869 SI->getOperand(0), 3870 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3871 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3872 ->getNumElements(); 3873 auto *O1 = B.CreateZExtOrTrunc( 3874 SI->getOperand(1), 3875 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3876 3877 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3878 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3879 // Don't do anything with the operands, just extend the result. 3880 continue; 3881 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3882 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3883 ->getNumElements(); 3884 auto *O0 = B.CreateZExtOrTrunc( 3885 IE->getOperand(0), 3886 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3887 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3888 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3889 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3890 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3891 ->getNumElements(); 3892 auto *O0 = B.CreateZExtOrTrunc( 3893 EE->getOperand(0), 3894 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3895 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3896 } else { 3897 // If we don't know what to do, be conservative and don't do anything. 3898 continue; 3899 } 3900 3901 // Lastly, extend the result. 3902 NewI->takeName(cast<Instruction>(I)); 3903 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3904 I->replaceAllUsesWith(Res); 3905 cast<Instruction>(I)->eraseFromParent(); 3906 Erased.insert(I); 3907 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3908 } 3909 } 3910 3911 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3912 for (const auto &KV : Cost->getMinimalBitwidths()) { 3913 // If the value wasn't vectorized, we must maintain the original scalar 3914 // type. The absence of the value from VectorLoopValueMap indicates that it 3915 // wasn't vectorized. 3916 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3917 continue; 3918 for (unsigned Part = 0; Part < UF; ++Part) { 3919 Value *I = getOrCreateVectorValue(KV.first, Part); 3920 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3921 if (Inst && Inst->use_empty()) { 3922 Value *NewI = Inst->getOperand(0); 3923 Inst->eraseFromParent(); 3924 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3925 } 3926 } 3927 } 3928 } 3929 3930 void InnerLoopVectorizer::fixVectorizedLoop() { 3931 // Insert truncates and extends for any truncated instructions as hints to 3932 // InstCombine. 3933 if (VF.isVector()) 3934 truncateToMinimalBitwidths(); 3935 3936 // Fix widened non-induction PHIs by setting up the PHI operands. 3937 if (OrigPHIsToFix.size()) { 3938 assert(EnableVPlanNativePath && 3939 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3940 fixNonInductionPHIs(); 3941 } 3942 3943 // At this point every instruction in the original loop is widened to a 3944 // vector form. Now we need to fix the recurrences in the loop. These PHI 3945 // nodes are currently empty because we did not want to introduce cycles. 3946 // This is the second stage of vectorizing recurrences. 3947 fixCrossIterationPHIs(); 3948 3949 // Forget the original basic block. 3950 PSE.getSE()->forgetLoop(OrigLoop); 3951 3952 // Fix-up external users of the induction variables. 3953 for (auto &Entry : Legal->getInductionVars()) 3954 fixupIVUsers(Entry.first, Entry.second, 3955 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3956 IVEndValues[Entry.first], LoopMiddleBlock); 3957 3958 fixLCSSAPHIs(); 3959 for (Instruction *PI : PredicatedInstructions) 3960 sinkScalarOperands(&*PI); 3961 3962 // Remove redundant induction instructions. 3963 cse(LoopVectorBody); 3964 3965 // Set/update profile weights for the vector and remainder loops as original 3966 // loop iterations are now distributed among them. Note that original loop 3967 // represented by LoopScalarBody becomes remainder loop after vectorization. 3968 // 3969 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3970 // end up getting slightly roughened result but that should be OK since 3971 // profile is not inherently precise anyway. Note also possible bypass of 3972 // vector code caused by legality checks is ignored, assigning all the weight 3973 // to the vector loop, optimistically. 3974 // 3975 // For scalable vectorization we can't know at compile time how many iterations 3976 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3977 // vscale of '1'. 3978 setProfileInfoAfterUnrolling( 3979 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3980 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3981 } 3982 3983 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3984 // In order to support recurrences we need to be able to vectorize Phi nodes. 3985 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3986 // stage #2: We now need to fix the recurrences by adding incoming edges to 3987 // the currently empty PHI nodes. At this point every instruction in the 3988 // original loop is widened to a vector form so we can use them to construct 3989 // the incoming edges. 3990 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3991 // Handle first-order recurrences and reductions that need to be fixed. 3992 if (Legal->isFirstOrderRecurrence(&Phi)) 3993 fixFirstOrderRecurrence(&Phi); 3994 else if (Legal->isReductionVariable(&Phi)) 3995 fixReduction(&Phi); 3996 } 3997 } 3998 3999 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 4000 // This is the second phase of vectorizing first-order recurrences. An 4001 // overview of the transformation is described below. Suppose we have the 4002 // following loop. 4003 // 4004 // for (int i = 0; i < n; ++i) 4005 // b[i] = a[i] - a[i - 1]; 4006 // 4007 // There is a first-order recurrence on "a". For this loop, the shorthand 4008 // scalar IR looks like: 4009 // 4010 // scalar.ph: 4011 // s_init = a[-1] 4012 // br scalar.body 4013 // 4014 // scalar.body: 4015 // i = phi [0, scalar.ph], [i+1, scalar.body] 4016 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4017 // s2 = a[i] 4018 // b[i] = s2 - s1 4019 // br cond, scalar.body, ... 4020 // 4021 // In this example, s1 is a recurrence because it's value depends on the 4022 // previous iteration. In the first phase of vectorization, we created a 4023 // temporary value for s1. We now complete the vectorization and produce the 4024 // shorthand vector IR shown below (for VF = 4, UF = 1). 4025 // 4026 // vector.ph: 4027 // v_init = vector(..., ..., ..., a[-1]) 4028 // br vector.body 4029 // 4030 // vector.body 4031 // i = phi [0, vector.ph], [i+4, vector.body] 4032 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4033 // v2 = a[i, i+1, i+2, i+3]; 4034 // v3 = vector(v1(3), v2(0, 1, 2)) 4035 // b[i, i+1, i+2, i+3] = v2 - v3 4036 // br cond, vector.body, middle.block 4037 // 4038 // middle.block: 4039 // x = v2(3) 4040 // br scalar.ph 4041 // 4042 // scalar.ph: 4043 // s_init = phi [x, middle.block], [a[-1], otherwise] 4044 // br scalar.body 4045 // 4046 // After execution completes the vector loop, we extract the next value of 4047 // the recurrence (x) to use as the initial value in the scalar loop. 4048 4049 // Get the original loop preheader and single loop latch. 4050 auto *Preheader = OrigLoop->getLoopPreheader(); 4051 auto *Latch = OrigLoop->getLoopLatch(); 4052 4053 // Get the initial and previous values of the scalar recurrence. 4054 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4055 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4056 4057 // Create a vector from the initial value. 4058 auto *VectorInit = ScalarInit; 4059 if (VF.isVector()) { 4060 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4061 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4062 VectorInit = Builder.CreateInsertElement( 4063 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4064 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4065 } 4066 4067 // We constructed a temporary phi node in the first phase of vectorization. 4068 // This phi node will eventually be deleted. 4069 Builder.SetInsertPoint( 4070 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4071 4072 // Create a phi node for the new recurrence. The current value will either be 4073 // the initial value inserted into a vector or loop-varying vector value. 4074 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4075 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4076 4077 // Get the vectorized previous value of the last part UF - 1. It appears last 4078 // among all unrolled iterations, due to the order of their construction. 4079 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4080 4081 // Find and set the insertion point after the previous value if it is an 4082 // instruction. 4083 BasicBlock::iterator InsertPt; 4084 // Note that the previous value may have been constant-folded so it is not 4085 // guaranteed to be an instruction in the vector loop. 4086 // FIXME: Loop invariant values do not form recurrences. We should deal with 4087 // them earlier. 4088 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4089 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4090 else { 4091 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4092 if (isa<PHINode>(PreviousLastPart)) 4093 // If the previous value is a phi node, we should insert after all the phi 4094 // nodes in the block containing the PHI to avoid breaking basic block 4095 // verification. Note that the basic block may be different to 4096 // LoopVectorBody, in case we predicate the loop. 4097 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4098 else 4099 InsertPt = ++PreviousInst->getIterator(); 4100 } 4101 Builder.SetInsertPoint(&*InsertPt); 4102 4103 // We will construct a vector for the recurrence by combining the values for 4104 // the current and previous iterations. This is the required shuffle mask. 4105 assert(!VF.isScalable()); 4106 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4107 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4108 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4109 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4110 4111 // The vector from which to take the initial value for the current iteration 4112 // (actual or unrolled). Initially, this is the vector phi node. 4113 Value *Incoming = VecPhi; 4114 4115 // Shuffle the current and previous vector and update the vector parts. 4116 for (unsigned Part = 0; Part < UF; ++Part) { 4117 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4118 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4119 auto *Shuffle = 4120 VF.isVector() 4121 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4122 : Incoming; 4123 PhiPart->replaceAllUsesWith(Shuffle); 4124 cast<Instruction>(PhiPart)->eraseFromParent(); 4125 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4126 Incoming = PreviousPart; 4127 } 4128 4129 // Fix the latch value of the new recurrence in the vector loop. 4130 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4131 4132 // Extract the last vector element in the middle block. This will be the 4133 // initial value for the recurrence when jumping to the scalar loop. 4134 auto *ExtractForScalar = Incoming; 4135 if (VF.isVector()) { 4136 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4137 ExtractForScalar = Builder.CreateExtractElement( 4138 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4139 "vector.recur.extract"); 4140 } 4141 // Extract the second last element in the middle block if the 4142 // Phi is used outside the loop. We need to extract the phi itself 4143 // and not the last element (the phi update in the current iteration). This 4144 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4145 // when the scalar loop is not run at all. 4146 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4147 if (VF.isVector()) 4148 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4149 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4150 "vector.recur.extract.for.phi"); 4151 // When loop is unrolled without vectorizing, initialize 4152 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4153 // `Incoming`. This is analogous to the vectorized case above: extracting the 4154 // second last element when VF > 1. 4155 else if (UF > 1) 4156 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4157 4158 // Fix the initial value of the original recurrence in the scalar loop. 4159 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4160 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4161 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4162 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4163 Start->addIncoming(Incoming, BB); 4164 } 4165 4166 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4167 Phi->setName("scalar.recur"); 4168 4169 // Finally, fix users of the recurrence outside the loop. The users will need 4170 // either the last value of the scalar recurrence or the last value of the 4171 // vector recurrence we extracted in the middle block. Since the loop is in 4172 // LCSSA form, we just need to find all the phi nodes for the original scalar 4173 // recurrence in the exit block, and then add an edge for the middle block. 4174 // Note that LCSSA does not imply single entry when the original scalar loop 4175 // had multiple exiting edges (as we always run the last iteration in the 4176 // scalar epilogue); in that case, the exiting path through middle will be 4177 // dynamically dead and the value picked for the phi doesn't matter. 4178 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4179 if (any_of(LCSSAPhi.incoming_values(), 4180 [Phi](Value *V) { return V == Phi; })) 4181 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4182 } 4183 4184 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4185 // Get it's reduction variable descriptor. 4186 assert(Legal->isReductionVariable(Phi) && 4187 "Unable to find the reduction variable"); 4188 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4189 4190 RecurKind RK = RdxDesc.getRecurrenceKind(); 4191 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4192 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4193 setDebugLocFromInst(Builder, ReductionStartValue); 4194 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4195 4196 // This is the vector-clone of the value that leaves the loop. 4197 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4198 4199 // Wrap flags are in general invalid after vectorization, clear them. 4200 clearReductionWrapFlags(RdxDesc); 4201 4202 // Fix the vector-loop phi. 4203 4204 // Reductions do not have to start at zero. They can start with 4205 // any loop invariant values. 4206 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4207 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4208 4209 for (unsigned Part = 0; Part < UF; ++Part) { 4210 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4211 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4212 cast<PHINode>(VecRdxPhi) 4213 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4214 } 4215 4216 // Before each round, move the insertion point right between 4217 // the PHIs and the values we are going to write. 4218 // This allows us to write both PHINodes and the extractelement 4219 // instructions. 4220 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4221 4222 setDebugLocFromInst(Builder, LoopExitInst); 4223 4224 // If tail is folded by masking, the vector value to leave the loop should be 4225 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4226 // instead of the former. For an inloop reduction the reduction will already 4227 // be predicated, and does not need to be handled here. 4228 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4229 for (unsigned Part = 0; Part < UF; ++Part) { 4230 Value *VecLoopExitInst = 4231 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4232 Value *Sel = nullptr; 4233 for (User *U : VecLoopExitInst->users()) { 4234 if (isa<SelectInst>(U)) { 4235 assert(!Sel && "Reduction exit feeding two selects"); 4236 Sel = U; 4237 } else 4238 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4239 } 4240 assert(Sel && "Reduction exit feeds no select"); 4241 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4242 4243 // If the target can create a predicated operator for the reduction at no 4244 // extra cost in the loop (for example a predicated vadd), it can be 4245 // cheaper for the select to remain in the loop than be sunk out of it, 4246 // and so use the select value for the phi instead of the old 4247 // LoopExitValue. 4248 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4249 if (PreferPredicatedReductionSelect || 4250 TTI->preferPredicatedReductionSelect( 4251 RdxDesc.getOpcode(), Phi->getType(), 4252 TargetTransformInfo::ReductionFlags())) { 4253 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4254 VecRdxPhi->setIncomingValueForBlock( 4255 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4256 } 4257 } 4258 } 4259 4260 // If the vector reduction can be performed in a smaller type, we truncate 4261 // then extend the loop exit value to enable InstCombine to evaluate the 4262 // entire expression in the smaller type. 4263 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4264 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4265 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4266 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4267 Builder.SetInsertPoint( 4268 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4269 VectorParts RdxParts(UF); 4270 for (unsigned Part = 0; Part < UF; ++Part) { 4271 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4272 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4273 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4274 : Builder.CreateZExt(Trunc, VecTy); 4275 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4276 UI != RdxParts[Part]->user_end();) 4277 if (*UI != Trunc) { 4278 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4279 RdxParts[Part] = Extnd; 4280 } else { 4281 ++UI; 4282 } 4283 } 4284 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4285 for (unsigned Part = 0; Part < UF; ++Part) { 4286 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4287 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4288 } 4289 } 4290 4291 // Reduce all of the unrolled parts into a single vector. 4292 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4293 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4294 4295 // The middle block terminator has already been assigned a DebugLoc here (the 4296 // OrigLoop's single latch terminator). We want the whole middle block to 4297 // appear to execute on this line because: (a) it is all compiler generated, 4298 // (b) these instructions are always executed after evaluating the latch 4299 // conditional branch, and (c) other passes may add new predecessors which 4300 // terminate on this line. This is the easiest way to ensure we don't 4301 // accidentally cause an extra step back into the loop while debugging. 4302 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4303 for (unsigned Part = 1; Part < UF; ++Part) { 4304 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4305 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4306 // Floating point operations had to be 'fast' to enable the reduction. 4307 ReducedPartRdx = addFastMathFlag( 4308 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4309 ReducedPartRdx, "bin.rdx"), 4310 RdxDesc.getFastMathFlags()); 4311 else 4312 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4313 } 4314 4315 // Create the reduction after the loop. Note that inloop reductions create the 4316 // target reduction in the loop using a Reduction recipe. 4317 if (VF.isVector() && !IsInLoopReductionPhi) { 4318 ReducedPartRdx = 4319 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4320 // If the reduction can be performed in a smaller type, we need to extend 4321 // the reduction to the wider type before we branch to the original loop. 4322 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4323 ReducedPartRdx = 4324 RdxDesc.isSigned() 4325 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4326 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4327 } 4328 4329 // Create a phi node that merges control-flow from the backedge-taken check 4330 // block and the middle block. 4331 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4332 LoopScalarPreHeader->getTerminator()); 4333 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4334 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4335 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4336 4337 // Now, we need to fix the users of the reduction variable 4338 // inside and outside of the scalar remainder loop. 4339 4340 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4341 // in the exit blocks. See comment on analogous loop in 4342 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4343 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4344 if (any_of(LCSSAPhi.incoming_values(), 4345 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4346 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4347 4348 // Fix the scalar loop reduction variable with the incoming reduction sum 4349 // from the vector body and from the backedge value. 4350 int IncomingEdgeBlockIdx = 4351 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4352 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4353 // Pick the other block. 4354 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4355 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4356 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4357 } 4358 4359 void InnerLoopVectorizer::clearReductionWrapFlags( 4360 RecurrenceDescriptor &RdxDesc) { 4361 RecurKind RK = RdxDesc.getRecurrenceKind(); 4362 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4363 return; 4364 4365 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4366 assert(LoopExitInstr && "null loop exit instruction"); 4367 SmallVector<Instruction *, 8> Worklist; 4368 SmallPtrSet<Instruction *, 8> Visited; 4369 Worklist.push_back(LoopExitInstr); 4370 Visited.insert(LoopExitInstr); 4371 4372 while (!Worklist.empty()) { 4373 Instruction *Cur = Worklist.pop_back_val(); 4374 if (isa<OverflowingBinaryOperator>(Cur)) 4375 for (unsigned Part = 0; Part < UF; ++Part) { 4376 Value *V = getOrCreateVectorValue(Cur, Part); 4377 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4378 } 4379 4380 for (User *U : Cur->users()) { 4381 Instruction *UI = cast<Instruction>(U); 4382 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4383 Visited.insert(UI).second) 4384 Worklist.push_back(UI); 4385 } 4386 } 4387 } 4388 4389 void InnerLoopVectorizer::fixLCSSAPHIs() { 4390 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4391 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4392 // Some phis were already hand updated by the reduction and recurrence 4393 // code above, leave them alone. 4394 continue; 4395 4396 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4397 // Non-instruction incoming values will have only one value. 4398 unsigned LastLane = 0; 4399 if (isa<Instruction>(IncomingValue)) 4400 LastLane = Cost->isUniformAfterVectorization( 4401 cast<Instruction>(IncomingValue), VF) 4402 ? 0 4403 : VF.getKnownMinValue() - 1; 4404 assert((!VF.isScalable() || LastLane == 0) && 4405 "scalable vectors dont support non-uniform scalars yet"); 4406 // Can be a loop invariant incoming value or the last scalar value to be 4407 // extracted from the vectorized loop. 4408 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4409 Value *lastIncomingValue = 4410 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4411 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4412 } 4413 } 4414 4415 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4416 // The basic block and loop containing the predicated instruction. 4417 auto *PredBB = PredInst->getParent(); 4418 auto *VectorLoop = LI->getLoopFor(PredBB); 4419 4420 // Initialize a worklist with the operands of the predicated instruction. 4421 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4422 4423 // Holds instructions that we need to analyze again. An instruction may be 4424 // reanalyzed if we don't yet know if we can sink it or not. 4425 SmallVector<Instruction *, 8> InstsToReanalyze; 4426 4427 // Returns true if a given use occurs in the predicated block. Phi nodes use 4428 // their operands in their corresponding predecessor blocks. 4429 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4430 auto *I = cast<Instruction>(U.getUser()); 4431 BasicBlock *BB = I->getParent(); 4432 if (auto *Phi = dyn_cast<PHINode>(I)) 4433 BB = Phi->getIncomingBlock( 4434 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4435 return BB == PredBB; 4436 }; 4437 4438 // Iteratively sink the scalarized operands of the predicated instruction 4439 // into the block we created for it. When an instruction is sunk, it's 4440 // operands are then added to the worklist. The algorithm ends after one pass 4441 // through the worklist doesn't sink a single instruction. 4442 bool Changed; 4443 do { 4444 // Add the instructions that need to be reanalyzed to the worklist, and 4445 // reset the changed indicator. 4446 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4447 InstsToReanalyze.clear(); 4448 Changed = false; 4449 4450 while (!Worklist.empty()) { 4451 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4452 4453 // We can't sink an instruction if it is a phi node, is already in the 4454 // predicated block, is not in the loop, or may have side effects. 4455 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4456 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4457 continue; 4458 4459 // It's legal to sink the instruction if all its uses occur in the 4460 // predicated block. Otherwise, there's nothing to do yet, and we may 4461 // need to reanalyze the instruction. 4462 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4463 InstsToReanalyze.push_back(I); 4464 continue; 4465 } 4466 4467 // Move the instruction to the beginning of the predicated block, and add 4468 // it's operands to the worklist. 4469 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4470 Worklist.insert(I->op_begin(), I->op_end()); 4471 4472 // The sinking may have enabled other instructions to be sunk, so we will 4473 // need to iterate. 4474 Changed = true; 4475 } 4476 } while (Changed); 4477 } 4478 4479 void InnerLoopVectorizer::fixNonInductionPHIs() { 4480 for (PHINode *OrigPhi : OrigPHIsToFix) { 4481 PHINode *NewPhi = 4482 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4483 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4484 4485 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4486 predecessors(OrigPhi->getParent())); 4487 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4488 predecessors(NewPhi->getParent())); 4489 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4490 "Scalar and Vector BB should have the same number of predecessors"); 4491 4492 // The insertion point in Builder may be invalidated by the time we get 4493 // here. Force the Builder insertion point to something valid so that we do 4494 // not run into issues during insertion point restore in 4495 // getOrCreateVectorValue calls below. 4496 Builder.SetInsertPoint(NewPhi); 4497 4498 // The predecessor order is preserved and we can rely on mapping between 4499 // scalar and vector block predecessors. 4500 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4501 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4502 4503 // When looking up the new scalar/vector values to fix up, use incoming 4504 // values from original phi. 4505 Value *ScIncV = 4506 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4507 4508 // Scalar incoming value may need a broadcast 4509 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4510 NewPhi->addIncoming(NewIncV, NewPredBB); 4511 } 4512 } 4513 } 4514 4515 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4516 VPUser &Operands, unsigned UF, 4517 ElementCount VF, bool IsPtrLoopInvariant, 4518 SmallBitVector &IsIndexLoopInvariant, 4519 VPTransformState &State) { 4520 // Construct a vector GEP by widening the operands of the scalar GEP as 4521 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4522 // results in a vector of pointers when at least one operand of the GEP 4523 // is vector-typed. Thus, to keep the representation compact, we only use 4524 // vector-typed operands for loop-varying values. 4525 4526 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4527 // If we are vectorizing, but the GEP has only loop-invariant operands, 4528 // the GEP we build (by only using vector-typed operands for 4529 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4530 // produce a vector of pointers, we need to either arbitrarily pick an 4531 // operand to broadcast, or broadcast a clone of the original GEP. 4532 // Here, we broadcast a clone of the original. 4533 // 4534 // TODO: If at some point we decide to scalarize instructions having 4535 // loop-invariant operands, this special case will no longer be 4536 // required. We would add the scalarization decision to 4537 // collectLoopScalars() and teach getVectorValue() to broadcast 4538 // the lane-zero scalar value. 4539 auto *Clone = Builder.Insert(GEP->clone()); 4540 for (unsigned Part = 0; Part < UF; ++Part) { 4541 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4542 State.set(VPDef, GEP, EntryPart, Part); 4543 addMetadata(EntryPart, GEP); 4544 } 4545 } else { 4546 // If the GEP has at least one loop-varying operand, we are sure to 4547 // produce a vector of pointers. But if we are only unrolling, we want 4548 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4549 // produce with the code below will be scalar (if VF == 1) or vector 4550 // (otherwise). Note that for the unroll-only case, we still maintain 4551 // values in the vector mapping with initVector, as we do for other 4552 // instructions. 4553 for (unsigned Part = 0; Part < UF; ++Part) { 4554 // The pointer operand of the new GEP. If it's loop-invariant, we 4555 // won't broadcast it. 4556 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4557 : State.get(Operands.getOperand(0), Part); 4558 4559 // Collect all the indices for the new GEP. If any index is 4560 // loop-invariant, we won't broadcast it. 4561 SmallVector<Value *, 4> Indices; 4562 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4563 VPValue *Operand = Operands.getOperand(I); 4564 if (IsIndexLoopInvariant[I - 1]) 4565 Indices.push_back(State.get(Operand, {0, 0})); 4566 else 4567 Indices.push_back(State.get(Operand, Part)); 4568 } 4569 4570 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4571 // but it should be a vector, otherwise. 4572 auto *NewGEP = 4573 GEP->isInBounds() 4574 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4575 Indices) 4576 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4577 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4578 "NewGEP is not a pointer vector"); 4579 State.set(VPDef, GEP, NewGEP, Part); 4580 addMetadata(NewGEP, GEP); 4581 } 4582 } 4583 } 4584 4585 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4586 RecurrenceDescriptor *RdxDesc, 4587 Value *StartV, unsigned UF, 4588 ElementCount VF) { 4589 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4590 PHINode *P = cast<PHINode>(PN); 4591 if (EnableVPlanNativePath) { 4592 // Currently we enter here in the VPlan-native path for non-induction 4593 // PHIs where all control flow is uniform. We simply widen these PHIs. 4594 // Create a vector phi with no operands - the vector phi operands will be 4595 // set at the end of vector code generation. 4596 Type *VecTy = 4597 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4598 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4599 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4600 OrigPHIsToFix.push_back(P); 4601 4602 return; 4603 } 4604 4605 assert(PN->getParent() == OrigLoop->getHeader() && 4606 "Non-header phis should have been handled elsewhere"); 4607 4608 // In order to support recurrences we need to be able to vectorize Phi nodes. 4609 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4610 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4611 // this value when we vectorize all of the instructions that use the PHI. 4612 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4613 Value *Iden = nullptr; 4614 bool ScalarPHI = 4615 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4616 Type *VecTy = 4617 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4618 4619 if (RdxDesc) { 4620 assert(Legal->isReductionVariable(P) && StartV && 4621 "RdxDesc should only be set for reduction variables; in that case " 4622 "a StartV is also required"); 4623 RecurKind RK = RdxDesc->getRecurrenceKind(); 4624 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4625 // MinMax reduction have the start value as their identify. 4626 if (ScalarPHI) { 4627 Iden = StartV; 4628 } else { 4629 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4630 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4631 StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 4632 } 4633 } else { 4634 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4635 RK, VecTy->getScalarType()); 4636 Iden = IdenC; 4637 4638 if (!ScalarPHI) { 4639 Iden = ConstantVector::getSplat(VF, IdenC); 4640 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4641 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4642 Constant *Zero = Builder.getInt32(0); 4643 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4644 } 4645 } 4646 } 4647 4648 for (unsigned Part = 0; Part < UF; ++Part) { 4649 // This is phase one of vectorizing PHIs. 4650 Value *EntryPart = PHINode::Create( 4651 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4652 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4653 if (StartV) { 4654 // Make sure to add the reduction start value only to the 4655 // first unroll part. 4656 Value *StartVal = (Part == 0) ? StartV : Iden; 4657 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4658 } 4659 } 4660 return; 4661 } 4662 4663 assert(!Legal->isReductionVariable(P) && 4664 "reductions should be handled above"); 4665 4666 setDebugLocFromInst(Builder, P); 4667 4668 // This PHINode must be an induction variable. 4669 // Make sure that we know about it. 4670 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4671 4672 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4673 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4674 4675 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4676 // which can be found from the original scalar operations. 4677 switch (II.getKind()) { 4678 case InductionDescriptor::IK_NoInduction: 4679 llvm_unreachable("Unknown induction"); 4680 case InductionDescriptor::IK_IntInduction: 4681 case InductionDescriptor::IK_FpInduction: 4682 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4683 case InductionDescriptor::IK_PtrInduction: { 4684 // Handle the pointer induction variable case. 4685 assert(P->getType()->isPointerTy() && "Unexpected type."); 4686 4687 if (Cost->isScalarAfterVectorization(P, VF)) { 4688 // This is the normalized GEP that starts counting at zero. 4689 Value *PtrInd = 4690 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4691 // Determine the number of scalars we need to generate for each unroll 4692 // iteration. If the instruction is uniform, we only need to generate the 4693 // first lane. Otherwise, we generate all VF values. 4694 unsigned Lanes = 4695 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4696 for (unsigned Part = 0; Part < UF; ++Part) { 4697 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4698 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4699 Lane + Part * VF.getKnownMinValue()); 4700 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4701 Value *SclrGep = 4702 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4703 SclrGep->setName("next.gep"); 4704 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4705 } 4706 } 4707 return; 4708 } 4709 assert(isa<SCEVConstant>(II.getStep()) && 4710 "Induction step not a SCEV constant!"); 4711 Type *PhiType = II.getStep()->getType(); 4712 4713 // Build a pointer phi 4714 Value *ScalarStartValue = II.getStartValue(); 4715 Type *ScStValueType = ScalarStartValue->getType(); 4716 PHINode *NewPointerPhi = 4717 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4718 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4719 4720 // A pointer induction, performed by using a gep 4721 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4722 Instruction *InductionLoc = LoopLatch->getTerminator(); 4723 const SCEV *ScalarStep = II.getStep(); 4724 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4725 Value *ScalarStepValue = 4726 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4727 Value *InductionGEP = GetElementPtrInst::Create( 4728 ScStValueType->getPointerElementType(), NewPointerPhi, 4729 Builder.CreateMul( 4730 ScalarStepValue, 4731 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4732 "ptr.ind", InductionLoc); 4733 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4734 4735 // Create UF many actual address geps that use the pointer 4736 // phi as base and a vectorized version of the step value 4737 // (<step*0, ..., step*N>) as offset. 4738 for (unsigned Part = 0; Part < UF; ++Part) { 4739 SmallVector<Constant *, 8> Indices; 4740 // Create a vector of consecutive numbers from zero to VF. 4741 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4742 Indices.push_back( 4743 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4744 Constant *StartOffset = ConstantVector::get(Indices); 4745 4746 Value *GEP = Builder.CreateGEP( 4747 ScStValueType->getPointerElementType(), NewPointerPhi, 4748 Builder.CreateMul( 4749 StartOffset, 4750 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4751 "vector.gep")); 4752 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4753 } 4754 } 4755 } 4756 } 4757 4758 /// A helper function for checking whether an integer division-related 4759 /// instruction may divide by zero (in which case it must be predicated if 4760 /// executed conditionally in the scalar code). 4761 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4762 /// Non-zero divisors that are non compile-time constants will not be 4763 /// converted into multiplication, so we will still end up scalarizing 4764 /// the division, but can do so w/o predication. 4765 static bool mayDivideByZero(Instruction &I) { 4766 assert((I.getOpcode() == Instruction::UDiv || 4767 I.getOpcode() == Instruction::SDiv || 4768 I.getOpcode() == Instruction::URem || 4769 I.getOpcode() == Instruction::SRem) && 4770 "Unexpected instruction"); 4771 Value *Divisor = I.getOperand(1); 4772 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4773 return !CInt || CInt->isZero(); 4774 } 4775 4776 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4777 VPUser &User, 4778 VPTransformState &State) { 4779 switch (I.getOpcode()) { 4780 case Instruction::Call: 4781 case Instruction::Br: 4782 case Instruction::PHI: 4783 case Instruction::GetElementPtr: 4784 case Instruction::Select: 4785 llvm_unreachable("This instruction is handled by a different recipe."); 4786 case Instruction::UDiv: 4787 case Instruction::SDiv: 4788 case Instruction::SRem: 4789 case Instruction::URem: 4790 case Instruction::Add: 4791 case Instruction::FAdd: 4792 case Instruction::Sub: 4793 case Instruction::FSub: 4794 case Instruction::FNeg: 4795 case Instruction::Mul: 4796 case Instruction::FMul: 4797 case Instruction::FDiv: 4798 case Instruction::FRem: 4799 case Instruction::Shl: 4800 case Instruction::LShr: 4801 case Instruction::AShr: 4802 case Instruction::And: 4803 case Instruction::Or: 4804 case Instruction::Xor: { 4805 // Just widen unops and binops. 4806 setDebugLocFromInst(Builder, &I); 4807 4808 for (unsigned Part = 0; Part < UF; ++Part) { 4809 SmallVector<Value *, 2> Ops; 4810 for (VPValue *VPOp : User.operands()) 4811 Ops.push_back(State.get(VPOp, Part)); 4812 4813 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4814 4815 if (auto *VecOp = dyn_cast<Instruction>(V)) 4816 VecOp->copyIRFlags(&I); 4817 4818 // Use this vector value for all users of the original instruction. 4819 State.set(Def, &I, V, Part); 4820 addMetadata(V, &I); 4821 } 4822 4823 break; 4824 } 4825 case Instruction::ICmp: 4826 case Instruction::FCmp: { 4827 // Widen compares. Generate vector compares. 4828 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4829 auto *Cmp = cast<CmpInst>(&I); 4830 setDebugLocFromInst(Builder, Cmp); 4831 for (unsigned Part = 0; Part < UF; ++Part) { 4832 Value *A = State.get(User.getOperand(0), Part); 4833 Value *B = State.get(User.getOperand(1), Part); 4834 Value *C = nullptr; 4835 if (FCmp) { 4836 // Propagate fast math flags. 4837 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4838 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4839 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4840 } else { 4841 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4842 } 4843 State.set(Def, &I, C, Part); 4844 addMetadata(C, &I); 4845 } 4846 4847 break; 4848 } 4849 4850 case Instruction::ZExt: 4851 case Instruction::SExt: 4852 case Instruction::FPToUI: 4853 case Instruction::FPToSI: 4854 case Instruction::FPExt: 4855 case Instruction::PtrToInt: 4856 case Instruction::IntToPtr: 4857 case Instruction::SIToFP: 4858 case Instruction::UIToFP: 4859 case Instruction::Trunc: 4860 case Instruction::FPTrunc: 4861 case Instruction::BitCast: { 4862 auto *CI = cast<CastInst>(&I); 4863 setDebugLocFromInst(Builder, CI); 4864 4865 /// Vectorize casts. 4866 Type *DestTy = 4867 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4868 4869 for (unsigned Part = 0; Part < UF; ++Part) { 4870 Value *A = State.get(User.getOperand(0), Part); 4871 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4872 State.set(Def, &I, Cast, Part); 4873 addMetadata(Cast, &I); 4874 } 4875 break; 4876 } 4877 default: 4878 // This instruction is not vectorized by simple widening. 4879 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4880 llvm_unreachable("Unhandled instruction!"); 4881 } // end of switch. 4882 } 4883 4884 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4885 VPUser &ArgOperands, 4886 VPTransformState &State) { 4887 assert(!isa<DbgInfoIntrinsic>(I) && 4888 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4889 setDebugLocFromInst(Builder, &I); 4890 4891 Module *M = I.getParent()->getParent()->getParent(); 4892 auto *CI = cast<CallInst>(&I); 4893 4894 SmallVector<Type *, 4> Tys; 4895 for (Value *ArgOperand : CI->arg_operands()) 4896 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4897 4898 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4899 4900 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4901 // version of the instruction. 4902 // Is it beneficial to perform intrinsic call compared to lib call? 4903 bool NeedToScalarize = false; 4904 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4905 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4906 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4907 assert((UseVectorIntrinsic || !NeedToScalarize) && 4908 "Instruction should be scalarized elsewhere."); 4909 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4910 "Cannot have invalid costs while widening"); 4911 4912 for (unsigned Part = 0; Part < UF; ++Part) { 4913 SmallVector<Value *, 4> Args; 4914 for (auto &I : enumerate(ArgOperands.operands())) { 4915 // Some intrinsics have a scalar argument - don't replace it with a 4916 // vector. 4917 Value *Arg; 4918 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4919 Arg = State.get(I.value(), Part); 4920 else 4921 Arg = State.get(I.value(), {0, 0}); 4922 Args.push_back(Arg); 4923 } 4924 4925 Function *VectorF; 4926 if (UseVectorIntrinsic) { 4927 // Use vector version of the intrinsic. 4928 Type *TysForDecl[] = {CI->getType()}; 4929 if (VF.isVector()) { 4930 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4931 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4932 } 4933 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4934 assert(VectorF && "Can't retrieve vector intrinsic."); 4935 } else { 4936 // Use vector version of the function call. 4937 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4938 #ifndef NDEBUG 4939 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4940 "Can't create vector function."); 4941 #endif 4942 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4943 } 4944 SmallVector<OperandBundleDef, 1> OpBundles; 4945 CI->getOperandBundlesAsDefs(OpBundles); 4946 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4947 4948 if (isa<FPMathOperator>(V)) 4949 V->copyFastMathFlags(CI); 4950 4951 State.set(Def, &I, V, Part); 4952 addMetadata(V, &I); 4953 } 4954 } 4955 4956 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4957 VPUser &Operands, 4958 bool InvariantCond, 4959 VPTransformState &State) { 4960 setDebugLocFromInst(Builder, &I); 4961 4962 // The condition can be loop invariant but still defined inside the 4963 // loop. This means that we can't just use the original 'cond' value. 4964 // We have to take the 'vectorized' value and pick the first lane. 4965 // Instcombine will make this a no-op. 4966 auto *InvarCond = 4967 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4968 4969 for (unsigned Part = 0; Part < UF; ++Part) { 4970 Value *Cond = 4971 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4972 Value *Op0 = State.get(Operands.getOperand(1), Part); 4973 Value *Op1 = State.get(Operands.getOperand(2), Part); 4974 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4975 State.set(VPDef, &I, Sel, Part); 4976 addMetadata(Sel, &I); 4977 } 4978 } 4979 4980 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4981 // We should not collect Scalars more than once per VF. Right now, this 4982 // function is called from collectUniformsAndScalars(), which already does 4983 // this check. Collecting Scalars for VF=1 does not make any sense. 4984 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4985 "This function should not be visited twice for the same VF"); 4986 4987 SmallSetVector<Instruction *, 8> Worklist; 4988 4989 // These sets are used to seed the analysis with pointers used by memory 4990 // accesses that will remain scalar. 4991 SmallSetVector<Instruction *, 8> ScalarPtrs; 4992 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4993 auto *Latch = TheLoop->getLoopLatch(); 4994 4995 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4996 // The pointer operands of loads and stores will be scalar as long as the 4997 // memory access is not a gather or scatter operation. The value operand of a 4998 // store will remain scalar if the store is scalarized. 4999 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5000 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5001 assert(WideningDecision != CM_Unknown && 5002 "Widening decision should be ready at this moment"); 5003 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5004 if (Ptr == Store->getValueOperand()) 5005 return WideningDecision == CM_Scalarize; 5006 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5007 "Ptr is neither a value or pointer operand"); 5008 return WideningDecision != CM_GatherScatter; 5009 }; 5010 5011 // A helper that returns true if the given value is a bitcast or 5012 // getelementptr instruction contained in the loop. 5013 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5014 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5015 isa<GetElementPtrInst>(V)) && 5016 !TheLoop->isLoopInvariant(V); 5017 }; 5018 5019 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5020 if (!isa<PHINode>(Ptr) || 5021 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5022 return false; 5023 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5024 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5025 return false; 5026 return isScalarUse(MemAccess, Ptr); 5027 }; 5028 5029 // A helper that evaluates a memory access's use of a pointer. If the 5030 // pointer is actually the pointer induction of a loop, it is being 5031 // inserted into Worklist. If the use will be a scalar use, and the 5032 // pointer is only used by memory accesses, we place the pointer in 5033 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5034 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5035 if (isScalarPtrInduction(MemAccess, Ptr)) { 5036 Worklist.insert(cast<Instruction>(Ptr)); 5037 Instruction *Update = cast<Instruction>( 5038 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5039 Worklist.insert(Update); 5040 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5041 << "\n"); 5042 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5043 << "\n"); 5044 return; 5045 } 5046 // We only care about bitcast and getelementptr instructions contained in 5047 // the loop. 5048 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5049 return; 5050 5051 // If the pointer has already been identified as scalar (e.g., if it was 5052 // also identified as uniform), there's nothing to do. 5053 auto *I = cast<Instruction>(Ptr); 5054 if (Worklist.count(I)) 5055 return; 5056 5057 // If the use of the pointer will be a scalar use, and all users of the 5058 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5059 // place the pointer in PossibleNonScalarPtrs. 5060 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5061 return isa<LoadInst>(U) || isa<StoreInst>(U); 5062 })) 5063 ScalarPtrs.insert(I); 5064 else 5065 PossibleNonScalarPtrs.insert(I); 5066 }; 5067 5068 // We seed the scalars analysis with three classes of instructions: (1) 5069 // instructions marked uniform-after-vectorization and (2) bitcast, 5070 // getelementptr and (pointer) phi instructions used by memory accesses 5071 // requiring a scalar use. 5072 // 5073 // (1) Add to the worklist all instructions that have been identified as 5074 // uniform-after-vectorization. 5075 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5076 5077 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5078 // memory accesses requiring a scalar use. The pointer operands of loads and 5079 // stores will be scalar as long as the memory accesses is not a gather or 5080 // scatter operation. The value operand of a store will remain scalar if the 5081 // store is scalarized. 5082 for (auto *BB : TheLoop->blocks()) 5083 for (auto &I : *BB) { 5084 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5085 evaluatePtrUse(Load, Load->getPointerOperand()); 5086 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5087 evaluatePtrUse(Store, Store->getPointerOperand()); 5088 evaluatePtrUse(Store, Store->getValueOperand()); 5089 } 5090 } 5091 for (auto *I : ScalarPtrs) 5092 if (!PossibleNonScalarPtrs.count(I)) { 5093 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5094 Worklist.insert(I); 5095 } 5096 5097 // Insert the forced scalars. 5098 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5099 // induction variable when the PHI user is scalarized. 5100 auto ForcedScalar = ForcedScalars.find(VF); 5101 if (ForcedScalar != ForcedScalars.end()) 5102 for (auto *I : ForcedScalar->second) 5103 Worklist.insert(I); 5104 5105 // Expand the worklist by looking through any bitcasts and getelementptr 5106 // instructions we've already identified as scalar. This is similar to the 5107 // expansion step in collectLoopUniforms(); however, here we're only 5108 // expanding to include additional bitcasts and getelementptr instructions. 5109 unsigned Idx = 0; 5110 while (Idx != Worklist.size()) { 5111 Instruction *Dst = Worklist[Idx++]; 5112 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5113 continue; 5114 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5115 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5116 auto *J = cast<Instruction>(U); 5117 return !TheLoop->contains(J) || Worklist.count(J) || 5118 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5119 isScalarUse(J, Src)); 5120 })) { 5121 Worklist.insert(Src); 5122 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5123 } 5124 } 5125 5126 // An induction variable will remain scalar if all users of the induction 5127 // variable and induction variable update remain scalar. 5128 for (auto &Induction : Legal->getInductionVars()) { 5129 auto *Ind = Induction.first; 5130 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5131 5132 // If tail-folding is applied, the primary induction variable will be used 5133 // to feed a vector compare. 5134 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5135 continue; 5136 5137 // Determine if all users of the induction variable are scalar after 5138 // vectorization. 5139 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5140 auto *I = cast<Instruction>(U); 5141 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5142 }); 5143 if (!ScalarInd) 5144 continue; 5145 5146 // Determine if all users of the induction variable update instruction are 5147 // scalar after vectorization. 5148 auto ScalarIndUpdate = 5149 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5150 auto *I = cast<Instruction>(U); 5151 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5152 }); 5153 if (!ScalarIndUpdate) 5154 continue; 5155 5156 // The induction variable and its update instruction will remain scalar. 5157 Worklist.insert(Ind); 5158 Worklist.insert(IndUpdate); 5159 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5160 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5161 << "\n"); 5162 } 5163 5164 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5165 } 5166 5167 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5168 ElementCount VF) { 5169 if (!blockNeedsPredication(I->getParent())) 5170 return false; 5171 switch(I->getOpcode()) { 5172 default: 5173 break; 5174 case Instruction::Load: 5175 case Instruction::Store: { 5176 if (!Legal->isMaskRequired(I)) 5177 return false; 5178 auto *Ptr = getLoadStorePointerOperand(I); 5179 auto *Ty = getMemInstValueType(I); 5180 // We have already decided how to vectorize this instruction, get that 5181 // result. 5182 if (VF.isVector()) { 5183 InstWidening WideningDecision = getWideningDecision(I, VF); 5184 assert(WideningDecision != CM_Unknown && 5185 "Widening decision should be ready at this moment"); 5186 return WideningDecision == CM_Scalarize; 5187 } 5188 const Align Alignment = getLoadStoreAlignment(I); 5189 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5190 isLegalMaskedGather(Ty, Alignment)) 5191 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5192 isLegalMaskedScatter(Ty, Alignment)); 5193 } 5194 case Instruction::UDiv: 5195 case Instruction::SDiv: 5196 case Instruction::SRem: 5197 case Instruction::URem: 5198 return mayDivideByZero(*I); 5199 } 5200 return false; 5201 } 5202 5203 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5204 Instruction *I, ElementCount VF) { 5205 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5206 assert(getWideningDecision(I, VF) == CM_Unknown && 5207 "Decision should not be set yet."); 5208 auto *Group = getInterleavedAccessGroup(I); 5209 assert(Group && "Must have a group."); 5210 5211 // If the instruction's allocated size doesn't equal it's type size, it 5212 // requires padding and will be scalarized. 5213 auto &DL = I->getModule()->getDataLayout(); 5214 auto *ScalarTy = getMemInstValueType(I); 5215 if (hasIrregularType(ScalarTy, DL, VF)) 5216 return false; 5217 5218 // Check if masking is required. 5219 // A Group may need masking for one of two reasons: it resides in a block that 5220 // needs predication, or it was decided to use masking to deal with gaps. 5221 bool PredicatedAccessRequiresMasking = 5222 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5223 bool AccessWithGapsRequiresMasking = 5224 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5225 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5226 return true; 5227 5228 // If masked interleaving is required, we expect that the user/target had 5229 // enabled it, because otherwise it either wouldn't have been created or 5230 // it should have been invalidated by the CostModel. 5231 assert(useMaskedInterleavedAccesses(TTI) && 5232 "Masked interleave-groups for predicated accesses are not enabled."); 5233 5234 auto *Ty = getMemInstValueType(I); 5235 const Align Alignment = getLoadStoreAlignment(I); 5236 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5237 : TTI.isLegalMaskedStore(Ty, Alignment); 5238 } 5239 5240 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5241 Instruction *I, ElementCount VF) { 5242 // Get and ensure we have a valid memory instruction. 5243 LoadInst *LI = dyn_cast<LoadInst>(I); 5244 StoreInst *SI = dyn_cast<StoreInst>(I); 5245 assert((LI || SI) && "Invalid memory instruction"); 5246 5247 auto *Ptr = getLoadStorePointerOperand(I); 5248 5249 // In order to be widened, the pointer should be consecutive, first of all. 5250 if (!Legal->isConsecutivePtr(Ptr)) 5251 return false; 5252 5253 // If the instruction is a store located in a predicated block, it will be 5254 // scalarized. 5255 if (isScalarWithPredication(I)) 5256 return false; 5257 5258 // If the instruction's allocated size doesn't equal it's type size, it 5259 // requires padding and will be scalarized. 5260 auto &DL = I->getModule()->getDataLayout(); 5261 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5262 if (hasIrregularType(ScalarTy, DL, VF)) 5263 return false; 5264 5265 return true; 5266 } 5267 5268 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5269 // We should not collect Uniforms more than once per VF. Right now, 5270 // this function is called from collectUniformsAndScalars(), which 5271 // already does this check. Collecting Uniforms for VF=1 does not make any 5272 // sense. 5273 5274 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5275 "This function should not be visited twice for the same VF"); 5276 5277 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5278 // not analyze again. Uniforms.count(VF) will return 1. 5279 Uniforms[VF].clear(); 5280 5281 // We now know that the loop is vectorizable! 5282 // Collect instructions inside the loop that will remain uniform after 5283 // vectorization. 5284 5285 // Global values, params and instructions outside of current loop are out of 5286 // scope. 5287 auto isOutOfScope = [&](Value *V) -> bool { 5288 Instruction *I = dyn_cast<Instruction>(V); 5289 return (!I || !TheLoop->contains(I)); 5290 }; 5291 5292 SetVector<Instruction *> Worklist; 5293 BasicBlock *Latch = TheLoop->getLoopLatch(); 5294 5295 // Instructions that are scalar with predication must not be considered 5296 // uniform after vectorization, because that would create an erroneous 5297 // replicating region where only a single instance out of VF should be formed. 5298 // TODO: optimize such seldom cases if found important, see PR40816. 5299 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5300 if (isOutOfScope(I)) { 5301 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5302 << *I << "\n"); 5303 return; 5304 } 5305 if (isScalarWithPredication(I, VF)) { 5306 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5307 << *I << "\n"); 5308 return; 5309 } 5310 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5311 Worklist.insert(I); 5312 }; 5313 5314 // Start with the conditional branch. If the branch condition is an 5315 // instruction contained in the loop that is only used by the branch, it is 5316 // uniform. 5317 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5318 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5319 addToWorklistIfAllowed(Cmp); 5320 5321 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5322 InstWidening WideningDecision = getWideningDecision(I, VF); 5323 assert(WideningDecision != CM_Unknown && 5324 "Widening decision should be ready at this moment"); 5325 5326 // A uniform memory op is itself uniform. We exclude uniform stores 5327 // here as they demand the last lane, not the first one. 5328 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5329 assert(WideningDecision == CM_Scalarize); 5330 return true; 5331 } 5332 5333 return (WideningDecision == CM_Widen || 5334 WideningDecision == CM_Widen_Reverse || 5335 WideningDecision == CM_Interleave); 5336 }; 5337 5338 5339 // Returns true if Ptr is the pointer operand of a memory access instruction 5340 // I, and I is known to not require scalarization. 5341 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5342 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5343 }; 5344 5345 // Holds a list of values which are known to have at least one uniform use. 5346 // Note that there may be other uses which aren't uniform. A "uniform use" 5347 // here is something which only demands lane 0 of the unrolled iterations; 5348 // it does not imply that all lanes produce the same value (e.g. this is not 5349 // the usual meaning of uniform) 5350 SmallPtrSet<Value *, 8> HasUniformUse; 5351 5352 // Scan the loop for instructions which are either a) known to have only 5353 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5354 for (auto *BB : TheLoop->blocks()) 5355 for (auto &I : *BB) { 5356 // If there's no pointer operand, there's nothing to do. 5357 auto *Ptr = getLoadStorePointerOperand(&I); 5358 if (!Ptr) 5359 continue; 5360 5361 // A uniform memory op is itself uniform. We exclude uniform stores 5362 // here as they demand the last lane, not the first one. 5363 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5364 addToWorklistIfAllowed(&I); 5365 5366 if (isUniformDecision(&I, VF)) { 5367 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5368 HasUniformUse.insert(Ptr); 5369 } 5370 } 5371 5372 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5373 // demanding) users. Since loops are assumed to be in LCSSA form, this 5374 // disallows uses outside the loop as well. 5375 for (auto *V : HasUniformUse) { 5376 if (isOutOfScope(V)) 5377 continue; 5378 auto *I = cast<Instruction>(V); 5379 auto UsersAreMemAccesses = 5380 llvm::all_of(I->users(), [&](User *U) -> bool { 5381 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5382 }); 5383 if (UsersAreMemAccesses) 5384 addToWorklistIfAllowed(I); 5385 } 5386 5387 // Expand Worklist in topological order: whenever a new instruction 5388 // is added , its users should be already inside Worklist. It ensures 5389 // a uniform instruction will only be used by uniform instructions. 5390 unsigned idx = 0; 5391 while (idx != Worklist.size()) { 5392 Instruction *I = Worklist[idx++]; 5393 5394 for (auto OV : I->operand_values()) { 5395 // isOutOfScope operands cannot be uniform instructions. 5396 if (isOutOfScope(OV)) 5397 continue; 5398 // First order recurrence Phi's should typically be considered 5399 // non-uniform. 5400 auto *OP = dyn_cast<PHINode>(OV); 5401 if (OP && Legal->isFirstOrderRecurrence(OP)) 5402 continue; 5403 // If all the users of the operand are uniform, then add the 5404 // operand into the uniform worklist. 5405 auto *OI = cast<Instruction>(OV); 5406 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5407 auto *J = cast<Instruction>(U); 5408 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5409 })) 5410 addToWorklistIfAllowed(OI); 5411 } 5412 } 5413 5414 // For an instruction to be added into Worklist above, all its users inside 5415 // the loop should also be in Worklist. However, this condition cannot be 5416 // true for phi nodes that form a cyclic dependence. We must process phi 5417 // nodes separately. An induction variable will remain uniform if all users 5418 // of the induction variable and induction variable update remain uniform. 5419 // The code below handles both pointer and non-pointer induction variables. 5420 for (auto &Induction : Legal->getInductionVars()) { 5421 auto *Ind = Induction.first; 5422 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5423 5424 // Determine if all users of the induction variable are uniform after 5425 // vectorization. 5426 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5427 auto *I = cast<Instruction>(U); 5428 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5429 isVectorizedMemAccessUse(I, Ind); 5430 }); 5431 if (!UniformInd) 5432 continue; 5433 5434 // Determine if all users of the induction variable update instruction are 5435 // uniform after vectorization. 5436 auto UniformIndUpdate = 5437 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5438 auto *I = cast<Instruction>(U); 5439 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5440 isVectorizedMemAccessUse(I, IndUpdate); 5441 }); 5442 if (!UniformIndUpdate) 5443 continue; 5444 5445 // The induction variable and its update instruction will remain uniform. 5446 addToWorklistIfAllowed(Ind); 5447 addToWorklistIfAllowed(IndUpdate); 5448 } 5449 5450 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5451 } 5452 5453 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5454 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5455 5456 if (Legal->getRuntimePointerChecking()->Need) { 5457 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5458 "runtime pointer checks needed. Enable vectorization of this " 5459 "loop with '#pragma clang loop vectorize(enable)' when " 5460 "compiling with -Os/-Oz", 5461 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5462 return true; 5463 } 5464 5465 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5466 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5467 "runtime SCEV checks needed. Enable vectorization of this " 5468 "loop with '#pragma clang loop vectorize(enable)' when " 5469 "compiling with -Os/-Oz", 5470 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5471 return true; 5472 } 5473 5474 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5475 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5476 reportVectorizationFailure("Runtime stride check for small trip count", 5477 "runtime stride == 1 checks needed. Enable vectorization of " 5478 "this loop without such check by compiling with -Os/-Oz", 5479 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5480 return true; 5481 } 5482 5483 return false; 5484 } 5485 5486 Optional<ElementCount> 5487 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5488 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5489 // TODO: It may by useful to do since it's still likely to be dynamically 5490 // uniform if the target can skip. 5491 reportVectorizationFailure( 5492 "Not inserting runtime ptr check for divergent target", 5493 "runtime pointer checks needed. Not enabled for divergent target", 5494 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5495 return None; 5496 } 5497 5498 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5499 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5500 if (TC == 1) { 5501 reportVectorizationFailure("Single iteration (non) loop", 5502 "loop trip count is one, irrelevant for vectorization", 5503 "SingleIterationLoop", ORE, TheLoop); 5504 return None; 5505 } 5506 5507 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5508 5509 switch (ScalarEpilogueStatus) { 5510 case CM_ScalarEpilogueAllowed: 5511 return MaxVF; 5512 case CM_ScalarEpilogueNotAllowedUsePredicate: 5513 LLVM_FALLTHROUGH; 5514 case CM_ScalarEpilogueNotNeededUsePredicate: 5515 LLVM_DEBUG( 5516 dbgs() << "LV: vector predicate hint/switch found.\n" 5517 << "LV: Not allowing scalar epilogue, creating predicated " 5518 << "vector loop.\n"); 5519 break; 5520 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5521 // fallthrough as a special case of OptForSize 5522 case CM_ScalarEpilogueNotAllowedOptSize: 5523 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5524 LLVM_DEBUG( 5525 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5526 else 5527 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5528 << "count.\n"); 5529 5530 // Bail if runtime checks are required, which are not good when optimising 5531 // for size. 5532 if (runtimeChecksRequired()) 5533 return None; 5534 5535 break; 5536 } 5537 5538 // The only loops we can vectorize without a scalar epilogue, are loops with 5539 // a bottom-test and a single exiting block. We'd have to handle the fact 5540 // that not every instruction executes on the last iteration. This will 5541 // require a lane mask which varies through the vector loop body. (TODO) 5542 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5543 // If there was a tail-folding hint/switch, but we can't fold the tail by 5544 // masking, fallback to a vectorization with a scalar epilogue. 5545 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5546 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5547 "scalar epilogue instead.\n"); 5548 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5549 return MaxVF; 5550 } 5551 return None; 5552 } 5553 5554 // Now try the tail folding 5555 5556 // Invalidate interleave groups that require an epilogue if we can't mask 5557 // the interleave-group. 5558 if (!useMaskedInterleavedAccesses(TTI)) { 5559 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5560 "No decisions should have been taken at this point"); 5561 // Note: There is no need to invalidate any cost modeling decisions here, as 5562 // non where taken so far. 5563 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5564 } 5565 5566 assert(!MaxVF.isScalable() && 5567 "Scalable vectors do not yet support tail folding"); 5568 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5569 "MaxVF must be a power of 2"); 5570 unsigned MaxVFtimesIC = 5571 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5572 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5573 // chose. 5574 ScalarEvolution *SE = PSE.getSE(); 5575 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5576 const SCEV *ExitCount = SE->getAddExpr( 5577 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5578 const SCEV *Rem = SE->getURemExpr( 5579 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5580 if (Rem->isZero()) { 5581 // Accept MaxVF if we do not have a tail. 5582 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5583 return MaxVF; 5584 } 5585 5586 // If we don't know the precise trip count, or if the trip count that we 5587 // found modulo the vectorization factor is not zero, try to fold the tail 5588 // by masking. 5589 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5590 if (Legal->prepareToFoldTailByMasking()) { 5591 FoldTailByMasking = true; 5592 return MaxVF; 5593 } 5594 5595 // If there was a tail-folding hint/switch, but we can't fold the tail by 5596 // masking, fallback to a vectorization with a scalar epilogue. 5597 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5598 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5599 "scalar epilogue instead.\n"); 5600 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5601 return MaxVF; 5602 } 5603 5604 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5605 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5606 return None; 5607 } 5608 5609 if (TC == 0) { 5610 reportVectorizationFailure( 5611 "Unable to calculate the loop count due to complex control flow", 5612 "unable to calculate the loop count due to complex control flow", 5613 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5614 return None; 5615 } 5616 5617 reportVectorizationFailure( 5618 "Cannot optimize for size and vectorize at the same time.", 5619 "cannot optimize for size and vectorize at the same time. " 5620 "Enable vectorization of this loop with '#pragma clang loop " 5621 "vectorize(enable)' when compiling with -Os/-Oz", 5622 "NoTailLoopWithOptForSize", ORE, TheLoop); 5623 return None; 5624 } 5625 5626 ElementCount 5627 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5628 ElementCount UserVF) { 5629 bool IgnoreScalableUserVF = UserVF.isScalable() && 5630 !TTI.supportsScalableVectors() && 5631 !ForceTargetSupportsScalableVectors; 5632 if (IgnoreScalableUserVF) { 5633 LLVM_DEBUG( 5634 dbgs() << "LV: Ignoring VF=" << UserVF 5635 << " because target does not support scalable vectors.\n"); 5636 ORE->emit([&]() { 5637 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5638 TheLoop->getStartLoc(), 5639 TheLoop->getHeader()) 5640 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5641 << " because target does not support scalable vectors."; 5642 }); 5643 } 5644 5645 // Beyond this point two scenarios are handled. If UserVF isn't specified 5646 // then a suitable VF is chosen. If UserVF is specified and there are 5647 // dependencies, check if it's legal. However, if a UserVF is specified and 5648 // there are no dependencies, then there's nothing to do. 5649 if (UserVF.isNonZero() && !IgnoreScalableUserVF && 5650 Legal->isSafeForAnyVectorWidth()) 5651 return UserVF; 5652 5653 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5654 unsigned SmallestType, WidestType; 5655 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5656 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5657 5658 // Get the maximum safe dependence distance in bits computed by LAA. 5659 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5660 // the memory accesses that is most restrictive (involved in the smallest 5661 // dependence distance). 5662 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5663 5664 // If the user vectorization factor is legally unsafe, clamp it to a safe 5665 // value. Otherwise, return as is. 5666 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5667 unsigned MaxSafeElements = 5668 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5669 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5670 5671 if (UserVF.isScalable()) { 5672 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5673 5674 // Scale VF by vscale before checking if it's safe. 5675 MaxSafeVF = ElementCount::getScalable( 5676 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5677 5678 if (MaxSafeVF.isZero()) { 5679 // The dependence distance is too small to use scalable vectors, 5680 // fallback on fixed. 5681 LLVM_DEBUG( 5682 dbgs() 5683 << "LV: Max legal vector width too small, scalable vectorization " 5684 "unfeasible. Using fixed-width vectorization instead.\n"); 5685 ORE->emit([&]() { 5686 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5687 TheLoop->getStartLoc(), 5688 TheLoop->getHeader()) 5689 << "Max legal vector width too small, scalable vectorization " 5690 << "unfeasible. Using fixed-width vectorization instead."; 5691 }); 5692 return computeFeasibleMaxVF( 5693 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5694 } 5695 } 5696 5697 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5698 5699 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5700 return UserVF; 5701 5702 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5703 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5704 << ".\n"); 5705 ORE->emit([&]() { 5706 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5707 TheLoop->getStartLoc(), 5708 TheLoop->getHeader()) 5709 << "User-specified vectorization factor " 5710 << ore::NV("UserVectorizationFactor", UserVF) 5711 << " is unsafe, clamping to maximum safe vectorization factor " 5712 << ore::NV("VectorizationFactor", MaxSafeVF); 5713 }); 5714 return MaxSafeVF; 5715 } 5716 5717 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5718 5719 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5720 // Note that both WidestRegister and WidestType may not be a powers of 2. 5721 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5722 5723 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5724 << " / " << WidestType << " bits.\n"); 5725 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5726 << WidestRegister << " bits.\n"); 5727 5728 assert(MaxVectorSize <= WidestRegister && 5729 "Did not expect to pack so many elements" 5730 " into one vector!"); 5731 if (MaxVectorSize == 0) { 5732 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5733 MaxVectorSize = 1; 5734 return ElementCount::getFixed(MaxVectorSize); 5735 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5736 isPowerOf2_32(ConstTripCount)) { 5737 // We need to clamp the VF to be the ConstTripCount. There is no point in 5738 // choosing a higher viable VF as done in the loop below. 5739 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5740 << ConstTripCount << "\n"); 5741 MaxVectorSize = ConstTripCount; 5742 return ElementCount::getFixed(MaxVectorSize); 5743 } 5744 5745 unsigned MaxVF = MaxVectorSize; 5746 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5747 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5748 // Collect all viable vectorization factors larger than the default MaxVF 5749 // (i.e. MaxVectorSize). 5750 SmallVector<ElementCount, 8> VFs; 5751 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5752 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5753 VFs.push_back(ElementCount::getFixed(VS)); 5754 5755 // For each VF calculate its register usage. 5756 auto RUs = calculateRegisterUsage(VFs); 5757 5758 // Select the largest VF which doesn't require more registers than existing 5759 // ones. 5760 for (int i = RUs.size() - 1; i >= 0; --i) { 5761 bool Selected = true; 5762 for (auto& pair : RUs[i].MaxLocalUsers) { 5763 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5764 if (pair.second > TargetNumRegisters) 5765 Selected = false; 5766 } 5767 if (Selected) { 5768 MaxVF = VFs[i].getKnownMinValue(); 5769 break; 5770 } 5771 } 5772 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5773 if (MaxVF < MinVF) { 5774 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5775 << ") with target's minimum: " << MinVF << '\n'); 5776 MaxVF = MinVF; 5777 } 5778 } 5779 } 5780 return ElementCount::getFixed(MaxVF); 5781 } 5782 5783 VectorizationFactor 5784 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5785 // FIXME: This can be fixed for scalable vectors later, because at this stage 5786 // the LoopVectorizer will only consider vectorizing a loop with scalable 5787 // vectors when the loop has a hint to enable vectorization for a given VF. 5788 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5789 5790 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5791 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5792 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5793 5794 unsigned Width = 1; 5795 const float ScalarCost = *ExpectedCost.getValue(); 5796 float Cost = ScalarCost; 5797 5798 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5799 if (ForceVectorization && MaxVF.isVector()) { 5800 // Ignore scalar width, because the user explicitly wants vectorization. 5801 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5802 // evaluation. 5803 Cost = std::numeric_limits<float>::max(); 5804 } 5805 5806 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5807 // Notice that the vector loop needs to be executed less times, so 5808 // we need to divide the cost of the vector loops by the width of 5809 // the vector elements. 5810 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5811 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5812 float VectorCost = *C.first.getValue() / (float)i; 5813 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5814 << " costs: " << (int)VectorCost << ".\n"); 5815 if (!C.second && !ForceVectorization) { 5816 LLVM_DEBUG( 5817 dbgs() << "LV: Not considering vector loop of width " << i 5818 << " because it will not generate any vector instructions.\n"); 5819 continue; 5820 } 5821 5822 // If profitable add it to ProfitableVF list. 5823 if (VectorCost < ScalarCost) { 5824 ProfitableVFs.push_back(VectorizationFactor( 5825 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5826 } 5827 5828 if (VectorCost < Cost) { 5829 Cost = VectorCost; 5830 Width = i; 5831 } 5832 } 5833 5834 if (!EnableCondStoresVectorization && NumPredStores) { 5835 reportVectorizationFailure("There are conditional stores.", 5836 "store that is conditionally executed prevents vectorization", 5837 "ConditionalStore", ORE, TheLoop); 5838 Width = 1; 5839 Cost = ScalarCost; 5840 } 5841 5842 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5843 << "LV: Vectorization seems to be not beneficial, " 5844 << "but was forced by a user.\n"); 5845 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5846 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5847 (unsigned)(Width * Cost)}; 5848 return Factor; 5849 } 5850 5851 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5852 const Loop &L, ElementCount VF) const { 5853 // Cross iteration phis such as reductions need special handling and are 5854 // currently unsupported. 5855 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5856 return Legal->isFirstOrderRecurrence(&Phi) || 5857 Legal->isReductionVariable(&Phi); 5858 })) 5859 return false; 5860 5861 // Phis with uses outside of the loop require special handling and are 5862 // currently unsupported. 5863 for (auto &Entry : Legal->getInductionVars()) { 5864 // Look for uses of the value of the induction at the last iteration. 5865 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5866 for (User *U : PostInc->users()) 5867 if (!L.contains(cast<Instruction>(U))) 5868 return false; 5869 // Look for uses of penultimate value of the induction. 5870 for (User *U : Entry.first->users()) 5871 if (!L.contains(cast<Instruction>(U))) 5872 return false; 5873 } 5874 5875 // Induction variables that are widened require special handling that is 5876 // currently not supported. 5877 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5878 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5879 this->isProfitableToScalarize(Entry.first, VF)); 5880 })) 5881 return false; 5882 5883 return true; 5884 } 5885 5886 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5887 const ElementCount VF) const { 5888 // FIXME: We need a much better cost-model to take different parameters such 5889 // as register pressure, code size increase and cost of extra branches into 5890 // account. For now we apply a very crude heuristic and only consider loops 5891 // with vectorization factors larger than a certain value. 5892 // We also consider epilogue vectorization unprofitable for targets that don't 5893 // consider interleaving beneficial (eg. MVE). 5894 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5895 return false; 5896 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5897 return true; 5898 return false; 5899 } 5900 5901 VectorizationFactor 5902 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5903 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5904 VectorizationFactor Result = VectorizationFactor::Disabled(); 5905 if (!EnableEpilogueVectorization) { 5906 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5907 return Result; 5908 } 5909 5910 if (!isScalarEpilogueAllowed()) { 5911 LLVM_DEBUG( 5912 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5913 "allowed.\n";); 5914 return Result; 5915 } 5916 5917 // FIXME: This can be fixed for scalable vectors later, because at this stage 5918 // the LoopVectorizer will only consider vectorizing a loop with scalable 5919 // vectors when the loop has a hint to enable vectorization for a given VF. 5920 if (MainLoopVF.isScalable()) { 5921 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5922 "yet supported.\n"); 5923 return Result; 5924 } 5925 5926 // Not really a cost consideration, but check for unsupported cases here to 5927 // simplify the logic. 5928 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5929 LLVM_DEBUG( 5930 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5931 "not a supported candidate.\n";); 5932 return Result; 5933 } 5934 5935 if (EpilogueVectorizationForceVF > 1) { 5936 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5937 if (LVP.hasPlanWithVFs( 5938 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5939 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5940 else { 5941 LLVM_DEBUG( 5942 dbgs() 5943 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5944 return Result; 5945 } 5946 } 5947 5948 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5949 TheLoop->getHeader()->getParent()->hasMinSize()) { 5950 LLVM_DEBUG( 5951 dbgs() 5952 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5953 return Result; 5954 } 5955 5956 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5957 return Result; 5958 5959 for (auto &NextVF : ProfitableVFs) 5960 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5961 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5962 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5963 Result = NextVF; 5964 5965 if (Result != VectorizationFactor::Disabled()) 5966 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5967 << Result.Width.getFixedValue() << "\n";); 5968 return Result; 5969 } 5970 5971 std::pair<unsigned, unsigned> 5972 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5973 unsigned MinWidth = -1U; 5974 unsigned MaxWidth = 8; 5975 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5976 5977 // For each block. 5978 for (BasicBlock *BB : TheLoop->blocks()) { 5979 // For each instruction in the loop. 5980 for (Instruction &I : BB->instructionsWithoutDebug()) { 5981 Type *T = I.getType(); 5982 5983 // Skip ignored values. 5984 if (ValuesToIgnore.count(&I)) 5985 continue; 5986 5987 // Only examine Loads, Stores and PHINodes. 5988 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5989 continue; 5990 5991 // Examine PHI nodes that are reduction variables. Update the type to 5992 // account for the recurrence type. 5993 if (auto *PN = dyn_cast<PHINode>(&I)) { 5994 if (!Legal->isReductionVariable(PN)) 5995 continue; 5996 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5997 if (PreferInLoopReductions || 5998 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5999 RdxDesc.getRecurrenceType(), 6000 TargetTransformInfo::ReductionFlags())) 6001 continue; 6002 T = RdxDesc.getRecurrenceType(); 6003 } 6004 6005 // Examine the stored values. 6006 if (auto *ST = dyn_cast<StoreInst>(&I)) 6007 T = ST->getValueOperand()->getType(); 6008 6009 // Ignore loaded pointer types and stored pointer types that are not 6010 // vectorizable. 6011 // 6012 // FIXME: The check here attempts to predict whether a load or store will 6013 // be vectorized. We only know this for certain after a VF has 6014 // been selected. Here, we assume that if an access can be 6015 // vectorized, it will be. We should also look at extending this 6016 // optimization to non-pointer types. 6017 // 6018 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6019 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6020 continue; 6021 6022 MinWidth = std::min(MinWidth, 6023 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6024 MaxWidth = std::max(MaxWidth, 6025 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6026 } 6027 } 6028 6029 return {MinWidth, MaxWidth}; 6030 } 6031 6032 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6033 unsigned LoopCost) { 6034 // -- The interleave heuristics -- 6035 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6036 // There are many micro-architectural considerations that we can't predict 6037 // at this level. For example, frontend pressure (on decode or fetch) due to 6038 // code size, or the number and capabilities of the execution ports. 6039 // 6040 // We use the following heuristics to select the interleave count: 6041 // 1. If the code has reductions, then we interleave to break the cross 6042 // iteration dependency. 6043 // 2. If the loop is really small, then we interleave to reduce the loop 6044 // overhead. 6045 // 3. We don't interleave if we think that we will spill registers to memory 6046 // due to the increased register pressure. 6047 6048 if (!isScalarEpilogueAllowed()) 6049 return 1; 6050 6051 // We used the distance for the interleave count. 6052 if (Legal->getMaxSafeDepDistBytes() != -1U) 6053 return 1; 6054 6055 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6056 const bool HasReductions = !Legal->getReductionVars().empty(); 6057 // Do not interleave loops with a relatively small known or estimated trip 6058 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6059 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6060 // because with the above conditions interleaving can expose ILP and break 6061 // cross iteration dependences for reductions. 6062 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6063 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6064 return 1; 6065 6066 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6067 // We divide by these constants so assume that we have at least one 6068 // instruction that uses at least one register. 6069 for (auto& pair : R.MaxLocalUsers) { 6070 pair.second = std::max(pair.second, 1U); 6071 } 6072 6073 // We calculate the interleave count using the following formula. 6074 // Subtract the number of loop invariants from the number of available 6075 // registers. These registers are used by all of the interleaved instances. 6076 // Next, divide the remaining registers by the number of registers that is 6077 // required by the loop, in order to estimate how many parallel instances 6078 // fit without causing spills. All of this is rounded down if necessary to be 6079 // a power of two. We want power of two interleave count to simplify any 6080 // addressing operations or alignment considerations. 6081 // We also want power of two interleave counts to ensure that the induction 6082 // variable of the vector loop wraps to zero, when tail is folded by masking; 6083 // this currently happens when OptForSize, in which case IC is set to 1 above. 6084 unsigned IC = UINT_MAX; 6085 6086 for (auto& pair : R.MaxLocalUsers) { 6087 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6088 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6089 << " registers of " 6090 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6091 if (VF.isScalar()) { 6092 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6093 TargetNumRegisters = ForceTargetNumScalarRegs; 6094 } else { 6095 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6096 TargetNumRegisters = ForceTargetNumVectorRegs; 6097 } 6098 unsigned MaxLocalUsers = pair.second; 6099 unsigned LoopInvariantRegs = 0; 6100 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6101 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6102 6103 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6104 // Don't count the induction variable as interleaved. 6105 if (EnableIndVarRegisterHeur) { 6106 TmpIC = 6107 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6108 std::max(1U, (MaxLocalUsers - 1))); 6109 } 6110 6111 IC = std::min(IC, TmpIC); 6112 } 6113 6114 // Clamp the interleave ranges to reasonable counts. 6115 unsigned MaxInterleaveCount = 6116 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6117 6118 // Check if the user has overridden the max. 6119 if (VF.isScalar()) { 6120 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6121 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6122 } else { 6123 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6124 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6125 } 6126 6127 // If trip count is known or estimated compile time constant, limit the 6128 // interleave count to be less than the trip count divided by VF, provided it 6129 // is at least 1. 6130 // 6131 // For scalable vectors we can't know if interleaving is beneficial. It may 6132 // not be beneficial for small loops if none of the lanes in the second vector 6133 // iterations is enabled. However, for larger loops, there is likely to be a 6134 // similar benefit as for fixed-width vectors. For now, we choose to leave 6135 // the InterleaveCount as if vscale is '1', although if some information about 6136 // the vector is known (e.g. min vector size), we can make a better decision. 6137 if (BestKnownTC) { 6138 MaxInterleaveCount = 6139 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6140 // Make sure MaxInterleaveCount is greater than 0. 6141 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6142 } 6143 6144 assert(MaxInterleaveCount > 0 && 6145 "Maximum interleave count must be greater than 0"); 6146 6147 // Clamp the calculated IC to be between the 1 and the max interleave count 6148 // that the target and trip count allows. 6149 if (IC > MaxInterleaveCount) 6150 IC = MaxInterleaveCount; 6151 else 6152 // Make sure IC is greater than 0. 6153 IC = std::max(1u, IC); 6154 6155 assert(IC > 0 && "Interleave count must be greater than 0."); 6156 6157 // If we did not calculate the cost for VF (because the user selected the VF) 6158 // then we calculate the cost of VF here. 6159 if (LoopCost == 0) { 6160 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6161 LoopCost = *expectedCost(VF).first.getValue(); 6162 } 6163 6164 assert(LoopCost && "Non-zero loop cost expected"); 6165 6166 // Interleave if we vectorized this loop and there is a reduction that could 6167 // benefit from interleaving. 6168 if (VF.isVector() && HasReductions) { 6169 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6170 return IC; 6171 } 6172 6173 // Note that if we've already vectorized the loop we will have done the 6174 // runtime check and so interleaving won't require further checks. 6175 bool InterleavingRequiresRuntimePointerCheck = 6176 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6177 6178 // We want to interleave small loops in order to reduce the loop overhead and 6179 // potentially expose ILP opportunities. 6180 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6181 << "LV: IC is " << IC << '\n' 6182 << "LV: VF is " << VF << '\n'); 6183 const bool AggressivelyInterleaveReductions = 6184 TTI.enableAggressiveInterleaving(HasReductions); 6185 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6186 // We assume that the cost overhead is 1 and we use the cost model 6187 // to estimate the cost of the loop and interleave until the cost of the 6188 // loop overhead is about 5% of the cost of the loop. 6189 unsigned SmallIC = 6190 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6191 6192 // Interleave until store/load ports (estimated by max interleave count) are 6193 // saturated. 6194 unsigned NumStores = Legal->getNumStores(); 6195 unsigned NumLoads = Legal->getNumLoads(); 6196 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6197 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6198 6199 // If we have a scalar reduction (vector reductions are already dealt with 6200 // by this point), we can increase the critical path length if the loop 6201 // we're interleaving is inside another loop. Limit, by default to 2, so the 6202 // critical path only gets increased by one reduction operation. 6203 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6204 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6205 SmallIC = std::min(SmallIC, F); 6206 StoresIC = std::min(StoresIC, F); 6207 LoadsIC = std::min(LoadsIC, F); 6208 } 6209 6210 if (EnableLoadStoreRuntimeInterleave && 6211 std::max(StoresIC, LoadsIC) > SmallIC) { 6212 LLVM_DEBUG( 6213 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6214 return std::max(StoresIC, LoadsIC); 6215 } 6216 6217 // If there are scalar reductions and TTI has enabled aggressive 6218 // interleaving for reductions, we will interleave to expose ILP. 6219 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6220 AggressivelyInterleaveReductions) { 6221 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6222 // Interleave no less than SmallIC but not as aggressive as the normal IC 6223 // to satisfy the rare situation when resources are too limited. 6224 return std::max(IC / 2, SmallIC); 6225 } else { 6226 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6227 return SmallIC; 6228 } 6229 } 6230 6231 // Interleave if this is a large loop (small loops are already dealt with by 6232 // this point) that could benefit from interleaving. 6233 if (AggressivelyInterleaveReductions) { 6234 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6235 return IC; 6236 } 6237 6238 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6239 return 1; 6240 } 6241 6242 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6243 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6244 // This function calculates the register usage by measuring the highest number 6245 // of values that are alive at a single location. Obviously, this is a very 6246 // rough estimation. We scan the loop in a topological order in order and 6247 // assign a number to each instruction. We use RPO to ensure that defs are 6248 // met before their users. We assume that each instruction that has in-loop 6249 // users starts an interval. We record every time that an in-loop value is 6250 // used, so we have a list of the first and last occurrences of each 6251 // instruction. Next, we transpose this data structure into a multi map that 6252 // holds the list of intervals that *end* at a specific location. This multi 6253 // map allows us to perform a linear search. We scan the instructions linearly 6254 // and record each time that a new interval starts, by placing it in a set. 6255 // If we find this value in the multi-map then we remove it from the set. 6256 // The max register usage is the maximum size of the set. 6257 // We also search for instructions that are defined outside the loop, but are 6258 // used inside the loop. We need this number separately from the max-interval 6259 // usage number because when we unroll, loop-invariant values do not take 6260 // more register. 6261 LoopBlocksDFS DFS(TheLoop); 6262 DFS.perform(LI); 6263 6264 RegisterUsage RU; 6265 6266 // Each 'key' in the map opens a new interval. The values 6267 // of the map are the index of the 'last seen' usage of the 6268 // instruction that is the key. 6269 using IntervalMap = DenseMap<Instruction *, unsigned>; 6270 6271 // Maps instruction to its index. 6272 SmallVector<Instruction *, 64> IdxToInstr; 6273 // Marks the end of each interval. 6274 IntervalMap EndPoint; 6275 // Saves the list of instruction indices that are used in the loop. 6276 SmallPtrSet<Instruction *, 8> Ends; 6277 // Saves the list of values that are used in the loop but are 6278 // defined outside the loop, such as arguments and constants. 6279 SmallPtrSet<Value *, 8> LoopInvariants; 6280 6281 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6282 for (Instruction &I : BB->instructionsWithoutDebug()) { 6283 IdxToInstr.push_back(&I); 6284 6285 // Save the end location of each USE. 6286 for (Value *U : I.operands()) { 6287 auto *Instr = dyn_cast<Instruction>(U); 6288 6289 // Ignore non-instruction values such as arguments, constants, etc. 6290 if (!Instr) 6291 continue; 6292 6293 // If this instruction is outside the loop then record it and continue. 6294 if (!TheLoop->contains(Instr)) { 6295 LoopInvariants.insert(Instr); 6296 continue; 6297 } 6298 6299 // Overwrite previous end points. 6300 EndPoint[Instr] = IdxToInstr.size(); 6301 Ends.insert(Instr); 6302 } 6303 } 6304 } 6305 6306 // Saves the list of intervals that end with the index in 'key'. 6307 using InstrList = SmallVector<Instruction *, 2>; 6308 DenseMap<unsigned, InstrList> TransposeEnds; 6309 6310 // Transpose the EndPoints to a list of values that end at each index. 6311 for (auto &Interval : EndPoint) 6312 TransposeEnds[Interval.second].push_back(Interval.first); 6313 6314 SmallPtrSet<Instruction *, 8> OpenIntervals; 6315 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6316 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6317 6318 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6319 6320 // A lambda that gets the register usage for the given type and VF. 6321 const auto &TTICapture = TTI; 6322 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6323 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6324 return 0U; 6325 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6326 }; 6327 6328 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6329 Instruction *I = IdxToInstr[i]; 6330 6331 // Remove all of the instructions that end at this location. 6332 InstrList &List = TransposeEnds[i]; 6333 for (Instruction *ToRemove : List) 6334 OpenIntervals.erase(ToRemove); 6335 6336 // Ignore instructions that are never used within the loop. 6337 if (!Ends.count(I)) 6338 continue; 6339 6340 // Skip ignored values. 6341 if (ValuesToIgnore.count(I)) 6342 continue; 6343 6344 // For each VF find the maximum usage of registers. 6345 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6346 // Count the number of live intervals. 6347 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6348 6349 if (VFs[j].isScalar()) { 6350 for (auto Inst : OpenIntervals) { 6351 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6352 if (RegUsage.find(ClassID) == RegUsage.end()) 6353 RegUsage[ClassID] = 1; 6354 else 6355 RegUsage[ClassID] += 1; 6356 } 6357 } else { 6358 collectUniformsAndScalars(VFs[j]); 6359 for (auto Inst : OpenIntervals) { 6360 // Skip ignored values for VF > 1. 6361 if (VecValuesToIgnore.count(Inst)) 6362 continue; 6363 if (isScalarAfterVectorization(Inst, VFs[j])) { 6364 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6365 if (RegUsage.find(ClassID) == RegUsage.end()) 6366 RegUsage[ClassID] = 1; 6367 else 6368 RegUsage[ClassID] += 1; 6369 } else { 6370 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6371 if (RegUsage.find(ClassID) == RegUsage.end()) 6372 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6373 else 6374 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6375 } 6376 } 6377 } 6378 6379 for (auto& pair : RegUsage) { 6380 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6381 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6382 else 6383 MaxUsages[j][pair.first] = pair.second; 6384 } 6385 } 6386 6387 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6388 << OpenIntervals.size() << '\n'); 6389 6390 // Add the current instruction to the list of open intervals. 6391 OpenIntervals.insert(I); 6392 } 6393 6394 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6395 SmallMapVector<unsigned, unsigned, 4> Invariant; 6396 6397 for (auto Inst : LoopInvariants) { 6398 unsigned Usage = 6399 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6400 unsigned ClassID = 6401 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6402 if (Invariant.find(ClassID) == Invariant.end()) 6403 Invariant[ClassID] = Usage; 6404 else 6405 Invariant[ClassID] += Usage; 6406 } 6407 6408 LLVM_DEBUG({ 6409 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6410 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6411 << " item\n"; 6412 for (const auto &pair : MaxUsages[i]) { 6413 dbgs() << "LV(REG): RegisterClass: " 6414 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6415 << " registers\n"; 6416 } 6417 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6418 << " item\n"; 6419 for (const auto &pair : Invariant) { 6420 dbgs() << "LV(REG): RegisterClass: " 6421 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6422 << " registers\n"; 6423 } 6424 }); 6425 6426 RU.LoopInvariantRegs = Invariant; 6427 RU.MaxLocalUsers = MaxUsages[i]; 6428 RUs[i] = RU; 6429 } 6430 6431 return RUs; 6432 } 6433 6434 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6435 // TODO: Cost model for emulated masked load/store is completely 6436 // broken. This hack guides the cost model to use an artificially 6437 // high enough value to practically disable vectorization with such 6438 // operations, except where previously deployed legality hack allowed 6439 // using very low cost values. This is to avoid regressions coming simply 6440 // from moving "masked load/store" check from legality to cost model. 6441 // Masked Load/Gather emulation was previously never allowed. 6442 // Limited number of Masked Store/Scatter emulation was allowed. 6443 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6444 return isa<LoadInst>(I) || 6445 (isa<StoreInst>(I) && 6446 NumPredStores > NumberOfStoresToPredicate); 6447 } 6448 6449 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6450 // If we aren't vectorizing the loop, or if we've already collected the 6451 // instructions to scalarize, there's nothing to do. Collection may already 6452 // have occurred if we have a user-selected VF and are now computing the 6453 // expected cost for interleaving. 6454 if (VF.isScalar() || VF.isZero() || 6455 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6456 return; 6457 6458 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6459 // not profitable to scalarize any instructions, the presence of VF in the 6460 // map will indicate that we've analyzed it already. 6461 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6462 6463 // Find all the instructions that are scalar with predication in the loop and 6464 // determine if it would be better to not if-convert the blocks they are in. 6465 // If so, we also record the instructions to scalarize. 6466 for (BasicBlock *BB : TheLoop->blocks()) { 6467 if (!blockNeedsPredication(BB)) 6468 continue; 6469 for (Instruction &I : *BB) 6470 if (isScalarWithPredication(&I)) { 6471 ScalarCostsTy ScalarCosts; 6472 // Do not apply discount logic if hacked cost is needed 6473 // for emulated masked memrefs. 6474 if (!useEmulatedMaskMemRefHack(&I) && 6475 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6476 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6477 // Remember that BB will remain after vectorization. 6478 PredicatedBBsAfterVectorization.insert(BB); 6479 } 6480 } 6481 } 6482 6483 int LoopVectorizationCostModel::computePredInstDiscount( 6484 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6485 assert(!isUniformAfterVectorization(PredInst, VF) && 6486 "Instruction marked uniform-after-vectorization will be predicated"); 6487 6488 // Initialize the discount to zero, meaning that the scalar version and the 6489 // vector version cost the same. 6490 InstructionCost Discount = 0; 6491 6492 // Holds instructions to analyze. The instructions we visit are mapped in 6493 // ScalarCosts. Those instructions are the ones that would be scalarized if 6494 // we find that the scalar version costs less. 6495 SmallVector<Instruction *, 8> Worklist; 6496 6497 // Returns true if the given instruction can be scalarized. 6498 auto canBeScalarized = [&](Instruction *I) -> bool { 6499 // We only attempt to scalarize instructions forming a single-use chain 6500 // from the original predicated block that would otherwise be vectorized. 6501 // Although not strictly necessary, we give up on instructions we know will 6502 // already be scalar to avoid traversing chains that are unlikely to be 6503 // beneficial. 6504 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6505 isScalarAfterVectorization(I, VF)) 6506 return false; 6507 6508 // If the instruction is scalar with predication, it will be analyzed 6509 // separately. We ignore it within the context of PredInst. 6510 if (isScalarWithPredication(I)) 6511 return false; 6512 6513 // If any of the instruction's operands are uniform after vectorization, 6514 // the instruction cannot be scalarized. This prevents, for example, a 6515 // masked load from being scalarized. 6516 // 6517 // We assume we will only emit a value for lane zero of an instruction 6518 // marked uniform after vectorization, rather than VF identical values. 6519 // Thus, if we scalarize an instruction that uses a uniform, we would 6520 // create uses of values corresponding to the lanes we aren't emitting code 6521 // for. This behavior can be changed by allowing getScalarValue to clone 6522 // the lane zero values for uniforms rather than asserting. 6523 for (Use &U : I->operands()) 6524 if (auto *J = dyn_cast<Instruction>(U.get())) 6525 if (isUniformAfterVectorization(J, VF)) 6526 return false; 6527 6528 // Otherwise, we can scalarize the instruction. 6529 return true; 6530 }; 6531 6532 // Compute the expected cost discount from scalarizing the entire expression 6533 // feeding the predicated instruction. We currently only consider expressions 6534 // that are single-use instruction chains. 6535 Worklist.push_back(PredInst); 6536 while (!Worklist.empty()) { 6537 Instruction *I = Worklist.pop_back_val(); 6538 6539 // If we've already analyzed the instruction, there's nothing to do. 6540 if (ScalarCosts.find(I) != ScalarCosts.end()) 6541 continue; 6542 6543 // Compute the cost of the vector instruction. Note that this cost already 6544 // includes the scalarization overhead of the predicated instruction. 6545 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6546 6547 // Compute the cost of the scalarized instruction. This cost is the cost of 6548 // the instruction as if it wasn't if-converted and instead remained in the 6549 // predicated block. We will scale this cost by block probability after 6550 // computing the scalarization overhead. 6551 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6552 InstructionCost ScalarCost = 6553 VF.getKnownMinValue() * 6554 getInstructionCost(I, ElementCount::getFixed(1)).first; 6555 6556 // Compute the scalarization overhead of needed insertelement instructions 6557 // and phi nodes. 6558 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6559 ScalarCost += TTI.getScalarizationOverhead( 6560 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6561 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6562 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6563 ScalarCost += 6564 VF.getKnownMinValue() * 6565 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6566 } 6567 6568 // Compute the scalarization overhead of needed extractelement 6569 // instructions. For each of the instruction's operands, if the operand can 6570 // be scalarized, add it to the worklist; otherwise, account for the 6571 // overhead. 6572 for (Use &U : I->operands()) 6573 if (auto *J = dyn_cast<Instruction>(U.get())) { 6574 assert(VectorType::isValidElementType(J->getType()) && 6575 "Instruction has non-scalar type"); 6576 if (canBeScalarized(J)) 6577 Worklist.push_back(J); 6578 else if (needsExtract(J, VF)) { 6579 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6580 ScalarCost += TTI.getScalarizationOverhead( 6581 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6582 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6583 } 6584 } 6585 6586 // Scale the total scalar cost by block probability. 6587 ScalarCost /= getReciprocalPredBlockProb(); 6588 6589 // Compute the discount. A non-negative discount means the vector version 6590 // of the instruction costs more, and scalarizing would be beneficial. 6591 Discount += VectorCost - ScalarCost; 6592 ScalarCosts[I] = ScalarCost; 6593 } 6594 6595 return *Discount.getValue(); 6596 } 6597 6598 LoopVectorizationCostModel::VectorizationCostTy 6599 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6600 VectorizationCostTy Cost; 6601 6602 // For each block. 6603 for (BasicBlock *BB : TheLoop->blocks()) { 6604 VectorizationCostTy BlockCost; 6605 6606 // For each instruction in the old loop. 6607 for (Instruction &I : BB->instructionsWithoutDebug()) { 6608 // Skip ignored values. 6609 if (ValuesToIgnore.count(&I) || 6610 (VF.isVector() && VecValuesToIgnore.count(&I))) 6611 continue; 6612 6613 VectorizationCostTy C = getInstructionCost(&I, VF); 6614 6615 // Check if we should override the cost. 6616 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6617 C.first = InstructionCost(ForceTargetInstructionCost); 6618 6619 BlockCost.first += C.first; 6620 BlockCost.second |= C.second; 6621 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6622 << " for VF " << VF << " For instruction: " << I 6623 << '\n'); 6624 } 6625 6626 // If we are vectorizing a predicated block, it will have been 6627 // if-converted. This means that the block's instructions (aside from 6628 // stores and instructions that may divide by zero) will now be 6629 // unconditionally executed. For the scalar case, we may not always execute 6630 // the predicated block, if it is an if-else block. Thus, scale the block's 6631 // cost by the probability of executing it. blockNeedsPredication from 6632 // Legal is used so as to not include all blocks in tail folded loops. 6633 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6634 BlockCost.first /= getReciprocalPredBlockProb(); 6635 6636 Cost.first += BlockCost.first; 6637 Cost.second |= BlockCost.second; 6638 } 6639 6640 return Cost; 6641 } 6642 6643 /// Gets Address Access SCEV after verifying that the access pattern 6644 /// is loop invariant except the induction variable dependence. 6645 /// 6646 /// This SCEV can be sent to the Target in order to estimate the address 6647 /// calculation cost. 6648 static const SCEV *getAddressAccessSCEV( 6649 Value *Ptr, 6650 LoopVectorizationLegality *Legal, 6651 PredicatedScalarEvolution &PSE, 6652 const Loop *TheLoop) { 6653 6654 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6655 if (!Gep) 6656 return nullptr; 6657 6658 // We are looking for a gep with all loop invariant indices except for one 6659 // which should be an induction variable. 6660 auto SE = PSE.getSE(); 6661 unsigned NumOperands = Gep->getNumOperands(); 6662 for (unsigned i = 1; i < NumOperands; ++i) { 6663 Value *Opd = Gep->getOperand(i); 6664 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6665 !Legal->isInductionVariable(Opd)) 6666 return nullptr; 6667 } 6668 6669 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6670 return PSE.getSCEV(Ptr); 6671 } 6672 6673 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6674 return Legal->hasStride(I->getOperand(0)) || 6675 Legal->hasStride(I->getOperand(1)); 6676 } 6677 6678 InstructionCost 6679 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6680 ElementCount VF) { 6681 assert(VF.isVector() && 6682 "Scalarization cost of instruction implies vectorization."); 6683 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6684 Type *ValTy = getMemInstValueType(I); 6685 auto SE = PSE.getSE(); 6686 6687 unsigned AS = getLoadStoreAddressSpace(I); 6688 Value *Ptr = getLoadStorePointerOperand(I); 6689 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6690 6691 // Figure out whether the access is strided and get the stride value 6692 // if it's known in compile time 6693 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6694 6695 // Get the cost of the scalar memory instruction and address computation. 6696 InstructionCost Cost = 6697 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6698 6699 // Don't pass *I here, since it is scalar but will actually be part of a 6700 // vectorized loop where the user of it is a vectorized instruction. 6701 const Align Alignment = getLoadStoreAlignment(I); 6702 Cost += VF.getKnownMinValue() * 6703 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6704 AS, TTI::TCK_RecipThroughput); 6705 6706 // Get the overhead of the extractelement and insertelement instructions 6707 // we might create due to scalarization. 6708 Cost += getScalarizationOverhead(I, VF); 6709 6710 // If we have a predicated store, it may not be executed for each vector 6711 // lane. Scale the cost by the probability of executing the predicated 6712 // block. 6713 if (isPredicatedInst(I)) { 6714 Cost /= getReciprocalPredBlockProb(); 6715 6716 if (useEmulatedMaskMemRefHack(I)) 6717 // Artificially setting to a high enough value to practically disable 6718 // vectorization with such operations. 6719 Cost = 3000000; 6720 } 6721 6722 return Cost; 6723 } 6724 6725 InstructionCost 6726 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6727 ElementCount VF) { 6728 Type *ValTy = getMemInstValueType(I); 6729 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6730 Value *Ptr = getLoadStorePointerOperand(I); 6731 unsigned AS = getLoadStoreAddressSpace(I); 6732 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6733 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6734 6735 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6736 "Stride should be 1 or -1 for consecutive memory access"); 6737 const Align Alignment = getLoadStoreAlignment(I); 6738 InstructionCost Cost = 0; 6739 if (Legal->isMaskRequired(I)) 6740 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6741 CostKind); 6742 else 6743 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6744 CostKind, I); 6745 6746 bool Reverse = ConsecutiveStride < 0; 6747 if (Reverse) 6748 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6749 return Cost; 6750 } 6751 6752 InstructionCost 6753 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6754 ElementCount VF) { 6755 assert(Legal->isUniformMemOp(*I)); 6756 6757 Type *ValTy = getMemInstValueType(I); 6758 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6759 const Align Alignment = getLoadStoreAlignment(I); 6760 unsigned AS = getLoadStoreAddressSpace(I); 6761 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6762 if (isa<LoadInst>(I)) { 6763 return TTI.getAddressComputationCost(ValTy) + 6764 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6765 CostKind) + 6766 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6767 } 6768 StoreInst *SI = cast<StoreInst>(I); 6769 6770 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6771 return TTI.getAddressComputationCost(ValTy) + 6772 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6773 CostKind) + 6774 (isLoopInvariantStoreValue 6775 ? 0 6776 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6777 VF.getKnownMinValue() - 1)); 6778 } 6779 6780 InstructionCost 6781 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6782 ElementCount VF) { 6783 Type *ValTy = getMemInstValueType(I); 6784 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6785 const Align Alignment = getLoadStoreAlignment(I); 6786 const Value *Ptr = getLoadStorePointerOperand(I); 6787 6788 return TTI.getAddressComputationCost(VectorTy) + 6789 TTI.getGatherScatterOpCost( 6790 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6791 TargetTransformInfo::TCK_RecipThroughput, I); 6792 } 6793 6794 InstructionCost 6795 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6796 ElementCount VF) { 6797 Type *ValTy = getMemInstValueType(I); 6798 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6799 unsigned AS = getLoadStoreAddressSpace(I); 6800 6801 auto Group = getInterleavedAccessGroup(I); 6802 assert(Group && "Fail to get an interleaved access group."); 6803 6804 unsigned InterleaveFactor = Group->getFactor(); 6805 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6806 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6807 6808 // Holds the indices of existing members in an interleaved load group. 6809 // An interleaved store group doesn't need this as it doesn't allow gaps. 6810 SmallVector<unsigned, 4> Indices; 6811 if (isa<LoadInst>(I)) { 6812 for (unsigned i = 0; i < InterleaveFactor; i++) 6813 if (Group->getMember(i)) 6814 Indices.push_back(i); 6815 } 6816 6817 // Calculate the cost of the whole interleaved group. 6818 bool UseMaskForGaps = 6819 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6820 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6821 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6822 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6823 6824 if (Group->isReverse()) { 6825 // TODO: Add support for reversed masked interleaved access. 6826 assert(!Legal->isMaskRequired(I) && 6827 "Reverse masked interleaved access not supported."); 6828 Cost += Group->getNumMembers() * 6829 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6830 } 6831 return Cost; 6832 } 6833 6834 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6835 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6836 // Early exit for no inloop reductions 6837 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6838 return InstructionCost::getInvalid(); 6839 auto *VectorTy = cast<VectorType>(Ty); 6840 6841 // We are looking for a pattern of, and finding the minimal acceptable cost: 6842 // reduce(mul(ext(A), ext(B))) or 6843 // reduce(mul(A, B)) or 6844 // reduce(ext(A)) or 6845 // reduce(A). 6846 // The basic idea is that we walk down the tree to do that, finding the root 6847 // reduction instruction in InLoopReductionImmediateChains. From there we find 6848 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6849 // of the components. If the reduction cost is lower then we return it for the 6850 // reduction instruction and 0 for the other instructions in the pattern. If 6851 // it is not we return an invalid cost specifying the orignal cost method 6852 // should be used. 6853 Instruction *RetI = I; 6854 if ((RetI->getOpcode() == Instruction::SExt || 6855 RetI->getOpcode() == Instruction::ZExt)) { 6856 if (!RetI->hasOneUser()) 6857 return InstructionCost::getInvalid(); 6858 RetI = RetI->user_back(); 6859 } 6860 if (RetI->getOpcode() == Instruction::Mul && 6861 RetI->user_back()->getOpcode() == Instruction::Add) { 6862 if (!RetI->hasOneUser()) 6863 return InstructionCost::getInvalid(); 6864 RetI = RetI->user_back(); 6865 } 6866 6867 // Test if the found instruction is a reduction, and if not return an invalid 6868 // cost specifying the parent to use the original cost modelling. 6869 if (!InLoopReductionImmediateChains.count(RetI)) 6870 return InstructionCost::getInvalid(); 6871 6872 // Find the reduction this chain is a part of and calculate the basic cost of 6873 // the reduction on its own. 6874 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6875 Instruction *ReductionPhi = LastChain; 6876 while (!isa<PHINode>(ReductionPhi)) 6877 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6878 6879 RecurrenceDescriptor RdxDesc = 6880 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6881 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6882 VectorTy, false, CostKind); 6883 6884 // Get the operand that was not the reduction chain and match it to one of the 6885 // patterns, returning the better cost if it is found. 6886 Instruction *RedOp = RetI->getOperand(1) == LastChain 6887 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6888 : dyn_cast<Instruction>(RetI->getOperand(1)); 6889 6890 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6891 6892 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6893 !TheLoop->isLoopInvariant(RedOp)) { 6894 bool IsUnsigned = isa<ZExtInst>(RedOp); 6895 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6896 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6897 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6898 CostKind); 6899 6900 unsigned ExtCost = 6901 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6902 TTI::CastContextHint::None, CostKind, RedOp); 6903 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6904 return I == RetI ? *RedCost.getValue() : 0; 6905 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6906 Instruction *Mul = RedOp; 6907 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6908 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6909 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6910 Op0->getOpcode() == Op1->getOpcode() && 6911 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6912 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6913 bool IsUnsigned = isa<ZExtInst>(Op0); 6914 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6915 // reduce(mul(ext, ext)) 6916 unsigned ExtCost = 6917 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6918 TTI::CastContextHint::None, CostKind, Op0); 6919 unsigned MulCost = 6920 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6921 6922 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6923 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6924 CostKind); 6925 6926 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6927 return I == RetI ? *RedCost.getValue() : 0; 6928 } else { 6929 unsigned MulCost = 6930 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6931 6932 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6933 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6934 CostKind); 6935 6936 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6937 return I == RetI ? *RedCost.getValue() : 0; 6938 } 6939 } 6940 6941 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 6942 } 6943 6944 InstructionCost 6945 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6946 ElementCount VF) { 6947 // Calculate scalar cost only. Vectorization cost should be ready at this 6948 // moment. 6949 if (VF.isScalar()) { 6950 Type *ValTy = getMemInstValueType(I); 6951 const Align Alignment = getLoadStoreAlignment(I); 6952 unsigned AS = getLoadStoreAddressSpace(I); 6953 6954 return TTI.getAddressComputationCost(ValTy) + 6955 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6956 TTI::TCK_RecipThroughput, I); 6957 } 6958 return getWideningCost(I, VF); 6959 } 6960 6961 LoopVectorizationCostModel::VectorizationCostTy 6962 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6963 ElementCount VF) { 6964 // If we know that this instruction will remain uniform, check the cost of 6965 // the scalar version. 6966 if (isUniformAfterVectorization(I, VF)) 6967 VF = ElementCount::getFixed(1); 6968 6969 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6970 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6971 6972 // Forced scalars do not have any scalarization overhead. 6973 auto ForcedScalar = ForcedScalars.find(VF); 6974 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6975 auto InstSet = ForcedScalar->second; 6976 if (InstSet.count(I)) 6977 return VectorizationCostTy( 6978 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6979 VF.getKnownMinValue()), 6980 false); 6981 } 6982 6983 Type *VectorTy; 6984 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6985 6986 bool TypeNotScalarized = 6987 VF.isVector() && VectorTy->isVectorTy() && 6988 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6989 return VectorizationCostTy(C, TypeNotScalarized); 6990 } 6991 6992 InstructionCost 6993 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6994 ElementCount VF) { 6995 6996 assert(!VF.isScalable() && 6997 "cannot compute scalarization overhead for scalable vectorization"); 6998 if (VF.isScalar()) 6999 return 0; 7000 7001 InstructionCost Cost = 0; 7002 Type *RetTy = ToVectorTy(I->getType(), VF); 7003 if (!RetTy->isVoidTy() && 7004 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7005 Cost += TTI.getScalarizationOverhead( 7006 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7007 true, false); 7008 7009 // Some targets keep addresses scalar. 7010 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7011 return Cost; 7012 7013 // Some targets support efficient element stores. 7014 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7015 return Cost; 7016 7017 // Collect operands to consider. 7018 CallInst *CI = dyn_cast<CallInst>(I); 7019 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7020 7021 // Skip operands that do not require extraction/scalarization and do not incur 7022 // any overhead. 7023 return Cost + TTI.getOperandsScalarizationOverhead( 7024 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 7025 } 7026 7027 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7028 if (VF.isScalar()) 7029 return; 7030 NumPredStores = 0; 7031 for (BasicBlock *BB : TheLoop->blocks()) { 7032 // For each instruction in the old loop. 7033 for (Instruction &I : *BB) { 7034 Value *Ptr = getLoadStorePointerOperand(&I); 7035 if (!Ptr) 7036 continue; 7037 7038 // TODO: We should generate better code and update the cost model for 7039 // predicated uniform stores. Today they are treated as any other 7040 // predicated store (see added test cases in 7041 // invariant-store-vectorization.ll). 7042 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7043 NumPredStores++; 7044 7045 if (Legal->isUniformMemOp(I)) { 7046 // TODO: Avoid replicating loads and stores instead of 7047 // relying on instcombine to remove them. 7048 // Load: Scalar load + broadcast 7049 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7050 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7051 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7052 continue; 7053 } 7054 7055 // We assume that widening is the best solution when possible. 7056 if (memoryInstructionCanBeWidened(&I, VF)) { 7057 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7058 int ConsecutiveStride = 7059 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7060 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7061 "Expected consecutive stride."); 7062 InstWidening Decision = 7063 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7064 setWideningDecision(&I, VF, Decision, Cost); 7065 continue; 7066 } 7067 7068 // Choose between Interleaving, Gather/Scatter or Scalarization. 7069 InstructionCost InterleaveCost = std::numeric_limits<int>::max(); 7070 unsigned NumAccesses = 1; 7071 if (isAccessInterleaved(&I)) { 7072 auto Group = getInterleavedAccessGroup(&I); 7073 assert(Group && "Fail to get an interleaved access group."); 7074 7075 // Make one decision for the whole group. 7076 if (getWideningDecision(&I, VF) != CM_Unknown) 7077 continue; 7078 7079 NumAccesses = Group->getNumMembers(); 7080 if (interleavedAccessCanBeWidened(&I, VF)) 7081 InterleaveCost = getInterleaveGroupCost(&I, VF); 7082 } 7083 7084 InstructionCost GatherScatterCost = 7085 isLegalGatherOrScatter(&I) 7086 ? getGatherScatterCost(&I, VF) * NumAccesses 7087 : std::numeric_limits<int>::max(); 7088 7089 InstructionCost ScalarizationCost = 7090 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7091 7092 // Choose better solution for the current VF, 7093 // write down this decision and use it during vectorization. 7094 InstructionCost Cost; 7095 InstWidening Decision; 7096 if (InterleaveCost <= GatherScatterCost && 7097 InterleaveCost < ScalarizationCost) { 7098 Decision = CM_Interleave; 7099 Cost = InterleaveCost; 7100 } else if (GatherScatterCost < ScalarizationCost) { 7101 Decision = CM_GatherScatter; 7102 Cost = GatherScatterCost; 7103 } else { 7104 Decision = CM_Scalarize; 7105 Cost = ScalarizationCost; 7106 } 7107 // If the instructions belongs to an interleave group, the whole group 7108 // receives the same decision. The whole group receives the cost, but 7109 // the cost will actually be assigned to one instruction. 7110 if (auto Group = getInterleavedAccessGroup(&I)) 7111 setWideningDecision(Group, VF, Decision, Cost); 7112 else 7113 setWideningDecision(&I, VF, Decision, Cost); 7114 } 7115 } 7116 7117 // Make sure that any load of address and any other address computation 7118 // remains scalar unless there is gather/scatter support. This avoids 7119 // inevitable extracts into address registers, and also has the benefit of 7120 // activating LSR more, since that pass can't optimize vectorized 7121 // addresses. 7122 if (TTI.prefersVectorizedAddressing()) 7123 return; 7124 7125 // Start with all scalar pointer uses. 7126 SmallPtrSet<Instruction *, 8> AddrDefs; 7127 for (BasicBlock *BB : TheLoop->blocks()) 7128 for (Instruction &I : *BB) { 7129 Instruction *PtrDef = 7130 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7131 if (PtrDef && TheLoop->contains(PtrDef) && 7132 getWideningDecision(&I, VF) != CM_GatherScatter) 7133 AddrDefs.insert(PtrDef); 7134 } 7135 7136 // Add all instructions used to generate the addresses. 7137 SmallVector<Instruction *, 4> Worklist; 7138 append_range(Worklist, AddrDefs); 7139 while (!Worklist.empty()) { 7140 Instruction *I = Worklist.pop_back_val(); 7141 for (auto &Op : I->operands()) 7142 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7143 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7144 AddrDefs.insert(InstOp).second) 7145 Worklist.push_back(InstOp); 7146 } 7147 7148 for (auto *I : AddrDefs) { 7149 if (isa<LoadInst>(I)) { 7150 // Setting the desired widening decision should ideally be handled in 7151 // by cost functions, but since this involves the task of finding out 7152 // if the loaded register is involved in an address computation, it is 7153 // instead changed here when we know this is the case. 7154 InstWidening Decision = getWideningDecision(I, VF); 7155 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7156 // Scalarize a widened load of address. 7157 setWideningDecision( 7158 I, VF, CM_Scalarize, 7159 (VF.getKnownMinValue() * 7160 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7161 else if (auto Group = getInterleavedAccessGroup(I)) { 7162 // Scalarize an interleave group of address loads. 7163 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7164 if (Instruction *Member = Group->getMember(I)) 7165 setWideningDecision( 7166 Member, VF, CM_Scalarize, 7167 (VF.getKnownMinValue() * 7168 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7169 } 7170 } 7171 } else 7172 // Make sure I gets scalarized and a cost estimate without 7173 // scalarization overhead. 7174 ForcedScalars[VF].insert(I); 7175 } 7176 } 7177 7178 InstructionCost 7179 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7180 Type *&VectorTy) { 7181 Type *RetTy = I->getType(); 7182 if (canTruncateToMinimalBitwidth(I, VF)) 7183 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7184 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7185 auto SE = PSE.getSE(); 7186 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7187 7188 // TODO: We need to estimate the cost of intrinsic calls. 7189 switch (I->getOpcode()) { 7190 case Instruction::GetElementPtr: 7191 // We mark this instruction as zero-cost because the cost of GEPs in 7192 // vectorized code depends on whether the corresponding memory instruction 7193 // is scalarized or not. Therefore, we handle GEPs with the memory 7194 // instruction cost. 7195 return 0; 7196 case Instruction::Br: { 7197 // In cases of scalarized and predicated instructions, there will be VF 7198 // predicated blocks in the vectorized loop. Each branch around these 7199 // blocks requires also an extract of its vector compare i1 element. 7200 bool ScalarPredicatedBB = false; 7201 BranchInst *BI = cast<BranchInst>(I); 7202 if (VF.isVector() && BI->isConditional() && 7203 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7204 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7205 ScalarPredicatedBB = true; 7206 7207 if (ScalarPredicatedBB) { 7208 // Return cost for branches around scalarized and predicated blocks. 7209 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7210 auto *Vec_i1Ty = 7211 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7212 return (TTI.getScalarizationOverhead( 7213 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7214 false, true) + 7215 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7216 VF.getKnownMinValue())); 7217 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7218 // The back-edge branch will remain, as will all scalar branches. 7219 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7220 else 7221 // This branch will be eliminated by if-conversion. 7222 return 0; 7223 // Note: We currently assume zero cost for an unconditional branch inside 7224 // a predicated block since it will become a fall-through, although we 7225 // may decide in the future to call TTI for all branches. 7226 } 7227 case Instruction::PHI: { 7228 auto *Phi = cast<PHINode>(I); 7229 7230 // First-order recurrences are replaced by vector shuffles inside the loop. 7231 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7232 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7233 return TTI.getShuffleCost( 7234 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7235 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7236 7237 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7238 // converted into select instructions. We require N - 1 selects per phi 7239 // node, where N is the number of incoming values. 7240 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7241 return (Phi->getNumIncomingValues() - 1) * 7242 TTI.getCmpSelInstrCost( 7243 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7244 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7245 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7246 7247 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7248 } 7249 case Instruction::UDiv: 7250 case Instruction::SDiv: 7251 case Instruction::URem: 7252 case Instruction::SRem: 7253 // If we have a predicated instruction, it may not be executed for each 7254 // vector lane. Get the scalarization cost and scale this amount by the 7255 // probability of executing the predicated block. If the instruction is not 7256 // predicated, we fall through to the next case. 7257 if (VF.isVector() && isScalarWithPredication(I)) { 7258 InstructionCost Cost = 0; 7259 7260 // These instructions have a non-void type, so account for the phi nodes 7261 // that we will create. This cost is likely to be zero. The phi node 7262 // cost, if any, should be scaled by the block probability because it 7263 // models a copy at the end of each predicated block. 7264 Cost += VF.getKnownMinValue() * 7265 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7266 7267 // The cost of the non-predicated instruction. 7268 Cost += VF.getKnownMinValue() * 7269 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7270 7271 // The cost of insertelement and extractelement instructions needed for 7272 // scalarization. 7273 Cost += getScalarizationOverhead(I, VF); 7274 7275 // Scale the cost by the probability of executing the predicated blocks. 7276 // This assumes the predicated block for each vector lane is equally 7277 // likely. 7278 return Cost / getReciprocalPredBlockProb(); 7279 } 7280 LLVM_FALLTHROUGH; 7281 case Instruction::Add: 7282 case Instruction::FAdd: 7283 case Instruction::Sub: 7284 case Instruction::FSub: 7285 case Instruction::Mul: 7286 case Instruction::FMul: 7287 case Instruction::FDiv: 7288 case Instruction::FRem: 7289 case Instruction::Shl: 7290 case Instruction::LShr: 7291 case Instruction::AShr: 7292 case Instruction::And: 7293 case Instruction::Or: 7294 case Instruction::Xor: { 7295 // Since we will replace the stride by 1 the multiplication should go away. 7296 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7297 return 0; 7298 7299 // Detect reduction patterns 7300 InstructionCost RedCost; 7301 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7302 .isValid()) 7303 return RedCost; 7304 7305 // Certain instructions can be cheaper to vectorize if they have a constant 7306 // second vector operand. One example of this are shifts on x86. 7307 Value *Op2 = I->getOperand(1); 7308 TargetTransformInfo::OperandValueProperties Op2VP; 7309 TargetTransformInfo::OperandValueKind Op2VK = 7310 TTI.getOperandInfo(Op2, Op2VP); 7311 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7312 Op2VK = TargetTransformInfo::OK_UniformValue; 7313 7314 SmallVector<const Value *, 4> Operands(I->operand_values()); 7315 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7316 return N * TTI.getArithmeticInstrCost( 7317 I->getOpcode(), VectorTy, CostKind, 7318 TargetTransformInfo::OK_AnyValue, 7319 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7320 } 7321 case Instruction::FNeg: { 7322 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7323 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7324 return N * TTI.getArithmeticInstrCost( 7325 I->getOpcode(), VectorTy, CostKind, 7326 TargetTransformInfo::OK_AnyValue, 7327 TargetTransformInfo::OK_AnyValue, 7328 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7329 I->getOperand(0), I); 7330 } 7331 case Instruction::Select: { 7332 SelectInst *SI = cast<SelectInst>(I); 7333 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7334 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7335 Type *CondTy = SI->getCondition()->getType(); 7336 if (!ScalarCond) 7337 CondTy = VectorType::get(CondTy, VF); 7338 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7339 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7340 } 7341 case Instruction::ICmp: 7342 case Instruction::FCmp: { 7343 Type *ValTy = I->getOperand(0)->getType(); 7344 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7345 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7346 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7347 VectorTy = ToVectorTy(ValTy, VF); 7348 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7349 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7350 } 7351 case Instruction::Store: 7352 case Instruction::Load: { 7353 ElementCount Width = VF; 7354 if (Width.isVector()) { 7355 InstWidening Decision = getWideningDecision(I, Width); 7356 assert(Decision != CM_Unknown && 7357 "CM decision should be taken at this point"); 7358 if (Decision == CM_Scalarize) 7359 Width = ElementCount::getFixed(1); 7360 } 7361 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7362 return getMemoryInstructionCost(I, VF); 7363 } 7364 case Instruction::ZExt: 7365 case Instruction::SExt: 7366 case Instruction::FPToUI: 7367 case Instruction::FPToSI: 7368 case Instruction::FPExt: 7369 case Instruction::PtrToInt: 7370 case Instruction::IntToPtr: 7371 case Instruction::SIToFP: 7372 case Instruction::UIToFP: 7373 case Instruction::Trunc: 7374 case Instruction::FPTrunc: 7375 case Instruction::BitCast: { 7376 // Computes the CastContextHint from a Load/Store instruction. 7377 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7378 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7379 "Expected a load or a store!"); 7380 7381 if (VF.isScalar() || !TheLoop->contains(I)) 7382 return TTI::CastContextHint::Normal; 7383 7384 switch (getWideningDecision(I, VF)) { 7385 case LoopVectorizationCostModel::CM_GatherScatter: 7386 return TTI::CastContextHint::GatherScatter; 7387 case LoopVectorizationCostModel::CM_Interleave: 7388 return TTI::CastContextHint::Interleave; 7389 case LoopVectorizationCostModel::CM_Scalarize: 7390 case LoopVectorizationCostModel::CM_Widen: 7391 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7392 : TTI::CastContextHint::Normal; 7393 case LoopVectorizationCostModel::CM_Widen_Reverse: 7394 return TTI::CastContextHint::Reversed; 7395 case LoopVectorizationCostModel::CM_Unknown: 7396 llvm_unreachable("Instr did not go through cost modelling?"); 7397 } 7398 7399 llvm_unreachable("Unhandled case!"); 7400 }; 7401 7402 unsigned Opcode = I->getOpcode(); 7403 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7404 // For Trunc, the context is the only user, which must be a StoreInst. 7405 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7406 if (I->hasOneUse()) 7407 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7408 CCH = ComputeCCH(Store); 7409 } 7410 // For Z/Sext, the context is the operand, which must be a LoadInst. 7411 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7412 Opcode == Instruction::FPExt) { 7413 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7414 CCH = ComputeCCH(Load); 7415 } 7416 7417 // We optimize the truncation of induction variables having constant 7418 // integer steps. The cost of these truncations is the same as the scalar 7419 // operation. 7420 if (isOptimizableIVTruncate(I, VF)) { 7421 auto *Trunc = cast<TruncInst>(I); 7422 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7423 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7424 } 7425 7426 // Detect reduction patterns 7427 InstructionCost RedCost; 7428 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7429 .isValid()) 7430 return RedCost; 7431 7432 Type *SrcScalarTy = I->getOperand(0)->getType(); 7433 Type *SrcVecTy = 7434 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7435 if (canTruncateToMinimalBitwidth(I, VF)) { 7436 // This cast is going to be shrunk. This may remove the cast or it might 7437 // turn it into slightly different cast. For example, if MinBW == 16, 7438 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7439 // 7440 // Calculate the modified src and dest types. 7441 Type *MinVecTy = VectorTy; 7442 if (Opcode == Instruction::Trunc) { 7443 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7444 VectorTy = 7445 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7446 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7447 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7448 VectorTy = 7449 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7450 } 7451 } 7452 7453 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7454 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7455 return N * 7456 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7457 } 7458 case Instruction::Call: { 7459 bool NeedToScalarize; 7460 CallInst *CI = cast<CallInst>(I); 7461 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7462 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7463 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7464 return std::min(CallCost, IntrinsicCost); 7465 } 7466 return CallCost; 7467 } 7468 case Instruction::ExtractValue: 7469 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7470 default: 7471 // The cost of executing VF copies of the scalar instruction. This opcode 7472 // is unknown. Assume that it is the same as 'mul'. 7473 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7474 Instruction::Mul, VectorTy, CostKind) + 7475 getScalarizationOverhead(I, VF); 7476 } // end of switch. 7477 } 7478 7479 char LoopVectorize::ID = 0; 7480 7481 static const char lv_name[] = "Loop Vectorization"; 7482 7483 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7484 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7485 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7486 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7487 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7488 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7489 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7490 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7491 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7492 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7493 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7494 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7495 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7496 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7497 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7498 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7499 7500 namespace llvm { 7501 7502 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7503 7504 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7505 bool VectorizeOnlyWhenForced) { 7506 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7507 } 7508 7509 } // end namespace llvm 7510 7511 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7512 // Check if the pointer operand of a load or store instruction is 7513 // consecutive. 7514 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7515 return Legal->isConsecutivePtr(Ptr); 7516 return false; 7517 } 7518 7519 void LoopVectorizationCostModel::collectValuesToIgnore() { 7520 // Ignore ephemeral values. 7521 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7522 7523 // Ignore type-promoting instructions we identified during reduction 7524 // detection. 7525 for (auto &Reduction : Legal->getReductionVars()) { 7526 RecurrenceDescriptor &RedDes = Reduction.second; 7527 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7528 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7529 } 7530 // Ignore type-casting instructions we identified during induction 7531 // detection. 7532 for (auto &Induction : Legal->getInductionVars()) { 7533 InductionDescriptor &IndDes = Induction.second; 7534 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7535 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7536 } 7537 } 7538 7539 void LoopVectorizationCostModel::collectInLoopReductions() { 7540 for (auto &Reduction : Legal->getReductionVars()) { 7541 PHINode *Phi = Reduction.first; 7542 RecurrenceDescriptor &RdxDesc = Reduction.second; 7543 7544 // We don't collect reductions that are type promoted (yet). 7545 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7546 continue; 7547 7548 // If the target would prefer this reduction to happen "in-loop", then we 7549 // want to record it as such. 7550 unsigned Opcode = RdxDesc.getOpcode(); 7551 if (!PreferInLoopReductions && 7552 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7553 TargetTransformInfo::ReductionFlags())) 7554 continue; 7555 7556 // Check that we can correctly put the reductions into the loop, by 7557 // finding the chain of operations that leads from the phi to the loop 7558 // exit value. 7559 SmallVector<Instruction *, 4> ReductionOperations = 7560 RdxDesc.getReductionOpChain(Phi, TheLoop); 7561 bool InLoop = !ReductionOperations.empty(); 7562 if (InLoop) { 7563 InLoopReductionChains[Phi] = ReductionOperations; 7564 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7565 Instruction *LastChain = Phi; 7566 for (auto *I : ReductionOperations) { 7567 InLoopReductionImmediateChains[I] = LastChain; 7568 LastChain = I; 7569 } 7570 } 7571 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7572 << " reduction for phi: " << *Phi << "\n"); 7573 } 7574 } 7575 7576 // TODO: we could return a pair of values that specify the max VF and 7577 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7578 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7579 // doesn't have a cost model that can choose which plan to execute if 7580 // more than one is generated. 7581 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7582 LoopVectorizationCostModel &CM) { 7583 unsigned WidestType; 7584 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7585 return WidestVectorRegBits / WidestType; 7586 } 7587 7588 VectorizationFactor 7589 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7590 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7591 ElementCount VF = UserVF; 7592 // Outer loop handling: They may require CFG and instruction level 7593 // transformations before even evaluating whether vectorization is profitable. 7594 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7595 // the vectorization pipeline. 7596 if (!OrigLoop->isInnermost()) { 7597 // If the user doesn't provide a vectorization factor, determine a 7598 // reasonable one. 7599 if (UserVF.isZero()) { 7600 VF = ElementCount::getFixed( 7601 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7602 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7603 7604 // Make sure we have a VF > 1 for stress testing. 7605 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7606 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7607 << "overriding computed VF.\n"); 7608 VF = ElementCount::getFixed(4); 7609 } 7610 } 7611 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7612 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7613 "VF needs to be a power of two"); 7614 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7615 << "VF " << VF << " to build VPlans.\n"); 7616 buildVPlans(VF, VF); 7617 7618 // For VPlan build stress testing, we bail out after VPlan construction. 7619 if (VPlanBuildStressTest) 7620 return VectorizationFactor::Disabled(); 7621 7622 return {VF, 0 /*Cost*/}; 7623 } 7624 7625 LLVM_DEBUG( 7626 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7627 "VPlan-native path.\n"); 7628 return VectorizationFactor::Disabled(); 7629 } 7630 7631 Optional<VectorizationFactor> 7632 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7633 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7634 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7635 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7636 return None; 7637 7638 // Invalidate interleave groups if all blocks of loop will be predicated. 7639 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7640 !useMaskedInterleavedAccesses(*TTI)) { 7641 LLVM_DEBUG( 7642 dbgs() 7643 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7644 "which requires masked-interleaved support.\n"); 7645 if (CM.InterleaveInfo.invalidateGroups()) 7646 // Invalidating interleave groups also requires invalidating all decisions 7647 // based on them, which includes widening decisions and uniform and scalar 7648 // values. 7649 CM.invalidateCostModelingDecisions(); 7650 } 7651 7652 ElementCount MaxVF = MaybeMaxVF.getValue(); 7653 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7654 7655 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7656 if (!UserVF.isZero() && 7657 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7658 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7659 // VFs here, this should be reverted to only use legal UserVFs once the 7660 // loop below supports scalable VFs. 7661 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7662 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7663 << " VF " << VF << ".\n"); 7664 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7665 "VF needs to be a power of two"); 7666 // Collect the instructions (and their associated costs) that will be more 7667 // profitable to scalarize. 7668 CM.selectUserVectorizationFactor(VF); 7669 CM.collectInLoopReductions(); 7670 buildVPlansWithVPRecipes(VF, VF); 7671 LLVM_DEBUG(printPlans(dbgs())); 7672 return {{VF, 0}}; 7673 } 7674 7675 assert(!MaxVF.isScalable() && 7676 "Scalable vectors not yet supported beyond this point"); 7677 7678 for (ElementCount VF = ElementCount::getFixed(1); 7679 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7680 // Collect Uniform and Scalar instructions after vectorization with VF. 7681 CM.collectUniformsAndScalars(VF); 7682 7683 // Collect the instructions (and their associated costs) that will be more 7684 // profitable to scalarize. 7685 if (VF.isVector()) 7686 CM.collectInstsToScalarize(VF); 7687 } 7688 7689 CM.collectInLoopReductions(); 7690 7691 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7692 LLVM_DEBUG(printPlans(dbgs())); 7693 if (MaxVF.isScalar()) 7694 return VectorizationFactor::Disabled(); 7695 7696 // Select the optimal vectorization factor. 7697 return CM.selectVectorizationFactor(MaxVF); 7698 } 7699 7700 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7701 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7702 << '\n'); 7703 BestVF = VF; 7704 BestUF = UF; 7705 7706 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7707 return !Plan->hasVF(VF); 7708 }); 7709 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7710 } 7711 7712 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7713 DominatorTree *DT) { 7714 // Perform the actual loop transformation. 7715 7716 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7717 VPCallbackILV CallbackILV(ILV); 7718 7719 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7720 7721 VPTransformState State{*BestVF, 7722 BestUF, 7723 OrigLoop, 7724 LI, 7725 DT, 7726 ILV.Builder, 7727 ILV.VectorLoopValueMap, 7728 &ILV, 7729 CallbackILV}; 7730 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7731 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7732 State.CanonicalIV = ILV.Induction; 7733 7734 ILV.printDebugTracesAtStart(); 7735 7736 //===------------------------------------------------===// 7737 // 7738 // Notice: any optimization or new instruction that go 7739 // into the code below should also be implemented in 7740 // the cost-model. 7741 // 7742 //===------------------------------------------------===// 7743 7744 // 2. Copy and widen instructions from the old loop into the new loop. 7745 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7746 VPlans.front()->execute(&State); 7747 7748 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7749 // predication, updating analyses. 7750 ILV.fixVectorizedLoop(); 7751 7752 ILV.printDebugTracesAtEnd(); 7753 } 7754 7755 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7756 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7757 7758 // We create new control-flow for the vectorized loop, so the original exit 7759 // conditions will be dead after vectorization if it's only used by the 7760 // terminator 7761 SmallVector<BasicBlock*> ExitingBlocks; 7762 OrigLoop->getExitingBlocks(ExitingBlocks); 7763 for (auto *BB : ExitingBlocks) { 7764 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7765 if (!Cmp || !Cmp->hasOneUse()) 7766 continue; 7767 7768 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7769 if (!DeadInstructions.insert(Cmp).second) 7770 continue; 7771 7772 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7773 // TODO: can recurse through operands in general 7774 for (Value *Op : Cmp->operands()) { 7775 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7776 DeadInstructions.insert(cast<Instruction>(Op)); 7777 } 7778 } 7779 7780 // We create new "steps" for induction variable updates to which the original 7781 // induction variables map. An original update instruction will be dead if 7782 // all its users except the induction variable are dead. 7783 auto *Latch = OrigLoop->getLoopLatch(); 7784 for (auto &Induction : Legal->getInductionVars()) { 7785 PHINode *Ind = Induction.first; 7786 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7787 7788 // If the tail is to be folded by masking, the primary induction variable, 7789 // if exists, isn't dead: it will be used for masking. Don't kill it. 7790 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7791 continue; 7792 7793 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7794 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7795 })) 7796 DeadInstructions.insert(IndUpdate); 7797 7798 // We record as "Dead" also the type-casting instructions we had identified 7799 // during induction analysis. We don't need any handling for them in the 7800 // vectorized loop because we have proven that, under a proper runtime 7801 // test guarding the vectorized loop, the value of the phi, and the casted 7802 // value of the phi, are the same. The last instruction in this casting chain 7803 // will get its scalar/vector/widened def from the scalar/vector/widened def 7804 // of the respective phi node. Any other casts in the induction def-use chain 7805 // have no other uses outside the phi update chain, and will be ignored. 7806 InductionDescriptor &IndDes = Induction.second; 7807 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7808 DeadInstructions.insert(Casts.begin(), Casts.end()); 7809 } 7810 } 7811 7812 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7813 7814 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7815 7816 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7817 Instruction::BinaryOps BinOp) { 7818 // When unrolling and the VF is 1, we only need to add a simple scalar. 7819 Type *Ty = Val->getType(); 7820 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7821 7822 if (Ty->isFloatingPointTy()) { 7823 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7824 7825 // Floating point operations had to be 'fast' to enable the unrolling. 7826 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7827 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7828 } 7829 Constant *C = ConstantInt::get(Ty, StartIdx); 7830 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7831 } 7832 7833 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7834 SmallVector<Metadata *, 4> MDs; 7835 // Reserve first location for self reference to the LoopID metadata node. 7836 MDs.push_back(nullptr); 7837 bool IsUnrollMetadata = false; 7838 MDNode *LoopID = L->getLoopID(); 7839 if (LoopID) { 7840 // First find existing loop unrolling disable metadata. 7841 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7842 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7843 if (MD) { 7844 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7845 IsUnrollMetadata = 7846 S && S->getString().startswith("llvm.loop.unroll.disable"); 7847 } 7848 MDs.push_back(LoopID->getOperand(i)); 7849 } 7850 } 7851 7852 if (!IsUnrollMetadata) { 7853 // Add runtime unroll disable metadata. 7854 LLVMContext &Context = L->getHeader()->getContext(); 7855 SmallVector<Metadata *, 1> DisableOperands; 7856 DisableOperands.push_back( 7857 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7858 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7859 MDs.push_back(DisableNode); 7860 MDNode *NewLoopID = MDNode::get(Context, MDs); 7861 // Set operand 0 to refer to the loop id itself. 7862 NewLoopID->replaceOperandWith(0, NewLoopID); 7863 L->setLoopID(NewLoopID); 7864 } 7865 } 7866 7867 //===--------------------------------------------------------------------===// 7868 // EpilogueVectorizerMainLoop 7869 //===--------------------------------------------------------------------===// 7870 7871 /// This function is partially responsible for generating the control flow 7872 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7873 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7874 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7875 Loop *Lp = createVectorLoopSkeleton(""); 7876 7877 // Generate the code to check the minimum iteration count of the vector 7878 // epilogue (see below). 7879 EPI.EpilogueIterationCountCheck = 7880 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7881 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7882 7883 // Generate the code to check any assumptions that we've made for SCEV 7884 // expressions. 7885 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7886 emitSCEVChecks(Lp, LoopScalarPreHeader); 7887 7888 // If a safety check was generated save it. 7889 if (SavedPreHeader != LoopVectorPreHeader) 7890 EPI.SCEVSafetyCheck = SavedPreHeader; 7891 7892 // Generate the code that checks at runtime if arrays overlap. We put the 7893 // checks into a separate block to make the more common case of few elements 7894 // faster. 7895 SavedPreHeader = LoopVectorPreHeader; 7896 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7897 7898 // If a safety check was generated save/overwite it. 7899 if (SavedPreHeader != LoopVectorPreHeader) 7900 EPI.MemSafetyCheck = SavedPreHeader; 7901 7902 // Generate the iteration count check for the main loop, *after* the check 7903 // for the epilogue loop, so that the path-length is shorter for the case 7904 // that goes directly through the vector epilogue. The longer-path length for 7905 // the main loop is compensated for, by the gain from vectorizing the larger 7906 // trip count. Note: the branch will get updated later on when we vectorize 7907 // the epilogue. 7908 EPI.MainLoopIterationCountCheck = 7909 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7910 7911 // Generate the induction variable. 7912 OldInduction = Legal->getPrimaryInduction(); 7913 Type *IdxTy = Legal->getWidestInductionType(); 7914 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7915 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7916 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7917 EPI.VectorTripCount = CountRoundDown; 7918 Induction = 7919 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7920 getDebugLocFromInstOrOperands(OldInduction)); 7921 7922 // Skip induction resume value creation here because they will be created in 7923 // the second pass. If we created them here, they wouldn't be used anyway, 7924 // because the vplan in the second pass still contains the inductions from the 7925 // original loop. 7926 7927 return completeLoopSkeleton(Lp, OrigLoopID); 7928 } 7929 7930 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7931 LLVM_DEBUG({ 7932 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7933 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7934 << ", Main Loop UF:" << EPI.MainLoopUF 7935 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7936 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7937 }); 7938 } 7939 7940 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7941 DEBUG_WITH_TYPE(VerboseDebug, { 7942 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7943 }); 7944 } 7945 7946 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7947 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7948 assert(L && "Expected valid Loop."); 7949 assert(Bypass && "Expected valid bypass basic block."); 7950 unsigned VFactor = 7951 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7952 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7953 Value *Count = getOrCreateTripCount(L); 7954 // Reuse existing vector loop preheader for TC checks. 7955 // Note that new preheader block is generated for vector loop. 7956 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7957 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7958 7959 // Generate code to check if the loop's trip count is less than VF * UF of the 7960 // main vector loop. 7961 auto P = 7962 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7963 7964 Value *CheckMinIters = Builder.CreateICmp( 7965 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7966 "min.iters.check"); 7967 7968 if (!ForEpilogue) 7969 TCCheckBlock->setName("vector.main.loop.iter.check"); 7970 7971 // Create new preheader for vector loop. 7972 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7973 DT, LI, nullptr, "vector.ph"); 7974 7975 if (ForEpilogue) { 7976 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7977 DT->getNode(Bypass)->getIDom()) && 7978 "TC check is expected to dominate Bypass"); 7979 7980 // Update dominator for Bypass & LoopExit. 7981 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7982 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7983 7984 LoopBypassBlocks.push_back(TCCheckBlock); 7985 7986 // Save the trip count so we don't have to regenerate it in the 7987 // vec.epilog.iter.check. This is safe to do because the trip count 7988 // generated here dominates the vector epilog iter check. 7989 EPI.TripCount = Count; 7990 } 7991 7992 ReplaceInstWithInst( 7993 TCCheckBlock->getTerminator(), 7994 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7995 7996 return TCCheckBlock; 7997 } 7998 7999 //===--------------------------------------------------------------------===// 8000 // EpilogueVectorizerEpilogueLoop 8001 //===--------------------------------------------------------------------===// 8002 8003 /// This function is partially responsible for generating the control flow 8004 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8005 BasicBlock * 8006 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8007 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8008 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8009 8010 // Now, compare the remaining count and if there aren't enough iterations to 8011 // execute the vectorized epilogue skip to the scalar part. 8012 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8013 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8014 LoopVectorPreHeader = 8015 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8016 LI, nullptr, "vec.epilog.ph"); 8017 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8018 VecEpilogueIterationCountCheck); 8019 8020 // Adjust the control flow taking the state info from the main loop 8021 // vectorization into account. 8022 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8023 "expected this to be saved from the previous pass."); 8024 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8025 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8026 8027 DT->changeImmediateDominator(LoopVectorPreHeader, 8028 EPI.MainLoopIterationCountCheck); 8029 8030 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8031 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8032 8033 if (EPI.SCEVSafetyCheck) 8034 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8035 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8036 if (EPI.MemSafetyCheck) 8037 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8038 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8039 8040 DT->changeImmediateDominator( 8041 VecEpilogueIterationCountCheck, 8042 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8043 8044 DT->changeImmediateDominator(LoopScalarPreHeader, 8045 EPI.EpilogueIterationCountCheck); 8046 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8047 8048 // Keep track of bypass blocks, as they feed start values to the induction 8049 // phis in the scalar loop preheader. 8050 if (EPI.SCEVSafetyCheck) 8051 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8052 if (EPI.MemSafetyCheck) 8053 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8054 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8055 8056 // Generate a resume induction for the vector epilogue and put it in the 8057 // vector epilogue preheader 8058 Type *IdxTy = Legal->getWidestInductionType(); 8059 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8060 LoopVectorPreHeader->getFirstNonPHI()); 8061 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8062 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8063 EPI.MainLoopIterationCountCheck); 8064 8065 // Generate the induction variable. 8066 OldInduction = Legal->getPrimaryInduction(); 8067 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8068 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8069 Value *StartIdx = EPResumeVal; 8070 Induction = 8071 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8072 getDebugLocFromInstOrOperands(OldInduction)); 8073 8074 // Generate induction resume values. These variables save the new starting 8075 // indexes for the scalar loop. They are used to test if there are any tail 8076 // iterations left once the vector loop has completed. 8077 // Note that when the vectorized epilogue is skipped due to iteration count 8078 // check, then the resume value for the induction variable comes from 8079 // the trip count of the main vector loop, hence passing the AdditionalBypass 8080 // argument. 8081 createInductionResumeValues(Lp, CountRoundDown, 8082 {VecEpilogueIterationCountCheck, 8083 EPI.VectorTripCount} /* AdditionalBypass */); 8084 8085 AddRuntimeUnrollDisableMetaData(Lp); 8086 return completeLoopSkeleton(Lp, OrigLoopID); 8087 } 8088 8089 BasicBlock * 8090 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8091 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8092 8093 assert(EPI.TripCount && 8094 "Expected trip count to have been safed in the first pass."); 8095 assert( 8096 (!isa<Instruction>(EPI.TripCount) || 8097 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8098 "saved trip count does not dominate insertion point."); 8099 Value *TC = EPI.TripCount; 8100 IRBuilder<> Builder(Insert->getTerminator()); 8101 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8102 8103 // Generate code to check if the loop's trip count is less than VF * UF of the 8104 // vector epilogue loop. 8105 auto P = 8106 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8107 8108 Value *CheckMinIters = Builder.CreateICmp( 8109 P, Count, 8110 ConstantInt::get(Count->getType(), 8111 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8112 "min.epilog.iters.check"); 8113 8114 ReplaceInstWithInst( 8115 Insert->getTerminator(), 8116 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8117 8118 LoopBypassBlocks.push_back(Insert); 8119 return Insert; 8120 } 8121 8122 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8123 LLVM_DEBUG({ 8124 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8125 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8126 << ", Main Loop UF:" << EPI.MainLoopUF 8127 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8128 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8129 }); 8130 } 8131 8132 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8133 DEBUG_WITH_TYPE(VerboseDebug, { 8134 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8135 }); 8136 } 8137 8138 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8139 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8140 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8141 bool PredicateAtRangeStart = Predicate(Range.Start); 8142 8143 for (ElementCount TmpVF = Range.Start * 2; 8144 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8145 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8146 Range.End = TmpVF; 8147 break; 8148 } 8149 8150 return PredicateAtRangeStart; 8151 } 8152 8153 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8154 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8155 /// of VF's starting at a given VF and extending it as much as possible. Each 8156 /// vectorization decision can potentially shorten this sub-range during 8157 /// buildVPlan(). 8158 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8159 ElementCount MaxVF) { 8160 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8161 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8162 VFRange SubRange = {VF, MaxVFPlusOne}; 8163 VPlans.push_back(buildVPlan(SubRange)); 8164 VF = SubRange.End; 8165 } 8166 } 8167 8168 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8169 VPlanPtr &Plan) { 8170 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8171 8172 // Look for cached value. 8173 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8174 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8175 if (ECEntryIt != EdgeMaskCache.end()) 8176 return ECEntryIt->second; 8177 8178 VPValue *SrcMask = createBlockInMask(Src, Plan); 8179 8180 // The terminator has to be a branch inst! 8181 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8182 assert(BI && "Unexpected terminator found"); 8183 8184 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8185 return EdgeMaskCache[Edge] = SrcMask; 8186 8187 // If source is an exiting block, we know the exit edge is dynamically dead 8188 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8189 // adding uses of an otherwise potentially dead instruction. 8190 if (OrigLoop->isLoopExiting(Src)) 8191 return EdgeMaskCache[Edge] = SrcMask; 8192 8193 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8194 assert(EdgeMask && "No Edge Mask found for condition"); 8195 8196 if (BI->getSuccessor(0) != Dst) 8197 EdgeMask = Builder.createNot(EdgeMask); 8198 8199 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 8200 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 8201 8202 return EdgeMaskCache[Edge] = EdgeMask; 8203 } 8204 8205 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8206 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8207 8208 // Look for cached value. 8209 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8210 if (BCEntryIt != BlockMaskCache.end()) 8211 return BCEntryIt->second; 8212 8213 // All-one mask is modelled as no-mask following the convention for masked 8214 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8215 VPValue *BlockMask = nullptr; 8216 8217 if (OrigLoop->getHeader() == BB) { 8218 if (!CM.blockNeedsPredication(BB)) 8219 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8220 8221 // Create the block in mask as the first non-phi instruction in the block. 8222 VPBuilder::InsertPointGuard Guard(Builder); 8223 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8224 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8225 8226 // Introduce the early-exit compare IV <= BTC to form header block mask. 8227 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8228 // Start by constructing the desired canonical IV. 8229 VPValue *IV = nullptr; 8230 if (Legal->getPrimaryInduction()) 8231 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8232 else { 8233 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8234 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8235 IV = IVRecipe->getVPValue(); 8236 } 8237 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8238 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8239 8240 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8241 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8242 // as a second argument, we only pass the IV here and extract the 8243 // tripcount from the transform state where codegen of the VP instructions 8244 // happen. 8245 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8246 } else { 8247 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8248 } 8249 return BlockMaskCache[BB] = BlockMask; 8250 } 8251 8252 // This is the block mask. We OR all incoming edges. 8253 for (auto *Predecessor : predecessors(BB)) { 8254 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8255 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8256 return BlockMaskCache[BB] = EdgeMask; 8257 8258 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8259 BlockMask = EdgeMask; 8260 continue; 8261 } 8262 8263 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8264 } 8265 8266 return BlockMaskCache[BB] = BlockMask; 8267 } 8268 8269 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8270 VPlanPtr &Plan) { 8271 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8272 "Must be called with either a load or store"); 8273 8274 auto willWiden = [&](ElementCount VF) -> bool { 8275 if (VF.isScalar()) 8276 return false; 8277 LoopVectorizationCostModel::InstWidening Decision = 8278 CM.getWideningDecision(I, VF); 8279 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8280 "CM decision should be taken at this point."); 8281 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8282 return true; 8283 if (CM.isScalarAfterVectorization(I, VF) || 8284 CM.isProfitableToScalarize(I, VF)) 8285 return false; 8286 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8287 }; 8288 8289 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8290 return nullptr; 8291 8292 VPValue *Mask = nullptr; 8293 if (Legal->isMaskRequired(I)) 8294 Mask = createBlockInMask(I->getParent(), Plan); 8295 8296 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8297 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8298 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8299 8300 StoreInst *Store = cast<StoreInst>(I); 8301 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8302 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8303 } 8304 8305 VPWidenIntOrFpInductionRecipe * 8306 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8307 // Check if this is an integer or fp induction. If so, build the recipe that 8308 // produces its scalar and vector values. 8309 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8310 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8311 II.getKind() == InductionDescriptor::IK_FpInduction) { 8312 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8313 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8314 } 8315 8316 return nullptr; 8317 } 8318 8319 VPWidenIntOrFpInductionRecipe * 8320 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8321 VPlan &Plan) const { 8322 // Optimize the special case where the source is a constant integer 8323 // induction variable. Notice that we can only optimize the 'trunc' case 8324 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8325 // (c) other casts depend on pointer size. 8326 8327 // Determine whether \p K is a truncation based on an induction variable that 8328 // can be optimized. 8329 auto isOptimizableIVTruncate = 8330 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8331 return [=](ElementCount VF) -> bool { 8332 return CM.isOptimizableIVTruncate(K, VF); 8333 }; 8334 }; 8335 8336 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8337 isOptimizableIVTruncate(I), Range)) { 8338 8339 InductionDescriptor II = 8340 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8341 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8342 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8343 Start, I); 8344 } 8345 return nullptr; 8346 } 8347 8348 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8349 // We know that all PHIs in non-header blocks are converted into selects, so 8350 // we don't have to worry about the insertion order and we can just use the 8351 // builder. At this point we generate the predication tree. There may be 8352 // duplications since this is a simple recursive scan, but future 8353 // optimizations will clean it up. 8354 8355 SmallVector<VPValue *, 2> Operands; 8356 unsigned NumIncoming = Phi->getNumIncomingValues(); 8357 for (unsigned In = 0; In < NumIncoming; In++) { 8358 VPValue *EdgeMask = 8359 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8360 assert((EdgeMask || NumIncoming == 1) && 8361 "Multiple predecessors with one having a full mask"); 8362 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8363 if (EdgeMask) 8364 Operands.push_back(EdgeMask); 8365 } 8366 return new VPBlendRecipe(Phi, Operands); 8367 } 8368 8369 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8370 VPlan &Plan) const { 8371 8372 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8373 [this, CI](ElementCount VF) { 8374 return CM.isScalarWithPredication(CI, VF); 8375 }, 8376 Range); 8377 8378 if (IsPredicated) 8379 return nullptr; 8380 8381 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8382 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8383 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8384 ID == Intrinsic::pseudoprobe || 8385 ID == Intrinsic::experimental_noalias_scope_decl)) 8386 return nullptr; 8387 8388 auto willWiden = [&](ElementCount VF) -> bool { 8389 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8390 // The following case may be scalarized depending on the VF. 8391 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8392 // version of the instruction. 8393 // Is it beneficial to perform intrinsic call compared to lib call? 8394 bool NeedToScalarize = false; 8395 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8396 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8397 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8398 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8399 "Cannot have invalid costs while widening"); 8400 return UseVectorIntrinsic || !NeedToScalarize; 8401 }; 8402 8403 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8404 return nullptr; 8405 8406 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8407 } 8408 8409 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8410 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8411 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8412 // Instruction should be widened, unless it is scalar after vectorization, 8413 // scalarization is profitable or it is predicated. 8414 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8415 return CM.isScalarAfterVectorization(I, VF) || 8416 CM.isProfitableToScalarize(I, VF) || 8417 CM.isScalarWithPredication(I, VF); 8418 }; 8419 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8420 Range); 8421 } 8422 8423 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8424 auto IsVectorizableOpcode = [](unsigned Opcode) { 8425 switch (Opcode) { 8426 case Instruction::Add: 8427 case Instruction::And: 8428 case Instruction::AShr: 8429 case Instruction::BitCast: 8430 case Instruction::FAdd: 8431 case Instruction::FCmp: 8432 case Instruction::FDiv: 8433 case Instruction::FMul: 8434 case Instruction::FNeg: 8435 case Instruction::FPExt: 8436 case Instruction::FPToSI: 8437 case Instruction::FPToUI: 8438 case Instruction::FPTrunc: 8439 case Instruction::FRem: 8440 case Instruction::FSub: 8441 case Instruction::ICmp: 8442 case Instruction::IntToPtr: 8443 case Instruction::LShr: 8444 case Instruction::Mul: 8445 case Instruction::Or: 8446 case Instruction::PtrToInt: 8447 case Instruction::SDiv: 8448 case Instruction::Select: 8449 case Instruction::SExt: 8450 case Instruction::Shl: 8451 case Instruction::SIToFP: 8452 case Instruction::SRem: 8453 case Instruction::Sub: 8454 case Instruction::Trunc: 8455 case Instruction::UDiv: 8456 case Instruction::UIToFP: 8457 case Instruction::URem: 8458 case Instruction::Xor: 8459 case Instruction::ZExt: 8460 return true; 8461 } 8462 return false; 8463 }; 8464 8465 if (!IsVectorizableOpcode(I->getOpcode())) 8466 return nullptr; 8467 8468 // Success: widen this instruction. 8469 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8470 } 8471 8472 VPBasicBlock *VPRecipeBuilder::handleReplication( 8473 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8474 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8475 VPlanPtr &Plan) { 8476 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8477 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8478 Range); 8479 8480 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8481 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8482 Range); 8483 8484 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8485 IsUniform, IsPredicated); 8486 setRecipe(I, Recipe); 8487 Plan->addVPValue(I, Recipe); 8488 8489 // Find if I uses a predicated instruction. If so, it will use its scalar 8490 // value. Avoid hoisting the insert-element which packs the scalar value into 8491 // a vector value, as that happens iff all users use the vector value. 8492 for (auto &Op : I->operands()) 8493 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8494 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8495 PredInst2Recipe[PredInst]->setAlsoPack(false); 8496 8497 // Finalize the recipe for Instr, first if it is not predicated. 8498 if (!IsPredicated) { 8499 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8500 VPBB->appendRecipe(Recipe); 8501 return VPBB; 8502 } 8503 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8504 assert(VPBB->getSuccessors().empty() && 8505 "VPBB has successors when handling predicated replication."); 8506 // Record predicated instructions for above packing optimizations. 8507 PredInst2Recipe[I] = Recipe; 8508 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8509 VPBlockUtils::insertBlockAfter(Region, VPBB); 8510 auto *RegSucc = new VPBasicBlock(); 8511 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8512 return RegSucc; 8513 } 8514 8515 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8516 VPRecipeBase *PredRecipe, 8517 VPlanPtr &Plan) { 8518 // Instructions marked for predication are replicated and placed under an 8519 // if-then construct to prevent side-effects. 8520 8521 // Generate recipes to compute the block mask for this region. 8522 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8523 8524 // Build the triangular if-then region. 8525 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8526 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8527 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8528 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8529 auto *PHIRecipe = Instr->getType()->isVoidTy() 8530 ? nullptr 8531 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8532 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8533 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8534 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8535 8536 // Note: first set Entry as region entry and then connect successors starting 8537 // from it in order, to propagate the "parent" of each VPBasicBlock. 8538 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8539 VPBlockUtils::connectBlocks(Pred, Exit); 8540 8541 return Region; 8542 } 8543 8544 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8545 VFRange &Range, 8546 VPlanPtr &Plan) { 8547 // First, check for specific widening recipes that deal with calls, memory 8548 // operations, inductions and Phi nodes. 8549 if (auto *CI = dyn_cast<CallInst>(Instr)) 8550 return tryToWidenCall(CI, Range, *Plan); 8551 8552 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8553 return tryToWidenMemory(Instr, Range, Plan); 8554 8555 VPRecipeBase *Recipe; 8556 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8557 if (Phi->getParent() != OrigLoop->getHeader()) 8558 return tryToBlend(Phi, Plan); 8559 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8560 return Recipe; 8561 8562 if (Legal->isReductionVariable(Phi)) { 8563 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8564 VPValue *StartV = 8565 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8566 return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8567 } 8568 8569 return new VPWidenPHIRecipe(Phi); 8570 } 8571 8572 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8573 cast<TruncInst>(Instr), Range, *Plan))) 8574 return Recipe; 8575 8576 if (!shouldWiden(Instr, Range)) 8577 return nullptr; 8578 8579 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8580 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8581 OrigLoop); 8582 8583 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8584 bool InvariantCond = 8585 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8586 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8587 InvariantCond); 8588 } 8589 8590 return tryToWiden(Instr, *Plan); 8591 } 8592 8593 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8594 ElementCount MaxVF) { 8595 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8596 8597 // Collect instructions from the original loop that will become trivially dead 8598 // in the vectorized loop. We don't need to vectorize these instructions. For 8599 // example, original induction update instructions can become dead because we 8600 // separately emit induction "steps" when generating code for the new loop. 8601 // Similarly, we create a new latch condition when setting up the structure 8602 // of the new loop, so the old one can become dead. 8603 SmallPtrSet<Instruction *, 4> DeadInstructions; 8604 collectTriviallyDeadInstructions(DeadInstructions); 8605 8606 // Add assume instructions we need to drop to DeadInstructions, to prevent 8607 // them from being added to the VPlan. 8608 // TODO: We only need to drop assumes in blocks that get flattend. If the 8609 // control flow is preserved, we should keep them. 8610 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8611 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8612 8613 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8614 // Dead instructions do not need sinking. Remove them from SinkAfter. 8615 for (Instruction *I : DeadInstructions) 8616 SinkAfter.erase(I); 8617 8618 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8619 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8620 VFRange SubRange = {VF, MaxVFPlusOne}; 8621 VPlans.push_back( 8622 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8623 VF = SubRange.End; 8624 } 8625 } 8626 8627 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8628 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8629 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8630 8631 // Hold a mapping from predicated instructions to their recipes, in order to 8632 // fix their AlsoPack behavior if a user is determined to replicate and use a 8633 // scalar instead of vector value. 8634 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8635 8636 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8637 8638 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8639 8640 // --------------------------------------------------------------------------- 8641 // Pre-construction: record ingredients whose recipes we'll need to further 8642 // process after constructing the initial VPlan. 8643 // --------------------------------------------------------------------------- 8644 8645 // Mark instructions we'll need to sink later and their targets as 8646 // ingredients whose recipe we'll need to record. 8647 for (auto &Entry : SinkAfter) { 8648 RecipeBuilder.recordRecipeOf(Entry.first); 8649 RecipeBuilder.recordRecipeOf(Entry.second); 8650 } 8651 for (auto &Reduction : CM.getInLoopReductionChains()) { 8652 PHINode *Phi = Reduction.first; 8653 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8654 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8655 8656 RecipeBuilder.recordRecipeOf(Phi); 8657 for (auto &R : ReductionOperations) { 8658 RecipeBuilder.recordRecipeOf(R); 8659 // For min/max reducitons, where we have a pair of icmp/select, we also 8660 // need to record the ICmp recipe, so it can be removed later. 8661 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8662 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8663 } 8664 } 8665 8666 // For each interleave group which is relevant for this (possibly trimmed) 8667 // Range, add it to the set of groups to be later applied to the VPlan and add 8668 // placeholders for its members' Recipes which we'll be replacing with a 8669 // single VPInterleaveRecipe. 8670 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8671 auto applyIG = [IG, this](ElementCount VF) -> bool { 8672 return (VF.isVector() && // Query is illegal for VF == 1 8673 CM.getWideningDecision(IG->getInsertPos(), VF) == 8674 LoopVectorizationCostModel::CM_Interleave); 8675 }; 8676 if (!getDecisionAndClampRange(applyIG, Range)) 8677 continue; 8678 InterleaveGroups.insert(IG); 8679 for (unsigned i = 0; i < IG->getFactor(); i++) 8680 if (Instruction *Member = IG->getMember(i)) 8681 RecipeBuilder.recordRecipeOf(Member); 8682 }; 8683 8684 // --------------------------------------------------------------------------- 8685 // Build initial VPlan: Scan the body of the loop in a topological order to 8686 // visit each basic block after having visited its predecessor basic blocks. 8687 // --------------------------------------------------------------------------- 8688 8689 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8690 auto Plan = std::make_unique<VPlan>(); 8691 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8692 Plan->setEntry(VPBB); 8693 8694 // Scan the body of the loop in a topological order to visit each basic block 8695 // after having visited its predecessor basic blocks. 8696 LoopBlocksDFS DFS(OrigLoop); 8697 DFS.perform(LI); 8698 8699 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8700 // Relevant instructions from basic block BB will be grouped into VPRecipe 8701 // ingredients and fill a new VPBasicBlock. 8702 unsigned VPBBsForBB = 0; 8703 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8704 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8705 VPBB = FirstVPBBForBB; 8706 Builder.setInsertPoint(VPBB); 8707 8708 // Introduce each ingredient into VPlan. 8709 // TODO: Model and preserve debug instrinsics in VPlan. 8710 for (Instruction &I : BB->instructionsWithoutDebug()) { 8711 Instruction *Instr = &I; 8712 8713 // First filter out irrelevant instructions, to ensure no recipes are 8714 // built for them. 8715 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8716 continue; 8717 8718 if (auto Recipe = 8719 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8720 for (auto *Def : Recipe->definedValues()) { 8721 auto *UV = Def->getUnderlyingValue(); 8722 Plan->addVPValue(UV, Def); 8723 } 8724 8725 RecipeBuilder.setRecipe(Instr, Recipe); 8726 VPBB->appendRecipe(Recipe); 8727 continue; 8728 } 8729 8730 // Otherwise, if all widening options failed, Instruction is to be 8731 // replicated. This may create a successor for VPBB. 8732 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8733 Instr, Range, VPBB, PredInst2Recipe, Plan); 8734 if (NextVPBB != VPBB) { 8735 VPBB = NextVPBB; 8736 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8737 : ""); 8738 } 8739 } 8740 } 8741 8742 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8743 // may also be empty, such as the last one VPBB, reflecting original 8744 // basic-blocks with no recipes. 8745 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8746 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8747 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8748 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8749 delete PreEntry; 8750 8751 // --------------------------------------------------------------------------- 8752 // Transform initial VPlan: Apply previously taken decisions, in order, to 8753 // bring the VPlan to its final state. 8754 // --------------------------------------------------------------------------- 8755 8756 // Apply Sink-After legal constraints. 8757 for (auto &Entry : SinkAfter) { 8758 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8759 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8760 // If the target is in a replication region, make sure to move Sink to the 8761 // block after it, not into the replication region itself. 8762 if (auto *Region = 8763 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8764 if (Region->isReplicator()) { 8765 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8766 VPBasicBlock *NextBlock = 8767 cast<VPBasicBlock>(Region->getSuccessors().front()); 8768 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8769 continue; 8770 } 8771 } 8772 Sink->moveAfter(Target); 8773 } 8774 8775 // Interleave memory: for each Interleave Group we marked earlier as relevant 8776 // for this VPlan, replace the Recipes widening its memory instructions with a 8777 // single VPInterleaveRecipe at its insertion point. 8778 for (auto IG : InterleaveGroups) { 8779 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8780 RecipeBuilder.getRecipe(IG->getInsertPos())); 8781 SmallVector<VPValue *, 4> StoredValues; 8782 for (unsigned i = 0; i < IG->getFactor(); ++i) 8783 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8784 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8785 8786 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8787 Recipe->getMask()); 8788 VPIG->insertBefore(Recipe); 8789 unsigned J = 0; 8790 for (unsigned i = 0; i < IG->getFactor(); ++i) 8791 if (Instruction *Member = IG->getMember(i)) { 8792 if (!Member->getType()->isVoidTy()) { 8793 VPValue *OriginalV = Plan->getVPValue(Member); 8794 Plan->removeVPValueFor(Member); 8795 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8796 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8797 J++; 8798 } 8799 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8800 } 8801 } 8802 8803 // Adjust the recipes for any inloop reductions. 8804 if (Range.Start.isVector()) 8805 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8806 8807 // Finally, if tail is folded by masking, introduce selects between the phi 8808 // and the live-out instruction of each reduction, at the end of the latch. 8809 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8810 Builder.setInsertPoint(VPBB); 8811 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8812 for (auto &Reduction : Legal->getReductionVars()) { 8813 if (CM.isInLoopReduction(Reduction.first)) 8814 continue; 8815 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8816 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8817 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8818 } 8819 } 8820 8821 std::string PlanName; 8822 raw_string_ostream RSO(PlanName); 8823 ElementCount VF = Range.Start; 8824 Plan->addVF(VF); 8825 RSO << "Initial VPlan for VF={" << VF; 8826 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8827 Plan->addVF(VF); 8828 RSO << "," << VF; 8829 } 8830 RSO << "},UF>=1"; 8831 RSO.flush(); 8832 Plan->setName(PlanName); 8833 8834 return Plan; 8835 } 8836 8837 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8838 // Outer loop handling: They may require CFG and instruction level 8839 // transformations before even evaluating whether vectorization is profitable. 8840 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8841 // the vectorization pipeline. 8842 assert(!OrigLoop->isInnermost()); 8843 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8844 8845 // Create new empty VPlan 8846 auto Plan = std::make_unique<VPlan>(); 8847 8848 // Build hierarchical CFG 8849 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8850 HCFGBuilder.buildHierarchicalCFG(); 8851 8852 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8853 VF *= 2) 8854 Plan->addVF(VF); 8855 8856 if (EnableVPlanPredication) { 8857 VPlanPredicator VPP(*Plan); 8858 VPP.predicate(); 8859 8860 // Avoid running transformation to recipes until masked code generation in 8861 // VPlan-native path is in place. 8862 return Plan; 8863 } 8864 8865 SmallPtrSet<Instruction *, 1> DeadInstructions; 8866 VPlanTransforms::VPInstructionsToVPRecipes( 8867 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8868 return Plan; 8869 } 8870 8871 // Adjust the recipes for any inloop reductions. The chain of instructions 8872 // leading from the loop exit instr to the phi need to be converted to 8873 // reductions, with one operand being vector and the other being the scalar 8874 // reduction chain. 8875 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8876 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8877 for (auto &Reduction : CM.getInLoopReductionChains()) { 8878 PHINode *Phi = Reduction.first; 8879 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8880 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8881 8882 // ReductionOperations are orders top-down from the phi's use to the 8883 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8884 // which of the two operands will remain scalar and which will be reduced. 8885 // For minmax the chain will be the select instructions. 8886 Instruction *Chain = Phi; 8887 for (Instruction *R : ReductionOperations) { 8888 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8889 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8890 8891 VPValue *ChainOp = Plan->getVPValue(Chain); 8892 unsigned FirstOpId; 8893 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8894 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8895 "Expected to replace a VPWidenSelectSC"); 8896 FirstOpId = 1; 8897 } else { 8898 assert(isa<VPWidenRecipe>(WidenRecipe) && 8899 "Expected to replace a VPWidenSC"); 8900 FirstOpId = 0; 8901 } 8902 unsigned VecOpId = 8903 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8904 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8905 8906 auto *CondOp = CM.foldTailByMasking() 8907 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8908 : nullptr; 8909 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8910 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8911 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8912 Plan->removeVPValueFor(R); 8913 Plan->addVPValue(R, RedRecipe); 8914 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8915 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8916 WidenRecipe->eraseFromParent(); 8917 8918 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8919 VPRecipeBase *CompareRecipe = 8920 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8921 assert(isa<VPWidenRecipe>(CompareRecipe) && 8922 "Expected to replace a VPWidenSC"); 8923 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8924 "Expected no remaining users"); 8925 CompareRecipe->eraseFromParent(); 8926 } 8927 Chain = R; 8928 } 8929 } 8930 } 8931 8932 Value* LoopVectorizationPlanner::VPCallbackILV:: 8933 getOrCreateVectorValues(Value *V, unsigned Part) { 8934 return ILV.getOrCreateVectorValue(V, Part); 8935 } 8936 8937 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8938 Value *V, const VPIteration &Instance) { 8939 return ILV.getOrCreateScalarValue(V, Instance); 8940 } 8941 8942 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8943 VPSlotTracker &SlotTracker) const { 8944 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8945 IG->getInsertPos()->printAsOperand(O, false); 8946 O << ", "; 8947 getAddr()->printAsOperand(O, SlotTracker); 8948 VPValue *Mask = getMask(); 8949 if (Mask) { 8950 O << ", "; 8951 Mask->printAsOperand(O, SlotTracker); 8952 } 8953 for (unsigned i = 0; i < IG->getFactor(); ++i) 8954 if (Instruction *I = IG->getMember(i)) 8955 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8956 } 8957 8958 void VPWidenCallRecipe::execute(VPTransformState &State) { 8959 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8960 *this, State); 8961 } 8962 8963 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8964 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8965 this, *this, InvariantCond, State); 8966 } 8967 8968 void VPWidenRecipe::execute(VPTransformState &State) { 8969 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8970 } 8971 8972 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8973 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8974 *this, State.UF, State.VF, IsPtrLoopInvariant, 8975 IsIndexLoopInvariant, State); 8976 } 8977 8978 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8979 assert(!State.Instance && "Int or FP induction being replicated."); 8980 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8981 Trunc); 8982 } 8983 8984 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8985 Value *StartV = 8986 getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 8987 State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 8988 } 8989 8990 void VPBlendRecipe::execute(VPTransformState &State) { 8991 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8992 // We know that all PHIs in non-header blocks are converted into 8993 // selects, so we don't have to worry about the insertion order and we 8994 // can just use the builder. 8995 // At this point we generate the predication tree. There may be 8996 // duplications since this is a simple recursive scan, but future 8997 // optimizations will clean it up. 8998 8999 unsigned NumIncoming = getNumIncomingValues(); 9000 9001 // Generate a sequence of selects of the form: 9002 // SELECT(Mask3, In3, 9003 // SELECT(Mask2, In2, 9004 // SELECT(Mask1, In1, 9005 // In0))) 9006 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9007 // are essentially undef are taken from In0. 9008 InnerLoopVectorizer::VectorParts Entry(State.UF); 9009 for (unsigned In = 0; In < NumIncoming; ++In) { 9010 for (unsigned Part = 0; Part < State.UF; ++Part) { 9011 // We might have single edge PHIs (blocks) - use an identity 9012 // 'select' for the first PHI operand. 9013 Value *In0 = State.get(getIncomingValue(In), Part); 9014 if (In == 0) 9015 Entry[Part] = In0; // Initialize with the first incoming value. 9016 else { 9017 // Select between the current value and the previous incoming edge 9018 // based on the incoming mask. 9019 Value *Cond = State.get(getMask(In), Part); 9020 Entry[Part] = 9021 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9022 } 9023 } 9024 } 9025 for (unsigned Part = 0; Part < State.UF; ++Part) 9026 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 9027 } 9028 9029 void VPInterleaveRecipe::execute(VPTransformState &State) { 9030 assert(!State.Instance && "Interleave group being replicated."); 9031 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9032 getStoredValues(), getMask()); 9033 } 9034 9035 void VPReductionRecipe::execute(VPTransformState &State) { 9036 assert(!State.Instance && "Reduction being replicated."); 9037 for (unsigned Part = 0; Part < State.UF; ++Part) { 9038 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9039 Value *NewVecOp = State.get(getVecOp(), Part); 9040 if (VPValue *Cond = getCondOp()) { 9041 Value *NewCond = State.get(Cond, Part); 9042 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9043 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9044 Kind, VecTy->getElementType()); 9045 Constant *IdenVec = 9046 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9047 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9048 NewVecOp = Select; 9049 } 9050 Value *NewRed = 9051 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9052 Value *PrevInChain = State.get(getChainOp(), Part); 9053 Value *NextInChain; 9054 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9055 NextInChain = 9056 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9057 NewRed, PrevInChain); 9058 } else { 9059 NextInChain = State.Builder.CreateBinOp( 9060 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9061 PrevInChain); 9062 } 9063 State.set(this, getUnderlyingInstr(), NextInChain, Part); 9064 } 9065 } 9066 9067 void VPReplicateRecipe::execute(VPTransformState &State) { 9068 if (State.Instance) { // Generate a single instance. 9069 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9070 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 9071 *State.Instance, IsPredicated, State); 9072 // Insert scalar instance packing it into a vector. 9073 if (AlsoPack && State.VF.isVector()) { 9074 // If we're constructing lane 0, initialize to start from poison. 9075 if (State.Instance->Lane == 0) { 9076 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9077 Value *Poison = PoisonValue::get( 9078 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9079 State.ValueMap.setVectorValue(getUnderlyingInstr(), 9080 State.Instance->Part, Poison); 9081 } 9082 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 9083 *State.Instance); 9084 } 9085 return; 9086 } 9087 9088 // Generate scalar instances for all VF lanes of all UF parts, unless the 9089 // instruction is uniform inwhich case generate only the first lane for each 9090 // of the UF parts. 9091 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9092 assert((!State.VF.isScalable() || IsUniform) && 9093 "Can't scalarize a scalable vector"); 9094 for (unsigned Part = 0; Part < State.UF; ++Part) 9095 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9096 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 9097 IsPredicated, State); 9098 } 9099 9100 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9101 assert(State.Instance && "Branch on Mask works only on single instance."); 9102 9103 unsigned Part = State.Instance->Part; 9104 unsigned Lane = State.Instance->Lane; 9105 9106 Value *ConditionBit = nullptr; 9107 VPValue *BlockInMask = getMask(); 9108 if (BlockInMask) { 9109 ConditionBit = State.get(BlockInMask, Part); 9110 if (ConditionBit->getType()->isVectorTy()) 9111 ConditionBit = State.Builder.CreateExtractElement( 9112 ConditionBit, State.Builder.getInt32(Lane)); 9113 } else // Block in mask is all-one. 9114 ConditionBit = State.Builder.getTrue(); 9115 9116 // Replace the temporary unreachable terminator with a new conditional branch, 9117 // whose two destinations will be set later when they are created. 9118 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9119 assert(isa<UnreachableInst>(CurrentTerminator) && 9120 "Expected to replace unreachable terminator with conditional branch."); 9121 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9122 CondBr->setSuccessor(0, nullptr); 9123 ReplaceInstWithInst(CurrentTerminator, CondBr); 9124 } 9125 9126 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9127 assert(State.Instance && "Predicated instruction PHI works per instance."); 9128 Instruction *ScalarPredInst = 9129 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9130 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9131 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9132 assert(PredicatingBB && "Predicated block has no single predecessor."); 9133 9134 // By current pack/unpack logic we need to generate only a single phi node: if 9135 // a vector value for the predicated instruction exists at this point it means 9136 // the instruction has vector users only, and a phi for the vector value is 9137 // needed. In this case the recipe of the predicated instruction is marked to 9138 // also do that packing, thereby "hoisting" the insert-element sequence. 9139 // Otherwise, a phi node for the scalar value is needed. 9140 unsigned Part = State.Instance->Part; 9141 Instruction *PredInst = 9142 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 9143 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 9144 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 9145 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9146 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9147 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9148 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9149 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 9150 } else { 9151 Type *PredInstType = PredInst->getType(); 9152 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9153 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 9154 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9155 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 9156 } 9157 } 9158 9159 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9160 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9161 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9162 StoredValue ? nullptr : getVPValue(), 9163 getAddr(), StoredValue, getMask()); 9164 } 9165 9166 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9167 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9168 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9169 // for predication. 9170 static ScalarEpilogueLowering getScalarEpilogueLowering( 9171 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9172 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9173 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9174 LoopVectorizationLegality &LVL) { 9175 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9176 // don't look at hints or options, and don't request a scalar epilogue. 9177 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9178 // LoopAccessInfo (due to code dependency and not being able to reliably get 9179 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9180 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9181 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9182 // back to the old way and vectorize with versioning when forced. See D81345.) 9183 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9184 PGSOQueryType::IRPass) && 9185 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9186 return CM_ScalarEpilogueNotAllowedOptSize; 9187 9188 // 2) If set, obey the directives 9189 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9190 switch (PreferPredicateOverEpilogue) { 9191 case PreferPredicateTy::ScalarEpilogue: 9192 return CM_ScalarEpilogueAllowed; 9193 case PreferPredicateTy::PredicateElseScalarEpilogue: 9194 return CM_ScalarEpilogueNotNeededUsePredicate; 9195 case PreferPredicateTy::PredicateOrDontVectorize: 9196 return CM_ScalarEpilogueNotAllowedUsePredicate; 9197 }; 9198 } 9199 9200 // 3) If set, obey the hints 9201 switch (Hints.getPredicate()) { 9202 case LoopVectorizeHints::FK_Enabled: 9203 return CM_ScalarEpilogueNotNeededUsePredicate; 9204 case LoopVectorizeHints::FK_Disabled: 9205 return CM_ScalarEpilogueAllowed; 9206 }; 9207 9208 // 4) if the TTI hook indicates this is profitable, request predication. 9209 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9210 LVL.getLAI())) 9211 return CM_ScalarEpilogueNotNeededUsePredicate; 9212 9213 return CM_ScalarEpilogueAllowed; 9214 } 9215 9216 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 9217 unsigned Part) { 9218 set(Def, V, Part); 9219 ILV->setVectorValue(IRDef, Part, V); 9220 } 9221 9222 // Process the loop in the VPlan-native vectorization path. This path builds 9223 // VPlan upfront in the vectorization pipeline, which allows to apply 9224 // VPlan-to-VPlan transformations from the very beginning without modifying the 9225 // input LLVM IR. 9226 static bool processLoopInVPlanNativePath( 9227 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9228 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9229 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9230 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9231 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9232 9233 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9234 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9235 return false; 9236 } 9237 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9238 Function *F = L->getHeader()->getParent(); 9239 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9240 9241 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9242 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9243 9244 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9245 &Hints, IAI); 9246 // Use the planner for outer loop vectorization. 9247 // TODO: CM is not used at this point inside the planner. Turn CM into an 9248 // optional argument if we don't need it in the future. 9249 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9250 9251 // Get user vectorization factor. 9252 ElementCount UserVF = Hints.getWidth(); 9253 9254 // Plan how to best vectorize, return the best VF and its cost. 9255 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9256 9257 // If we are stress testing VPlan builds, do not attempt to generate vector 9258 // code. Masked vector code generation support will follow soon. 9259 // Also, do not attempt to vectorize if no vector code will be produced. 9260 if (VPlanBuildStressTest || EnableVPlanPredication || 9261 VectorizationFactor::Disabled() == VF) 9262 return false; 9263 9264 LVP.setBestPlan(VF.Width, 1); 9265 9266 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9267 &CM, BFI, PSI); 9268 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9269 << L->getHeader()->getParent()->getName() << "\"\n"); 9270 LVP.executePlan(LB, DT); 9271 9272 // Mark the loop as already vectorized to avoid vectorizing again. 9273 Hints.setAlreadyVectorized(); 9274 9275 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9276 return true; 9277 } 9278 9279 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9280 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9281 !EnableLoopInterleaving), 9282 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9283 !EnableLoopVectorization) {} 9284 9285 bool LoopVectorizePass::processLoop(Loop *L) { 9286 assert((EnableVPlanNativePath || L->isInnermost()) && 9287 "VPlan-native path is not enabled. Only process inner loops."); 9288 9289 #ifndef NDEBUG 9290 const std::string DebugLocStr = getDebugLocString(L); 9291 #endif /* NDEBUG */ 9292 9293 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9294 << L->getHeader()->getParent()->getName() << "\" from " 9295 << DebugLocStr << "\n"); 9296 9297 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9298 9299 LLVM_DEBUG( 9300 dbgs() << "LV: Loop hints:" 9301 << " force=" 9302 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9303 ? "disabled" 9304 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9305 ? "enabled" 9306 : "?")) 9307 << " width=" << Hints.getWidth() 9308 << " unroll=" << Hints.getInterleave() << "\n"); 9309 9310 // Function containing loop 9311 Function *F = L->getHeader()->getParent(); 9312 9313 // Looking at the diagnostic output is the only way to determine if a loop 9314 // was vectorized (other than looking at the IR or machine code), so it 9315 // is important to generate an optimization remark for each loop. Most of 9316 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9317 // generated as OptimizationRemark and OptimizationRemarkMissed are 9318 // less verbose reporting vectorized loops and unvectorized loops that may 9319 // benefit from vectorization, respectively. 9320 9321 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9322 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9323 return false; 9324 } 9325 9326 PredicatedScalarEvolution PSE(*SE, *L); 9327 9328 // Check if it is legal to vectorize the loop. 9329 LoopVectorizationRequirements Requirements(*ORE); 9330 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9331 &Requirements, &Hints, DB, AC, BFI, PSI); 9332 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9333 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9334 Hints.emitRemarkWithHints(); 9335 return false; 9336 } 9337 9338 // Check the function attributes and profiles to find out if this function 9339 // should be optimized for size. 9340 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9341 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9342 9343 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9344 // here. They may require CFG and instruction level transformations before 9345 // even evaluating whether vectorization is profitable. Since we cannot modify 9346 // the incoming IR, we need to build VPlan upfront in the vectorization 9347 // pipeline. 9348 if (!L->isInnermost()) 9349 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9350 ORE, BFI, PSI, Hints); 9351 9352 assert(L->isInnermost() && "Inner loop expected."); 9353 9354 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9355 // count by optimizing for size, to minimize overheads. 9356 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9357 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9358 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9359 << "This loop is worth vectorizing only if no scalar " 9360 << "iteration overheads are incurred."); 9361 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9362 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9363 else { 9364 LLVM_DEBUG(dbgs() << "\n"); 9365 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9366 } 9367 } 9368 9369 // Check the function attributes to see if implicit floats are allowed. 9370 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9371 // an integer loop and the vector instructions selected are purely integer 9372 // vector instructions? 9373 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9374 reportVectorizationFailure( 9375 "Can't vectorize when the NoImplicitFloat attribute is used", 9376 "loop not vectorized due to NoImplicitFloat attribute", 9377 "NoImplicitFloat", ORE, L); 9378 Hints.emitRemarkWithHints(); 9379 return false; 9380 } 9381 9382 // Check if the target supports potentially unsafe FP vectorization. 9383 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9384 // for the target we're vectorizing for, to make sure none of the 9385 // additional fp-math flags can help. 9386 if (Hints.isPotentiallyUnsafe() && 9387 TTI->isFPVectorizationPotentiallyUnsafe()) { 9388 reportVectorizationFailure( 9389 "Potentially unsafe FP op prevents vectorization", 9390 "loop not vectorized due to unsafe FP support.", 9391 "UnsafeFP", ORE, L); 9392 Hints.emitRemarkWithHints(); 9393 return false; 9394 } 9395 9396 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9397 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9398 9399 // If an override option has been passed in for interleaved accesses, use it. 9400 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9401 UseInterleaved = EnableInterleavedMemAccesses; 9402 9403 // Analyze interleaved memory accesses. 9404 if (UseInterleaved) { 9405 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9406 } 9407 9408 // Use the cost model. 9409 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9410 F, &Hints, IAI); 9411 CM.collectValuesToIgnore(); 9412 9413 // Use the planner for vectorization. 9414 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9415 9416 // Get user vectorization factor and interleave count. 9417 ElementCount UserVF = Hints.getWidth(); 9418 unsigned UserIC = Hints.getInterleave(); 9419 9420 // Plan how to best vectorize, return the best VF and its cost. 9421 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9422 9423 VectorizationFactor VF = VectorizationFactor::Disabled(); 9424 unsigned IC = 1; 9425 9426 if (MaybeVF) { 9427 VF = *MaybeVF; 9428 // Select the interleave count. 9429 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9430 } 9431 9432 // Identify the diagnostic messages that should be produced. 9433 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9434 bool VectorizeLoop = true, InterleaveLoop = true; 9435 if (Requirements.doesNotMeet(F, L, Hints)) { 9436 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9437 "requirements.\n"); 9438 Hints.emitRemarkWithHints(); 9439 return false; 9440 } 9441 9442 if (VF.Width.isScalar()) { 9443 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9444 VecDiagMsg = std::make_pair( 9445 "VectorizationNotBeneficial", 9446 "the cost-model indicates that vectorization is not beneficial"); 9447 VectorizeLoop = false; 9448 } 9449 9450 if (!MaybeVF && UserIC > 1) { 9451 // Tell the user interleaving was avoided up-front, despite being explicitly 9452 // requested. 9453 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9454 "interleaving should be avoided up front\n"); 9455 IntDiagMsg = std::make_pair( 9456 "InterleavingAvoided", 9457 "Ignoring UserIC, because interleaving was avoided up front"); 9458 InterleaveLoop = false; 9459 } else if (IC == 1 && UserIC <= 1) { 9460 // Tell the user interleaving is not beneficial. 9461 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9462 IntDiagMsg = std::make_pair( 9463 "InterleavingNotBeneficial", 9464 "the cost-model indicates that interleaving is not beneficial"); 9465 InterleaveLoop = false; 9466 if (UserIC == 1) { 9467 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9468 IntDiagMsg.second += 9469 " and is explicitly disabled or interleave count is set to 1"; 9470 } 9471 } else if (IC > 1 && UserIC == 1) { 9472 // Tell the user interleaving is beneficial, but it explicitly disabled. 9473 LLVM_DEBUG( 9474 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9475 IntDiagMsg = std::make_pair( 9476 "InterleavingBeneficialButDisabled", 9477 "the cost-model indicates that interleaving is beneficial " 9478 "but is explicitly disabled or interleave count is set to 1"); 9479 InterleaveLoop = false; 9480 } 9481 9482 // Override IC if user provided an interleave count. 9483 IC = UserIC > 0 ? UserIC : IC; 9484 9485 // Emit diagnostic messages, if any. 9486 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9487 if (!VectorizeLoop && !InterleaveLoop) { 9488 // Do not vectorize or interleaving the loop. 9489 ORE->emit([&]() { 9490 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9491 L->getStartLoc(), L->getHeader()) 9492 << VecDiagMsg.second; 9493 }); 9494 ORE->emit([&]() { 9495 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9496 L->getStartLoc(), L->getHeader()) 9497 << IntDiagMsg.second; 9498 }); 9499 return false; 9500 } else if (!VectorizeLoop && InterleaveLoop) { 9501 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9502 ORE->emit([&]() { 9503 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9504 L->getStartLoc(), L->getHeader()) 9505 << VecDiagMsg.second; 9506 }); 9507 } else if (VectorizeLoop && !InterleaveLoop) { 9508 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9509 << ") in " << DebugLocStr << '\n'); 9510 ORE->emit([&]() { 9511 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9512 L->getStartLoc(), L->getHeader()) 9513 << IntDiagMsg.second; 9514 }); 9515 } else if (VectorizeLoop && InterleaveLoop) { 9516 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9517 << ") in " << DebugLocStr << '\n'); 9518 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9519 } 9520 9521 LVP.setBestPlan(VF.Width, IC); 9522 9523 using namespace ore; 9524 bool DisableRuntimeUnroll = false; 9525 MDNode *OrigLoopID = L->getLoopID(); 9526 9527 if (!VectorizeLoop) { 9528 assert(IC > 1 && "interleave count should not be 1 or 0"); 9529 // If we decided that it is not legal to vectorize the loop, then 9530 // interleave it. 9531 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9532 BFI, PSI); 9533 LVP.executePlan(Unroller, DT); 9534 9535 ORE->emit([&]() { 9536 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9537 L->getHeader()) 9538 << "interleaved loop (interleaved count: " 9539 << NV("InterleaveCount", IC) << ")"; 9540 }); 9541 } else { 9542 // If we decided that it is *legal* to vectorize the loop, then do it. 9543 9544 // Consider vectorizing the epilogue too if it's profitable. 9545 VectorizationFactor EpilogueVF = 9546 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9547 if (EpilogueVF.Width.isVector()) { 9548 9549 // The first pass vectorizes the main loop and creates a scalar epilogue 9550 // to be vectorized by executing the plan (potentially with a different 9551 // factor) again shortly afterwards. 9552 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9553 EpilogueVF.Width.getKnownMinValue(), 1); 9554 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9555 &LVL, &CM, BFI, PSI); 9556 9557 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9558 LVP.executePlan(MainILV, DT); 9559 ++LoopsVectorized; 9560 9561 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9562 formLCSSARecursively(*L, *DT, LI, SE); 9563 9564 // Second pass vectorizes the epilogue and adjusts the control flow 9565 // edges from the first pass. 9566 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9567 EPI.MainLoopVF = EPI.EpilogueVF; 9568 EPI.MainLoopUF = EPI.EpilogueUF; 9569 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9570 ORE, EPI, &LVL, &CM, BFI, PSI); 9571 LVP.executePlan(EpilogILV, DT); 9572 ++LoopsEpilogueVectorized; 9573 9574 if (!MainILV.areSafetyChecksAdded()) 9575 DisableRuntimeUnroll = true; 9576 } else { 9577 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9578 &LVL, &CM, BFI, PSI); 9579 LVP.executePlan(LB, DT); 9580 ++LoopsVectorized; 9581 9582 // Add metadata to disable runtime unrolling a scalar loop when there are 9583 // no runtime checks about strides and memory. A scalar loop that is 9584 // rarely used is not worth unrolling. 9585 if (!LB.areSafetyChecksAdded()) 9586 DisableRuntimeUnroll = true; 9587 } 9588 9589 // Report the vectorization decision. 9590 ORE->emit([&]() { 9591 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9592 L->getHeader()) 9593 << "vectorized loop (vectorization width: " 9594 << NV("VectorizationFactor", VF.Width) 9595 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9596 }); 9597 } 9598 9599 Optional<MDNode *> RemainderLoopID = 9600 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9601 LLVMLoopVectorizeFollowupEpilogue}); 9602 if (RemainderLoopID.hasValue()) { 9603 L->setLoopID(RemainderLoopID.getValue()); 9604 } else { 9605 if (DisableRuntimeUnroll) 9606 AddRuntimeUnrollDisableMetaData(L); 9607 9608 // Mark the loop as already vectorized to avoid vectorizing again. 9609 Hints.setAlreadyVectorized(); 9610 } 9611 9612 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9613 return true; 9614 } 9615 9616 LoopVectorizeResult LoopVectorizePass::runImpl( 9617 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9618 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9619 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9620 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9621 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9622 SE = &SE_; 9623 LI = &LI_; 9624 TTI = &TTI_; 9625 DT = &DT_; 9626 BFI = &BFI_; 9627 TLI = TLI_; 9628 AA = &AA_; 9629 AC = &AC_; 9630 GetLAA = &GetLAA_; 9631 DB = &DB_; 9632 ORE = &ORE_; 9633 PSI = PSI_; 9634 9635 // Don't attempt if 9636 // 1. the target claims to have no vector registers, and 9637 // 2. interleaving won't help ILP. 9638 // 9639 // The second condition is necessary because, even if the target has no 9640 // vector registers, loop vectorization may still enable scalar 9641 // interleaving. 9642 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9643 TTI->getMaxInterleaveFactor(1) < 2) 9644 return LoopVectorizeResult(false, false); 9645 9646 bool Changed = false, CFGChanged = false; 9647 9648 // The vectorizer requires loops to be in simplified form. 9649 // Since simplification may add new inner loops, it has to run before the 9650 // legality and profitability checks. This means running the loop vectorizer 9651 // will simplify all loops, regardless of whether anything end up being 9652 // vectorized. 9653 for (auto &L : *LI) 9654 Changed |= CFGChanged |= 9655 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9656 9657 // Build up a worklist of inner-loops to vectorize. This is necessary as 9658 // the act of vectorizing or partially unrolling a loop creates new loops 9659 // and can invalidate iterators across the loops. 9660 SmallVector<Loop *, 8> Worklist; 9661 9662 for (Loop *L : *LI) 9663 collectSupportedLoops(*L, LI, ORE, Worklist); 9664 9665 LoopsAnalyzed += Worklist.size(); 9666 9667 // Now walk the identified inner loops. 9668 while (!Worklist.empty()) { 9669 Loop *L = Worklist.pop_back_val(); 9670 9671 // For the inner loops we actually process, form LCSSA to simplify the 9672 // transform. 9673 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9674 9675 Changed |= CFGChanged |= processLoop(L); 9676 } 9677 9678 // Process each loop nest in the function. 9679 return LoopVectorizeResult(Changed, CFGChanged); 9680 } 9681 9682 PreservedAnalyses LoopVectorizePass::run(Function &F, 9683 FunctionAnalysisManager &AM) { 9684 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9685 auto &LI = AM.getResult<LoopAnalysis>(F); 9686 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9687 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9688 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9689 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9690 auto &AA = AM.getResult<AAManager>(F); 9691 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9692 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9693 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9694 MemorySSA *MSSA = EnableMSSALoopDependency 9695 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9696 : nullptr; 9697 9698 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9699 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9700 [&](Loop &L) -> const LoopAccessInfo & { 9701 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9702 TLI, TTI, nullptr, MSSA}; 9703 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9704 }; 9705 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9706 ProfileSummaryInfo *PSI = 9707 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9708 LoopVectorizeResult Result = 9709 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9710 if (!Result.MadeAnyChange) 9711 return PreservedAnalyses::all(); 9712 PreservedAnalyses PA; 9713 9714 // We currently do not preserve loopinfo/dominator analyses with outer loop 9715 // vectorization. Until this is addressed, mark these analyses as preserved 9716 // only for non-VPlan-native path. 9717 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9718 if (!EnableVPlanNativePath) { 9719 PA.preserve<LoopAnalysis>(); 9720 PA.preserve<DominatorTreeAnalysis>(); 9721 } 9722 PA.preserve<BasicAA>(); 9723 PA.preserve<GlobalsAA>(); 9724 if (!Result.MadeCFGChange) 9725 PA.preserveSet<CFGAnalyses>(); 9726 return PA; 9727 } 9728