LoopVectorize.cpp - OpenGrok cross reference for /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines Matching defs:VF
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185              "1 is specified, forces the given VF for all applicable epilogue "
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
463 /// block to a specified vectorization factor (VF).
489         AC(AC), ORE(ORE), VF(VecWidth),
624   ElementCount VF;
655   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
885 /// Return a value for Step multiplied by VF.
886 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
889   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
892 /// Return the runtime value for VF.
893 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
894   return B.CreateElementCount(Ty, VF);
926                                 VectorizationFactor VF, unsigned IC) {
935            << ore::NV("VectorizationFactor", VF.Width)
1003   /// \return true if the UserVF is a feasible VF to be chosen.
1017   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1018   /// are the selected vectorization factor and the cost of the selected VF.
1019   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1028   void setCostBasedWideningDecision(ElementCount VF);
1032   /// This function analyzes all calls in the function at the supplied VF,
1035   void setVectorizedCallDecision(ElementCount VF);
1079   /// vectorization factor \p VF.
1080   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1081     assert(VF.isVector() &&
1082            "Profitable to scalarize relevant only for VF > 1.");
1087     auto Scalars = InstsToScalarize.find(VF);
1089            "VF not yet analyzed for scalarization profitability");
1094   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1104     if (VF.isScalar())
1107     auto UniformsPerVF = Uniforms.find(VF);
1109            "VF not yet analyzed for uniformity");
1114   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1118     if (VF.isScalar())
1121     auto ScalarsPerVF = Scalars.find(VF);
1123            "Scalar values are not calculated for VF");
1128   /// for vectorization factor \p VF.
1129   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1130     return VF.isVector() && MinBWs.contains(I) &&
1131            !isProfitableToScalarize(I, VF) &&
1132            !isScalarAfterVectorization(I, VF);
1148   /// instruction \p I and vector width \p VF.
1149   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1151     assert(VF.isVector() && "Expected VF >=2");
1152     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1156   /// interleaving group \p Grp and vector width \p VF.
1158                            ElementCount VF, InstWidening W,
1160     assert(VF.isVector() && "Expected VF >=2");
1174           WideningDecisions[std::make_pair(I, VF)] =
1177           WideningDecisions[std::make_pair(I, VF)] =
1184   /// width \p VF. Return CM_Unknown if this instruction did not pass
1186   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1187     assert(VF.isVector() && "Expected VF to be a vector VF");
1192     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1200   /// width \p VF.
1201   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1202     assert(VF.isVector() && "Expected VF >=2");
1203     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1217   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1221     assert(!VF.isScalar() && "Expected vector VF");
1222     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1227                                                ElementCount VF) const {
1228     assert(!VF.isScalar() && "Expected vector VF");
1229     return CallWideningDecisions.at(std::make_pair(CI, VF));
1235   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1242     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1243     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1260   void collectInstsToScalarize(ElementCount VF);
1262   /// Collect Uniform and Scalar values for the given \p VF.
1266   /// at that VF -- scalarize, call a known vector routine, or call a
1268   void collectUniformsAndScalars(ElementCount VF) {
1270     if (VF.isScalar() || Uniforms.contains(VF))
1272     setCostBasedWideningDecision(VF);
1273     collectLoopUniforms(VF);
1274     setVectorizedCallDecision(VF);
1275     collectLoopScalars(VF);
1294   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1301     if (VF.isVector())
1302       Ty = VectorType::get(Ty, VF);
1308   /// variables found for the given VF.
1309   bool canVectorizeReductions(ElementCount VF) const {
1312       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1335   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1336   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1349                            ElementCount VF) const;
1353   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1358   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1400     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1401       return requiresScalarEpilogue(VF.isVector());
1511     // second-to-last iteration might not be VF*UF.
1520   /// with factor VF.  Return the cost of the instruction, including
1522   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1525   /// VF. Return the cost of the instruction, including scalarization overhead
1527   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1541   InstructionCost expectedCost(ElementCount VF);
1547   /// \p VF is the vectorization factor chosen for the original loop.
1548   /// \p Multiplier is an aditional scaling factor applied to VF before
1550   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1555   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1560                                                          ElementCount VF,
1580   /// registers and the loop trip-count, but limited to a maximum safe VF.
1592   /// \return the maximum legal scalable VF, based on the safe max number
1597   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1600   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1603   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1606   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1610   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1616   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1621                                            ElementCount VF) const;
1625   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1645   /// iterations when the trip count is unknown or doesn't divide by the VF,
1668   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1672   /// The data is collected per VF.
1676   /// The data is collected per VF.
1698                                           ElementCount VF);
1706   /// scalarized instruction will be represented by VF scalar values in the
1709   void collectLoopUniforms(ElementCount VF);
1716   /// VF values in the vectorized loop, each corresponding to an iteration of
1718   void collectLoopScalars(ElementCount VF);
1734   bool needsExtract(Value *V, ElementCount VF) const {
1736     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1738         getWideningDecision(I, VF) == CM_Scalarize)
1747     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1752                                                    ElementCount VF) const {
1754         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1797   /// Values to ignore in the cost model when VF > 1.
1863               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1900             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1902                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2316 /// For the given VF and UF and maximum trip count computed for the loop, return
2322     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2324   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2330   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2333     uint64_t MaxVF = VF.getKnownMinValue();
2334     if (VF.isScalable()) {
2427   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2435   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2438     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2439            "VF*UF must be a power of 2 when folding tail by masking");
2457   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2489   // Generate code to check if the loop's trip count is less than VF * UF, or
2494   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2501     // Create step with max(MinProTripCount, UF * VF).
2502     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2503       return createStepForVF(Builder, CountTy, VF, UF);
2507     if (!VF.isScalable())
2510         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2522       // TODO: Ensure step is at most the trip count when determining max VF and
2531   } else if (VF.isScalable() &&
2532              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2543     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2642           Cost->requiresScalarEpilogue(VF.isVector())) &&
2842                                               ElementCount VF) const {
2843   // We only need to calculate a cost if the VF is scalar; for actual vectors
2844   // we should already have a pre-calculated cost at each VF.
2845   if (!VF.isScalar())
2846     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2850     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2862     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2868 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2869   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2871   return VectorType::get(Elt, VF);
2876                                                    ElementCount VF) const {
2879   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2889                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2944                                VF.getKnownMinValue() * UF);
3042 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3043   // We should not collect Scalars more than once per VF. Right now, this
3045   // this check. Collecting Scalars for VF=1 does not make any sense.
3046   assert(VF.isVector() && !Scalars.contains(VF) &&
3047          "This function should not be visited twice for the same VF");
3052   if (VF.isScalable()) {
3053     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3070     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3120   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3144   auto ForcedScalar = ForcedScalars.find(VF);
3228   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3232     Instruction *I, ElementCount VF) const {
3242     if (VF.isScalar())
3244     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3251     if (VF.isVector())
3252       VTy = VectorType::get(Ty, VF);
3266     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3325                                                     ElementCount VF) const {
3334   if (!VF.isScalable()) {
3344     ScalarizationCost += VF.getKnownMinValue() *
3348     ScalarizationCost += VF.getKnownMinValue() *
3353     ScalarizationCost += getScalarizationOverhead(I, VF);
3362   auto *VecTy = toVectorTy(I->getType(), VF);
3368                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3388     Instruction *I, ElementCount VF) const {
3390   assert(getWideningDecision(I, VF) == CM_Unknown &&
3406   if (VF.isScalable() && InterleaveFactor != 2)
3462     Instruction *I, ElementCount VF) {
3475   if (isScalarWithPredication(I, VF))
3487 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3488   // We should not collect Uniforms more than once per VF. Right now,
3490   // already does this check. Collecting Uniforms for VF=1 does not make any
3493   assert(VF.isVector() && !Uniforms.contains(VF) &&
3494          "This function should not be visited twice for the same VF");
3497   // analyze again.  Uniforms.count(VF) will return 1.
3498   Uniforms[VF].clear();
3517   // where only a single instance out of VF should be formed.
3548   auto PrevVF = VF.divideCoefficientBy(2);
3553     // (smaller VF), it cannot be uniform for the larger VF.
3559     if (!Legal->isUniformMemOp(*I, VF))
3569   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3570     InstWidening WideningDecision = getWideningDecision(I, VF);
3589            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3717   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3851   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3853   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3862       // If `VF=vscale x N` is safe, then so is `VF=N`
3873     // is better to ignore the hint and let the compiler choose a suitable VF.
3875       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3876                         << " is unsafe, clamping to max safe VF="
3891       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3904       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3933       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4040   // Avoid tail folding if the trip count is known to be a multiple of any VF
4074       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4091       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4173   // max VF that results in a dead vector loop.
4180     // point in choosing VF greater than TC (as done in the loop below). Select
4182     // scalable, we only fall back on a fixed VF when the TC is less than or
4213     // For each VF calculate its register usage.
4216     // Select the largest VF which doesn't require more registers than existing
4267                                       ElementCount VF) {
4268   unsigned EstimatedVF = VF.getKnownMinValue();
4269   if (VF.isScalable())
4272   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4309   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4315     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4316     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4321       return VectorCost * divideCeil(MaxTripCount, VF);
4322     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4341     for (ElementCount VF : Plan->vectorFactors()) {
4344       precomputeCosts(*Plan, VF, CostCtx);
4348           if (!R.cost(VF, CostCtx).isValid())
4349             InvalidCosts.emplace_back(&R, VF);
4366   // Sort the list, first on recipe(number) then on VF.
4376   // For a list of ordered recipe-VF pairs:
4413     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4418       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4449 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4451   assert(VF.isVector() && "Checking a scalar VF?");
4505       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4506         Type *VectorTy = toVectorTy(ScalarTy, VF);
4510         if (VF.isScalable()) {
4516           return NumLegalParts <= VF.getKnownMinValue();
4519         return NumLegalParts < VF.getKnownMinValue();
4554          "Expected Scalar VF to be a candidate");
4564     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4570     for (ElementCount VF : P->vectorFactors()) {
4571       // The cost for scalar VF=1 is already calculated, so ignore it.
4572       if (VF.isScalar())
4575       InstructionCost C = CM.expectedCost(VF);
4576       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4579       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4581       if (VF.isScalable())
4587       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4590             << "LV: Not considering vector loop of width " << VF
4617     ElementCount VF) const {
4650     const ElementCount VF, const unsigned IC) const {
4662   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4669   unsigned Multiplier = VF.isFixed() ? IC : 1;
4673   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4724   // vectorizing the epilogue loop with VF=4.
4737     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4738     // vectors) or > the VF of the main loop (fixed vectors).
4780     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4861 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4900   // If we did not calculate the cost for VF (because the user selected the VF)
4901   // then we calculate the cost of VF here.
4903     LoopCost = expectedCost(VF);
4904     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4911   RegisterUsage R = calculateRegisterUsage({VF})[0];
4937     if (VF.isScalar()) {
4961   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4964   if (VF.isScalar()) {
4972   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4978         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4981     // 1) the aggressive IC is capped by the trip count divided by VF
4982     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5007     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5012     // IC to be capped by the trip count divided by VF * 2, such that the vector
5035   if (VF.isVector() && HasReductions) {
5045       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5049       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5055                     << "LV: VF is " << VF << '\n');
5074     // and compares when VF=1 since it may just create more overhead than it's
5122     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5227   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5229         (VF.isScalable() &&
5232     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5253     // For each VF find the maximum usage of registers.
5273           // Skip ignored values for VF > 1.
5318       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5320           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5321       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5325       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5351                                                            ElementCount VF) {
5367 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5370   // have occurred if we have a user-selected VF and are now computing the
5372   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5375   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5376   // not profitable to scalarize any instructions, the presence of VF in the
5378   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5380   PredicatedBBsAfterVectorization[VF].clear();
5389       if (isScalarWithPredication(&I, VF)) {
5394         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5396         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5397             !useEmulatedMaskMemRefHack(&I, VF) &&
5398             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5404             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5406             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5407             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5411         PredicatedBBsAfterVectorization[VF].insert(BB);
5414             PredicatedBBsAfterVectorization[VF].insert(Pred);
5421     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5422   assert(!isUniformAfterVectorization(PredInst, VF) &&
5442         isScalarAfterVectorization(I, VF))
5447     if (isScalarWithPredication(I, VF))
5455     // marked uniform after vectorization, rather than VF identical values.
5462         if (isUniformAfterVectorization(J, VF))
5482     InstructionCost VectorCost = getInstructionCost(I, VF);
5489         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5493     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5495           cast<VectorType>(toVectorTy(I->getType(), VF)),
5496           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5499           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5512         else if (needsExtract(J, VF)) {
5514               cast<VectorType>(toVectorTy(J->getType(), VF)),
5515               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5532 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5535   // If the vector loop gets executed exactly once with the given VF, ignore the
5540   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5552           (VF.isVector() && VecValuesToIgnore.count(&I)))
5555       InstructionCost C = getInstructionCost(&I, VF);
5562       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5563                         << VF << " For instruction: " << I << '\n');
5573     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5614                                                         ElementCount VF) {
5615   assert(VF.isVector() &&
5617   if (VF.isScalable())
5625   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5635       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5640   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5646   Cost += getScalarizationOverhead(I, VF);
5656         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5658         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5662     if (useEmulatedMaskMemRefHack(I, VF))
5673                                                     ElementCount VF) {
5675   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5702                                                 ElementCount VF) {
5703   assert(Legal->isUniformMemOp(*I, VF));
5706   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5725                                        CostKind, VF.getKnownMinValue() - 1));
5730                                                  ElementCount VF) {
5732   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5744                                                    ElementCount VF) {
5750   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5754   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5784                                                     ElementCount VF,
5788   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5966                                                      ElementCount VF) {
5969   if (VF.isScalar()) {
5979   return getWideningCost(I, VF);
5984                                                      ElementCount VF) const {
5988   if (VF.isScalable())
5991   if (VF.isScalar())
5995   Type *RetTy = toVectorTy(I->getType(), VF);
5999         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6018   for (auto *V : filterExtractingOperands(Ops, VF))
6019     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6021                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6024 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6025   if (VF.isScalar())
6039       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6042       if (Legal->isUniformMemOp(I, VF)) {
6044           if (!VF.isScalable())
6067           isLegalGatherOrScatter(&I, VF) ?
6068           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6075             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6078         // Choose better solution for the current VF,  Note that Invalid
6082           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6084           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6089       if (memoryInstructionCanBeWidened(&I, VF)) {
6090         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6097         setWideningDecision(&I, VF, Decision, Cost);
6109         if (getWideningDecision(&I, VF) != CM_Unknown)
6113         if (interleavedAccessCanBeWidened(&I, VF))
6114           InterleaveCost = getInterleaveGroupCost(&I, VF);
6118           isLegalGatherOrScatter(&I, VF)
6119               ? getGatherScatterCost(&I, VF) * NumAccesses
6123           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6125       // Choose better solution for the current VF,
6144         setWideningDecision(Group, VF, Decision, Cost);
6146         setWideningDecision(&I, VF, Decision, Cost);
6165           getWideningDecision(&I, VF) != CM_GatherScatter)
6187       InstWidening Decision = getWideningDecision(I, VF);
6191             I, VF, CM_Scalarize,
6192             (VF.getKnownMinValue() *
6199                 Member, VF, CM_Scalarize,
6200                 (VF.getKnownMinValue() *
6207       ForcedScalars[VF].insert(I);
6211 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6212   assert(!VF.isScalar() &&
6213          "Trying to set a vectorization decision for a scalar VF");
6215   auto ForcedScalar = ForcedScalars.find(VF);
6235       // there, execute VF scalar calls, and then gather the result into the
6242       InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6244       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6248       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6250                             isUniformAfterVectorization(CI, VF))) {
6251         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6259       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6261         Tys.push_back(toVectorTy(ScalarTy, VF));
6266         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6267           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6278       // Search through any available variants for one we can use at this VF.
6280         // Must match requested VF.
6281         if (Info.Shape.VF != VF)
6351                             VF),
6362         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6377       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6398                                                ElementCount VF) {
6401   if (isUniformAfterVectorization(I, VF))
6402     VF = ElementCount::getFixed(1);
6404   if (VF.isVector() && isProfitableToScalarize(I, VF))
6405     return InstsToScalarize[VF][I];
6408   auto ForcedScalar = ForcedScalars.find(VF);
6409   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6413              VF.getKnownMinValue();
6417   if (canTruncateToMinimalBitwidth(I, VF))
6422                                                 ElementCount VF) -> bool {
6423     if (VF.isScalar())
6426     auto Scalarized = InstsToScalarize.find(VF);
6428            "VF not yet analyzed for scalarization profitability");
6438   if (isScalarAfterVectorization(I, VF)) {
6441     // because the VF is either 1, or any instructions that need scalarizing
6443     // it means we don't have to multiply the instruction cost by VF.
6448            HasSingleCopyAfterVectorization(I, VF));
6451     VectorTy = toVectorTy(RetTy, VF);
6453   if (VF.isVector() && VectorTy->isVectorTy() &&
6466     // In cases of scalarized and predicated instructions, there will be VF
6474     if (VF.isVector() && BI->isConditional() &&
6475         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6476          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6482       if (VF.isScalable())
6486           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6489               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6491           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6494     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6505     if (VF.isScalar())
6511                toVectorTy(Switch->getCondition()->getType(), VF),
6512                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6519     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6523       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6525       SmallVector<int> Mask(VF.getKnownMinValue());
6526       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6529                                 VF.getKnownMinValue() - 1);
6535     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6557                  Instruction::Select, toVectorTy(ResultTy, VF),
6558                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6564     if (VF.isVector() && foldTailWithEVL() &&
6567           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6568           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6578     if (VF.isVector() && isPredicatedInst(I)) {
6579       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6588     if (Info && VF.isVector()) {
6599       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6601       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6633     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6685       CondTy = VectorType::get(CondTy, VF);
6698     if (canTruncateToMinimalBitwidth(I, VF)) {
6701       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6708     VectorTy = toVectorTy(ValTy, VF);
6716     ElementCount Width = VF;
6721       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6727     return getMemoryInstructionCost(I, VF);
6749       if (VF.isScalar() || !TheLoop->contains(I))
6752       switch (getWideningDecision(I, VF)) {
6791     if (isOptimizableIVTruncate(I, VF)) {
6798     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6803     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6807         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6809     if (canTruncateToMinimalBitwidth(I, VF)) {
6821     return getVectorCallCost(cast<CallInst>(I), VF);
6827     if (VF.isScalable())
7031 // This function will select a scalable VF if the target supports scalable
7033 // TODO: we could return a pair of values that specify the max VF and
7034 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7035 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7055   ElementCount VF = UserVF;
7064       VF = determineVPlanVF(TTI, CM);
7065       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7067       // Make sure we have a VF > 1 for stress testing.
7068       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7070                           << "overriding computed VF.\n");
7071         VF = ElementCount::getFixed(4);
7075       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7086     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7087            "VF needs to be a power of two");
7089                       << "VF " << VF << " to build VPlans.\n");
7090     buildVPlans(VF, VF);
7096     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7136           "UserVF ignored because it may be larger than the maximal safe VF",
7140              "VF needs to be a power of two");
7145         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7157   for (auto VF = ElementCount::getFixed(1);
7158        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7159     VFCandidates.push_back(VF);
7160   for (auto VF = ElementCount::getScalable(1);
7161        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7162     VFCandidates.push_back(VF);
7165   for (const auto &VF : VFCandidates) {
7166     // Collect Uniform and Scalar instructions after vectorization with VF.
7167     CM.collectUniformsAndScalars(VF);
7171     if (VF.isVector())
7172       CM.collectInstsToScalarize(VF);
7182                                              ElementCount VF) const {
7185   return CM.getInstructionCost(UI, VF);
7195 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7225       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7230     // If the vector loop gets executed exactly once with the given VF, ignore
7236     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7241       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7243       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7245         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7275     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7277       dbgs() << "Cost of " << CondICost << " for VF " << VF
7336           CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7343       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7355     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7360     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7367   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7368     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7371     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7373       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7378   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7379     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7383       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7393                                                ElementCount VF) const {
7396   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7399   Cost += Plan.cost(VF, CostCtx);
7401   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7402   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7470   // If there is a single VPlan with a single VF, return it directly.
7475   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7487          "More than a single plan/VF w/o any plan having scalar VF");
7498     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7504     for (ElementCount VF : P->vectorFactors()) {
7505       if (VF.isScalar())
7507       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7510             << "LV: Not considering vector loop of width " << VF
7515       InstructionCost Cost = cost(*P, VF);
7516       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7537   // different VF to be picked by the VPlan-based cost model.
7551   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7654          "Trying to execute plan with unsupported VF");
7796       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7849            << "Main Loop VF:" << EPI.MainLoopVF
7851            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7867   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7875   // Generate code to check if the loop's trip count is less than VF * UF of the
7878                                                     : VF.isVector())
8014   // Generate code to check if the loop's trip count is less than VF * UF of the
8029     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8059            << "Epilogue Loop VF:" << EPI.EpilogueVF
8263   auto WillWiden = [&](ElementCount VF) -> bool {
8265         CM.getWideningDecision(I, VF);
8270     if (CM.isScalarAfterVectorization(I, VF) ||
8271         CM.isProfitableToScalarize(I, VF))
8363             [&](ElementCount VF) {
8364               return CM.isScalarAfterVectorization(Phi, VF);
8383     return [=](ElementCount VF) -> bool {
8384       return CM.isOptimizableIVTruncate(K, VF);
8430       [this, CI](ElementCount VF) {
8431         return CM.isScalarWithPredication(CI, VF);
8450                 [&](ElementCount VF) -> bool {
8451                   return CM.getCallWideningDecision(CI, VF).Kind ==
8464       [&](ElementCount VF) -> bool {
8465         // The following case may be scalarized depending on the VF.
8469         // If we've found a variant at a previous VF, then stop looking. A
8474         // once we have an appropriate variant it's only valid for that VF.
8475         // This will force a different vplan to be generated for each VF that
8480             CM.getCallWideningDecision(CI, VF);
8497       //      vector variant at this VF requires a mask, so we synthesize an
8521   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8522     return CM.isScalarAfterVectorization(I, VF) ||
8523            CM.isProfitableToScalarize(I, VF) ||
8524            CM.isScalarWithPredication(I, VF);
8634       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8794           [&](ElementCount VF) {
8797                 VF, OpAExtend, OpBExtend,
8856   // All widen recipes below deal only with VF > 1.
8858           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8916   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8917     VFRange SubRange = {VF, MaxVFTimes2};
8933     VF = SubRange.End;
8951   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9239           [this](ElementCount VF) {
9240             return !CM.requiresScalarEpilogue(VF.isVector());
9252   for (ElementCount VF : Range)
9253     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9277     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9278       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9279                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9284       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9441   for (ElementCount VF : Range)
9442     Plan->addVF(VF);
9524   for (ElementCount VF : Range)
9525     Plan->addVF(VF);
9866     assert((State.VF.isScalar() || !isUniform()) &&
9868     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9871     if (State.VF.isVector() && shouldPack()) {
9874         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9876             VectorType::get(UI->getType(), State.VF));
9894     auto Lane = VPLane::getLastLaneForVF(State.VF);
9899   // Generate scalar instances for all VF lanes.
9900   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9901   const unsigned EndLane = State.VF.getKnownMinValue();
9991   // Plan how to best vectorize, return the best VF and its cost.
9992   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9997   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10000   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10007     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10008                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10011     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10014   reportVectorization(ORE, L, VF, 1);
10068                                        VectorizationFactor &VF, Loop *L,
10078   if (VF.Width.isScalar()) {
10088   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10089   uint64_t ScalarC = *VF.ScalarCost.getValue();
10102   //    RtC + VecC * (TC / VF) + EpiC
10107   //  * VF is the vectorization factor
10113   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10116   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10121   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10123   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10135   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10136   // epilogue is allowed, choose the next closest multiple of VF. This should
10141   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10145              << VF.MinProfitableTripCount << "\n");
10151                                 VF.MinProfitableTripCount)) {
10153                            "trip count < minimum profitable VF ("
10154                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10534   VectorizationFactor VF = LVP.computeBestVF();
10544   if (LVP.hasPlanWithVF(VF.Width)) {
10546     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10551     if (VF.Width.isVector() || SelectedIC > 1)
10552       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10558         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10575   if (VF.Width.isScalar()) {
10583   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10617   // histogram intrinsics, which are only used for recipes with VF > 1.
10656     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10664     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10677       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10682       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10693       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10696           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10705         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10731         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10732                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10734         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10744       reportVectorization(ORE, L, VF, IC);