Lines Matching defs:VF

183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185 "1 is specified, forces the given VF for all applicable epilogue "
189 "epilogue-vectorization-minimum-VF", cl::Hidden,
463 /// block to a specified vectorization factor (VF).
489 AC(AC), ORE(ORE), VF(VecWidth),
624 ElementCount VF;
655 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
885 /// Return a value for Step multiplied by VF.
886 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
889 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
892 /// Return the runtime value for VF.
893 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
894 return B.CreateElementCount(Ty, VF);
926 VectorizationFactor VF, unsigned IC) {
935 << ore::NV("VectorizationFactor", VF.Width)
1003 /// \return true if the UserVF is a feasible VF to be chosen.
1017 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1018 /// are the selected vectorization factor and the cost of the selected VF.
1019 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1028 void setCostBasedWideningDecision(ElementCount VF);
1032 /// This function analyzes all calls in the function at the supplied VF,
1035 void setVectorizedCallDecision(ElementCount VF);
1079 /// vectorization factor \p VF.
1080 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1081 assert(VF.isVector() &&
1082 "Profitable to scalarize relevant only for VF > 1.");
1087 auto Scalars = InstsToScalarize.find(VF);
1089 "VF not yet analyzed for scalarization profitability");
1094 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1104 if (VF.isScalar())
1107 auto UniformsPerVF = Uniforms.find(VF);
1109 "VF not yet analyzed for uniformity");
1114 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1118 if (VF.isScalar())
1121 auto ScalarsPerVF = Scalars.find(VF);
1123 "Scalar values are not calculated for VF");
1128 /// for vectorization factor \p VF.
1129 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1130 return VF.isVector() && MinBWs.contains(I) &&
1131 !isProfitableToScalarize(I, VF) &&
1132 !isScalarAfterVectorization(I, VF);
1148 /// instruction \p I and vector width \p VF.
1149 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1151 assert(VF.isVector() && "Expected VF >=2");
1152 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1156 /// interleaving group \p Grp and vector width \p VF.
1158 ElementCount VF, InstWidening W,
1160 assert(VF.isVector() && "Expected VF >=2");
1174 WideningDecisions[std::make_pair(I, VF)] =
1177 WideningDecisions[std::make_pair(I, VF)] =
1184 /// width \p VF. Return CM_Unknown if this instruction did not pass
1186 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1187 assert(VF.isVector() && "Expected VF to be a vector VF");
1192 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1200 /// width \p VF.
1201 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1202 assert(VF.isVector() && "Expected VF >=2");
1203 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1217 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1221 assert(!VF.isScalar() && "Expected vector VF");
1222 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1227 ElementCount VF) const {
1228 assert(!VF.isScalar() && "Expected vector VF");
1229 return CallWideningDecisions.at(std::make_pair(CI, VF));
1235 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1242 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1243 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1260 void collectInstsToScalarize(ElementCount VF);
1262 /// Collect Uniform and Scalar values for the given \p VF.
1266 /// at that VF -- scalarize, call a known vector routine, or call a
1268 void collectUniformsAndScalars(ElementCount VF) {
1270 if (VF.isScalar() || Uniforms.contains(VF))
1272 setCostBasedWideningDecision(VF);
1273 collectLoopUniforms(VF);
1274 setVectorizedCallDecision(VF);
1275 collectLoopScalars(VF);
1294 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1301 if (VF.isVector())
1302 Ty = VectorType::get(Ty, VF);
1308 /// variables found for the given VF.
1309 bool canVectorizeReductions(ElementCount VF) const {
1312 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1335 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1336 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1349 ElementCount VF) const;
1353 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1358 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1400 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1401 return requiresScalarEpilogue(VF.isVector());
1511 // second-to-last iteration might not be VF*UF.
1520 /// with factor VF. Return the cost of the instruction, including
1522 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1525 /// VF. Return the cost of the instruction, including scalarization overhead
1527 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1541 InstructionCost expectedCost(ElementCount VF);
1547 /// \p VF is the vectorization factor chosen for the original loop.
1548 /// \p Multiplier is an aditional scaling factor applied to VF before
1550 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1555 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1560 ElementCount VF,
1580 /// registers and the loop trip-count, but limited to a maximum safe VF.
1592 /// \return the maximum legal scalable VF, based on the safe max number
1597 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1600 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1603 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1606 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1610 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1616 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1621 ElementCount VF) const;
1625 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1645 /// iterations when the trip count is unknown or doesn't divide by the VF,
1668 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1672 /// The data is collected per VF.
1676 /// The data is collected per VF.
1698 ElementCount VF);
1706 /// scalarized instruction will be represented by VF scalar values in the
1709 void collectLoopUniforms(ElementCount VF);
1716 /// VF values in the vectorized loop, each corresponding to an iteration of
1718 void collectLoopScalars(ElementCount VF);
1734 bool needsExtract(Value *V, ElementCount VF) const {
1736 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1738 getWideningDecision(I, VF) == CM_Scalarize)
1747 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1752 ElementCount VF) const {
1754 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1797 /// Values to ignore in the cost model when VF > 1.
1863 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1900 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1902 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2316 /// For the given VF and UF and maximum trip count computed for the loop, return
2322 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2324 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2330 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2333 uint64_t MaxVF = VF.getKnownMinValue();
2334 if (VF.isScalable()) {
2427 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2435 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2438 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2439 "VF*UF must be a power of 2 when folding tail by masking");
2457 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2489 // Generate code to check if the loop's trip count is less than VF * UF, or
2494 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2501 // Create step with max(MinProTripCount, UF * VF).
2502 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2503 return createStepForVF(Builder, CountTy, VF, UF);
2507 if (!VF.isScalable())
2510 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2522 // TODO: Ensure step is at most the trip count when determining max VF and
2531 } else if (VF.isScalable() &&
2532 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2543 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2642 Cost->requiresScalarEpilogue(VF.isVector())) &&
2842 ElementCount VF) const {
2843 // We only need to calculate a cost if the VF is scalar; for actual vectors
2844 // we should already have a pre-calculated cost at each VF.
2845 if (!VF.isScalar())
2846 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2850 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2862 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2868 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2869 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2871 return VectorType::get(Elt, VF);
2876 ElementCount VF) const {
2879 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2889 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2944 VF.getKnownMinValue() * UF);
3042 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3043 // We should not collect Scalars more than once per VF. Right now, this
3045 // this check. Collecting Scalars for VF=1 does not make any sense.
3046 assert(VF.isVector() && !Scalars.contains(VF) &&
3047 "This function should not be visited twice for the same VF");
3052 if (VF.isScalable()) {
3053 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3070 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3120 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3144 auto ForcedScalar = ForcedScalars.find(VF);
3228 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3232 Instruction *I, ElementCount VF) const {
3242 if (VF.isScalar())
3244 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3251 if (VF.isVector())
3252 VTy = VectorType::get(Ty, VF);
3266 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3325 ElementCount VF) const {
3334 if (!VF.isScalable()) {
3344 ScalarizationCost += VF.getKnownMinValue() *
3348 ScalarizationCost += VF.getKnownMinValue() *
3353 ScalarizationCost += getScalarizationOverhead(I, VF);
3362 auto *VecTy = toVectorTy(I->getType(), VF);
3368 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3388 Instruction *I, ElementCount VF) const {
3390 assert(getWideningDecision(I, VF) == CM_Unknown &&
3406 if (VF.isScalable() && InterleaveFactor != 2)
3462 Instruction *I, ElementCount VF) {
3475 if (isScalarWithPredication(I, VF))
3487 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3488 // We should not collect Uniforms more than once per VF. Right now,
3490 // already does this check. Collecting Uniforms for VF=1 does not make any
3493 assert(VF.isVector() && !Uniforms.contains(VF) &&
3494 "This function should not be visited twice for the same VF");
3497 // analyze again. Uniforms.count(VF) will return 1.
3498 Uniforms[VF].clear();
3517 // where only a single instance out of VF should be formed.
3548 auto PrevVF = VF.divideCoefficientBy(2);
3553 // (smaller VF), it cannot be uniform for the larger VF.
3559 if (!Legal->isUniformMemOp(*I, VF))
3569 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3570 InstWidening WideningDecision = getWideningDecision(I, VF);
3589 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3717 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3851 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3853 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3862 // If `VF=vscale x N` is safe, then so is `VF=N`
3873 // is better to ignore the hint and let the compiler choose a suitable VF.
3875 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3876 << " is unsafe, clamping to max safe VF="
3891 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3904 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3933 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4040 // Avoid tail folding if the trip count is known to be a multiple of any VF
4074 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4091 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4173 // max VF that results in a dead vector loop.
4180 // point in choosing VF greater than TC (as done in the loop below). Select
4182 // scalable, we only fall back on a fixed VF when the TC is less than or
4213 // For each VF calculate its register usage.
4216 // Select the largest VF which doesn't require more registers than existing
4267 ElementCount VF) {
4268 unsigned EstimatedVF = VF.getKnownMinValue();
4269 if (VF.isScalable())
4272 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4309 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4315 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4316 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4321 return VectorCost * divideCeil(MaxTripCount, VF);
4322 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4341 for (ElementCount VF : Plan->vectorFactors()) {
4344 precomputeCosts(*Plan, VF, CostCtx);
4348 if (!R.cost(VF, CostCtx).isValid())
4349 InvalidCosts.emplace_back(&R, VF);
4366 // Sort the list, first on recipe(number) then on VF.
4376 // For a list of ordered recipe-VF pairs:
4413 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4418 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4449 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4451 assert(VF.isVector() && "Checking a scalar VF?");
4505 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4506 Type *VectorTy = toVectorTy(ScalarTy, VF);
4510 if (VF.isScalable()) {
4516 return NumLegalParts <= VF.getKnownMinValue();
4519 return NumLegalParts < VF.getKnownMinValue();
4554 "Expected Scalar VF to be a candidate");
4564 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4570 for (ElementCount VF : P->vectorFactors()) {
4571 // The cost for scalar VF=1 is already calculated, so ignore it.
4572 if (VF.isScalar())
4575 InstructionCost C = CM.expectedCost(VF);
4576 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4579 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4581 if (VF.isScalable())
4587 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4590 << "LV: Not considering vector loop of width " << VF
4617 ElementCount VF) const {
4650 const ElementCount VF, const unsigned IC) const {
4662 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4669 unsigned Multiplier = VF.isFixed() ? IC : 1;
4673 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4724 // vectorizing the epilogue loop with VF=4.
4737 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4738 // vectors) or > the VF of the main loop (fixed vectors).
4780 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4861 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4900 // If we did not calculate the cost for VF (because the user selected the VF)
4901 // then we calculate the cost of VF here.
4903 LoopCost = expectedCost(VF);
4904 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4911 RegisterUsage R = calculateRegisterUsage({VF})[0];
4937 if (VF.isScalar()) {
4961 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4964 if (VF.isScalar()) {
4972 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4978 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4981 // 1) the aggressive IC is capped by the trip count divided by VF
4982 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5007 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5012 // IC to be capped by the trip count divided by VF * 2, such that the vector
5035 if (VF.isVector() && HasReductions) {
5045 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5049 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5055 << "LV: VF is " << VF << '\n');
5074 // and compares when VF=1 since it may just create more overhead than it's
5122 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5227 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5229 (VF.isScalable() &&
5232 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5253 // For each VF find the maximum usage of registers.
5273 // Skip ignored values for VF > 1.
5318 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5320 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5321 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5325 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5351 ElementCount VF) {
5367 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5370 // have occurred if we have a user-selected VF and are now computing the
5372 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5375 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5376 // not profitable to scalarize any instructions, the presence of VF in the
5378 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5380 PredicatedBBsAfterVectorization[VF].clear();
5389 if (isScalarWithPredication(&I, VF)) {
5394 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5396 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5397 !useEmulatedMaskMemRefHack(&I, VF) &&
5398 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5404 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5406 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5407 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5411 PredicatedBBsAfterVectorization[VF].insert(BB);
5414 PredicatedBBsAfterVectorization[VF].insert(Pred);
5421 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5422 assert(!isUniformAfterVectorization(PredInst, VF) &&
5442 isScalarAfterVectorization(I, VF))
5447 if (isScalarWithPredication(I, VF))
5455 // marked uniform after vectorization, rather than VF identical values.
5462 if (isUniformAfterVectorization(J, VF))
5482 InstructionCost VectorCost = getInstructionCost(I, VF);
5489 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5493 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5495 cast<VectorType>(toVectorTy(I->getType(), VF)),
5496 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5499 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5512 else if (needsExtract(J, VF)) {
5514 cast<VectorType>(toVectorTy(J->getType(), VF)),
5515 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5532 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5535 // If the vector loop gets executed exactly once with the given VF, ignore the
5540 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5552 (VF.isVector() && VecValuesToIgnore.count(&I)))
5555 InstructionCost C = getInstructionCost(&I, VF);
5562 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5563 << VF << " For instruction: " << I << '\n');
5573 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5614 ElementCount VF) {
5615 assert(VF.isVector() &&
5617 if (VF.isScalable())
5625 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5635 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5640 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5646 Cost += getScalarizationOverhead(I, VF);
5656 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5658 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5662 if (useEmulatedMaskMemRefHack(I, VF))
5673 ElementCount VF) {
5675 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5702 ElementCount VF) {
5703 assert(Legal->isUniformMemOp(*I, VF));
5706 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5725 CostKind, VF.getKnownMinValue() - 1));
5730 ElementCount VF) {
5732 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5744 ElementCount VF) {
5750 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5754 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5784 ElementCount VF,
5788 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5966 ElementCount VF) {
5969 if (VF.isScalar()) {
5979 return getWideningCost(I, VF);
5984 ElementCount VF) const {
5988 if (VF.isScalable())
5991 if (VF.isScalar())
5995 Type *RetTy = toVectorTy(I->getType(), VF);
5999 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6018 for (auto *V : filterExtractingOperands(Ops, VF))
6019 Tys.push_back(maybeVectorizeType(V->getType(), VF));
6021 filterExtractingOperands(Ops, VF), Tys, CostKind);
6024 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6025 if (VF.isScalar())
6039 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6042 if (Legal->isUniformMemOp(I, VF)) {
6044 if (!VF.isScalable())
6067 isLegalGatherOrScatter(&I, VF) ?
6068 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6075 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6078 // Choose better solution for the current VF, Note that Invalid
6082 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6084 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6089 if (memoryInstructionCanBeWidened(&I, VF)) {
6090 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6097 setWideningDecision(&I, VF, Decision, Cost);
6109 if (getWideningDecision(&I, VF) != CM_Unknown)
6113 if (interleavedAccessCanBeWidened(&I, VF))
6114 InterleaveCost = getInterleaveGroupCost(&I, VF);
6118 isLegalGatherOrScatter(&I, VF)
6119 ? getGatherScatterCost(&I, VF) * NumAccesses
6123 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6125 // Choose better solution for the current VF,
6144 setWideningDecision(Group, VF, Decision, Cost);
6146 setWideningDecision(&I, VF, Decision, Cost);
6165 getWideningDecision(&I, VF) != CM_GatherScatter)
6187 InstWidening Decision = getWideningDecision(I, VF);
6191 I, VF, CM_Scalarize,
6192 (VF.getKnownMinValue() *
6199 Member, VF, CM_Scalarize,
6200 (VF.getKnownMinValue() *
6207 ForcedScalars[VF].insert(I);
6211 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6212 assert(!VF.isScalar() &&
6213 "Trying to set a vectorization decision for a scalar VF");
6215 auto ForcedScalar = ForcedScalars.find(VF);
6235 // there, execute VF scalar calls, and then gather the result into the
6242 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6244 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6248 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6250 isUniformAfterVectorization(CI, VF))) {
6251 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6259 Type *RetTy = toVectorTy(ScalarRetTy, VF);
6261 Tys.push_back(toVectorTy(ScalarTy, VF));
6266 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6267 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6278 // Search through any available variants for one we can use at this VF.
6280 // Must match requested VF.
6281 if (Info.Shape.VF != VF)
6351 VF),
6362 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6377 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6398 ElementCount VF) {
6401 if (isUniformAfterVectorization(I, VF))
6402 VF = ElementCount::getFixed(1);
6404 if (VF.isVector() && isProfitableToScalarize(I, VF))
6405 return InstsToScalarize[VF][I];
6408 auto ForcedScalar = ForcedScalars.find(VF);
6409 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6413 VF.getKnownMinValue();
6417 if (canTruncateToMinimalBitwidth(I, VF))
6422 ElementCount VF) -> bool {
6423 if (VF.isScalar())
6426 auto Scalarized = InstsToScalarize.find(VF);
6428 "VF not yet analyzed for scalarization profitability");
6438 if (isScalarAfterVectorization(I, VF)) {
6441 // because the VF is either 1, or any instructions that need scalarizing
6443 // it means we don't have to multiply the instruction cost by VF.
6448 HasSingleCopyAfterVectorization(I, VF));
6451 VectorTy = toVectorTy(RetTy, VF);
6453 if (VF.isVector() && VectorTy->isVectorTy() &&
6466 // In cases of scalarized and predicated instructions, there will be VF
6474 if (VF.isVector() && BI->isConditional() &&
6475 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6476 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6482 if (VF.isScalable())
6486 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6489 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6491 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6494 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6505 if (VF.isScalar())
6511 toVectorTy(Switch->getCondition()->getType(), VF),
6512 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6519 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6523 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6525 SmallVector<int> Mask(VF.getKnownMinValue());
6526 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6529 VF.getKnownMinValue() - 1);
6535 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6557 Instruction::Select, toVectorTy(ResultTy, VF),
6558 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6564 if (VF.isVector() && foldTailWithEVL() &&
6567 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6568 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6578 if (VF.isVector() && isPredicatedInst(I)) {
6579 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6588 if (Info && VF.isVector()) {
6599 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6601 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6633 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6685 CondTy = VectorType::get(CondTy, VF);
6698 if (canTruncateToMinimalBitwidth(I, VF)) {
6701 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6708 VectorTy = toVectorTy(ValTy, VF);
6716 ElementCount Width = VF;
6721 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6727 return getMemoryInstructionCost(I, VF);
6749 if (VF.isScalar() || !TheLoop->contains(I))
6752 switch (getWideningDecision(I, VF)) {
6791 if (isOptimizableIVTruncate(I, VF)) {
6798 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6803 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6807 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6809 if (canTruncateToMinimalBitwidth(I, VF)) {
6821 return getVectorCallCost(cast<CallInst>(I), VF);
6827 if (VF.isScalable())
7031 // This function will select a scalable VF if the target supports scalable
7033 // TODO: we could return a pair of values that specify the max VF and
7034 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7035 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7055 ElementCount VF = UserVF;
7064 VF = determineVPlanVF(TTI, CM);
7065 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7067 // Make sure we have a VF > 1 for stress testing.
7068 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7070 << "overriding computed VF.\n");
7071 VF = ElementCount::getFixed(4);
7075 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7086 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7087 "VF needs to be a power of two");
7089 << "VF " << VF << " to build VPlans.\n");
7090 buildVPlans(VF, VF);
7096 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7136 "UserVF ignored because it may be larger than the maximal safe VF",
7140 "VF needs to be a power of two");
7145 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7157 for (auto VF = ElementCount::getFixed(1);
7158 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7159 VFCandidates.push_back(VF);
7160 for (auto VF = ElementCount::getScalable(1);
7161 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7162 VFCandidates.push_back(VF);
7165 for (const auto &VF : VFCandidates) {
7166 // Collect Uniform and Scalar instructions after vectorization with VF.
7167 CM.collectUniformsAndScalars(VF);
7171 if (VF.isVector())
7172 CM.collectInstsToScalarize(VF);
7182 ElementCount VF) const {
7185 return CM.getInstructionCost(UI, VF);
7195 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7225 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7230 // If the vector loop gets executed exactly once with the given VF, ignore
7236 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7241 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7243 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7245 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7275 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7277 dbgs() << "Cost of " << CondICost << " for VF " << VF
7336 CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7343 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7355 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7360 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7367 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7368 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7371 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7373 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7378 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7379 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7383 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7393 ElementCount VF) const {
7396 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7399 Cost += Plan.cost(VF, CostCtx);
7401 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7402 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7470 // If there is a single VPlan with a single VF, return it directly.
7475 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7487 "More than a single plan/VF w/o any plan having scalar VF");
7498 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7504 for (ElementCount VF : P->vectorFactors()) {
7505 if (VF.isScalar())
7507 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7510 << "LV: Not considering vector loop of width " << VF
7515 InstructionCost Cost = cost(*P, VF);
7516 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7537 // different VF to be picked by the VPlan-based cost model.
7551 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7654 "Trying to execute plan with unsupported VF");
7796 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7849 << "Main Loop VF:" << EPI.MainLoopVF
7851 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7867 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7875 // Generate code to check if the loop's trip count is less than VF * UF of the
7878 : VF.isVector())
8014 // Generate code to check if the loop's trip count is less than VF * UF of the
8029 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8059 << "Epilogue Loop VF:" << EPI.EpilogueVF
8263 auto WillWiden = [&](ElementCount VF) -> bool {
8265 CM.getWideningDecision(I, VF);
8270 if (CM.isScalarAfterVectorization(I, VF) ||
8271 CM.isProfitableToScalarize(I, VF))
8363 [&](ElementCount VF) {
8364 return CM.isScalarAfterVectorization(Phi, VF);
8383 return [=](ElementCount VF) -> bool {
8384 return CM.isOptimizableIVTruncate(K, VF);
8430 [this, CI](ElementCount VF) {
8431 return CM.isScalarWithPredication(CI, VF);
8450 [&](ElementCount VF) -> bool {
8451 return CM.getCallWideningDecision(CI, VF).Kind ==
8464 [&](ElementCount VF) -> bool {
8465 // The following case may be scalarized depending on the VF.
8469 // If we've found a variant at a previous VF, then stop looking. A
8474 // once we have an appropriate variant it's only valid for that VF.
8475 // This will force a different vplan to be generated for each VF that
8480 CM.getCallWideningDecision(CI, VF);
8497 // vector variant at this VF requires a mask, so we synthesize an
8521 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8522 return CM.isScalarAfterVectorization(I, VF) ||
8523 CM.isProfitableToScalarize(I, VF) ||
8524 CM.isScalarWithPredication(I, VF);
8634 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8794 [&](ElementCount VF) {
8797 VF, OpAExtend, OpBExtend,
8856 // All widen recipes below deal only with VF > 1.
8858 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8916 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8917 VFRange SubRange = {VF, MaxVFTimes2};
8933 VF = SubRange.End;
8951 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9239 [this](ElementCount VF) {
9240 return !CM.requiresScalarEpilogue(VF.isVector());
9252 for (ElementCount VF : Range)
9253 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9277 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9278 bool Result = (VF.isVector() && // Query is illegal for VF == 1
9279 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9284 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9441 for (ElementCount VF : Range)
9442 Plan->addVF(VF);
9524 for (ElementCount VF : Range)
9525 Plan->addVF(VF);
9866 assert((State.VF.isScalar() || !isUniform()) &&
9868 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9871 if (State.VF.isVector() && shouldPack()) {
9874 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9876 VectorType::get(UI->getType(), State.VF));
9894 auto Lane = VPLane::getLastLaneForVF(State.VF);
9899 // Generate scalar instances for all VF lanes.
9900 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9901 const unsigned EndLane = State.VF.getKnownMinValue();
9991 // Plan how to best vectorize, return the best VF and its cost.
9992 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9997 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10000 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10007 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10008 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10011 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10014 reportVectorization(ORE, L, VF, 1);
10068 VectorizationFactor &VF, Loop *L,
10078 if (VF.Width.isScalar()) {
10088 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10089 uint64_t ScalarC = *VF.ScalarCost.getValue();
10102 // RtC + VecC * (TC / VF) + EpiC
10107 // * VF is the vectorization factor
10113 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10116 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10121 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10123 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10135 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10136 // epilogue is allowed, choose the next closest multiple of VF. This should
10141 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10145 << VF.MinProfitableTripCount << "\n");
10151 VF.MinProfitableTripCount)) {
10153 "trip count < minimum profitable VF ("
10154 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10534 VectorizationFactor VF = LVP.computeBestVF();
10544 if (LVP.hasPlanWithVF(VF.Width)) {
10546 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10551 if (VF.Width.isVector() || SelectedIC > 1)
10552 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10558 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10575 if (VF.Width.isScalar()) {
10583 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10617 // histogram intrinsics, which are only used for recipes with VF > 1.
10656 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10664 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10677 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10682 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10693 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10696 LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10705 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10731 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10732 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10734 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10744 reportVectorization(ORE, L, VF, IC);