AArch64TargetTransformInfo.cpp - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines Matching +full:test +full:- +full:part1
1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
33 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
36 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
39 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
42 static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
46     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
50     "call-penalty-sm-change", cl::init(5), cl::Hidden,
55     "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
58 static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
61 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
66     BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
71   // These bitfields will only ever be set to something non-zero in operator=,
72   // when setting the -sve-tail-folding option. This option should always be of
84   // explicitly set the -sve-tail-folding option.
116            << "' to -sve-tail-folding=; the option should be of the form\n"
119     report_fatal_error("Unrecognised tail-folding option");
125     // If the user explicitly sets -sve-tail-folding= then treat as an error.
136     StringRef(Val).split(TailFoldTypes, '+', -1, false);
179     "sve-tail-folding",
181         "Control the use of vectorisation using tail-folding for SVE where the"
184         "tail-folding"
185         "\ndefault       (Initial) Uses the default tail-folding settings for "
188         "tail-folding"
189         "\nsimple        (Initial) Use tail-folding for simple loops (not "
191         "\nreductions    Use tail-folding for loops containing reductions"
193         "\nrecurrences   Use tail-folding for loops containing fixed order "
196         "\nreverse       Use tail-folding for loops requiring reversed "
202 // code-generator is changed to use SVE instead of NEON for all fixed-width
205     "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
207 // Experimental option that will only be fully functional when the cost-model
208 // and code-generator have been changed to avoid using scalable vector
211     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
215   return F && StringSwitch<bool>(F->getName())
230       // intrinsics could could result in non-streaming ops (e.g. calls to
263   const TargetMachine &TM = getTLI()->getTargetMachine();
266       TM.getSubtargetImpl(*Caller)->getFeatureBits();
268       TM.getSubtargetImpl(*Callee)->getFeatureBits();
270   // Inline a callee if its target-features are a subset of the callers
271   // target-features.
282   // pointers to fixed-length vector types larger than 128 bits like
283   // <8 x float> (and pointers to aggregate types which have such fixed-length
286   // backend cannot lower such value arguments. The 128-bit fixed-length SVE
287   // types can be safely treated as 128-bit NEON types and they cannot be
289   if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
292                FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
306   //       call from F -> G (the call here is Call)
309   // a streaming-mode change is required (thus promoting the need to inline the
313   //       call from F -> G (the call here is not Call)
315   //       call from G -> H (the call here is Call)
318   // streaming-mode change, and the call to G from F would also require a
319   // streaming-mode change, then there is benefit to do the streaming-mode
337           ST->isNeonAvailable());
340 /// Calculate the cost of materializing a 64-bit value. This helper
360   assert(Ty->isIntegerTy());
362   unsigned BitSize = Ty->getPrimitiveSizeInBits();
366   // Sign-extend all constants to a multiple of 64-bit.
371   // Split the constant into 64-bit chunks and calculate the cost for each
387   assert(Ty->isIntegerTy());
389   unsigned BitSize = Ty->getPrimitiveSizeInBits();
455   assert(Ty->isIntegerTy());
457   unsigned BitSize = Ty->getPrimitiveSizeInBits();
522   if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
523       (EltTy->getScalarSizeInBits() != 32 &&
524        EltTy->getScalarSizeInBits() != 64))
529   // FIXME: We should be able to generate histcnt for fixed-length vectors
532     if ((VTy->getElementCount().getKnownMinValue() != 2 &&
533          VTy->getElementCount().getKnownMinValue() != 4) ||
534         VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
535         !VTy->isScalableTy())
544   // The code-generator is currently not able to handle scalable vectors
546   // it. This change will be removed when code-generation for these types is
550     if (VTy->getElementCount() == ElementCount::getScalable(1))
555     if (!ST->hasSVE2())
585         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
604         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
612     // (LT.first - 1) vector adds.
614       Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
617       Cost += AddCost * (LT.first - 1);
624     // is 0, then this should be a no-op or simple operation; return a
630         ICA.getReturnType()->getScalarType()->isIntegerTy(1))
633     LLVMContext &C = RetTy->getContext();
634     EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
636     EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
637                              : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
644         getTLI()->getTypeConversion(C, SubVecVT);
646         getTLI()->getTypeConversion(C, VecVT);
650         VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
673       if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
674           TLI->getValueType(DL, RetTy, true) == MVT::i16)
675         return LegalisationCost.first * Entry->Cost + 1;
677       return LegalisationCost.first * Entry->Cost;
682     if (!ST->hasNEON()) {
683       // 32-bit or 64-bit ctpop without NEON is 12 instructions.
703                                             RetTy->getScalarSizeInBits()
706       return LT.first * Entry->Cost + ExtraCost;
742     EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
746         return Entry->Cost;
755     EVT MTy = TLI->getValueType(DL, RetTy);
757     // output are the same, or we are using cvt f64->i32 or f32->i64.
766     if (ST->hasFullFP16() &&
775          (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
778           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
815         return LegalisationCost.first * Entry->Cost;
819     if (!RetTy->isIntegerTy())
824     bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
825                        RetTy->getScalarSizeInBits() < 64) ||
826                       (RetTy->getScalarSizeInBits() % 64 != 0);
828     if (RetTy->getScalarSizeInBits() == 32 ||
829         RetTy->getScalarSizeInBits() == 64)
841       EVT RetVT = getTLI()->getValueType(DL, RetTy);
842       EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
843       if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
844           !getTLI()->isTypeLegal(RetVT)) {
847         // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
850         // be extremely high for fixed-width vectors.
854         return RetTy->getNumElements() * 2;
876   if (!PN->hasOneUse())
879   for (Value *IncValPhi : PN->incoming_values()) {
882         Reinterpret->getIntrinsicID() !=
884         RequiredType != Reinterpret->getArgOperand(0)->getType())
890   PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
893   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
894     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
895     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
918   auto IntrinsicID = BinOp->getIntrinsicID();
932   auto BinOpPred = BinOp->getOperand(0);
933   auto BinOpOp1 = BinOp->getOperand(1);
934   auto BinOpOp2 = BinOp->getOperand(2);
938       PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
941   auto PredOp = PredIntr->getOperand(0);
942   auto PredOpTy = cast<VectorType>(PredOp->getType());
971   if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
984     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
985     if (CursorVTy->getElementCount().getKnownMinValue() <
986         IVTy->getElementCount().getKnownMinValue())
990     if (Cursor->getType() == IVTy)
996     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
998                               IntrinsicCursor->getIntrinsicID() ==
1003     Cursor = IntrinsicCursor->getOperand(0);
1022     if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1023         cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1047     if (RetTy->isStructTy()) {
1049       auto VecT = StructT->getElementType(0);
1051       for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1052         ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1056     } else if (RetTy->isFPOrFPVectorTy())
1085   if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1089       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1097   Insert->insertBefore(&II);
1098   Insert->takeName(&II);
1107   Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1109   Splat->takeName(&II);
1119   if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1123       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1130   if (!SplatValue || !SplatValue->isZero())
1136       DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1140   if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1143   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1144   if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1149   if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1152   if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1155   auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1159   auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1161   if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1164   unsigned NumElts = VecTy->getNumElements();
1167   // Expand intrinsic operands to a 16-bit byte level predicate
1169     auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1172     if (!Arg->isZero())
1179     PFalse->takeName(&II);
1189   unsigned PredSize = Mask & -Mask;
1208   ConvertFromSVBool->takeName(&II);
1219   // lastX(splat(X)) --> X
1224   // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1229       auto OpC = OldBinOp->getOpcode();
1231           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1233           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1235           OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1241   if (IsAfter && C && C->isNullValue()) {
1245     Extract->insertBefore(&II);
1246     Extract->takeName(&II);
1254   if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1258       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1265   unsigned Idx = MinNumElts - 1;
1274   auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1275   if (Idx >= PgVTy->getMinNumElements())
1281   Extract->insertBefore(&II);
1282   Extract->takeName(&II);
1289   // integer variant across a variety of micro-architectures. Replace scalar
1291   // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1292   // depending on the micro-architecture, but has been observed as generally
1293   // being faster, particularly when the CLAST[AB] op is a loop-carried
1300   if (!Ty->isIntegerTy())
1304   switch (cast<IntegerType>(Ty)->getBitWidth()) {
1320       FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1323       II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1339   RDFFR->takeName(&II);
1345   const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1350     VScale->takeName(&II);
1373     Type *Tys[] = {PgVal->getType()};
1377     PTest->takeName(&II);
1388   Intrinsic::ID OpIID = Op->getIntrinsicID();
1390   if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1392       Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1393     Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1394     Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1398     PTest->takeName(&II);
1402   // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1403   // Later optimizations may rewrite sequence to use the flag-setting variant
1418     Value *Ops[] = {Pg->getArgOperand(0), Pg};
1419     Type *Tys[] = {Pg->getType()};
1422     PTest->takeName(&II);
1448   if (!Mul->hasOneUse())
1452   if (II.getType()->isFPOrFPVectorTy()) {
1456     if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1486     Load->copyMetadata(II);
1491       IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1493   MaskedLoad->copyMetadata(II);
1505     Store->copyMetadata(II);
1510       VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1511   MaskedStore->copyMetadata(II);
1547 // Canonicalise operations that take an all active predicate (e.g. sve.add ->
1569     //  llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1709     if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1712     auto *SplatValue = IntrI->getOperand(2);
1718     OpMultiplicand->takeName(&II);
1723     auto *DupPg = DupInst->getOperand(1);
1727       OpMultiplicand->takeName(&II);
1742   // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1743   // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1746         IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1748         IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1749     NewVal->takeName(&II);
1765       SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1772       IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1774   VectorSplat->takeName(&II);
1785   // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1786   // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1793     auto *TyA = cast<ScalableVectorType>(A->getType());
1794     if (TyA == B->getType() &&
1799           RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1800       ConcatVec->takeName(&II);
1810   // zip1(uzp1(A, B), uzp2(A, B)) --> A
1811   // zip2(uzp1(A, B), uzp2(A, B)) --> B
1842         BasePtr->getPointerAlignment(II.getDataLayout());
1845     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1850     MaskedLoad->takeName(&II);
1863   Type *Ty = Val->getType();
1872         BasePtr->getPointerAlignment(II.getDataLayout());
1874     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1898   APInt Divisor = SplatConstantInt->getValue();
1912         Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1954       !isa<FixedVectorType>(CurrentInsertElt->getType()))
1959   SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1961     auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1962     Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1963     CurrentInsertElt = InsertElt->getOperand(0);
1972   Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1986   unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1987   unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1988                                  IIScalableTy->getMinNumElements() /
2036   // * The ABS merge value is an undef or non-negative
2043   // Only valid when the shift amount is non-negative, otherwise the rounding
2308   return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2318     if (ST->useSVEForFixedLengthVectors() &&
2319         (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2321           std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2322     else if (ST->isNeonAvailable())
2327     if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2342     return VectorType::get(ArgTy->getScalarType(),
2343                            cast<VectorType>(DstTy)->getElementCount());
2351   unsigned DstEltSize = DstTy->getScalarSizeInBits();
2371             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2381             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2388       if (Args[0]->getType()->getScalarSizeInBits() -
2390           DstTy->getScalarSizeInBits() / 2)
2393         SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2394                                            DstTy->getScalarSizeInBits() / 2));
2406   if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2414   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2430 //   %x = add ((zext i8 -> i16), 1)
2431 //   %y = (zext i8 -> i16)
2432 //   trunc i16 (lshr (add %x, %y), 1) -> i8
2437   if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2438       (Src->isScalableTy() && !ST->hasSVE2()))
2441   if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2447       dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2448   if (AddUser && AddUser->getOpcode() == Instruction::Add)
2451   auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2452   if (!Shr || Shr->getOpcode() != Instruction::LShr)
2455   auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2456   if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2457       Src->getScalarSizeInBits() !=
2458           cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2470       Ex1->getOpcode() == Ex2->getOpcode())
2481   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2485   if (I && I->hasOneUser()) {
2486     auto *SingleUser = cast<Instruction>(*I->user_begin());
2487     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2488     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2492       if (SingleUser->getOpcode() == Instruction::Add) {
2493         if (I == SingleUser->getOperand(1) ||
2494             (isa<CastInst>(SingleUser->getOperand(1)) &&
2495              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2507   // TODO: Allow non-throughput costs that aren't binary.
2508   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2514   EVT SrcTy = TLI->getValueType(DL, Src);
2515   EVT DstTy = TLI->getValueType(DL, Dst);
2684     // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2690     // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2698     // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2774     // Add cost for extending to illegal -too wide- scalable vectors.
2798       ST->useSVEForFixedLengthVectors(WiderTy)) {
2800         getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2806             Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2807             ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2814     return AdjustCost(Entry->Cost);
2841   if (ST->hasFullFP16())
2844       return AdjustCost(Entry->Cost);
2848       ST->isSVEorStreamingSVEAvailable() &&
2849       TLI->getTypeAction(Src->getContext(), SrcTy) ==
2851       TLI->getTypeAction(Dst->getContext(), DstTy) ==
2858     Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2859     InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
2863     return Part1 + Part2;
2870       ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
2888   auto *Src = VecTy->getElementType();
2890   // Sign- and zero-extends are for integer types only.
2901   auto DstVT = TLI->getValueType(DL, Dst);
2902   auto SrcVT = TLI->getValueType(DL, Src);
2907   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2921   // For sign-extends, we only need a smov, which performs the extension
2926   // For zero-extends, the extend is performed automatically by a umov unless
2952   assert(Val->isVectorTy() && "This must be a vector type");
2954   if (Index != -1U) {
2962     // The type may be split. For fixed-width vectors we can normalize the
2970     // - For a physical (HasRealUse==true) insert-element or extract-element
2971     // instruction that extracts integers, an explicit FPR -> GPR move is
2972     // needed. So it has non-zero cost.
2973     // - For the rest of cases (virtual instruction or element type is float),
2975     if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2978     // This is recognising a LD1 single-element structure to one lane of one
2982     if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2983       return ST->getVectorInsertExtractBaseCost() + 1;
2987     if (Val->getScalarSizeInBits() == 1)
2988       return ST->getVectorInsertExtractBaseCost() + 1;
2991     // If the extract-element and insert-element instructions could be
2992     // simplified away (e.g., could be combined into users by looking at use-def
2994     // compile-time considerations.
2998   return ST->getVectorInsertExtractBaseCost();
3022   if (Ty->getElementType()->isFloatingPointTy())
3026          ST->getVectorInsertExtractBaseCost();
3035   // The code-generator is currently not able to handle scalable vectors
3037   // it. This change will be removed when code-generation for these types is
3040     if (VTy->getElementCount() == ElementCount::getScalable(1))
3050   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3058       // On AArch64, scalar signed division by constants power-of-two are
3077       auto VT = TLI->getValueType(DL, Ty);
3078       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3094     if (Ty->isVectorTy()) {
3095       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3099                                                 ->getPrimitiveSizeInBits()
3101           EVT VT = TLI->getValueType(DL, Ty);
3112             return Entry->Cost;
3114         // For 8/16-bit elements, the cost is higher because the type
3130                 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3131             return (4 + DivCost) * VTy->getNumElements();
3151     if (LT.second == MVT::v2i64 && ST->hasSVE())
3159     // - four 2-cost i64 extracts,
3160     // - two 2-cost i64 inserts, and
3161     // - two 1-cost muls.
3185     if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3186         (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3188     if (!Ty->getScalarType()->isFP128Ty())
3195     if (!Ty->getScalarType()->isFP128Ty())
3203     if (!Ty->isVectorTy())
3213   // Address computations in vectorized code with non-consecutive addresses will
3216   // extra micro-ops can significantly decrease throughput.
3220   if (Ty->isVectorTy() && SE &&
3239   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3248     if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3267           (ST->hasFullFP16() &&
3287     EVT SelCondTy = TLI->getValueType(DL, CondTy);
3288     EVT SelValTy = TLI->getValueType(DL, ValTy);
3293         return Entry->Cost;
3300     if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3307   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3309       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3310       match(I->getOperand(1), m_Zero()) &&
3311       match(I->getOperand(0), m_And(m_Value(), m_Value())))
3322   if (ST->requiresStrictAlign()) {
3328   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3332   // they could be used with no holds barred (-O3).
3339   return ST->hasSVE();
3355   if (VT->getElementType()->isIntegerTy(1))
3358   // The code-generator is currently not able to handle scalable vectors
3360   // it. This change will be removed when code-generation for these types is
3362   if (VT->getElementCount() == ElementCount::getScalable(1))
3385       !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3386       VT->getElementType()->isIntegerTy(1))
3389   // The code-generator is currently not able to handle scalable vectors
3391   // it. This change will be removed when code-generation for these types is
3393   if (VT->getElementCount() == ElementCount::getScalable(1))
3398       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3402   // point we may want a per-CPU overhead.
3408   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3417   EVT VT = TLI->getValueType(DL, Ty, true);
3427   // The code-generator is currently not able to handle scalable vectors
3429   // it. This change will be removed when code-generation for these types is
3433     if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3434         (VTy->getElementType()->isIntegerTy(1) &&
3435          !VTy->getElementCount().isKnownMultipleOf(
3446   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3449     // unaligned 128-bit stores because the negative impact that has shown in
3459   if (Ty->isPtrOrPtrVectorTy())
3464     if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3469       return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3482     // Check non-power-of-2 loads/stores for legal vector element types with
3483     // NEON. Non-power-of-2 memory ops will get broken down to a set of
3484     // operations on smaller power-of-2 ops, including ld1/st1.
3485     LLVMContext &C = Ty->getContext();
3500           EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3515   if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3520   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3523   if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3524     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3526         VectorType::get(VecVTy->getElementType(),
3527                         VecVTy->getElementCount().divideCoefficientBy(Factor));
3534         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3535       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3548     if (!I->isVectorTy())
3550     if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3559   return ST->getMaxInterleaveFactor();
3573     // e.g. not counting loads in each side of an if-then-else diamond.
3574     for (const auto BB : L->blocks()) {
3580         Value *PtrValue = LMemI->getPointerOperand();
3581         if (L->isLoopInvariant(PtrValue))
3586         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3603   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3609     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3625   if (L->getLoopDepth() > 1)
3628   // Disable partial & runtime unrolling on -Os.
3631   if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3638   for (auto *BB : L->getBlocks()) {
3641       if (I.getType()->isVectorTy())
3654   // Enable runtime unrolling for in-order models
3658   if (ST->getProcFamily() != AArch64Subtarget::Others &&
3659       !ST->getSchedModel().isOutOfOrder()) {
3677   switch (Inst->getIntrinsicID()) {
3687     unsigned NumElts = Inst->arg_size() - 1;
3688     if (ST->getNumElements() != NumElts)
3691       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3697       Value *L = Inst->getArgOperand(i);
3705     if (Inst->getType() == ExpectedType)
3713   switch (Inst->getIntrinsicID()) {
3721     Info.PtrVal = Inst->getArgOperand(0);
3728     Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3732   switch (Inst->getIntrinsicID()) {
3763       Type::getInt64Ty(I.getParent()->getParent()->getContext());
3774       if (GEPInst->getNumOperands() > 2) {
3789   if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
3817   // The code-generator is currently not able to handle scalable vectors
3819   // it. This change will be removed when code-generation for these types is
3822     if (VTy->getElementCount() == ElementCount::getScalable(1))
3827   if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3832     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3834     LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3845     Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3847     LegalizationCost *= LT.first - 1;
3850   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3869   // The code-generator is currently not able to handle scalable vectors
3871   // it. This change will be removed when code-generation for these types is
3874     if (VTy->getElementCount() == ElementCount::getScalable(1))
3883       return BaseCost + FixedVTy->getNumElements();
3891         getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3892     Cost *= getMaxNumElements(VTy->getElementCount());
3901   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3909   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3910   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3911   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3946       return (LT.first - 1) + Entry->Cost;
3955     if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3956         isPowerOf2_32(ValVTy->getNumElements())) {
3959         // Type needs to be split, so there is an extra cost of LT.first - 1
3961         auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3964         ExtraCost *= LT.first - 1;
3967       auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3992   // The code-generator is currently not able to handle scalable vectors
3994   // it. This change will be removed when code-generation for these types is
3996   if (Tp->getElementCount() == ElementCount::getScalable(1))
4000   Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4003                        ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4005   Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4027   LegalizationCost += Entry->Cost;
4040       Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4044     // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4053     // store(interleaving-shuffle). The shuffle cost could potentially be free,
4056     if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4058              Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4060              Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4065     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4067         VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4072       // sub-vectors to ensure the result has at most 2 inputs.
4105       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4114         Cost += LTNumElts - 1;
4128   // In terms of code-size, the shuffle vector is free when a load + dup get
4137         isLegalBroadcastLoad(Tp->getElementType(),
4144   if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4145       (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4165        // Check for non-zero lane splats
4290       return LT.first * Entry->Cost;
4319   for (BasicBlock *BB : TheLoop->blocks()) {
4337   if (!ST->hasSVE())
4340   // We don't currently support vectorisation with interleaving for SVE - with
4341   // such loops we're better off not using tail-folding. This gives us a chance
4342   // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4343   if (TFI->IAI->hasGroups())
4347   if (TFI->LVL->getReductionVars().size())
4349   if (TFI->LVL->getFixedOrderRecurrences().size())
4355   if (containsDecreasingPointers(TFI->LVL->getLoop(),
4356                                  TFI->LVL->getPredicatedScalarEvolution()))
4361   if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
4365   // Don't tail-fold for tight loops where we would be better off interleaving
4368   for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4369     NumInsns += BB->sizeWithoutDebug();
4382   // -------------------------------------------
4384   // -------------------------------------------
4393   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4397   return -1;
4403   // break point in the code - the end of a block with an unconditional
4405   if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4406       isa<BranchInst>(I->getNextNode()) &&
4407       cast<BranchInst>(I->getNextNode())->isUnconditional())