Lines Matching +full:test +full:- +full:part1

1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
33 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
36 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
39 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
42 static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
58 static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
61 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
66 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
71 // These bitfields will only ever be set to something non-zero in operator=,
72 // when setting the -sve-tail-folding option. This option should always be of
84 // explicitly set the -sve-tail-folding option.
116 << "' to -sve-tail-folding=; the option should be of the form\n"
119 report_fatal_error("Unrecognised tail-folding option");
125 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 StringRef(Val).split(TailFoldTypes, '+', -1, false);
179 "sve-tail-folding",
181 "Control the use of vectorisation using tail-folding for SVE where the"
184 "tail-folding"
185 "\ndefault (Initial) Uses the default tail-folding settings for "
188 "tail-folding"
189 "\nsimple (Initial) Use tail-folding for simple loops (not "
191 "\nreductions Use tail-folding for loops containing reductions"
193 "\nrecurrences Use tail-folding for loops containing fixed order "
196 "\nreverse Use tail-folding for loops requiring reversed "
202 // code-generator is changed to use SVE instead of NEON for all fixed-width
205 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
207 // Experimental option that will only be fully functional when the cost-model
208 // and code-generator have been changed to avoid using scalable vector
211 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
215 return F && StringSwitch<bool>(F->getName())
230 // intrinsics could could result in non-streaming ops (e.g. calls to
263 const TargetMachine &TM = getTLI()->getTargetMachine();
266 TM.getSubtargetImpl(*Caller)->getFeatureBits();
268 TM.getSubtargetImpl(*Callee)->getFeatureBits();
270 // Inline a callee if its target-features are a subset of the callers
271 // target-features.
282 // pointers to fixed-length vector types larger than 128 bits like
283 // <8 x float> (and pointers to aggregate types which have such fixed-length
286 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
287 // types can be safely treated as 128-bit NEON types and they cannot be
289 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
292 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
306 // call from F -> G (the call here is Call)
309 // a streaming-mode change is required (thus promoting the need to inline the
313 // call from F -> G (the call here is not Call)
315 // call from G -> H (the call here is Call)
318 // streaming-mode change, and the call to G from F would also require a
319 // streaming-mode change, then there is benefit to do the streaming-mode
337 ST->isNeonAvailable());
340 /// Calculate the cost of materializing a 64-bit value. This helper
360 assert(Ty->isIntegerTy());
362 unsigned BitSize = Ty->getPrimitiveSizeInBits();
366 // Sign-extend all constants to a multiple of 64-bit.
371 // Split the constant into 64-bit chunks and calculate the cost for each
387 assert(Ty->isIntegerTy());
389 unsigned BitSize = Ty->getPrimitiveSizeInBits();
455 assert(Ty->isIntegerTy());
457 unsigned BitSize = Ty->getPrimitiveSizeInBits();
522 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
523 (EltTy->getScalarSizeInBits() != 32 &&
524 EltTy->getScalarSizeInBits() != 64))
529 // FIXME: We should be able to generate histcnt for fixed-length vectors
532 if ((VTy->getElementCount().getKnownMinValue() != 2 &&
533 VTy->getElementCount().getKnownMinValue() != 4) ||
534 VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
535 !VTy->isScalableTy())
544 // The code-generator is currently not able to handle scalable vectors
546 // it. This change will be removed when code-generation for these types is
550 if (VTy->getElementCount() == ElementCount::getScalable(1))
555 if (!ST->hasSVE2())
585 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
604 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
612 // (LT.first - 1) vector adds.
614 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
617 Cost += AddCost * (LT.first - 1);
624 // is 0, then this should be a no-op or simple operation; return a
630 ICA.getReturnType()->getScalarType()->isIntegerTy(1))
633 LLVMContext &C = RetTy->getContext();
634 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
636 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
637 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
644 getTLI()->getTypeConversion(C, SubVecVT);
646 getTLI()->getTypeConversion(C, VecVT);
650 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
673 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
674 TLI->getValueType(DL, RetTy, true) == MVT::i16)
675 return LegalisationCost.first * Entry->Cost + 1;
677 return LegalisationCost.first * Entry->Cost;
682 if (!ST->hasNEON()) {
683 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
703 RetTy->getScalarSizeInBits()
706 return LT.first * Entry->Cost + ExtraCost;
742 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
746 return Entry->Cost;
755 EVT MTy = TLI->getValueType(DL, RetTy);
757 // output are the same, or we are using cvt f64->i32 or f32->i64.
766 if (ST->hasFullFP16() &&
775 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
778 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
815 return LegalisationCost.first * Entry->Cost;
819 if (!RetTy->isIntegerTy())
824 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
825 RetTy->getScalarSizeInBits() < 64) ||
826 (RetTy->getScalarSizeInBits() % 64 != 0);
828 if (RetTy->getScalarSizeInBits() == 32 ||
829 RetTy->getScalarSizeInBits() == 64)
841 EVT RetVT = getTLI()->getValueType(DL, RetTy);
842 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
843 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
844 !getTLI()->isTypeLegal(RetVT)) {
847 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
850 // be extremely high for fixed-width vectors.
854 return RetTy->getNumElements() * 2;
876 if (!PN->hasOneUse())
879 for (Value *IncValPhi : PN->incoming_values()) {
882 Reinterpret->getIntrinsicID() !=
884 RequiredType != Reinterpret->getArgOperand(0)->getType())
890 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
893 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
894 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
895 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
918 auto IntrinsicID = BinOp->getIntrinsicID();
932 auto BinOpPred = BinOp->getOperand(0);
933 auto BinOpOp1 = BinOp->getOperand(1);
934 auto BinOpOp2 = BinOp->getOperand(2);
938 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
941 auto PredOp = PredIntr->getOperand(0);
942 auto PredOpTy = cast<VectorType>(PredOp->getType());
971 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
984 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
985 if (CursorVTy->getElementCount().getKnownMinValue() <
986 IVTy->getElementCount().getKnownMinValue())
990 if (Cursor->getType() == IVTy)
996 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
998 IntrinsicCursor->getIntrinsicID() ==
1003 Cursor = IntrinsicCursor->getOperand(0);
1022 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1023 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1047 if (RetTy->isStructTy()) {
1049 auto VecT = StructT->getElementType(0);
1051 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1052 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1056 } else if (RetTy->isFPOrFPVectorTy())
1085 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1089 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1097 Insert->insertBefore(&II);
1098 Insert->takeName(&II);
1107 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1109 Splat->takeName(&II);
1119 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1123 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1130 if (!SplatValue || !SplatValue->isZero())
1136 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1140 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1143 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1144 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1149 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1152 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1155 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1159 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1161 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1164 unsigned NumElts = VecTy->getNumElements();
1167 // Expand intrinsic operands to a 16-bit byte level predicate
1169 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1172 if (!Arg->isZero())
1179 PFalse->takeName(&II);
1189 unsigned PredSize = Mask & -Mask;
1208 ConvertFromSVBool->takeName(&II);
1219 // lastX(splat(X)) --> X
1224 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1229 auto OpC = OldBinOp->getOpcode();
1231 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1233 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1235 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1241 if (IsAfter && C && C->isNullValue()) {
1245 Extract->insertBefore(&II);
1246 Extract->takeName(&II);
1254 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1258 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1265 unsigned Idx = MinNumElts - 1;
1274 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1275 if (Idx >= PgVTy->getMinNumElements())
1281 Extract->insertBefore(&II);
1282 Extract->takeName(&II);
1289 // integer variant across a variety of micro-architectures. Replace scalar
1291 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1292 // depending on the micro-architecture, but has been observed as generally
1293 // being faster, particularly when the CLAST[AB] op is a loop-carried
1300 if (!Ty->isIntegerTy())
1304 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1320 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1323 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1339 RDFFR->takeName(&II);
1345 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1350 VScale->takeName(&II);
1373 Type *Tys[] = {PgVal->getType()};
1377 PTest->takeName(&II);
1388 Intrinsic::ID OpIID = Op->getIntrinsicID();
1390 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1392 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1393 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1394 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1398 PTest->takeName(&II);
1402 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1403 // Later optimizations may rewrite sequence to use the flag-setting variant
1418 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1419 Type *Tys[] = {Pg->getType()};
1422 PTest->takeName(&II);
1448 if (!Mul->hasOneUse())
1452 if (II.getType()->isFPOrFPVectorTy()) {
1456 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1486 Load->copyMetadata(II);
1491 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1493 MaskedLoad->copyMetadata(II);
1505 Store->copyMetadata(II);
1510 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1511 MaskedStore->copyMetadata(II);
1547 // Canonicalise operations that take an all active predicate (e.g. sve.add ->
1569 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1709 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1712 auto *SplatValue = IntrI->getOperand(2);
1718 OpMultiplicand->takeName(&II);
1723 auto *DupPg = DupInst->getOperand(1);
1727 OpMultiplicand->takeName(&II);
1742 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1743 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1746 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1748 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1749 NewVal->takeName(&II);
1765 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1772 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1774 VectorSplat->takeName(&II);
1785 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1786 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1793 auto *TyA = cast<ScalableVectorType>(A->getType());
1794 if (TyA == B->getType() &&
1799 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1800 ConcatVec->takeName(&II);
1810 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1811 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1842 BasePtr->getPointerAlignment(II.getDataLayout());
1845 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1850 MaskedLoad->takeName(&II);
1863 Type *Ty = Val->getType();
1872 BasePtr->getPointerAlignment(II.getDataLayout());
1874 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1898 APInt Divisor = SplatConstantInt->getValue();
1912 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1954 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1959 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1961 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1962 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1963 CurrentInsertElt = InsertElt->getOperand(0);
1972 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1986 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1987 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1988 IIScalableTy->getMinNumElements() /
2036 // * The ABS merge value is an undef or non-negative
2043 // Only valid when the shift amount is non-negative, otherwise the rounding
2308 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2318 if (ST->useSVEForFixedLengthVectors() &&
2319 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2321 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2322 else if (ST->isNeonAvailable())
2327 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2342 return VectorType::get(ArgTy->getScalarType(),
2343 cast<VectorType>(DstTy)->getElementCount());
2351 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2371 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2381 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2388 if (Args[0]->getType()->getScalarSizeInBits() -
2390 DstTy->getScalarSizeInBits() / 2)
2393 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2394 DstTy->getScalarSizeInBits() / 2));
2406 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2414 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2430 // %x = add ((zext i8 -> i16), 1)
2431 // %y = (zext i8 -> i16)
2432 // trunc i16 (lshr (add %x, %y), 1) -> i8
2437 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2438 (Src->isScalableTy() && !ST->hasSVE2()))
2441 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2447 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2448 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2451 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2452 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2455 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2456 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2457 Src->getScalarSizeInBits() !=
2458 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2470 Ex1->getOpcode() == Ex2->getOpcode())
2481 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2485 if (I && I->hasOneUser()) {
2486 auto *SingleUser = cast<Instruction>(*I->user_begin());
2487 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2488 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2492 if (SingleUser->getOpcode() == Instruction::Add) {
2493 if (I == SingleUser->getOperand(1) ||
2494 (isa<CastInst>(SingleUser->getOperand(1)) &&
2495 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2507 // TODO: Allow non-throughput costs that aren't binary.
2508 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2514 EVT SrcTy = TLI->getValueType(DL, Src);
2515 EVT DstTy = TLI->getValueType(DL, Dst);
2684 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2690 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2698 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2774 // Add cost for extending to illegal -too wide- scalable vectors.
2798 ST->useSVEForFixedLengthVectors(WiderTy)) {
2800 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2806 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2807 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2814 return AdjustCost(Entry->Cost);
2841 if (ST->hasFullFP16())
2844 return AdjustCost(Entry->Cost);
2848 ST->isSVEorStreamingSVEAvailable() &&
2849 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2851 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2858 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2859 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
2863 return Part1 + Part2;
2870 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
2888 auto *Src = VecTy->getElementType();
2890 // Sign- and zero-extends are for integer types only.
2901 auto DstVT = TLI->getValueType(DL, Dst);
2902 auto SrcVT = TLI->getValueType(DL, Src);
2907 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2921 // For sign-extends, we only need a smov, which performs the extension
2926 // For zero-extends, the extend is performed automatically by a umov unless
2952 assert(Val->isVectorTy() && "This must be a vector type");
2954 if (Index != -1U) {
2962 // The type may be split. For fixed-width vectors we can normalize the
2970 // - For a physical (HasRealUse==true) insert-element or extract-element
2971 // instruction that extracts integers, an explicit FPR -> GPR move is
2972 // needed. So it has non-zero cost.
2973 // - For the rest of cases (virtual instruction or element type is float),
2975 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2978 // This is recognising a LD1 single-element structure to one lane of one
2982 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2983 return ST->getVectorInsertExtractBaseCost() + 1;
2987 if (Val->getScalarSizeInBits() == 1)
2988 return ST->getVectorInsertExtractBaseCost() + 1;
2991 // If the extract-element and insert-element instructions could be
2992 // simplified away (e.g., could be combined into users by looking at use-def
2994 // compile-time considerations.
2998 return ST->getVectorInsertExtractBaseCost();
3022 if (Ty->getElementType()->isFloatingPointTy())
3026 ST->getVectorInsertExtractBaseCost();
3035 // The code-generator is currently not able to handle scalable vectors
3037 // it. This change will be removed when code-generation for these types is
3040 if (VTy->getElementCount() == ElementCount::getScalable(1))
3050 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3058 // On AArch64, scalar signed division by constants power-of-two are
3077 auto VT = TLI->getValueType(DL, Ty);
3078 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3094 if (Ty->isVectorTy()) {
3095 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3099 ->getPrimitiveSizeInBits()
3101 EVT VT = TLI->getValueType(DL, Ty);
3112 return Entry->Cost;
3114 // For 8/16-bit elements, the cost is higher because the type
3130 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3131 return (4 + DivCost) * VTy->getNumElements();
3151 if (LT.second == MVT::v2i64 && ST->hasSVE())
3159 // - four 2-cost i64 extracts,
3160 // - two 2-cost i64 inserts, and
3161 // - two 1-cost muls.
3185 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3186 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3188 if (!Ty->getScalarType()->isFP128Ty())
3195 if (!Ty->getScalarType()->isFP128Ty())
3203 if (!Ty->isVectorTy())
3213 // Address computations in vectorized code with non-consecutive addresses will
3216 // extra micro-ops can significantly decrease throughput.
3220 if (Ty->isVectorTy() && SE &&
3239 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3248 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3267 (ST->hasFullFP16() &&
3287 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3288 EVT SelValTy = TLI->getValueType(DL, ValTy);
3293 return Entry->Cost;
3300 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3307 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3309 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3310 match(I->getOperand(1), m_Zero()) &&
3311 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3322 if (ST->requiresStrictAlign()) {
3328 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3332 // they could be used with no holds barred (-O3).
3339 return ST->hasSVE();
3355 if (VT->getElementType()->isIntegerTy(1))
3358 // The code-generator is currently not able to handle scalable vectors
3360 // it. This change will be removed when code-generation for these types is
3362 if (VT->getElementCount() == ElementCount::getScalable(1))
3385 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3386 VT->getElementType()->isIntegerTy(1))
3389 // The code-generator is currently not able to handle scalable vectors
3391 // it. This change will be removed when code-generation for these types is
3393 if (VT->getElementCount() == ElementCount::getScalable(1))
3398 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3402 // point we may want a per-CPU overhead.
3408 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3417 EVT VT = TLI->getValueType(DL, Ty, true);
3427 // The code-generator is currently not able to handle scalable vectors
3429 // it. This change will be removed when code-generation for these types is
3433 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3434 (VTy->getElementType()->isIntegerTy(1) &&
3435 !VTy->getElementCount().isKnownMultipleOf(
3446 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3449 // unaligned 128-bit stores because the negative impact that has shown in
3459 if (Ty->isPtrOrPtrVectorTy())
3464 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3469 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3482 // Check non-power-of-2 loads/stores for legal vector element types with
3483 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3484 // operations on smaller power-of-2 ops, including ld1/st1.
3485 LLVMContext &C = Ty->getContext();
3500 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3515 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3520 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3523 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3524 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3526 VectorType::get(VecVTy->getElementType(),
3527 VecVTy->getElementCount().divideCoefficientBy(Factor));
3534 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3535 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3548 if (!I->isVectorTy())
3550 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3559 return ST->getMaxInterleaveFactor();
3573 // e.g. not counting loads in each side of an if-then-else diamond.
3574 for (const auto BB : L->blocks()) {
3580 Value *PtrValue = LMemI->getPointerOperand();
3581 if (L->isLoopInvariant(PtrValue))
3586 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3603 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3609 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3625 if (L->getLoopDepth() > 1)
3628 // Disable partial & runtime unrolling on -Os.
3631 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3638 for (auto *BB : L->getBlocks()) {
3641 if (I.getType()->isVectorTy())
3654 // Enable runtime unrolling for in-order models
3658 if (ST->getProcFamily() != AArch64Subtarget::Others &&
3659 !ST->getSchedModel().isOutOfOrder()) {
3677 switch (Inst->getIntrinsicID()) {
3687 unsigned NumElts = Inst->arg_size() - 1;
3688 if (ST->getNumElements() != NumElts)
3691 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3697 Value *L = Inst->getArgOperand(i);
3705 if (Inst->getType() == ExpectedType)
3713 switch (Inst->getIntrinsicID()) {
3721 Info.PtrVal = Inst->getArgOperand(0);
3728 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3732 switch (Inst->getIntrinsicID()) {
3763 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3774 if (GEPInst->getNumOperands() > 2) {
3789 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
3817 // The code-generator is currently not able to handle scalable vectors
3819 // it. This change will be removed when code-generation for these types is
3822 if (VTy->getElementCount() == ElementCount::getScalable(1))
3827 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3832 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3834 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3845 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3847 LegalizationCost *= LT.first - 1;
3850 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3869 // The code-generator is currently not able to handle scalable vectors
3871 // it. This change will be removed when code-generation for these types is
3874 if (VTy->getElementCount() == ElementCount::getScalable(1))
3883 return BaseCost + FixedVTy->getNumElements();
3891 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3892 Cost *= getMaxNumElements(VTy->getElementCount());
3901 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3909 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3910 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3911 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3946 return (LT.first - 1) + Entry->Cost;
3955 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3956 isPowerOf2_32(ValVTy->getNumElements())) {
3959 // Type needs to be split, so there is an extra cost of LT.first - 1
3961 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3964 ExtraCost *= LT.first - 1;
3967 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3992 // The code-generator is currently not able to handle scalable vectors
3994 // it. This change will be removed when code-generation for these types is
3996 if (Tp->getElementCount() == ElementCount::getScalable(1))
4000 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4003 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4005 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4027 LegalizationCost += Entry->Cost;
4040 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4044 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4053 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4056 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4058 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4060 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4065 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4067 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4072 // sub-vectors to ensure the result has at most 2 inputs.
4105 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4114 Cost += LTNumElts - 1;
4128 // In terms of code-size, the shuffle vector is free when a load + dup get
4137 isLegalBroadcastLoad(Tp->getElementType(),
4144 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4145 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4165 // Check for non-zero lane splats
4290 return LT.first * Entry->Cost;
4319 for (BasicBlock *BB : TheLoop->blocks()) {
4337 if (!ST->hasSVE())
4340 // We don't currently support vectorisation with interleaving for SVE - with
4341 // such loops we're better off not using tail-folding. This gives us a chance
4342 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4343 if (TFI->IAI->hasGroups())
4347 if (TFI->LVL->getReductionVars().size())
4349 if (TFI->LVL->getFixedOrderRecurrences().size())
4355 if (containsDecreasingPointers(TFI->LVL->getLoop(),
4356 TFI->LVL->getPredicatedScalarEvolution()))
4361 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
4365 // Don't tail-fold for tight loops where we would be better off interleaving
4368 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4369 NumInsns += BB->sizeWithoutDebug();
4382 // -------------------------------------------
4384 // -------------------------------------------
4393 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4397 return -1;
4403 // break point in the code - the end of a block with an unconditional
4405 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4406 isa<BranchInst>(I->getNextNode()) &&
4407 cast<BranchInst>(I->getNextNode())->isUnconditional())