xref: /llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h (revision b84b717f093bd081f290cedcc4fecb2abec27868)
1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/BitVector.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/Analysis/TargetTransformInfoImpl.h"
27 #include "llvm/Analysis/ValueTracking.h"
28 #include "llvm/CodeGen/ISDOpcodes.h"
29 #include "llvm/CodeGen/TargetLowering.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/CodeGenTypes/MachineValueType.h"
33 #include "llvm/IR/BasicBlock.h"
34 #include "llvm/IR/Constant.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/CommandLine.h"
47 #include "llvm/Support/ErrorHandling.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include "llvm/Target/TargetOptions.h"
51 #include "llvm/Transforms/Utils/LoopUtils.h"
52 #include <algorithm>
53 #include <cassert>
54 #include <cstdint>
55 #include <limits>
56 #include <optional>
57 #include <utility>
58 
59 namespace llvm {
60 
61 class Function;
62 class GlobalValue;
63 class LLVMContext;
64 class ScalarEvolution;
65 class SCEV;
66 class TargetMachine;
67 
68 extern cl::opt<unsigned> PartialUnrollingThreshold;
69 
70 /// Base class which can be used to help build a TTI implementation.
71 ///
72 /// This class provides as much implementation of the TTI interface as is
73 /// possible using the target independent parts of the code generator.
74 ///
75 /// In order to subclass it, your class must implement a getST() method to
76 /// return the subtarget, and a getTLI() method to return the target lowering.
77 /// We need these methods implemented in the derived class so that this class
78 /// doesn't have to duplicate storage for them.
79 template <typename T>
80 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
81 private:
82   using BaseT = TargetTransformInfoImplCRTPBase<T>;
83   using TTI = TargetTransformInfo;
84 
85   /// Helper function to access this as a T.
86   T *thisT() { return static_cast<T *>(this); }
87 
88   /// Estimate a cost of Broadcast as an extract and sequence of insert
89   /// operations.
90   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
91                                               TTI::TargetCostKind CostKind) {
92     InstructionCost Cost = 0;
93     // Broadcast cost is equal to the cost of extracting the zero'th element
94     // plus the cost of inserting it into every element of the result vector.
95     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
96                                         CostKind, 0, nullptr, nullptr);
97 
98     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
99       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
100                                           CostKind, i, nullptr, nullptr);
101     }
102     return Cost;
103   }
104 
105   /// Estimate a cost of shuffle as a sequence of extract and insert
106   /// operations.
107   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
108                                             TTI::TargetCostKind CostKind) {
109     InstructionCost Cost = 0;
110     // Shuffle cost is equal to the cost of extracting element from its argument
111     // plus the cost of inserting them onto the result vector.
112 
113     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
114     // index 0 of first vector, index 1 of second vector,index 2 of first
115     // vector and finally index 3 of second vector and insert them at index
116     // <0,1,2,3> of result vector.
117     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
118       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
119                                           CostKind, i, nullptr, nullptr);
120       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
121                                           CostKind, i, nullptr, nullptr);
122     }
123     return Cost;
124   }
125 
126   /// Estimate a cost of subvector extraction as a sequence of extract and
127   /// insert operations.
128   InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
129                                               TTI::TargetCostKind CostKind,
130                                               int Index,
131                                               FixedVectorType *SubVTy) {
132     assert(VTy && SubVTy &&
133            "Can only extract subvectors from vectors");
134     int NumSubElts = SubVTy->getNumElements();
135     assert((!isa<FixedVectorType>(VTy) ||
136             (Index + NumSubElts) <=
137                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
138            "SK_ExtractSubvector index out of range");
139 
140     InstructionCost Cost = 0;
141     // Subvector extraction cost is equal to the cost of extracting element from
142     // the source type plus the cost of inserting them into the result vector
143     // type.
144     for (int i = 0; i != NumSubElts; ++i) {
145       Cost +=
146           thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
147                                       CostKind, i + Index, nullptr, nullptr);
148       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
149                                           CostKind, i, nullptr, nullptr);
150     }
151     return Cost;
152   }
153 
154   /// Estimate a cost of subvector insertion as a sequence of extract and
155   /// insert operations.
156   InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
157                                              TTI::TargetCostKind CostKind,
158                                              int Index,
159                                              FixedVectorType *SubVTy) {
160     assert(VTy && SubVTy &&
161            "Can only insert subvectors into vectors");
162     int NumSubElts = SubVTy->getNumElements();
163     assert((!isa<FixedVectorType>(VTy) ||
164             (Index + NumSubElts) <=
165                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
166            "SK_InsertSubvector index out of range");
167 
168     InstructionCost Cost = 0;
169     // Subvector insertion cost is equal to the cost of extracting element from
170     // the source type plus the cost of inserting them into the result vector
171     // type.
172     for (int i = 0; i != NumSubElts; ++i) {
173       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
174                                           CostKind, i, nullptr, nullptr);
175       Cost +=
176           thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
177                                       i + Index, nullptr, nullptr);
178     }
179     return Cost;
180   }
181 
182   /// Local query method delegates up to T which *must* implement this!
183   const TargetSubtargetInfo *getST() const {
184     return static_cast<const T *>(this)->getST();
185   }
186 
187   /// Local query method delegates up to T which *must* implement this!
188   const TargetLoweringBase *getTLI() const {
189     return static_cast<const T *>(this)->getTLI();
190   }
191 
192   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
193     switch (M) {
194       case TTI::MIM_Unindexed:
195         return ISD::UNINDEXED;
196       case TTI::MIM_PreInc:
197         return ISD::PRE_INC;
198       case TTI::MIM_PreDec:
199         return ISD::PRE_DEC;
200       case TTI::MIM_PostInc:
201         return ISD::POST_INC;
202       case TTI::MIM_PostDec:
203         return ISD::POST_DEC;
204     }
205     llvm_unreachable("Unexpected MemIndexedMode");
206   }
207 
208   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
209                                               Align Alignment,
210                                               bool VariableMask,
211                                               bool IsGatherScatter,
212                                               TTI::TargetCostKind CostKind,
213                                               unsigned AddressSpace = 0) {
214     // We cannot scalarize scalable vectors, so return Invalid.
215     if (isa<ScalableVectorType>(DataTy))
216       return InstructionCost::getInvalid();
217 
218     auto *VT = cast<FixedVectorType>(DataTy);
219     unsigned VF = VT->getNumElements();
220 
221     // Assume the target does not have support for gather/scatter operations
222     // and provide a rough estimate.
223     //
224     // First, compute the cost of the individual memory operations.
225     InstructionCost AddrExtractCost =
226         IsGatherScatter ? getScalarizationOverhead(
227                               FixedVectorType::get(
228                                   PointerType::get(VT->getContext(), 0), VF),
229                               /*Insert=*/false, /*Extract=*/true, CostKind)
230                         : 0;
231 
232     // The cost of the scalar loads/stores.
233     InstructionCost MemoryOpCost =
234         VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
235                                       AddressSpace, CostKind);
236 
237     // Next, compute the cost of packing the result in a vector.
238     InstructionCost PackingCost =
239         getScalarizationOverhead(VT, Opcode != Instruction::Store,
240                                  Opcode == Instruction::Store, CostKind);
241 
242     InstructionCost ConditionalCost = 0;
243     if (VariableMask) {
244       // Compute the cost of conditionally executing the memory operations with
245       // variable masks. This includes extracting the individual conditions, a
246       // branches and PHIs to combine the results.
247       // NOTE: Estimating the cost of conditionally executing the memory
248       // operations accurately is quite difficult and the current solution
249       // provides a very rough estimate only.
250       ConditionalCost =
251           getScalarizationOverhead(
252               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
253               /*Insert=*/false, /*Extract=*/true, CostKind) +
254           VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
255                 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
256     }
257 
258     return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
259   }
260 
261   /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
262   /// or same non -1 index value and this index value contained at least twice.
263   /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
264   /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
265   /// with \p Index=2.
266   static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
267     // Check that the broadcast index meets at least twice.
268     bool IsCompared = false;
269     if (int SplatIdx = PoisonMaskElem;
270         all_of(enumerate(Mask), [&](const auto &P) {
271           if (P.value() == PoisonMaskElem)
272             return P.index() != Mask.size() - 1 || IsCompared;
273           if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
274             return false;
275           if (SplatIdx == PoisonMaskElem) {
276             SplatIdx = P.value();
277             return P.index() != Mask.size() - 1;
278           }
279           IsCompared = true;
280           return SplatIdx == P.value();
281         })) {
282       Index = SplatIdx;
283       return true;
284     }
285     return false;
286   }
287 
288 protected:
289   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
290       : BaseT(DL) {}
291   virtual ~BasicTTIImplBase() = default;
292 
293   using TargetTransformInfoImplBase::DL;
294 
295 public:
296   /// \name Scalar TTI Implementations
297   /// @{
298   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
299                                       unsigned AddressSpace, Align Alignment,
300                                       unsigned *Fast) const {
301     EVT E = EVT::getIntegerVT(Context, BitWidth);
302     return getTLI()->allowsMisalignedMemoryAccesses(
303         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
304   }
305 
306   bool areInlineCompatible(const Function *Caller,
307                            const Function *Callee) const {
308     const TargetMachine &TM = getTLI()->getTargetMachine();
309 
310     const FeatureBitset &CallerBits =
311         TM.getSubtargetImpl(*Caller)->getFeatureBits();
312     const FeatureBitset &CalleeBits =
313         TM.getSubtargetImpl(*Callee)->getFeatureBits();
314 
315     // Inline a callee if its target-features are a subset of the callers
316     // target-features.
317     return (CallerBits & CalleeBits) == CalleeBits;
318   }
319 
320   bool hasBranchDivergence(const Function *F = nullptr) { return false; }
321 
322   bool isSourceOfDivergence(const Value *V) { return false; }
323 
324   bool isAlwaysUniform(const Value *V) { return false; }
325 
326   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
327     return false;
328   }
329 
330   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
331     return true;
332   }
333 
334   unsigned getFlatAddressSpace() {
335     // Return an invalid address space.
336     return -1;
337   }
338 
339   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
340                                   Intrinsic::ID IID) const {
341     return false;
342   }
343 
344   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
345     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
346   }
347 
348   unsigned getAssumedAddrSpace(const Value *V) const {
349     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
350   }
351 
352   bool isSingleThreaded() const {
353     return getTLI()->getTargetMachine().Options.ThreadModel ==
354            ThreadModel::Single;
355   }
356 
357   std::pair<const Value *, unsigned>
358   getPredicatedAddrSpace(const Value *V) const {
359     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
360   }
361 
362   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
363                                           Value *NewV) const {
364     return nullptr;
365   }
366 
367   bool isLegalAddImmediate(int64_t imm) {
368     return getTLI()->isLegalAddImmediate(imm);
369   }
370 
371   bool isLegalAddScalableImmediate(int64_t Imm) {
372     return getTLI()->isLegalAddScalableImmediate(Imm);
373   }
374 
375   bool isLegalICmpImmediate(int64_t imm) {
376     return getTLI()->isLegalICmpImmediate(imm);
377   }
378 
379   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
380                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
381                              Instruction *I = nullptr,
382                              int64_t ScalableOffset = 0) {
383     TargetLoweringBase::AddrMode AM;
384     AM.BaseGV = BaseGV;
385     AM.BaseOffs = BaseOffset;
386     AM.HasBaseReg = HasBaseReg;
387     AM.Scale = Scale;
388     AM.ScalableOffset = ScalableOffset;
389     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
390   }
391 
392   int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
393     return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
394   }
395 
396   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
397                              Type *ScalarValTy) const {
398     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
399       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
400       EVT VT = getTLI()->getValueType(DL, SrcTy);
401       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
402           getTLI()->isOperationCustom(ISD::STORE, VT))
403         return true;
404 
405       EVT ValVT =
406           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
407       EVT LegalizedVT =
408           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
409       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
410     };
411     while (VF > 2 && IsSupportedByTarget(VF))
412       VF /= 2;
413     return VF;
414   }
415 
416   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
417                           const DataLayout &DL) const {
418     EVT VT = getTLI()->getValueType(DL, Ty);
419     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
420   }
421 
422   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
423                            const DataLayout &DL) const {
424     EVT VT = getTLI()->getValueType(DL, Ty);
425     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
426   }
427 
428   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
429     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
430   }
431 
432   bool isNumRegsMajorCostOfLSR() {
433     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
434   }
435 
436   bool shouldDropLSRSolutionIfLessProfitable() const {
437     return TargetTransformInfoImplBase::shouldDropLSRSolutionIfLessProfitable();
438   }
439 
440   bool isProfitableLSRChainElement(Instruction *I) {
441     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
442   }
443 
444   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
445                                        StackOffset BaseOffset, bool HasBaseReg,
446                                        int64_t Scale, unsigned AddrSpace) {
447     TargetLoweringBase::AddrMode AM;
448     AM.BaseGV = BaseGV;
449     AM.BaseOffs = BaseOffset.getFixed();
450     AM.HasBaseReg = HasBaseReg;
451     AM.Scale = Scale;
452     AM.ScalableOffset = BaseOffset.getScalable();
453     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
454       return 0;
455     return -1;
456   }
457 
458   bool isTruncateFree(Type *Ty1, Type *Ty2) {
459     return getTLI()->isTruncateFree(Ty1, Ty2);
460   }
461 
462   bool isProfitableToHoist(Instruction *I) {
463     return getTLI()->isProfitableToHoist(I);
464   }
465 
466   bool useAA() const { return getST()->useAA(); }
467 
468   bool isTypeLegal(Type *Ty) {
469     EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
470     return getTLI()->isTypeLegal(VT);
471   }
472 
473   unsigned getRegUsageForType(Type *Ty) {
474     EVT ETy = getTLI()->getValueType(DL, Ty);
475     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
476   }
477 
478   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
479                              ArrayRef<const Value *> Operands, Type *AccessType,
480                              TTI::TargetCostKind CostKind) {
481     return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
482   }
483 
484   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
485                                             unsigned &JumpTableSize,
486                                             ProfileSummaryInfo *PSI,
487                                             BlockFrequencyInfo *BFI) {
488     /// Try to find the estimated number of clusters. Note that the number of
489     /// clusters identified in this function could be different from the actual
490     /// numbers found in lowering. This function ignore switches that are
491     /// lowered with a mix of jump table / bit test / BTree. This function was
492     /// initially intended to be used when estimating the cost of switch in
493     /// inline cost heuristic, but it's a generic cost model to be used in other
494     /// places (e.g., in loop unrolling).
495     unsigned N = SI.getNumCases();
496     const TargetLoweringBase *TLI = getTLI();
497     const DataLayout &DL = this->getDataLayout();
498 
499     JumpTableSize = 0;
500     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
501 
502     // Early exit if both a jump table and bit test are not allowed.
503     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
504       return N;
505 
506     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
507     APInt MinCaseVal = MaxCaseVal;
508     for (auto CI : SI.cases()) {
509       const APInt &CaseVal = CI.getCaseValue()->getValue();
510       if (CaseVal.sgt(MaxCaseVal))
511         MaxCaseVal = CaseVal;
512       if (CaseVal.slt(MinCaseVal))
513         MinCaseVal = CaseVal;
514     }
515 
516     // Check if suitable for a bit test
517     if (N <= DL.getIndexSizeInBits(0u)) {
518       SmallPtrSet<const BasicBlock *, 4> Dests;
519       for (auto I : SI.cases())
520         Dests.insert(I.getCaseSuccessor());
521 
522       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
523                                      DL))
524         return 1;
525     }
526 
527     // Check if suitable for a jump table.
528     if (IsJTAllowed) {
529       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
530         return N;
531       uint64_t Range =
532           (MaxCaseVal - MinCaseVal)
533               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
534       // Check whether a range of clusters is dense enough for a jump table
535       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
536         JumpTableSize = Range;
537         return 1;
538       }
539     }
540     return N;
541   }
542 
543   bool shouldBuildLookupTables() {
544     const TargetLoweringBase *TLI = getTLI();
545     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
546            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
547   }
548 
549   bool shouldBuildRelLookupTables() const {
550     const TargetMachine &TM = getTLI()->getTargetMachine();
551     // If non-PIC mode, do not generate a relative lookup table.
552     if (!TM.isPositionIndependent())
553       return false;
554 
555     /// Relative lookup table entries consist of 32-bit offsets.
556     /// Do not generate relative lookup tables for large code models
557     /// in 64-bit achitectures where 32-bit offsets might not be enough.
558     if (TM.getCodeModel() == CodeModel::Medium ||
559         TM.getCodeModel() == CodeModel::Large)
560       return false;
561 
562     const Triple &TargetTriple = TM.getTargetTriple();
563     if (!TargetTriple.isArch64Bit())
564       return false;
565 
566     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
567     // there.
568     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
569       return false;
570 
571     return true;
572   }
573 
574   bool haveFastSqrt(Type *Ty) {
575     const TargetLoweringBase *TLI = getTLI();
576     EVT VT = TLI->getValueType(DL, Ty);
577     return TLI->isTypeLegal(VT) &&
578            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
579   }
580 
581   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
582     return true;
583   }
584 
585   InstructionCost getFPOpCost(Type *Ty) {
586     // Check whether FADD is available, as a proxy for floating-point in
587     // general.
588     const TargetLoweringBase *TLI = getTLI();
589     EVT VT = TLI->getValueType(DL, Ty);
590     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
591       return TargetTransformInfo::TCC_Basic;
592     return TargetTransformInfo::TCC_Expensive;
593   }
594 
595   bool preferToKeepConstantsAttached(const Instruction &Inst,
596                                      const Function &Fn) const {
597     switch (Inst.getOpcode()) {
598     default:
599       break;
600     case Instruction::SDiv:
601     case Instruction::SRem:
602     case Instruction::UDiv:
603     case Instruction::URem: {
604       if (!isa<ConstantInt>(Inst.getOperand(1)))
605         return false;
606       EVT VT = getTLI()->getValueType(DL, Inst.getType());
607       return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
608     }
609     };
610 
611     return false;
612   }
613 
614   unsigned getInliningThresholdMultiplier() const { return 1; }
615   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
616   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
617     return 0;
618   }
619 
620   int getInlinerVectorBonusPercent() const { return 150; }
621 
622   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
623                                TTI::UnrollingPreferences &UP,
624                                OptimizationRemarkEmitter *ORE) {
625     // This unrolling functionality is target independent, but to provide some
626     // motivation for its intended use, for x86:
627 
628     // According to the Intel 64 and IA-32 Architectures Optimization Reference
629     // Manual, Intel Core models and later have a loop stream detector (and
630     // associated uop queue) that can benefit from partial unrolling.
631     // The relevant requirements are:
632     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
633     //    taken, and none of them may be calls.
634     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
635 
636     // According to the Software Optimization Guide for AMD Family 15h
637     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
638     // and loop buffer which can benefit from partial unrolling.
639     // The relevant requirements are:
640     //  - The loop must have fewer than 16 branches
641     //  - The loop must have less than 40 uops in all executed loop branches
642 
643     // The number of taken branches in a loop is hard to estimate here, and
644     // benchmarking has revealed that it is better not to be conservative when
645     // estimating the branch count. As a result, we'll ignore the branch limits
646     // until someone finds a case where it matters in practice.
647 
648     unsigned MaxOps;
649     const TargetSubtargetInfo *ST = getST();
650     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
651       MaxOps = PartialUnrollingThreshold;
652     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
653       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
654     else
655       return;
656 
657     // Scan the loop: don't unroll loops with calls.
658     for (BasicBlock *BB : L->blocks()) {
659       for (Instruction &I : *BB) {
660         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
661           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
662             if (!thisT()->isLoweredToCall(F))
663               continue;
664           }
665 
666           if (ORE) {
667             ORE->emit([&]() {
668               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
669                                         L->getHeader())
670                      << "advising against unrolling the loop because it "
671                         "contains a "
672                      << ore::NV("Call", &I);
673             });
674           }
675           return;
676         }
677       }
678     }
679 
680     // Enable runtime and partial unrolling up to the specified size.
681     // Enable using trip count upper bound to unroll loops.
682     UP.Partial = UP.Runtime = UP.UpperBound = true;
683     UP.PartialThreshold = MaxOps;
684 
685     // Avoid unrolling when optimizing for size.
686     UP.OptSizeThreshold = 0;
687     UP.PartialOptSizeThreshold = 0;
688 
689     // Set number of instructions optimized when "back edge"
690     // becomes "fall through" to default value of 2.
691     UP.BEInsns = 2;
692   }
693 
694   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
695                              TTI::PeelingPreferences &PP) {
696     PP.PeelCount = 0;
697     PP.AllowPeeling = true;
698     PP.AllowLoopNestsPeeling = false;
699     PP.PeelProfiledIterations = true;
700   }
701 
702   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
703                                 AssumptionCache &AC,
704                                 TargetLibraryInfo *LibInfo,
705                                 HardwareLoopInfo &HWLoopInfo) {
706     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
707   }
708 
709   unsigned getEpilogueVectorizationMinVF() {
710     return BaseT::getEpilogueVectorizationMinVF();
711   }
712 
713   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
714     return BaseT::preferPredicateOverEpilogue(TFI);
715   }
716 
717   TailFoldingStyle
718   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
719     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
720   }
721 
722   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
723                                                IntrinsicInst &II) {
724     return BaseT::instCombineIntrinsic(IC, II);
725   }
726 
727   std::optional<Value *>
728   simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
729                                    APInt DemandedMask, KnownBits &Known,
730                                    bool &KnownBitsComputed) {
731     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
732                                                    KnownBitsComputed);
733   }
734 
735   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
736       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
737       APInt &UndefElts2, APInt &UndefElts3,
738       std::function<void(Instruction *, unsigned, APInt, APInt &)>
739           SimplifyAndSetOp) {
740     return BaseT::simplifyDemandedVectorEltsIntrinsic(
741         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
742         SimplifyAndSetOp);
743   }
744 
745   virtual std::optional<unsigned>
746   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
747     return std::optional<unsigned>(
748         getST()->getCacheSize(static_cast<unsigned>(Level)));
749   }
750 
751   virtual std::optional<unsigned>
752   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
753     std::optional<unsigned> TargetResult =
754         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
755 
756     if (TargetResult)
757       return TargetResult;
758 
759     return BaseT::getCacheAssociativity(Level);
760   }
761 
762   virtual unsigned getCacheLineSize() const {
763     return getST()->getCacheLineSize();
764   }
765 
766   virtual unsigned getPrefetchDistance() const {
767     return getST()->getPrefetchDistance();
768   }
769 
770   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
771                                         unsigned NumStridedMemAccesses,
772                                         unsigned NumPrefetches,
773                                         bool HasCall) const {
774     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
775                                          NumPrefetches, HasCall);
776   }
777 
778   virtual unsigned getMaxPrefetchIterationsAhead() const {
779     return getST()->getMaxPrefetchIterationsAhead();
780   }
781 
782   virtual bool enableWritePrefetching() const {
783     return getST()->enableWritePrefetching();
784   }
785 
786   virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
787     return getST()->shouldPrefetchAddressSpace(AS);
788   }
789 
790   /// @}
791 
792   /// \name Vector TTI Implementations
793   /// @{
794 
795   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
796     return TypeSize::getFixed(32);
797   }
798 
799   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
800   std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
801   bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
802 
803   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
804   /// are set if the demanded result elements need to be inserted and/or
805   /// extracted from vectors.
806   InstructionCost getScalarizationOverhead(VectorType *InTy,
807                                            const APInt &DemandedElts,
808                                            bool Insert, bool Extract,
809                                            TTI::TargetCostKind CostKind,
810                                            ArrayRef<Value *> VL = {}) {
811     /// FIXME: a bitfield is not a reasonable abstraction for talking about
812     /// which elements are needed from a scalable vector
813     if (isa<ScalableVectorType>(InTy))
814       return InstructionCost::getInvalid();
815     auto *Ty = cast<FixedVectorType>(InTy);
816 
817     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
818            (VL.empty() || VL.size() == Ty->getNumElements()) &&
819            "Vector size mismatch");
820 
821     InstructionCost Cost = 0;
822 
823     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
824       if (!DemandedElts[i])
825         continue;
826       if (Insert) {
827         Value *InsertedVal = VL.empty() ? nullptr : VL[i];
828         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
829                                             CostKind, i, nullptr, InsertedVal);
830       }
831       if (Extract)
832         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
833                                             CostKind, i, nullptr, nullptr);
834     }
835 
836     return Cost;
837   }
838 
839   bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const {
840     return false;
841   }
842 
843   bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
844                                           unsigned ScalarOpdIdx) const {
845     return false;
846   }
847 
848   bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
849                                               int OpdIdx) const {
850     return OpdIdx == -1;
851   }
852 
853   bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
854                                                         int RetIdx) const {
855     return RetIdx == 0;
856   }
857 
858   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
859   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
860                                            bool Extract,
861                                            TTI::TargetCostKind CostKind) {
862     if (isa<ScalableVectorType>(InTy))
863       return InstructionCost::getInvalid();
864     auto *Ty = cast<FixedVectorType>(InTy);
865 
866     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
867     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
868                                              CostKind);
869   }
870 
871   /// Estimate the overhead of scalarizing an instructions unique
872   /// non-constant operands. The (potentially vector) types to use for each of
873   /// argument are passes via Tys.
874   InstructionCost
875   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
876                                    ArrayRef<Type *> Tys,
877                                    TTI::TargetCostKind CostKind) {
878     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
879 
880     InstructionCost Cost = 0;
881     SmallPtrSet<const Value*, 4> UniqueOperands;
882     for (int I = 0, E = Args.size(); I != E; I++) {
883       // Disregard things like metadata arguments.
884       const Value *A = Args[I];
885       Type *Ty = Tys[I];
886       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
887           !Ty->isPtrOrPtrVectorTy())
888         continue;
889 
890       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
891         if (auto *VecTy = dyn_cast<VectorType>(Ty))
892           Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
893                                            /*Extract*/ true, CostKind);
894       }
895     }
896 
897     return Cost;
898   }
899 
900   /// Estimate the overhead of scalarizing the inputs and outputs of an
901   /// instruction, with return type RetTy and arguments Args of type Tys. If
902   /// Args are unknown (empty), then the cost associated with one argument is
903   /// added as a heuristic.
904   InstructionCost getScalarizationOverhead(VectorType *RetTy,
905                                            ArrayRef<const Value *> Args,
906                                            ArrayRef<Type *> Tys,
907                                            TTI::TargetCostKind CostKind) {
908     InstructionCost Cost = getScalarizationOverhead(
909         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
910     if (!Args.empty())
911       Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
912     else
913       // When no information on arguments is provided, we add the cost
914       // associated with one argument as a heuristic.
915       Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
916                                        /*Extract*/ true, CostKind);
917 
918     return Cost;
919   }
920 
921   /// Estimate the cost of type-legalization and the legalized type.
922   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
923     LLVMContext &C = Ty->getContext();
924     EVT MTy = getTLI()->getValueType(DL, Ty);
925 
926     InstructionCost Cost = 1;
927     // We keep legalizing the type until we find a legal kind. We assume that
928     // the only operation that costs anything is the split. After splitting
929     // we need to handle two types.
930     while (true) {
931       TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
932 
933       if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
934         // Ensure we return a sensible simple VT here, since many callers of
935         // this function require it.
936         MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
937         return std::make_pair(InstructionCost::getInvalid(), VT);
938       }
939 
940       if (LK.first == TargetLoweringBase::TypeLegal)
941         return std::make_pair(Cost, MTy.getSimpleVT());
942 
943       if (LK.first == TargetLoweringBase::TypeSplitVector ||
944           LK.first == TargetLoweringBase::TypeExpandInteger)
945         Cost *= 2;
946 
947       // Do not loop with f128 type.
948       if (MTy == LK.second)
949         return std::make_pair(Cost, MTy.getSimpleVT());
950 
951       // Keep legalizing the type.
952       MTy = LK.second;
953     }
954   }
955 
956   unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
957 
958   InstructionCost getArithmeticInstrCost(
959       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
960       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
961       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
962       ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) {
963     // Check if any of the operands are vector operands.
964     const TargetLoweringBase *TLI = getTLI();
965     int ISD = TLI->InstructionOpcodeToISD(Opcode);
966     assert(ISD && "Invalid opcode");
967 
968     // TODO: Handle more cost kinds.
969     if (CostKind != TTI::TCK_RecipThroughput)
970       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
971                                            Opd1Info, Opd2Info,
972                                            Args, CxtI);
973 
974     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
975 
976     bool IsFloat = Ty->isFPOrFPVectorTy();
977     // Assume that floating point arithmetic operations cost twice as much as
978     // integer operations.
979     InstructionCost OpCost = (IsFloat ? 2 : 1);
980 
981     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
982       // The operation is legal. Assume it costs 1.
983       // TODO: Once we have extract/insert subvector cost we need to use them.
984       return LT.first * OpCost;
985     }
986 
987     if (!TLI->isOperationExpand(ISD, LT.second)) {
988       // If the operation is custom lowered, then assume that the code is twice
989       // as expensive.
990       return LT.first * 2 * OpCost;
991     }
992 
993     // An 'Expand' of URem and SRem is special because it may default
994     // to expanding the operation into a sequence of sub-operations
995     // i.e. X % Y -> X-(X/Y)*Y.
996     if (ISD == ISD::UREM || ISD == ISD::SREM) {
997       bool IsSigned = ISD == ISD::SREM;
998       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
999                                         LT.second) ||
1000           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
1001                                         LT.second)) {
1002         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
1003         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
1004             DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
1005         InstructionCost MulCost =
1006             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
1007         InstructionCost SubCost =
1008             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
1009         return DivCost + MulCost + SubCost;
1010       }
1011     }
1012 
1013     // We cannot scalarize scalable vectors, so return Invalid.
1014     if (isa<ScalableVectorType>(Ty))
1015       return InstructionCost::getInvalid();
1016 
1017     // Else, assume that we need to scalarize this op.
1018     // TODO: If one of the types get legalized by splitting, handle this
1019     // similarly to what getCastInstrCost() does.
1020     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1021       InstructionCost Cost = thisT()->getArithmeticInstrCost(
1022           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
1023           Args, CxtI);
1024       // Return the cost of multiple scalar invocation plus the cost of
1025       // inserting and extracting the values.
1026       SmallVector<Type *> Tys(Args.size(), Ty);
1027       return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1028              VTy->getNumElements() * Cost;
1029     }
1030 
1031     // We don't know anything about this scalar instruction.
1032     return OpCost;
1033   }
1034 
1035   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
1036                                               ArrayRef<int> Mask,
1037                                               VectorType *Ty, int &Index,
1038                                               VectorType *&SubTy) const {
1039     if (Mask.empty())
1040       return Kind;
1041     int NumSrcElts = Ty->getElementCount().getKnownMinValue();
1042     switch (Kind) {
1043     case TTI::SK_PermuteSingleSrc: {
1044       if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1045         return TTI::SK_Reverse;
1046       if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1047         return TTI::SK_Broadcast;
1048       if (isSplatMask(Mask, NumSrcElts, Index))
1049         return TTI::SK_Broadcast;
1050       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1051           (Index + Mask.size()) <= (size_t)NumSrcElts) {
1052         SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
1053         return TTI::SK_ExtractSubvector;
1054       }
1055       break;
1056     }
1057     case TTI::SK_PermuteTwoSrc: {
1058       int NumSubElts;
1059       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1060                                  Mask, NumSrcElts, NumSubElts, Index)) {
1061         if (Index + NumSubElts > NumSrcElts)
1062           return Kind;
1063         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1064         return TTI::SK_InsertSubvector;
1065       }
1066       if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1067         return TTI::SK_Select;
1068       if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1069         return TTI::SK_Transpose;
1070       if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1071         return TTI::SK_Splice;
1072       break;
1073     }
1074     case TTI::SK_Select:
1075     case TTI::SK_Reverse:
1076     case TTI::SK_Broadcast:
1077     case TTI::SK_Transpose:
1078     case TTI::SK_InsertSubvector:
1079     case TTI::SK_ExtractSubvector:
1080     case TTI::SK_Splice:
1081       break;
1082     }
1083     return Kind;
1084   }
1085 
1086   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1087                                  ArrayRef<int> Mask,
1088                                  TTI::TargetCostKind CostKind, int Index,
1089                                  VectorType *SubTp,
1090                                  ArrayRef<const Value *> Args = {},
1091                                  const Instruction *CxtI = nullptr) {
1092     switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1093     case TTI::SK_Broadcast:
1094       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1095         return getBroadcastShuffleOverhead(FVT, CostKind);
1096       return InstructionCost::getInvalid();
1097     case TTI::SK_Select:
1098     case TTI::SK_Splice:
1099     case TTI::SK_Reverse:
1100     case TTI::SK_Transpose:
1101     case TTI::SK_PermuteSingleSrc:
1102     case TTI::SK_PermuteTwoSrc:
1103       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1104         return getPermuteShuffleOverhead(FVT, CostKind);
1105       return InstructionCost::getInvalid();
1106     case TTI::SK_ExtractSubvector:
1107       return getExtractSubvectorOverhead(Tp, CostKind, Index,
1108                                          cast<FixedVectorType>(SubTp));
1109     case TTI::SK_InsertSubvector:
1110       return getInsertSubvectorOverhead(Tp, CostKind, Index,
1111                                         cast<FixedVectorType>(SubTp));
1112     }
1113     llvm_unreachable("Unknown TTI::ShuffleKind");
1114   }
1115 
1116   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1117                                    TTI::CastContextHint CCH,
1118                                    TTI::TargetCostKind CostKind,
1119                                    const Instruction *I = nullptr) {
1120     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1121       return 0;
1122 
1123     const TargetLoweringBase *TLI = getTLI();
1124     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1125     assert(ISD && "Invalid opcode");
1126     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1127     std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1128 
1129     TypeSize SrcSize = SrcLT.second.getSizeInBits();
1130     TypeSize DstSize = DstLT.second.getSizeInBits();
1131     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1132     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1133 
1134     switch (Opcode) {
1135     default:
1136       break;
1137     case Instruction::Trunc:
1138       // Check for NOOP conversions.
1139       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1140         return 0;
1141       [[fallthrough]];
1142     case Instruction::BitCast:
1143       // Bitcast between types that are legalized to the same type are free and
1144       // assume int to/from ptr of the same size is also free.
1145       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1146           SrcSize == DstSize)
1147         return 0;
1148       break;
1149     case Instruction::FPExt:
1150       if (I && getTLI()->isExtFree(I))
1151         return 0;
1152       break;
1153     case Instruction::ZExt:
1154       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1155         return 0;
1156       [[fallthrough]];
1157     case Instruction::SExt:
1158       if (I && getTLI()->isExtFree(I))
1159         return 0;
1160 
1161       // If this is a zext/sext of a load, return 0 if the corresponding
1162       // extending load exists on target and the result type is legal.
1163       if (CCH == TTI::CastContextHint::Normal) {
1164         EVT ExtVT = EVT::getEVT(Dst);
1165         EVT LoadVT = EVT::getEVT(Src);
1166         unsigned LType =
1167           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1168         if (DstLT.first == SrcLT.first &&
1169             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1170           return 0;
1171       }
1172       break;
1173     case Instruction::AddrSpaceCast:
1174       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1175                                    Dst->getPointerAddressSpace()))
1176         return 0;
1177       break;
1178     }
1179 
1180     auto *SrcVTy = dyn_cast<VectorType>(Src);
1181     auto *DstVTy = dyn_cast<VectorType>(Dst);
1182 
1183     // If the cast is marked as legal (or promote) then assume low cost.
1184     if (SrcLT.first == DstLT.first &&
1185         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1186       return SrcLT.first;
1187 
1188     // Handle scalar conversions.
1189     if (!SrcVTy && !DstVTy) {
1190       // Just check the op cost. If the operation is legal then assume it costs
1191       // 1.
1192       if (!TLI->isOperationExpand(ISD, DstLT.second))
1193         return 1;
1194 
1195       // Assume that illegal scalar instruction are expensive.
1196       return 4;
1197     }
1198 
1199     // Check vector-to-vector casts.
1200     if (DstVTy && SrcVTy) {
1201       // If the cast is between same-sized registers, then the check is simple.
1202       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1203 
1204         // Assume that Zext is done using AND.
1205         if (Opcode == Instruction::ZExt)
1206           return SrcLT.first;
1207 
1208         // Assume that sext is done using SHL and SRA.
1209         if (Opcode == Instruction::SExt)
1210           return SrcLT.first * 2;
1211 
1212         // Just check the op cost. If the operation is legal then assume it
1213         // costs
1214         // 1 and multiply by the type-legalization overhead.
1215         if (!TLI->isOperationExpand(ISD, DstLT.second))
1216           return SrcLT.first * 1;
1217       }
1218 
1219       // If we are legalizing by splitting, query the concrete TTI for the cost
1220       // of casting the original vector twice. We also need to factor in the
1221       // cost of the split itself. Count that as 1, to be consistent with
1222       // getTypeLegalizationCost().
1223       bool SplitSrc =
1224           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1225           TargetLowering::TypeSplitVector;
1226       bool SplitDst =
1227           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1228           TargetLowering::TypeSplitVector;
1229       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1230           DstVTy->getElementCount().isVector()) {
1231         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1232         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1233         T *TTI = static_cast<T *>(this);
1234         // If both types need to be split then the split is free.
1235         InstructionCost SplitCost =
1236             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1237         return SplitCost +
1238                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1239                                           CostKind, I));
1240       }
1241 
1242       // Scalarization cost is Invalid, can't assume any num elements.
1243       if (isa<ScalableVectorType>(DstVTy))
1244         return InstructionCost::getInvalid();
1245 
1246       // In other cases where the source or destination are illegal, assume
1247       // the operation will get scalarized.
1248       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1249       InstructionCost Cost = thisT()->getCastInstrCost(
1250           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1251 
1252       // Return the cost of multiple scalar invocation plus the cost of
1253       // inserting and extracting the values.
1254       return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1255                                       CostKind) +
1256              Num * Cost;
1257     }
1258 
1259     // We already handled vector-to-vector and scalar-to-scalar conversions.
1260     // This
1261     // is where we handle bitcast between vectors and scalars. We need to assume
1262     //  that the conversion is scalarized in one way or another.
1263     if (Opcode == Instruction::BitCast) {
1264       // Illegal bitcasts are done by storing and loading from a stack slot.
1265       return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1266                                                 /*Extract*/ true, CostKind)
1267                      : 0) +
1268              (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1269                                                 /*Extract*/ false, CostKind)
1270                      : 0);
1271     }
1272 
1273     llvm_unreachable("Unhandled cast");
1274   }
1275 
1276   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1277                                            VectorType *VecTy, unsigned Index) {
1278     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1279     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1280                                        CostKind, Index, nullptr, nullptr) +
1281            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1282                                      TTI::CastContextHint::None, CostKind);
1283   }
1284 
1285   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1286                                  const Instruction *I = nullptr) {
1287     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1288   }
1289 
1290   InstructionCost getCmpSelInstrCost(
1291       unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1292       TTI::TargetCostKind CostKind,
1293       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
1294       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
1295       const Instruction *I = nullptr) {
1296     const TargetLoweringBase *TLI = getTLI();
1297     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1298     assert(ISD && "Invalid opcode");
1299 
1300     // TODO: Handle other cost kinds.
1301     if (CostKind != TTI::TCK_RecipThroughput)
1302       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1303                                        Op1Info, Op2Info, I);
1304 
1305     // Selects on vectors are actually vector selects.
1306     if (ISD == ISD::SELECT) {
1307       assert(CondTy && "CondTy must exist");
1308       if (CondTy->isVectorTy())
1309         ISD = ISD::VSELECT;
1310     }
1311     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1312 
1313     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1314         !TLI->isOperationExpand(ISD, LT.second)) {
1315       // The operation is legal. Assume it costs 1. Multiply
1316       // by the type-legalization overhead.
1317       return LT.first * 1;
1318     }
1319 
1320     // Otherwise, assume that the cast is scalarized.
1321     // TODO: If one of the types get legalized by splitting, handle this
1322     // similarly to what getCastInstrCost() does.
1323     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1324       if (isa<ScalableVectorType>(ValTy))
1325         return InstructionCost::getInvalid();
1326 
1327       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1328       if (CondTy)
1329         CondTy = CondTy->getScalarType();
1330       InstructionCost Cost =
1331           thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy,
1332                                       VecPred, CostKind, Op1Info, Op2Info, I);
1333 
1334       // Return the cost of multiple scalar invocation plus the cost of
1335       // inserting and extracting the values.
1336       return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1337                                       /*Extract*/ false, CostKind) +
1338              Num * Cost;
1339     }
1340 
1341     // Unknown scalar opcode.
1342     return 1;
1343   }
1344 
1345   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1346                                      TTI::TargetCostKind CostKind,
1347                                      unsigned Index, Value *Op0, Value *Op1) {
1348     return getRegUsageForType(Val->getScalarType());
1349   }
1350 
1351   /// \param ScalarUserAndIdx encodes the information about extracts from a
1352   /// vector with 'Scalar' being the value being extracted,'User' being the user
1353   /// of the extract(nullptr if user is not known before vectorization) and
1354   /// 'Idx' being the extract lane.
1355   InstructionCost getVectorInstrCost(
1356       unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1357       Value *Scalar,
1358       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
1359     return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
1360                                        nullptr);
1361   }
1362 
1363   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1364                                      TTI::TargetCostKind CostKind,
1365                                      unsigned Index) {
1366     Value *Op0 = nullptr;
1367     Value *Op1 = nullptr;
1368     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1369       Op0 = IE->getOperand(0);
1370       Op1 = IE->getOperand(1);
1371     }
1372     return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1373                                        Op1);
1374   }
1375 
1376   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1377                                             int VF,
1378                                             const APInt &DemandedDstElts,
1379                                             TTI::TargetCostKind CostKind) {
1380     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1381            "Unexpected size of DemandedDstElts.");
1382 
1383     InstructionCost Cost;
1384 
1385     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1386     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1387 
1388     // The Mask shuffling cost is extract all the elements of the Mask
1389     // and insert each of them Factor times into the wide vector:
1390     //
1391     // E.g. an interleaved group with factor 3:
1392     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1393     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1394     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1395     // The cost is estimated as extract all mask elements from the <8xi1> mask
1396     // vector and insert them factor times into the <24xi1> shuffled mask
1397     // vector.
1398     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1399     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1400                                               /*Insert*/ false,
1401                                               /*Extract*/ true, CostKind);
1402     Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1403                                               /*Insert*/ true,
1404                                               /*Extract*/ false, CostKind);
1405 
1406     return Cost;
1407   }
1408 
1409   InstructionCost
1410   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1411                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
1412                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
1413                   const Instruction *I = nullptr) {
1414     assert(!Src->isVoidTy() && "Invalid type");
1415     // Assume types, such as structs, are expensive.
1416     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1417       return 4;
1418     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1419 
1420     // Assuming that all loads of legal types cost 1.
1421     InstructionCost Cost = LT.first;
1422     if (CostKind != TTI::TCK_RecipThroughput)
1423       return Cost;
1424 
1425     const DataLayout &DL = this->getDataLayout();
1426     if (Src->isVectorTy() &&
1427         // In practice it's not currently possible to have a change in lane
1428         // length for extending loads or truncating stores so both types should
1429         // have the same scalable property.
1430         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1431                             LT.second.getSizeInBits())) {
1432       // This is a vector load that legalizes to a larger type than the vector
1433       // itself. Unless the corresponding extending load or truncating store is
1434       // legal, then this will scalarize.
1435       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1436       EVT MemVT = getTLI()->getValueType(DL, Src);
1437       if (Opcode == Instruction::Store)
1438         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1439       else
1440         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1441 
1442       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1443         // This is a vector load/store for some illegal type that is scalarized.
1444         // We must account for the cost of building or decomposing the vector.
1445         Cost += getScalarizationOverhead(
1446             cast<VectorType>(Src), Opcode != Instruction::Store,
1447             Opcode == Instruction::Store, CostKind);
1448       }
1449     }
1450 
1451     return Cost;
1452   }
1453 
1454   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1455                                         Align Alignment, unsigned AddressSpace,
1456                                         TTI::TargetCostKind CostKind) {
1457     // TODO: Pass on AddressSpace when we have test coverage.
1458     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1459                                        CostKind);
1460   }
1461 
1462   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1463                                          const Value *Ptr, bool VariableMask,
1464                                          Align Alignment,
1465                                          TTI::TargetCostKind CostKind,
1466                                          const Instruction *I = nullptr) {
1467     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1468                                        true, CostKind);
1469   }
1470 
1471   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
1472                                          const Value *Ptr, bool VariableMask,
1473                                          Align Alignment,
1474                                          TTI::TargetCostKind CostKind,
1475                                          const Instruction *I) {
1476     // For a target without strided memory operations (or for an illegal
1477     // operation type on one which does), assume we lower to a gather/scatter
1478     // operation.  (Which may in turn be scalarized.)
1479     return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1480                                            Alignment, CostKind, I);
1481   }
1482 
1483   InstructionCost getInterleavedMemoryOpCost(
1484       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1485       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1486       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1487 
1488     // We cannot scalarize scalable vectors, so return Invalid.
1489     if (isa<ScalableVectorType>(VecTy))
1490       return InstructionCost::getInvalid();
1491 
1492     auto *VT = cast<FixedVectorType>(VecTy);
1493 
1494     unsigned NumElts = VT->getNumElements();
1495     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1496 
1497     unsigned NumSubElts = NumElts / Factor;
1498     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1499 
1500     // Firstly, the cost of load/store operation.
1501     InstructionCost Cost;
1502     if (UseMaskForCond || UseMaskForGaps)
1503       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1504                                             AddressSpace, CostKind);
1505     else
1506       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1507                                       CostKind);
1508 
1509     // Legalize the vector type, and get the legalized and unlegalized type
1510     // sizes.
1511     MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1512     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1513     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1514 
1515     // Scale the cost of the memory operation by the fraction of legalized
1516     // instructions that will actually be used. We shouldn't account for the
1517     // cost of dead instructions since they will be removed.
1518     //
1519     // E.g., An interleaved load of factor 8:
1520     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1521     //       %v0 = shufflevector %vec, undef, <0, 8>
1522     //
1523     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1524     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1525     // type). The other loads are unused.
1526     //
1527     // TODO: Note that legalization can turn masked loads/stores into unmasked
1528     // (legalized) loads/stores. This can be reflected in the cost.
1529     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1530       // The number of loads of a legal type it will take to represent a load
1531       // of the unlegalized vector type.
1532       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1533 
1534       // The number of elements of the unlegalized type that correspond to a
1535       // single legal instruction.
1536       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1537 
1538       // Determine which legal instructions will be used.
1539       BitVector UsedInsts(NumLegalInsts, false);
1540       for (unsigned Index : Indices)
1541         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1542           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1543 
1544       // Scale the cost of the load by the fraction of legal instructions that
1545       // will be used.
1546       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1547     }
1548 
1549     // Then plus the cost of interleave operation.
1550     assert(Indices.size() <= Factor &&
1551            "Interleaved memory op has too many members");
1552 
1553     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1554     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1555 
1556     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1557     for (unsigned Index : Indices) {
1558       assert(Index < Factor && "Invalid index for interleaved memory op");
1559       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1560         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1561     }
1562 
1563     if (Opcode == Instruction::Load) {
1564       // The interleave cost is similar to extract sub vectors' elements
1565       // from the wide vector, and insert them into sub vectors.
1566       //
1567       // E.g. An interleaved load of factor 2 (with one member of index 0):
1568       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1569       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1570       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1571       // <8 x i32> vector and insert them into a <4 x i32> vector.
1572       InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1573           SubVT, DemandedAllSubElts,
1574           /*Insert*/ true, /*Extract*/ false, CostKind);
1575       Cost += Indices.size() * InsSubCost;
1576       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1577                                                 /*Insert*/ false,
1578                                                 /*Extract*/ true, CostKind);
1579     } else {
1580       // The interleave cost is extract elements from sub vectors, and
1581       // insert them into the wide vector.
1582       //
1583       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1584       // (using VF=4):
1585       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1586       //    %gaps.mask = <true, true, false, true, true, false,
1587       //                  true, true, false, true, true, false>
1588       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1589       //                           i32 Align, <12 x i1> %gaps.mask
1590       // The cost is estimated as extract all elements (of actual members,
1591       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1592       // i32> vector.
1593       InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1594           SubVT, DemandedAllSubElts,
1595           /*Insert*/ false, /*Extract*/ true, CostKind);
1596       Cost += ExtSubCost * Indices.size();
1597       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1598                                                 /*Insert*/ true,
1599                                                 /*Extract*/ false, CostKind);
1600     }
1601 
1602     if (!UseMaskForCond)
1603       return Cost;
1604 
1605     Type *I8Type = Type::getInt8Ty(VT->getContext());
1606 
1607     Cost += thisT()->getReplicationShuffleCost(
1608         I8Type, Factor, NumSubElts,
1609         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1610         CostKind);
1611 
1612     // The Gaps mask is invariant and created outside the loop, therefore the
1613     // cost of creating it is not accounted for here. However if we have both
1614     // a MaskForGaps and some other mask that guards the execution of the
1615     // memory access, we need to account for the cost of And-ing the two masks
1616     // inside the loop.
1617     if (UseMaskForGaps) {
1618       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1619       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1620                                               CostKind);
1621     }
1622 
1623     return Cost;
1624   }
1625 
1626   /// Get intrinsic cost based on arguments.
1627   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1628                                         TTI::TargetCostKind CostKind) {
1629     // Check for generically free intrinsics.
1630     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1631       return 0;
1632 
1633     // Assume that target intrinsics are cheap.
1634     Intrinsic::ID IID = ICA.getID();
1635     if (Intrinsic::isTargetIntrinsic(IID))
1636       return TargetTransformInfo::TCC_Basic;
1637 
1638     // VP Intrinsics should have the same cost as their non-vp counterpart.
1639     // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1640     // counterpart when the vector length argument is smaller than the maximum
1641     // vector length.
1642     // TODO: Support other kinds of VPIntrinsics
1643     if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1644       std::optional<unsigned> FOp =
1645           VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
1646       if (FOp) {
1647         if (ICA.getID() == Intrinsic::vp_load) {
1648           Align Alignment;
1649           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1650             Alignment = VPI->getPointerAlignment().valueOrOne();
1651           unsigned AS = 0;
1652           if (ICA.getArgTypes().size() > 1)
1653             if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1654               AS = PtrTy->getAddressSpace();
1655           return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1656                                           AS, CostKind);
1657         }
1658         if (ICA.getID() == Intrinsic::vp_store) {
1659           Align Alignment;
1660           if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1661             Alignment = VPI->getPointerAlignment().valueOrOne();
1662           unsigned AS = 0;
1663           if (ICA.getArgTypes().size() >= 2)
1664             if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1665               AS = PtrTy->getAddressSpace();
1666           return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1667                                           AS, CostKind);
1668         }
1669         if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
1670           return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1671                                                  CostKind);
1672         }
1673         if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1674           return thisT()->getCastInstrCost(
1675               *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1676               TTI::CastContextHint::None, CostKind);
1677         }
1678         if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1679           // We can only handle vp_cmp intrinsics with underlying instructions.
1680           if (ICA.getInst()) {
1681             assert(FOp);
1682             auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1683             return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1684                                                ICA.getReturnType(),
1685                                                UI->getPredicate(), CostKind);
1686           }
1687         }
1688       }
1689 
1690       std::optional<Intrinsic::ID> FID =
1691           VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
1692       if (FID) {
1693         // Non-vp version will have same arg types except mask and vector
1694         // length.
1695         assert(ICA.getArgTypes().size() >= 2 &&
1696                "Expected VPIntrinsic to have Mask and Vector Length args and "
1697                "types");
1698         ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
1699 
1700         // VPReduction intrinsics have a start value argument that their non-vp
1701         // counterparts do not have, except for the fadd and fmul non-vp
1702         // counterpart.
1703         if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
1704             *FID != Intrinsic::vector_reduce_fadd &&
1705             *FID != Intrinsic::vector_reduce_fmul)
1706           NewTys = NewTys.drop_front();
1707 
1708         IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1709                                        ICA.getFlags());
1710         return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1711       }
1712     }
1713 
1714     if (ICA.isTypeBasedOnly())
1715       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1716 
1717     Type *RetTy = ICA.getReturnType();
1718 
1719     ElementCount RetVF =
1720         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1721                              : ElementCount::getFixed(1));
1722     const IntrinsicInst *I = ICA.getInst();
1723     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1724     FastMathFlags FMF = ICA.getFlags();
1725     switch (IID) {
1726     default:
1727       break;
1728 
1729     case Intrinsic::powi:
1730       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1731         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1732         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1733                                                ShouldOptForSize)) {
1734           // The cost is modeled on the expansion performed by ExpandPowI in
1735           // SelectionDAGBuilder.
1736           APInt Exponent = RHSC->getValue().abs();
1737           unsigned ActiveBits = Exponent.getActiveBits();
1738           unsigned PopCount = Exponent.popcount();
1739           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1740                                  thisT()->getArithmeticInstrCost(
1741                                      Instruction::FMul, RetTy, CostKind);
1742           if (RHSC->isNegative())
1743             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1744                                                     CostKind);
1745           return Cost;
1746         }
1747       }
1748       break;
1749     case Intrinsic::cttz:
1750       // FIXME: If necessary, this should go in target-specific overrides.
1751       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1752         return TargetTransformInfo::TCC_Basic;
1753       break;
1754 
1755     case Intrinsic::ctlz:
1756       // FIXME: If necessary, this should go in target-specific overrides.
1757       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1758         return TargetTransformInfo::TCC_Basic;
1759       break;
1760 
1761     case Intrinsic::memcpy:
1762       return thisT()->getMemcpyCost(ICA.getInst());
1763 
1764     case Intrinsic::masked_scatter: {
1765       const Value *Mask = Args[3];
1766       bool VarMask = !isa<Constant>(Mask);
1767       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1768       return thisT()->getGatherScatterOpCost(Instruction::Store,
1769                                              ICA.getArgTypes()[0], Args[1],
1770                                              VarMask, Alignment, CostKind, I);
1771     }
1772     case Intrinsic::masked_gather: {
1773       const Value *Mask = Args[2];
1774       bool VarMask = !isa<Constant>(Mask);
1775       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1776       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1777                                              VarMask, Alignment, CostKind, I);
1778     }
1779     case Intrinsic::experimental_vp_strided_store: {
1780       const Value *Data = Args[0];
1781       const Value *Ptr = Args[1];
1782       const Value *Mask = Args[3];
1783       const Value *EVL = Args[4];
1784       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1785       Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1786       Align Alignment =
1787           I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1788       return thisT()->getStridedMemoryOpCost(Instruction::Store,
1789                                              Data->getType(), Ptr, VarMask,
1790                                              Alignment, CostKind, I);
1791     }
1792     case Intrinsic::experimental_vp_strided_load: {
1793       const Value *Ptr = Args[0];
1794       const Value *Mask = Args[2];
1795       const Value *EVL = Args[3];
1796       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1797       Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1798       Align Alignment =
1799           I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1800       return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1801                                              VarMask, Alignment, CostKind, I);
1802     }
1803     case Intrinsic::stepvector: {
1804       if (isa<ScalableVectorType>(RetTy))
1805         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1806       // The cost of materialising a constant integer vector.
1807       return TargetTransformInfo::TCC_Basic;
1808     }
1809     case Intrinsic::vector_extract: {
1810       // FIXME: Handle case where a scalable vector is extracted from a scalable
1811       // vector
1812       if (isa<ScalableVectorType>(RetTy))
1813         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1814       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1815       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1816                                      cast<VectorType>(Args[0]->getType()), {},
1817                                      CostKind, Index, cast<VectorType>(RetTy));
1818     }
1819     case Intrinsic::vector_insert: {
1820       // FIXME: Handle case where a scalable vector is inserted into a scalable
1821       // vector
1822       if (isa<ScalableVectorType>(Args[1]->getType()))
1823         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1824       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1825       return thisT()->getShuffleCost(
1826           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {},
1827           CostKind, Index, cast<VectorType>(Args[1]->getType()));
1828     }
1829     case Intrinsic::vector_reverse: {
1830       return thisT()->getShuffleCost(TTI::SK_Reverse,
1831                                      cast<VectorType>(Args[0]->getType()), {},
1832                                      CostKind, 0, cast<VectorType>(RetTy));
1833     }
1834     case Intrinsic::vector_splice: {
1835       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1836       return thisT()->getShuffleCost(TTI::SK_Splice,
1837                                      cast<VectorType>(Args[0]->getType()), {},
1838                                      CostKind, Index, cast<VectorType>(RetTy));
1839     }
1840     case Intrinsic::vector_reduce_add:
1841     case Intrinsic::vector_reduce_mul:
1842     case Intrinsic::vector_reduce_and:
1843     case Intrinsic::vector_reduce_or:
1844     case Intrinsic::vector_reduce_xor:
1845     case Intrinsic::vector_reduce_smax:
1846     case Intrinsic::vector_reduce_smin:
1847     case Intrinsic::vector_reduce_fmax:
1848     case Intrinsic::vector_reduce_fmin:
1849     case Intrinsic::vector_reduce_fmaximum:
1850     case Intrinsic::vector_reduce_fminimum:
1851     case Intrinsic::vector_reduce_umax:
1852     case Intrinsic::vector_reduce_umin: {
1853       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1854       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1855     }
1856     case Intrinsic::vector_reduce_fadd:
1857     case Intrinsic::vector_reduce_fmul: {
1858       IntrinsicCostAttributes Attrs(
1859           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1860       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1861     }
1862     case Intrinsic::fshl:
1863     case Intrinsic::fshr: {
1864       const Value *X = Args[0];
1865       const Value *Y = Args[1];
1866       const Value *Z = Args[2];
1867       const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X);
1868       const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y);
1869       const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1870       const TTI::OperandValueInfo OpInfoBW =
1871         {TTI::OK_UniformConstantValue,
1872          isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1873          : TTI::OP_None};
1874 
1875       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1876       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1877       InstructionCost Cost = 0;
1878       Cost +=
1879           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1880       Cost +=
1881           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1882       Cost += thisT()->getArithmeticInstrCost(
1883           BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1884           {OpInfoZ.Kind, TTI::OP_None});
1885       Cost += thisT()->getArithmeticInstrCost(
1886           BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1887           {OpInfoZ.Kind, TTI::OP_None});
1888       // Non-constant shift amounts requires a modulo.
1889       if (!OpInfoZ.isConstant())
1890         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1891                                                 CostKind, OpInfoZ, OpInfoBW);
1892       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1893       if (X != Y) {
1894         Type *CondTy = RetTy->getWithNewBitWidth(1);
1895         Cost +=
1896             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1897                                         CmpInst::ICMP_EQ, CostKind);
1898         Cost +=
1899             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1900                                         CmpInst::ICMP_EQ, CostKind);
1901       }
1902       return Cost;
1903     }
1904     case Intrinsic::get_active_lane_mask: {
1905       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1906       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1907 
1908       // If we're not expanding the intrinsic then we assume this is cheap
1909       // to implement.
1910       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1911         return getTypeLegalizationCost(RetTy).first;
1912       }
1913 
1914       // Create the expanded types that will be used to calculate the uadd_sat
1915       // operation.
1916       Type *ExpRetTy = VectorType::get(
1917           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1918       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1919       InstructionCost Cost =
1920           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1921       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1922                                           CmpInst::ICMP_ULT, CostKind);
1923       return Cost;
1924     }
1925     case Intrinsic::experimental_cttz_elts: {
1926       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1927 
1928       // If we're not expanding the intrinsic then we assume this is cheap
1929       // to implement.
1930       if (!getTLI()->shouldExpandCttzElements(ArgType))
1931         return getTypeLegalizationCost(RetTy).first;
1932 
1933       // TODO: The costs below reflect the expansion code in
1934       // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1935       // favour of compile time.
1936 
1937       // Find the smallest "sensible" element type to use for the expansion.
1938       bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1939       ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1940       if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1941         VScaleRange = getVScaleRange(I->getCaller(), 64);
1942 
1943       unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1944           RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1945       Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1946 
1947       // Create the new vector type & get the vector length
1948       Type *NewVecTy = VectorType::get(
1949           NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1950 
1951       IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
1952                                            FMF);
1953       InstructionCost Cost =
1954           thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1955 
1956       Cost +=
1957           thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1958       Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1959                                         Args[0]->getType(),
1960                                         TTI::CastContextHint::None, CostKind);
1961       Cost +=
1962           thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1963 
1964       IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1965                                          NewEltTy, NewVecTy, FMF, I, 1);
1966       Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1967       Cost +=
1968           thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1969 
1970       return Cost;
1971     }
1972     case Intrinsic::experimental_vector_match:
1973       return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1974     }
1975 
1976     // Assume that we need to scalarize this intrinsic.)
1977     // Compute the scalarization overhead based on Args for a vector
1978     // intrinsic.
1979     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1980     if (RetVF.isVector() && !RetVF.isScalable()) {
1981       ScalarizationCost = 0;
1982       if (!RetTy->isVoidTy())
1983         ScalarizationCost += getScalarizationOverhead(
1984             cast<VectorType>(RetTy),
1985             /*Insert*/ true, /*Extract*/ false, CostKind);
1986       ScalarizationCost +=
1987           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
1988     }
1989 
1990     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1991                                   ScalarizationCost);
1992     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1993   }
1994 
1995   /// Get intrinsic cost based on argument types.
1996   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1997   /// cost of scalarizing the arguments and the return value will be computed
1998   /// based on types.
1999   InstructionCost
2000   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2001                                  TTI::TargetCostKind CostKind) {
2002     Intrinsic::ID IID = ICA.getID();
2003     Type *RetTy = ICA.getReturnType();
2004     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
2005     FastMathFlags FMF = ICA.getFlags();
2006     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
2007     bool SkipScalarizationCost = ICA.skipScalarizationCost();
2008 
2009     VectorType *VecOpTy = nullptr;
2010     if (!Tys.empty()) {
2011       // The vector reduction operand is operand 0 except for fadd/fmul.
2012       // Their operand 0 is a scalar start value, so the vector op is operand 1.
2013       unsigned VecTyIndex = 0;
2014       if (IID == Intrinsic::vector_reduce_fadd ||
2015           IID == Intrinsic::vector_reduce_fmul)
2016         VecTyIndex = 1;
2017       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
2018       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
2019     }
2020 
2021     // Library call cost - other than size, make it expensive.
2022     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
2023     unsigned ISD = 0;
2024     switch (IID) {
2025     default: {
2026       // Scalable vectors cannot be scalarized, so return Invalid.
2027       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2028             return isa<ScalableVectorType>(Ty);
2029           }))
2030         return InstructionCost::getInvalid();
2031 
2032       // Assume that we need to scalarize this intrinsic.
2033       InstructionCost ScalarizationCost =
2034           SkipScalarizationCost ? ScalarizationCostPassed : 0;
2035       unsigned ScalarCalls = 1;
2036       Type *ScalarRetTy = RetTy;
2037       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2038         if (!SkipScalarizationCost)
2039           ScalarizationCost = getScalarizationOverhead(
2040               RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2041         ScalarCalls = std::max(ScalarCalls,
2042                                cast<FixedVectorType>(RetVTy)->getNumElements());
2043         ScalarRetTy = RetTy->getScalarType();
2044       }
2045       SmallVector<Type *, 4> ScalarTys;
2046       for (Type *Ty : Tys) {
2047         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2048           if (!SkipScalarizationCost)
2049             ScalarizationCost += getScalarizationOverhead(
2050                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2051           ScalarCalls = std::max(ScalarCalls,
2052                                  cast<FixedVectorType>(VTy)->getNumElements());
2053           Ty = Ty->getScalarType();
2054         }
2055         ScalarTys.push_back(Ty);
2056       }
2057       if (ScalarCalls == 1)
2058         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2059 
2060       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2061       InstructionCost ScalarCost =
2062           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2063 
2064       return ScalarCalls * ScalarCost + ScalarizationCost;
2065     }
2066     // Look for intrinsics that can be lowered directly or turned into a scalar
2067     // intrinsic call.
2068     case Intrinsic::sqrt:
2069       ISD = ISD::FSQRT;
2070       break;
2071     case Intrinsic::sin:
2072       ISD = ISD::FSIN;
2073       break;
2074     case Intrinsic::cos:
2075       ISD = ISD::FCOS;
2076       break;
2077     case Intrinsic::sincos:
2078       ISD = ISD::FSINCOS;
2079       break;
2080     case Intrinsic::tan:
2081       ISD = ISD::FTAN;
2082       break;
2083     case Intrinsic::asin:
2084       ISD = ISD::FASIN;
2085       break;
2086     case Intrinsic::acos:
2087       ISD = ISD::FACOS;
2088       break;
2089     case Intrinsic::atan:
2090       ISD = ISD::FATAN;
2091       break;
2092     case Intrinsic::atan2:
2093       ISD = ISD::FATAN2;
2094       break;
2095     case Intrinsic::sinh:
2096       ISD = ISD::FSINH;
2097       break;
2098     case Intrinsic::cosh:
2099       ISD = ISD::FCOSH;
2100       break;
2101     case Intrinsic::tanh:
2102       ISD = ISD::FTANH;
2103       break;
2104     case Intrinsic::exp:
2105       ISD = ISD::FEXP;
2106       break;
2107     case Intrinsic::exp2:
2108       ISD = ISD::FEXP2;
2109       break;
2110     case Intrinsic::exp10:
2111       ISD = ISD::FEXP10;
2112       break;
2113     case Intrinsic::log:
2114       ISD = ISD::FLOG;
2115       break;
2116     case Intrinsic::log10:
2117       ISD = ISD::FLOG10;
2118       break;
2119     case Intrinsic::log2:
2120       ISD = ISD::FLOG2;
2121       break;
2122     case Intrinsic::fabs:
2123       ISD = ISD::FABS;
2124       break;
2125     case Intrinsic::canonicalize:
2126       ISD = ISD::FCANONICALIZE;
2127       break;
2128     case Intrinsic::minnum:
2129       ISD = ISD::FMINNUM;
2130       break;
2131     case Intrinsic::maxnum:
2132       ISD = ISD::FMAXNUM;
2133       break;
2134     case Intrinsic::minimum:
2135       ISD = ISD::FMINIMUM;
2136       break;
2137     case Intrinsic::maximum:
2138       ISD = ISD::FMAXIMUM;
2139       break;
2140     case Intrinsic::minimumnum:
2141       ISD = ISD::FMINIMUMNUM;
2142       break;
2143     case Intrinsic::maximumnum:
2144       ISD = ISD::FMAXIMUMNUM;
2145       break;
2146     case Intrinsic::copysign:
2147       ISD = ISD::FCOPYSIGN;
2148       break;
2149     case Intrinsic::floor:
2150       ISD = ISD::FFLOOR;
2151       break;
2152     case Intrinsic::ceil:
2153       ISD = ISD::FCEIL;
2154       break;
2155     case Intrinsic::trunc:
2156       ISD = ISD::FTRUNC;
2157       break;
2158     case Intrinsic::nearbyint:
2159       ISD = ISD::FNEARBYINT;
2160       break;
2161     case Intrinsic::rint:
2162       ISD = ISD::FRINT;
2163       break;
2164     case Intrinsic::lrint:
2165       ISD = ISD::LRINT;
2166       break;
2167     case Intrinsic::llrint:
2168       ISD = ISD::LLRINT;
2169       break;
2170     case Intrinsic::round:
2171       ISD = ISD::FROUND;
2172       break;
2173     case Intrinsic::roundeven:
2174       ISD = ISD::FROUNDEVEN;
2175       break;
2176     case Intrinsic::pow:
2177       ISD = ISD::FPOW;
2178       break;
2179     case Intrinsic::fma:
2180       ISD = ISD::FMA;
2181       break;
2182     case Intrinsic::fmuladd:
2183       ISD = ISD::FMA;
2184       break;
2185     case Intrinsic::experimental_constrained_fmuladd:
2186       ISD = ISD::STRICT_FMA;
2187       break;
2188     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2189     case Intrinsic::lifetime_start:
2190     case Intrinsic::lifetime_end:
2191     case Intrinsic::sideeffect:
2192     case Intrinsic::pseudoprobe:
2193     case Intrinsic::arithmetic_fence:
2194       return 0;
2195     case Intrinsic::masked_store: {
2196       Type *Ty = Tys[0];
2197       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2198       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2199                                             CostKind);
2200     }
2201     case Intrinsic::masked_load: {
2202       Type *Ty = RetTy;
2203       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2204       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2205                                             CostKind);
2206     }
2207     case Intrinsic::vector_reduce_add:
2208     case Intrinsic::vector_reduce_mul:
2209     case Intrinsic::vector_reduce_and:
2210     case Intrinsic::vector_reduce_or:
2211     case Intrinsic::vector_reduce_xor:
2212       return thisT()->getArithmeticReductionCost(
2213           getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2214           CostKind);
2215     case Intrinsic::vector_reduce_fadd:
2216     case Intrinsic::vector_reduce_fmul:
2217       return thisT()->getArithmeticReductionCost(
2218           getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2219     case Intrinsic::vector_reduce_smax:
2220     case Intrinsic::vector_reduce_smin:
2221     case Intrinsic::vector_reduce_umax:
2222     case Intrinsic::vector_reduce_umin:
2223     case Intrinsic::vector_reduce_fmax:
2224     case Intrinsic::vector_reduce_fmin:
2225     case Intrinsic::vector_reduce_fmaximum:
2226     case Intrinsic::vector_reduce_fminimum:
2227       return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2228                                              VecOpTy, ICA.getFlags(), CostKind);
2229     case Intrinsic::experimental_vector_match: {
2230       auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2231       auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2232       unsigned SearchSize = NeedleTy->getNumElements();
2233 
2234       // If we're not expanding the intrinsic then we assume this is cheap to
2235       // implement.
2236       EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2237       if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2238         return getTypeLegalizationCost(RetTy).first;
2239 
2240       // Approximate the cost based on the expansion code in
2241       // SelectionDAGBuilder.
2242       InstructionCost Cost = 0;
2243       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2244                                           CostKind, 1, nullptr, nullptr);
2245       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2246                                           CostKind, 0, nullptr, nullptr);
2247       Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
2248                                       CostKind, 0, nullptr);
2249       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2250                                           CmpInst::ICMP_EQ, CostKind);
2251       Cost +=
2252           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2253       Cost *= SearchSize;
2254       Cost +=
2255           thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2256       return Cost;
2257     }
2258     case Intrinsic::abs:
2259       ISD = ISD::ABS;
2260       break;
2261     case Intrinsic::fshl:
2262       ISD = ISD::FSHL;
2263       break;
2264     case Intrinsic::fshr:
2265       ISD = ISD::FSHR;
2266       break;
2267     case Intrinsic::smax:
2268       ISD = ISD::SMAX;
2269       break;
2270     case Intrinsic::smin:
2271       ISD = ISD::SMIN;
2272       break;
2273     case Intrinsic::umax:
2274       ISD = ISD::UMAX;
2275       break;
2276     case Intrinsic::umin:
2277       ISD = ISD::UMIN;
2278       break;
2279     case Intrinsic::sadd_sat:
2280       ISD = ISD::SADDSAT;
2281       break;
2282     case Intrinsic::ssub_sat:
2283       ISD = ISD::SSUBSAT;
2284       break;
2285     case Intrinsic::uadd_sat:
2286       ISD = ISD::UADDSAT;
2287       break;
2288     case Intrinsic::usub_sat:
2289       ISD = ISD::USUBSAT;
2290       break;
2291     case Intrinsic::smul_fix:
2292       ISD = ISD::SMULFIX;
2293       break;
2294     case Intrinsic::umul_fix:
2295       ISD = ISD::UMULFIX;
2296       break;
2297     case Intrinsic::sadd_with_overflow:
2298       ISD = ISD::SADDO;
2299       break;
2300     case Intrinsic::ssub_with_overflow:
2301       ISD = ISD::SSUBO;
2302       break;
2303     case Intrinsic::uadd_with_overflow:
2304       ISD = ISD::UADDO;
2305       break;
2306     case Intrinsic::usub_with_overflow:
2307       ISD = ISD::USUBO;
2308       break;
2309     case Intrinsic::smul_with_overflow:
2310       ISD = ISD::SMULO;
2311       break;
2312     case Intrinsic::umul_with_overflow:
2313       ISD = ISD::UMULO;
2314       break;
2315     case Intrinsic::fptosi_sat:
2316       ISD = ISD::FP_TO_SINT_SAT;
2317       break;
2318     case Intrinsic::fptoui_sat:
2319       ISD = ISD::FP_TO_UINT_SAT;
2320       break;
2321     case Intrinsic::ctpop:
2322       ISD = ISD::CTPOP;
2323       // In case of legalization use TCC_Expensive. This is cheaper than a
2324       // library call but still not a cheap instruction.
2325       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2326       break;
2327     case Intrinsic::ctlz:
2328       ISD = ISD::CTLZ;
2329       break;
2330     case Intrinsic::cttz:
2331       ISD = ISD::CTTZ;
2332       break;
2333     case Intrinsic::bswap:
2334       ISD = ISD::BSWAP;
2335       break;
2336     case Intrinsic::bitreverse:
2337       ISD = ISD::BITREVERSE;
2338       break;
2339     case Intrinsic::ucmp:
2340       ISD = ISD::UCMP;
2341       break;
2342     case Intrinsic::scmp:
2343       ISD = ISD::SCMP;
2344       break;
2345     }
2346 
2347     auto *ST = dyn_cast<StructType>(RetTy);
2348     Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2349     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2350 
2351     const TargetLoweringBase *TLI = getTLI();
2352 
2353     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2354       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2355           TLI->isFAbsFree(LT.second)) {
2356         return 0;
2357       }
2358 
2359       // The operation is legal. Assume it costs 1.
2360       // If the type is split to multiple registers, assume that there is some
2361       // overhead to this.
2362       // TODO: Once we have extract/insert subvector cost we need to use them.
2363       if (LT.first > 1)
2364         return (LT.first * 2);
2365       else
2366         return (LT.first * 1);
2367     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2368       // If the operation is custom lowered then assume
2369       // that the code is twice as expensive.
2370       return (LT.first * 2);
2371     }
2372 
2373     switch (IID) {
2374     case Intrinsic::fmuladd: {
2375       // If we can't lower fmuladd into an FMA estimate the cost as a floating
2376       // point mul followed by an add.
2377 
2378       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2379                                              CostKind) +
2380              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2381                                              CostKind);
2382     }
2383     case Intrinsic::experimental_constrained_fmuladd: {
2384       IntrinsicCostAttributes FMulAttrs(
2385         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2386       IntrinsicCostAttributes FAddAttrs(
2387         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2388       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2389              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2390     }
2391     case Intrinsic::smin:
2392     case Intrinsic::smax:
2393     case Intrinsic::umin:
2394     case Intrinsic::umax: {
2395       // minmax(X,Y) = select(icmp(X,Y),X,Y)
2396       Type *CondTy = RetTy->getWithNewBitWidth(1);
2397       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2398       CmpInst::Predicate Pred =
2399           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2400       InstructionCost Cost = 0;
2401       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2402                                           Pred, CostKind);
2403       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2404                                           Pred, CostKind);
2405       return Cost;
2406     }
2407     case Intrinsic::sadd_with_overflow:
2408     case Intrinsic::ssub_with_overflow: {
2409       Type *SumTy = RetTy->getContainedType(0);
2410       Type *OverflowTy = RetTy->getContainedType(1);
2411       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2412                             ? BinaryOperator::Add
2413                             : BinaryOperator::Sub;
2414 
2415       //   Add:
2416       //   Overflow -> (Result < LHS) ^ (RHS < 0)
2417       //   Sub:
2418       //   Overflow -> (Result < LHS) ^ (RHS > 0)
2419       InstructionCost Cost = 0;
2420       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2421       Cost +=
2422           2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2423                                           CmpInst::ICMP_SGT, CostKind);
2424       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2425                                               CostKind);
2426       return Cost;
2427     }
2428     case Intrinsic::uadd_with_overflow:
2429     case Intrinsic::usub_with_overflow: {
2430       Type *SumTy = RetTy->getContainedType(0);
2431       Type *OverflowTy = RetTy->getContainedType(1);
2432       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2433                             ? BinaryOperator::Add
2434                             : BinaryOperator::Sub;
2435       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2436                                     ? CmpInst::ICMP_ULT
2437                                     : CmpInst::ICMP_UGT;
2438 
2439       InstructionCost Cost = 0;
2440       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2441       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2442                                           OverflowTy, Pred, CostKind);
2443       return Cost;
2444     }
2445     case Intrinsic::smul_with_overflow:
2446     case Intrinsic::umul_with_overflow: {
2447       Type *MulTy = RetTy->getContainedType(0);
2448       Type *OverflowTy = RetTy->getContainedType(1);
2449       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2450       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2451       bool IsSigned = IID == Intrinsic::smul_with_overflow;
2452 
2453       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2454       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2455 
2456       InstructionCost Cost = 0;
2457       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2458       Cost +=
2459           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2460       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2461                                             CCH, CostKind);
2462       Cost += thisT()->getArithmeticInstrCost(
2463           Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2464           {TTI::OK_UniformConstantValue, TTI::OP_None});
2465 
2466       if (IsSigned)
2467         Cost += thisT()->getArithmeticInstrCost(
2468             Instruction::AShr, MulTy, CostKind,
2469             {TTI::OK_AnyValue, TTI::OP_None},
2470             {TTI::OK_UniformConstantValue, TTI::OP_None});
2471 
2472       Cost += thisT()->getCmpSelInstrCost(
2473           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2474       return Cost;
2475     }
2476     case Intrinsic::sadd_sat:
2477     case Intrinsic::ssub_sat: {
2478       // Assume a default expansion.
2479       Type *CondTy = RetTy->getWithNewBitWidth(1);
2480 
2481       Type *OpTy = StructType::create({RetTy, CondTy});
2482       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2483                                      ? Intrinsic::sadd_with_overflow
2484                                      : Intrinsic::ssub_with_overflow;
2485       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2486 
2487       // SatMax -> Overflow && SumDiff < 0
2488       // SatMin -> Overflow && SumDiff >= 0
2489       InstructionCost Cost = 0;
2490       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2491                                     nullptr, ScalarizationCostPassed);
2492       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2493       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2494                                           Pred, CostKind);
2495       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2496                                               CondTy, Pred, CostKind);
2497       return Cost;
2498     }
2499     case Intrinsic::uadd_sat:
2500     case Intrinsic::usub_sat: {
2501       Type *CondTy = RetTy->getWithNewBitWidth(1);
2502 
2503       Type *OpTy = StructType::create({RetTy, CondTy});
2504       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2505                                      ? Intrinsic::uadd_with_overflow
2506                                      : Intrinsic::usub_with_overflow;
2507 
2508       InstructionCost Cost = 0;
2509       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2510                                     nullptr, ScalarizationCostPassed);
2511       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2512       Cost +=
2513           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2514                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2515       return Cost;
2516     }
2517     case Intrinsic::smul_fix:
2518     case Intrinsic::umul_fix: {
2519       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2520       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2521 
2522       unsigned ExtOp =
2523           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2524       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2525 
2526       InstructionCost Cost = 0;
2527       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2528       Cost +=
2529           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2530       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2531                                             CCH, CostKind);
2532       Cost += thisT()->getArithmeticInstrCost(
2533           Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2534           {TTI::OK_UniformConstantValue, TTI::OP_None});
2535       Cost += thisT()->getArithmeticInstrCost(
2536           Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2537           {TTI::OK_UniformConstantValue, TTI::OP_None});
2538       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2539       return Cost;
2540     }
2541     case Intrinsic::abs: {
2542       // abs(X) = select(icmp(X,0),X,sub(0,X))
2543       Type *CondTy = RetTy->getWithNewBitWidth(1);
2544       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2545       InstructionCost Cost = 0;
2546       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2547                                           Pred, CostKind);
2548       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2549                                           Pred, CostKind);
2550       // TODO: Should we add an OperandValueProperties::OP_Zero property?
2551       Cost += thisT()->getArithmeticInstrCost(
2552           BinaryOperator::Sub, RetTy, CostKind,
2553           {TTI::OK_UniformConstantValue, TTI::OP_None});
2554       return Cost;
2555     }
2556     case Intrinsic::fshl:
2557     case Intrinsic::fshr: {
2558       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
2559       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
2560       Type *CondTy = RetTy->getWithNewBitWidth(1);
2561       InstructionCost Cost = 0;
2562       Cost +=
2563           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2564       Cost +=
2565           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
2566       Cost +=
2567           thisT()->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
2568       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
2569                                               CostKind);
2570       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
2571                                               CostKind);
2572       // Shift-by-zero handling.
2573       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2574                                           CmpInst::ICMP_EQ, CostKind);
2575       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2576                                           CmpInst::ICMP_EQ, CostKind);
2577       return Cost;
2578     }
2579     case Intrinsic::fptosi_sat:
2580     case Intrinsic::fptoui_sat: {
2581       if (Tys.empty())
2582         break;
2583       Type *FromTy = Tys[0];
2584       bool IsSigned = IID == Intrinsic::fptosi_sat;
2585 
2586       InstructionCost Cost = 0;
2587       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2588                                      {FromTy, FromTy});
2589       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2590       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2591                                      {FromTy, FromTy});
2592       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2593       Cost += thisT()->getCastInstrCost(
2594           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2595           TTI::CastContextHint::None, CostKind);
2596       if (IsSigned) {
2597         Type *CondTy = RetTy->getWithNewBitWidth(1);
2598         Cost += thisT()->getCmpSelInstrCost(
2599             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2600         Cost += thisT()->getCmpSelInstrCost(
2601             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2602       }
2603       return Cost;
2604     }
2605     case Intrinsic::ucmp:
2606     case Intrinsic::scmp: {
2607       Type *CmpTy = Tys[0];
2608       Type *CondTy = RetTy->getWithNewBitWidth(1);
2609       InstructionCost Cost =
2610           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2611                                       CmpIntrinsic::getGTPredicate(IID),
2612                                       CostKind) +
2613           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2614                                       CmpIntrinsic::getLTPredicate(IID),
2615                                       CostKind);
2616 
2617       EVT VT = TLI->getValueType(DL, CmpTy, true);
2618       if (TLI->shouldExpandCmpUsingSelects(VT)) {
2619         // x < y ? -1 : (x > y ? 1 : 0)
2620         Cost += 2 * thisT()->getCmpSelInstrCost(
2621                         BinaryOperator::Select, RetTy, CondTy,
2622                         ICmpInst::BAD_ICMP_PREDICATE, CostKind);
2623       } else {
2624         // zext(x > y) - zext(x < y)
2625         Cost +=
2626             2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
2627                                           TTI::CastContextHint::None, CostKind);
2628         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2629                                                 CostKind);
2630       }
2631       return Cost;
2632     }
2633     default:
2634       break;
2635     }
2636 
2637     // Else, assume that we need to scalarize this intrinsic. For math builtins
2638     // this will emit a costly libcall, adding call overhead and spills. Make it
2639     // very expensive.
2640     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2641       // Scalable vectors cannot be scalarized, so return Invalid.
2642       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2643             return isa<ScalableVectorType>(Ty);
2644           }))
2645         return InstructionCost::getInvalid();
2646 
2647       InstructionCost ScalarizationCost =
2648           SkipScalarizationCost
2649               ? ScalarizationCostPassed
2650               : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2651                                          /*Extract*/ false, CostKind);
2652 
2653       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2654       SmallVector<Type *, 4> ScalarTys;
2655       for (Type *Ty : Tys) {
2656         if (Ty->isVectorTy())
2657           Ty = Ty->getScalarType();
2658         ScalarTys.push_back(Ty);
2659       }
2660       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2661       InstructionCost ScalarCost =
2662           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2663       for (Type *Ty : Tys) {
2664         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2665           if (!ICA.skipScalarizationCost())
2666             ScalarizationCost += getScalarizationOverhead(
2667                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2668           ScalarCalls = std::max(ScalarCalls,
2669                                  cast<FixedVectorType>(VTy)->getNumElements());
2670         }
2671       }
2672       return ScalarCalls * ScalarCost + ScalarizationCost;
2673     }
2674 
2675     // This is going to be turned into a library call, make it expensive.
2676     return SingleCallCost;
2677   }
2678 
2679   /// Compute a cost of the given call instruction.
2680   ///
2681   /// Compute the cost of calling function F with return type RetTy and
2682   /// argument types Tys. F might be nullptr, in this case the cost of an
2683   /// arbitrary call with the specified signature will be returned.
2684   /// This is used, for instance,  when we estimate call of a vector
2685   /// counterpart of the given function.
2686   /// \param F Called function, might be nullptr.
2687   /// \param RetTy Return value types.
2688   /// \param Tys Argument types.
2689   /// \returns The cost of Call instruction.
2690   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2691                                    ArrayRef<Type *> Tys,
2692                                    TTI::TargetCostKind CostKind) {
2693     return 10;
2694   }
2695 
2696   unsigned getNumberOfParts(Type *Tp) {
2697     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2698     if (!LT.first.isValid())
2699       return 0;
2700     // Try to find actual number of parts for non-power-of-2 elements as
2701     // ceil(num-of-elements/num-of-subtype-elements).
2702     if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
2703         Tp && LT.second.isFixedLengthVector() &&
2704         !has_single_bit(FTp->getNumElements())) {
2705       if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
2706               EVT(LT.second).getTypeForEVT(Tp->getContext()));
2707           SubTp && SubTp->getElementType() == FTp->getElementType())
2708         return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
2709     }
2710     return *LT.first.getValue();
2711   }
2712 
2713   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2714                                             const SCEV *) {
2715     return 0;
2716   }
2717 
2718   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2719   /// We're assuming that reduction operation are performing the following way:
2720   ///
2721   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2722   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2723   ///            \----------------v-------------/  \----------v------------/
2724   ///                            n/2 elements               n/2 elements
2725   /// %red1 = op <n x t> %val, <n x t> val1
2726   /// After this operation we have a vector %red1 where only the first n/2
2727   /// elements are meaningful, the second n/2 elements are undefined and can be
2728   /// dropped. All other operations are actually working with the vector of
2729   /// length n/2, not n, though the real vector length is still n.
2730   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2731   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2732   ///            \----------------v-------------/  \----------v------------/
2733   ///                            n/4 elements               3*n/4 elements
2734   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2735   /// length n/2, the resulting vector has length n/4 etc.
2736   ///
2737   /// The cost model should take into account that the actual length of the
2738   /// vector is reduced on each iteration.
2739   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2740                                        TTI::TargetCostKind CostKind) {
2741     // Targets must implement a default value for the scalable case, since
2742     // we don't know how many lanes the vector has.
2743     if (isa<ScalableVectorType>(Ty))
2744       return InstructionCost::getInvalid();
2745 
2746     Type *ScalarTy = Ty->getElementType();
2747     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2748     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2749         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2750         NumVecElts >= 2) {
2751       // Or reduction for i1 is represented as:
2752       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2753       // %res = cmp ne iReduxWidth %val, 0
2754       // And reduction for i1 is represented as:
2755       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2756       // %res = cmp eq iReduxWidth %val, 11111
2757       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2758       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2759                                        TTI::CastContextHint::None, CostKind) +
2760              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2761                                          CmpInst::makeCmpResultType(ValTy),
2762                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2763     }
2764     unsigned NumReduxLevels = Log2_32(NumVecElts);
2765     InstructionCost ArithCost = 0;
2766     InstructionCost ShuffleCost = 0;
2767     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2768     unsigned LongVectorCount = 0;
2769     unsigned MVTLen =
2770         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2771     while (NumVecElts > MVTLen) {
2772       NumVecElts /= 2;
2773       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2774       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2775                                              CostKind, NumVecElts, SubTy);
2776       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2777       Ty = SubTy;
2778       ++LongVectorCount;
2779     }
2780 
2781     NumReduxLevels -= LongVectorCount;
2782 
2783     // The minimal length of the vector is limited by the real length of vector
2784     // operations performed on the current platform. That's why several final
2785     // reduction operations are performed on the vectors with the same
2786     // architecture-dependent length.
2787 
2788     // By default reductions need one shuffle per reduction level.
2789     ShuffleCost +=
2790         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2791                                                  {}, CostKind, 0, Ty);
2792     ArithCost +=
2793         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2794     return ShuffleCost + ArithCost +
2795            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2796                                        CostKind, 0, nullptr, nullptr);
2797   }
2798 
2799   /// Try to calculate the cost of performing strict (in-order) reductions,
2800   /// which involves doing a sequence of floating point additions in lane
2801   /// order, starting with an initial value. For example, consider a scalar
2802   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2803   ///
2804   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2805   ///
2806   ///   %add1 = %InitVal + %v0
2807   ///   %add2 = %add1 + %v1
2808   ///   %add3 = %add2 + %v2
2809   ///   %add4 = %add3 + %v3
2810   ///
2811   /// As a simple estimate we can say the cost of such a reduction is 4 times
2812   /// the cost of a scalar FP addition. We can only estimate the costs for
2813   /// fixed-width vectors here because for scalable vectors we do not know the
2814   /// runtime number of operations.
2815   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2816                                           TTI::TargetCostKind CostKind) {
2817     // Targets must implement a default value for the scalable case, since
2818     // we don't know how many lanes the vector has.
2819     if (isa<ScalableVectorType>(Ty))
2820       return InstructionCost::getInvalid();
2821 
2822     auto *VTy = cast<FixedVectorType>(Ty);
2823     InstructionCost ExtractCost = getScalarizationOverhead(
2824         VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2825     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2826         Opcode, VTy->getElementType(), CostKind);
2827     ArithCost *= VTy->getNumElements();
2828 
2829     return ExtractCost + ArithCost;
2830   }
2831 
2832   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2833                                              std::optional<FastMathFlags> FMF,
2834                                              TTI::TargetCostKind CostKind) {
2835     assert(Ty && "Unknown reduction vector type");
2836     if (TTI::requiresOrderedReduction(FMF))
2837       return getOrderedReductionCost(Opcode, Ty, CostKind);
2838     return getTreeReductionCost(Opcode, Ty, CostKind);
2839   }
2840 
2841   /// Try to calculate op costs for min/max reduction operations.
2842   /// \param CondTy Conditional type for the Select instruction.
2843   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2844                                          FastMathFlags FMF,
2845                                          TTI::TargetCostKind CostKind) {
2846     // Targets must implement a default value for the scalable case, since
2847     // we don't know how many lanes the vector has.
2848     if (isa<ScalableVectorType>(Ty))
2849       return InstructionCost::getInvalid();
2850 
2851     Type *ScalarTy = Ty->getElementType();
2852     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2853     unsigned NumReduxLevels = Log2_32(NumVecElts);
2854     InstructionCost MinMaxCost = 0;
2855     InstructionCost ShuffleCost = 0;
2856     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2857     unsigned LongVectorCount = 0;
2858     unsigned MVTLen =
2859         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2860     while (NumVecElts > MVTLen) {
2861       NumVecElts /= 2;
2862       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2863 
2864       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2865                                              CostKind, NumVecElts, SubTy);
2866 
2867       IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2868       MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2869       Ty = SubTy;
2870       ++LongVectorCount;
2871     }
2872 
2873     NumReduxLevels -= LongVectorCount;
2874 
2875     // The minimal length of the vector is limited by the real length of vector
2876     // operations performed on the current platform. That's why several final
2877     // reduction opertions are perfomed on the vectors with the same
2878     // architecture-dependent length.
2879     ShuffleCost +=
2880         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2881                                                  {}, CostKind, 0, Ty);
2882     IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2883     MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2884     // The last min/max should be in vector registers and we counted it above.
2885     // So just need a single extractelement.
2886     return ShuffleCost + MinMaxCost +
2887            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2888                                        CostKind, 0, nullptr, nullptr);
2889   }
2890 
2891   InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2892                                            Type *ResTy, VectorType *Ty,
2893                                            FastMathFlags FMF,
2894                                            TTI::TargetCostKind CostKind) {
2895     if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
2896         FTy && IsUnsigned && Opcode == Instruction::Add &&
2897         FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
2898       // Represent vector_reduce_add(ZExt(<n x i1>)) as
2899       // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2900       auto *IntTy =
2901           IntegerType::get(ResTy->getContext(), FTy->getNumElements());
2902       IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
2903       return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
2904                                        TTI::CastContextHint::None, CostKind) +
2905              thisT()->getIntrinsicInstrCost(ICA, CostKind);
2906     }
2907     // Without any native support, this is equivalent to the cost of
2908     // vecreduce.opcode(ext(Ty A)).
2909     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2910     InstructionCost RedCost =
2911         thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2912     InstructionCost ExtCost = thisT()->getCastInstrCost(
2913         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2914         TTI::CastContextHint::None, CostKind);
2915 
2916     return RedCost + ExtCost;
2917   }
2918 
2919   InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2920                                          VectorType *Ty,
2921                                          TTI::TargetCostKind CostKind) {
2922     // Without any native support, this is equivalent to the cost of
2923     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2924     // vecreduce.add(mul(A, B)).
2925     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2926     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2927         Instruction::Add, ExtTy, std::nullopt, CostKind);
2928     InstructionCost ExtCost = thisT()->getCastInstrCost(
2929         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2930         TTI::CastContextHint::None, CostKind);
2931 
2932     InstructionCost MulCost =
2933         thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2934 
2935     return RedCost + MulCost + 2 * ExtCost;
2936   }
2937 
2938   InstructionCost getVectorSplitCost() { return 1; }
2939 
2940   /// @}
2941 };
2942 
2943 /// Concrete BasicTTIImpl that can be used if no further customization
2944 /// is needed.
2945 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2946   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2947 
2948   friend class BasicTTIImplBase<BasicTTIImpl>;
2949 
2950   const TargetSubtargetInfo *ST;
2951   const TargetLoweringBase *TLI;
2952 
2953   const TargetSubtargetInfo *getST() const { return ST; }
2954   const TargetLoweringBase *getTLI() const { return TLI; }
2955 
2956 public:
2957   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2958 };
2959 
2960 } // end namespace llvm
2961 
2962 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2963