xref: /netbsd-src/external/apache2/llvm/dist/llvm/include/llvm/CodeGen/BasicTTIImpl.h (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/Analysis/TargetTransformInfoImpl.h"
27 #include "llvm/CodeGen/ISDOpcodes.h"
28 #include "llvm/CodeGen/TargetLowering.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/CodeGen/ValueTypes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/Operator.h"
41 #include "llvm/IR/Type.h"
42 #include "llvm/IR/Value.h"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MachineValueType.h"
47 #include "llvm/Support/MathExtras.h"
48 #include "llvm/Target/TargetMachine.h"
49 #include <algorithm>
50 #include <cassert>
51 #include <cstdint>
52 #include <limits>
53 #include <utility>
54 
55 namespace llvm {
56 
57 class Function;
58 class GlobalValue;
59 class LLVMContext;
60 class ScalarEvolution;
61 class SCEV;
62 class TargetMachine;
63 
64 extern cl::opt<unsigned> PartialUnrollingThreshold;
65 
66 /// Base class which can be used to help build a TTI implementation.
67 ///
68 /// This class provides as much implementation of the TTI interface as is
69 /// possible using the target independent parts of the code generator.
70 ///
71 /// In order to subclass it, your class must implement a getST() method to
72 /// return the subtarget, and a getTLI() method to return the target lowering.
73 /// We need these methods implemented in the derived class so that this class
74 /// doesn't have to duplicate storage for them.
75 template <typename T>
76 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
77 private:
78   using BaseT = TargetTransformInfoImplCRTPBase<T>;
79   using TTI = TargetTransformInfo;
80 
81   /// Helper function to access this as a T.
thisT()82   T *thisT() { return static_cast<T *>(this); }
83 
84   /// Estimate a cost of Broadcast as an extract and sequence of insert
85   /// operations.
getBroadcastShuffleOverhead(FixedVectorType * VTy)86   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
87     InstructionCost Cost = 0;
88     // Broadcast cost is equal to the cost of extracting the zero'th element
89     // plus the cost of inserting it into every element of the result vector.
90     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
91 
92     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
93       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
94     }
95     return Cost;
96   }
97 
98   /// Estimate a cost of shuffle as a sequence of extract and insert
99   /// operations.
getPermuteShuffleOverhead(FixedVectorType * VTy)100   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
101     InstructionCost Cost = 0;
102     // Shuffle cost is equal to the cost of extracting element from its argument
103     // plus the cost of inserting them onto the result vector.
104 
105     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
106     // index 0 of first vector, index 1 of second vector,index 2 of first
107     // vector and finally index 3 of second vector and insert them at index
108     // <0,1,2,3> of result vector.
109     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
110       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
111       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
112     }
113     return Cost;
114   }
115 
116   /// Estimate a cost of subvector extraction as a sequence of extract and
117   /// insert operations.
getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)118   InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
119                                        FixedVectorType *SubVTy) {
120     assert(VTy && SubVTy &&
121            "Can only extract subvectors from vectors");
122     int NumSubElts = SubVTy->getNumElements();
123     assert((!isa<FixedVectorType>(VTy) ||
124             (Index + NumSubElts) <=
125                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
126            "SK_ExtractSubvector index out of range");
127 
128     InstructionCost Cost = 0;
129     // Subvector extraction cost is equal to the cost of extracting element from
130     // the source type plus the cost of inserting them into the result vector
131     // type.
132     for (int i = 0; i != NumSubElts; ++i) {
133       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
134                                           i + Index);
135       Cost +=
136           thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
137     }
138     return Cost;
139   }
140 
141   /// Estimate a cost of subvector insertion as a sequence of extract and
142   /// insert operations.
getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)143   InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
144                                       FixedVectorType *SubVTy) {
145     assert(VTy && SubVTy &&
146            "Can only insert subvectors into vectors");
147     int NumSubElts = SubVTy->getNumElements();
148     assert((!isa<FixedVectorType>(VTy) ||
149             (Index + NumSubElts) <=
150                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
151            "SK_InsertSubvector index out of range");
152 
153     InstructionCost Cost = 0;
154     // Subvector insertion cost is equal to the cost of extracting element from
155     // the source type plus the cost of inserting them into the result vector
156     // type.
157     for (int i = 0; i != NumSubElts; ++i) {
158       Cost +=
159           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
160       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
161                                           i + Index);
162     }
163     return Cost;
164   }
165 
166   /// Local query method delegates up to T which *must* implement this!
getST()167   const TargetSubtargetInfo *getST() const {
168     return static_cast<const T *>(this)->getST();
169   }
170 
171   /// Local query method delegates up to T which *must* implement this!
getTLI()172   const TargetLoweringBase *getTLI() const {
173     return static_cast<const T *>(this)->getTLI();
174   }
175 
getISDIndexedMode(TTI::MemIndexedMode M)176   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
177     switch (M) {
178       case TTI::MIM_Unindexed:
179         return ISD::UNINDEXED;
180       case TTI::MIM_PreInc:
181         return ISD::PRE_INC;
182       case TTI::MIM_PreDec:
183         return ISD::PRE_DEC;
184       case TTI::MIM_PostInc:
185         return ISD::POST_INC;
186       case TTI::MIM_PostDec:
187         return ISD::POST_DEC;
188     }
189     llvm_unreachable("Unexpected MemIndexedMode");
190   }
191 
getCommonMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,bool VariableMask,bool IsGatherScatter,TTI::TargetCostKind CostKind)192   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
193                                               Align Alignment,
194                                               bool VariableMask,
195                                               bool IsGatherScatter,
196                                               TTI::TargetCostKind CostKind) {
197     auto *VT = cast<FixedVectorType>(DataTy);
198     // Assume the target does not have support for gather/scatter operations
199     // and provide a rough estimate.
200     //
201     // First, compute the cost of the individual memory operations.
202     InstructionCost AddrExtractCost =
203         IsGatherScatter
204             ? getVectorInstrCost(Instruction::ExtractElement,
205                                  FixedVectorType::get(
206                                      PointerType::get(VT->getElementType(), 0),
207                                      VT->getNumElements()),
208                                  -1)
209             : 0;
210     InstructionCost LoadCost =
211         VT->getNumElements() *
212         (AddrExtractCost +
213          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
214 
215     // Next, compute the cost of packing the result in a vector.
216     InstructionCost PackingCost = getScalarizationOverhead(
217         VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
218 
219     InstructionCost ConditionalCost = 0;
220     if (VariableMask) {
221       // Compute the cost of conditionally executing the memory operations with
222       // variable masks. This includes extracting the individual conditions, a
223       // branches and PHIs to combine the results.
224       // NOTE: Estimating the cost of conditionally executing the memory
225       // operations accurately is quite difficult and the current solution
226       // provides a very rough estimate only.
227       ConditionalCost =
228           VT->getNumElements() *
229           (getVectorInstrCost(
230                Instruction::ExtractElement,
231                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
232                                     VT->getNumElements()),
233                -1) +
234            getCFInstrCost(Instruction::Br, CostKind) +
235            getCFInstrCost(Instruction::PHI, CostKind));
236     }
237 
238     return LoadCost + PackingCost + ConditionalCost;
239   }
240 
241 protected:
BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)242   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
243       : BaseT(DL) {}
244   virtual ~BasicTTIImplBase() = default;
245 
246   using TargetTransformInfoImplBase::DL;
247 
248 public:
249   /// \name Scalar TTI Implementations
250   /// @{
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,bool * Fast)251   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
252                                       unsigned AddressSpace, Align Alignment,
253                                       bool *Fast) const {
254     EVT E = EVT::getIntegerVT(Context, BitWidth);
255     return getTLI()->allowsMisalignedMemoryAccesses(
256         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
257   }
258 
hasBranchDivergence()259   bool hasBranchDivergence() { return false; }
260 
useGPUDivergenceAnalysis()261   bool useGPUDivergenceAnalysis() { return false; }
262 
isSourceOfDivergence(const Value * V)263   bool isSourceOfDivergence(const Value *V) { return false; }
264 
isAlwaysUniform(const Value * V)265   bool isAlwaysUniform(const Value *V) { return false; }
266 
getFlatAddressSpace()267   unsigned getFlatAddressSpace() {
268     // Return an invalid address space.
269     return -1;
270   }
271 
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)272   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
273                                   Intrinsic::ID IID) const {
274     return false;
275   }
276 
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)277   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
278     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
279   }
280 
getAssumedAddrSpace(const Value * V)281   unsigned getAssumedAddrSpace(const Value *V) const {
282     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
283   }
284 
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)285   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
286                                           Value *NewV) const {
287     return nullptr;
288   }
289 
isLegalAddImmediate(int64_t imm)290   bool isLegalAddImmediate(int64_t imm) {
291     return getTLI()->isLegalAddImmediate(imm);
292   }
293 
isLegalICmpImmediate(int64_t imm)294   bool isLegalICmpImmediate(int64_t imm) {
295     return getTLI()->isLegalICmpImmediate(imm);
296   }
297 
298   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
299                              bool HasBaseReg, int64_t Scale,
300                              unsigned AddrSpace, Instruction *I = nullptr) {
301     TargetLoweringBase::AddrMode AM;
302     AM.BaseGV = BaseGV;
303     AM.BaseOffs = BaseOffset;
304     AM.HasBaseReg = HasBaseReg;
305     AM.Scale = Scale;
306     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
307   }
308 
isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)309   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
310                           const DataLayout &DL) const {
311     EVT VT = getTLI()->getValueType(DL, Ty);
312     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
313   }
314 
isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)315   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
316                            const DataLayout &DL) const {
317     EVT VT = getTLI()->getValueType(DL, Ty);
318     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
319   }
320 
isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)321   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
322     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
323   }
324 
isNumRegsMajorCostOfLSR()325   bool isNumRegsMajorCostOfLSR() {
326     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
327   }
328 
isProfitableLSRChainElement(Instruction * I)329   bool isProfitableLSRChainElement(Instruction *I) {
330     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
331   }
332 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)333   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
334                                        int64_t BaseOffset, bool HasBaseReg,
335                                        int64_t Scale, unsigned AddrSpace) {
336     TargetLoweringBase::AddrMode AM;
337     AM.BaseGV = BaseGV;
338     AM.BaseOffs = BaseOffset;
339     AM.HasBaseReg = HasBaseReg;
340     AM.Scale = Scale;
341     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
342   }
343 
isTruncateFree(Type * Ty1,Type * Ty2)344   bool isTruncateFree(Type *Ty1, Type *Ty2) {
345     return getTLI()->isTruncateFree(Ty1, Ty2);
346   }
347 
isProfitableToHoist(Instruction * I)348   bool isProfitableToHoist(Instruction *I) {
349     return getTLI()->isProfitableToHoist(I);
350   }
351 
useAA()352   bool useAA() const { return getST()->useAA(); }
353 
isTypeLegal(Type * Ty)354   bool isTypeLegal(Type *Ty) {
355     EVT VT = getTLI()->getValueType(DL, Ty);
356     return getTLI()->isTypeLegal(VT);
357   }
358 
getRegUsageForType(Type * Ty)359   InstructionCost getRegUsageForType(Type *Ty) {
360     InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
361     assert(Val >= 0 && "Negative cost!");
362     return Val;
363   }
364 
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands)365   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
366                              ArrayRef<const Value *> Operands) {
367     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
368   }
369 
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)370   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
371                                             unsigned &JumpTableSize,
372                                             ProfileSummaryInfo *PSI,
373                                             BlockFrequencyInfo *BFI) {
374     /// Try to find the estimated number of clusters. Note that the number of
375     /// clusters identified in this function could be different from the actual
376     /// numbers found in lowering. This function ignore switches that are
377     /// lowered with a mix of jump table / bit test / BTree. This function was
378     /// initially intended to be used when estimating the cost of switch in
379     /// inline cost heuristic, but it's a generic cost model to be used in other
380     /// places (e.g., in loop unrolling).
381     unsigned N = SI.getNumCases();
382     const TargetLoweringBase *TLI = getTLI();
383     const DataLayout &DL = this->getDataLayout();
384 
385     JumpTableSize = 0;
386     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
387 
388     // Early exit if both a jump table and bit test are not allowed.
389     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
390       return N;
391 
392     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
393     APInt MinCaseVal = MaxCaseVal;
394     for (auto CI : SI.cases()) {
395       const APInt &CaseVal = CI.getCaseValue()->getValue();
396       if (CaseVal.sgt(MaxCaseVal))
397         MaxCaseVal = CaseVal;
398       if (CaseVal.slt(MinCaseVal))
399         MinCaseVal = CaseVal;
400     }
401 
402     // Check if suitable for a bit test
403     if (N <= DL.getIndexSizeInBits(0u)) {
404       SmallPtrSet<const BasicBlock *, 4> Dests;
405       for (auto I : SI.cases())
406         Dests.insert(I.getCaseSuccessor());
407 
408       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
409                                      DL))
410         return 1;
411     }
412 
413     // Check if suitable for a jump table.
414     if (IsJTAllowed) {
415       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
416         return N;
417       uint64_t Range =
418           (MaxCaseVal - MinCaseVal)
419               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
420       // Check whether a range of clusters is dense enough for a jump table
421       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
422         JumpTableSize = Range;
423         return 1;
424       }
425     }
426     return N;
427   }
428 
shouldBuildLookupTables()429   bool shouldBuildLookupTables() {
430     const TargetLoweringBase *TLI = getTLI();
431     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
432            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
433   }
434 
shouldBuildRelLookupTables()435   bool shouldBuildRelLookupTables() const {
436     const TargetMachine &TM = getTLI()->getTargetMachine();
437     // If non-PIC mode, do not generate a relative lookup table.
438     if (!TM.isPositionIndependent())
439       return false;
440 
441     /// Relative lookup table entries consist of 32-bit offsets.
442     /// Do not generate relative lookup tables for large code models
443     /// in 64-bit achitectures where 32-bit offsets might not be enough.
444     if (TM.getCodeModel() == CodeModel::Medium ||
445         TM.getCodeModel() == CodeModel::Large)
446       return false;
447 
448     Triple TargetTriple = TM.getTargetTriple();
449     if (!TargetTriple.isArch64Bit())
450       return false;
451 
452     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
453     // there.
454     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
455       return false;
456 
457     return true;
458   }
459 
haveFastSqrt(Type * Ty)460   bool haveFastSqrt(Type *Ty) {
461     const TargetLoweringBase *TLI = getTLI();
462     EVT VT = TLI->getValueType(DL, Ty);
463     return TLI->isTypeLegal(VT) &&
464            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
465   }
466 
isFCmpOrdCheaperThanFCmpZero(Type * Ty)467   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
468     return true;
469   }
470 
getFPOpCost(Type * Ty)471   InstructionCost getFPOpCost(Type *Ty) {
472     // Check whether FADD is available, as a proxy for floating-point in
473     // general.
474     const TargetLoweringBase *TLI = getTLI();
475     EVT VT = TLI->getValueType(DL, Ty);
476     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
477       return TargetTransformInfo::TCC_Basic;
478     return TargetTransformInfo::TCC_Expensive;
479   }
480 
getInliningThresholdMultiplier()481   unsigned getInliningThresholdMultiplier() { return 1; }
adjustInliningThreshold(const CallBase * CB)482   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
483 
getInlinerVectorBonusPercent()484   int getInlinerVectorBonusPercent() { return 150; }
485 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)486   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
487                                TTI::UnrollingPreferences &UP) {
488     // This unrolling functionality is target independent, but to provide some
489     // motivation for its intended use, for x86:
490 
491     // According to the Intel 64 and IA-32 Architectures Optimization Reference
492     // Manual, Intel Core models and later have a loop stream detector (and
493     // associated uop queue) that can benefit from partial unrolling.
494     // The relevant requirements are:
495     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
496     //    taken, and none of them may be calls.
497     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
498 
499     // According to the Software Optimization Guide for AMD Family 15h
500     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
501     // and loop buffer which can benefit from partial unrolling.
502     // The relevant requirements are:
503     //  - The loop must have fewer than 16 branches
504     //  - The loop must have less than 40 uops in all executed loop branches
505 
506     // The number of taken branches in a loop is hard to estimate here, and
507     // benchmarking has revealed that it is better not to be conservative when
508     // estimating the branch count. As a result, we'll ignore the branch limits
509     // until someone finds a case where it matters in practice.
510 
511     unsigned MaxOps;
512     const TargetSubtargetInfo *ST = getST();
513     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
514       MaxOps = PartialUnrollingThreshold;
515     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
516       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
517     else
518       return;
519 
520     // Scan the loop: don't unroll loops with calls.
521     for (BasicBlock *BB : L->blocks()) {
522       for (Instruction &I : *BB) {
523         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
524           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
525             if (!thisT()->isLoweredToCall(F))
526               continue;
527           }
528 
529           return;
530         }
531       }
532     }
533 
534     // Enable runtime and partial unrolling up to the specified size.
535     // Enable using trip count upper bound to unroll loops.
536     UP.Partial = UP.Runtime = UP.UpperBound = true;
537     UP.PartialThreshold = MaxOps;
538 
539     // Avoid unrolling when optimizing for size.
540     UP.OptSizeThreshold = 0;
541     UP.PartialOptSizeThreshold = 0;
542 
543     // Set number of instructions optimized when "back edge"
544     // becomes "fall through" to default value of 2.
545     UP.BEInsns = 2;
546   }
547 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)548   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
549                              TTI::PeelingPreferences &PP) {
550     PP.PeelCount = 0;
551     PP.AllowPeeling = true;
552     PP.AllowLoopNestsPeeling = false;
553     PP.PeelProfiledIterations = true;
554   }
555 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)556   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
557                                 AssumptionCache &AC,
558                                 TargetLibraryInfo *LibInfo,
559                                 HardwareLoopInfo &HWLoopInfo) {
560     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
561   }
562 
preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)563   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
564                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
565                                    DominatorTree *DT,
566                                    const LoopAccessInfo *LAI) {
567     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
568   }
569 
emitGetActiveLaneMask()570   bool emitGetActiveLaneMask() {
571     return BaseT::emitGetActiveLaneMask();
572   }
573 
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)574   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
575                                                IntrinsicInst &II) {
576     return BaseT::instCombineIntrinsic(IC, II);
577   }
578 
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)579   Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
580                                                      IntrinsicInst &II,
581                                                      APInt DemandedMask,
582                                                      KnownBits &Known,
583                                                      bool &KnownBitsComputed) {
584     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
585                                                    KnownBitsComputed);
586   }
587 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)588   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
589       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
590       APInt &UndefElts2, APInt &UndefElts3,
591       std::function<void(Instruction *, unsigned, APInt, APInt &)>
592           SimplifyAndSetOp) {
593     return BaseT::simplifyDemandedVectorEltsIntrinsic(
594         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
595         SimplifyAndSetOp);
596   }
597 
getInstructionLatency(const Instruction * I)598   InstructionCost getInstructionLatency(const Instruction *I) {
599     if (isa<LoadInst>(I))
600       return getST()->getSchedModel().DefaultLoadLatency;
601 
602     return BaseT::getInstructionLatency(I);
603   }
604 
605   virtual Optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level)606   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
607     return Optional<unsigned>(
608       getST()->getCacheSize(static_cast<unsigned>(Level)));
609   }
610 
611   virtual Optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level)612   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
613     Optional<unsigned> TargetResult =
614         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
615 
616     if (TargetResult)
617       return TargetResult;
618 
619     return BaseT::getCacheAssociativity(Level);
620   }
621 
getCacheLineSize()622   virtual unsigned getCacheLineSize() const {
623     return getST()->getCacheLineSize();
624   }
625 
getPrefetchDistance()626   virtual unsigned getPrefetchDistance() const {
627     return getST()->getPrefetchDistance();
628   }
629 
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)630   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
631                                         unsigned NumStridedMemAccesses,
632                                         unsigned NumPrefetches,
633                                         bool HasCall) const {
634     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
635                                          NumPrefetches, HasCall);
636   }
637 
getMaxPrefetchIterationsAhead()638   virtual unsigned getMaxPrefetchIterationsAhead() const {
639     return getST()->getMaxPrefetchIterationsAhead();
640   }
641 
enableWritePrefetching()642   virtual bool enableWritePrefetching() const {
643     return getST()->enableWritePrefetching();
644   }
645 
646   /// @}
647 
648   /// \name Vector TTI Implementations
649   /// @{
650 
getRegisterBitWidth(TargetTransformInfo::RegisterKind K)651   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
652     return TypeSize::getFixed(32);
653   }
654 
getMaxVScale()655   Optional<unsigned> getMaxVScale() const { return None; }
656 
657   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
658   /// are set if the demanded result elements need to be inserted and/or
659   /// extracted from vectors.
getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)660   InstructionCost getScalarizationOverhead(VectorType *InTy,
661                                            const APInt &DemandedElts,
662                                            bool Insert, bool Extract) {
663     /// FIXME: a bitfield is not a reasonable abstraction for talking about
664     /// which elements are needed from a scalable vector
665     auto *Ty = cast<FixedVectorType>(InTy);
666 
667     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
668            "Vector size mismatch");
669 
670     InstructionCost Cost = 0;
671 
672     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
673       if (!DemandedElts[i])
674         continue;
675       if (Insert)
676         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
677       if (Extract)
678         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
679     }
680 
681     return Cost;
682   }
683 
684   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)685   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
686                                            bool Extract) {
687     auto *Ty = cast<FixedVectorType>(InTy);
688 
689     APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
690     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
691   }
692 
693   /// Estimate the overhead of scalarizing an instructions unique
694   /// non-constant operands. The (potentially vector) types to use for each of
695   /// argument are passes via Tys.
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)696   InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
697                                                    ArrayRef<Type *> Tys) {
698     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
699 
700     InstructionCost Cost = 0;
701     SmallPtrSet<const Value*, 4> UniqueOperands;
702     for (int I = 0, E = Args.size(); I != E; I++) {
703       // Disregard things like metadata arguments.
704       const Value *A = Args[I];
705       Type *Ty = Tys[I];
706       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
707           !Ty->isPtrOrPtrVectorTy())
708         continue;
709 
710       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
711         if (auto *VecTy = dyn_cast<VectorType>(Ty))
712           Cost += getScalarizationOverhead(VecTy, false, true);
713       }
714     }
715 
716     return Cost;
717   }
718 
719   /// Estimate the overhead of scalarizing the inputs and outputs of an
720   /// instruction, with return type RetTy and arguments Args of type Tys. If
721   /// Args are unknown (empty), then the cost associated with one argument is
722   /// added as a heuristic.
getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)723   InstructionCost getScalarizationOverhead(VectorType *RetTy,
724                                            ArrayRef<const Value *> Args,
725                                            ArrayRef<Type *> Tys) {
726     InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
727     if (!Args.empty())
728       Cost += getOperandsScalarizationOverhead(Args, Tys);
729     else
730       // When no information on arguments is provided, we add the cost
731       // associated with one argument as a heuristic.
732       Cost += getScalarizationOverhead(RetTy, false, true);
733 
734     return Cost;
735   }
736 
getMaxInterleaveFactor(unsigned VF)737   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
738 
739   InstructionCost getArithmeticInstrCost(
740       unsigned Opcode, Type *Ty,
741       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
742       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
743       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
744       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
745       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
746       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
747       const Instruction *CxtI = nullptr) {
748     // Check if any of the operands are vector operands.
749     const TargetLoweringBase *TLI = getTLI();
750     int ISD = TLI->InstructionOpcodeToISD(Opcode);
751     assert(ISD && "Invalid opcode");
752 
753     // TODO: Handle more cost kinds.
754     if (CostKind != TTI::TCK_RecipThroughput)
755       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
756                                            Opd1Info, Opd2Info,
757                                            Opd1PropInfo, Opd2PropInfo,
758                                            Args, CxtI);
759 
760     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
761 
762     bool IsFloat = Ty->isFPOrFPVectorTy();
763     // Assume that floating point arithmetic operations cost twice as much as
764     // integer operations.
765     InstructionCost OpCost = (IsFloat ? 2 : 1);
766 
767     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
768       // The operation is legal. Assume it costs 1.
769       // TODO: Once we have extract/insert subvector cost we need to use them.
770       return LT.first * OpCost;
771     }
772 
773     if (!TLI->isOperationExpand(ISD, LT.second)) {
774       // If the operation is custom lowered, then assume that the code is twice
775       // as expensive.
776       return LT.first * 2 * OpCost;
777     }
778 
779     // Else, assume that we need to scalarize this op.
780     // TODO: If one of the types get legalized by splitting, handle this
781     // similarly to what getCastInstrCost() does.
782     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
783       unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
784       InstructionCost Cost = thisT()->getArithmeticInstrCost(
785           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
786           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
787       // Return the cost of multiple scalar invocation plus the cost of
788       // inserting and extracting the values.
789       SmallVector<Type *> Tys(Args.size(), Ty);
790       return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
791     }
792 
793     // We don't know anything about this scalar instruction.
794     return OpCost;
795   }
796 
improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask)797   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
798                                               ArrayRef<int> Mask) const {
799     int Limit = Mask.size() * 2;
800     if (Mask.empty() ||
801         // Extra check required by isSingleSourceMaskImpl function (called by
802         // ShuffleVectorInst::isSingleSourceMask).
803         any_of(Mask, [Limit](int I) { return I >= Limit; }))
804       return Kind;
805     switch (Kind) {
806     case TTI::SK_PermuteSingleSrc:
807       if (ShuffleVectorInst::isReverseMask(Mask))
808         return TTI::SK_Reverse;
809       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
810         return TTI::SK_Broadcast;
811       break;
812     case TTI::SK_PermuteTwoSrc:
813       if (ShuffleVectorInst::isSelectMask(Mask))
814         return TTI::SK_Select;
815       if (ShuffleVectorInst::isTransposeMask(Mask))
816         return TTI::SK_Transpose;
817       break;
818     case TTI::SK_Select:
819     case TTI::SK_Reverse:
820     case TTI::SK_Broadcast:
821     case TTI::SK_Transpose:
822     case TTI::SK_InsertSubvector:
823     case TTI::SK_ExtractSubvector:
824       break;
825     }
826     return Kind;
827   }
828 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp)829   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
830                                  ArrayRef<int> Mask, int Index,
831                                  VectorType *SubTp) {
832 
833     switch (improveShuffleKindFromMask(Kind, Mask)) {
834     case TTI::SK_Broadcast:
835       return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
836     case TTI::SK_Select:
837     case TTI::SK_Reverse:
838     case TTI::SK_Transpose:
839     case TTI::SK_PermuteSingleSrc:
840     case TTI::SK_PermuteTwoSrc:
841       return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
842     case TTI::SK_ExtractSubvector:
843       return getExtractSubvectorOverhead(Tp, Index,
844                                          cast<FixedVectorType>(SubTp));
845     case TTI::SK_InsertSubvector:
846       return getInsertSubvectorOverhead(Tp, Index,
847                                         cast<FixedVectorType>(SubTp));
848     }
849     llvm_unreachable("Unknown TTI::ShuffleKind");
850   }
851 
852   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
853                                    TTI::CastContextHint CCH,
854                                    TTI::TargetCostKind CostKind,
855                                    const Instruction *I = nullptr) {
856     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
857       return 0;
858 
859     const TargetLoweringBase *TLI = getTLI();
860     int ISD = TLI->InstructionOpcodeToISD(Opcode);
861     assert(ISD && "Invalid opcode");
862     std::pair<InstructionCost, MVT> SrcLT =
863         TLI->getTypeLegalizationCost(DL, Src);
864     std::pair<InstructionCost, MVT> DstLT =
865         TLI->getTypeLegalizationCost(DL, Dst);
866 
867     TypeSize SrcSize = SrcLT.second.getSizeInBits();
868     TypeSize DstSize = DstLT.second.getSizeInBits();
869     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
870     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
871 
872     switch (Opcode) {
873     default:
874       break;
875     case Instruction::Trunc:
876       // Check for NOOP conversions.
877       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
878         return 0;
879       LLVM_FALLTHROUGH;
880     case Instruction::BitCast:
881       // Bitcast between types that are legalized to the same type are free and
882       // assume int to/from ptr of the same size is also free.
883       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
884           SrcSize == DstSize)
885         return 0;
886       break;
887     case Instruction::FPExt:
888       if (I && getTLI()->isExtFree(I))
889         return 0;
890       break;
891     case Instruction::ZExt:
892       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
893         return 0;
894       LLVM_FALLTHROUGH;
895     case Instruction::SExt:
896       if (I && getTLI()->isExtFree(I))
897         return 0;
898 
899       // If this is a zext/sext of a load, return 0 if the corresponding
900       // extending load exists on target and the result type is legal.
901       if (CCH == TTI::CastContextHint::Normal) {
902         EVT ExtVT = EVT::getEVT(Dst);
903         EVT LoadVT = EVT::getEVT(Src);
904         unsigned LType =
905           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
906         if (DstLT.first == SrcLT.first &&
907             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
908           return 0;
909       }
910       break;
911     case Instruction::AddrSpaceCast:
912       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
913                                    Dst->getPointerAddressSpace()))
914         return 0;
915       break;
916     }
917 
918     auto *SrcVTy = dyn_cast<VectorType>(Src);
919     auto *DstVTy = dyn_cast<VectorType>(Dst);
920 
921     // If the cast is marked as legal (or promote) then assume low cost.
922     if (SrcLT.first == DstLT.first &&
923         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
924       return SrcLT.first;
925 
926     // Handle scalar conversions.
927     if (!SrcVTy && !DstVTy) {
928       // Just check the op cost. If the operation is legal then assume it costs
929       // 1.
930       if (!TLI->isOperationExpand(ISD, DstLT.second))
931         return 1;
932 
933       // Assume that illegal scalar instruction are expensive.
934       return 4;
935     }
936 
937     // Check vector-to-vector casts.
938     if (DstVTy && SrcVTy) {
939       // If the cast is between same-sized registers, then the check is simple.
940       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
941 
942         // Assume that Zext is done using AND.
943         if (Opcode == Instruction::ZExt)
944           return SrcLT.first;
945 
946         // Assume that sext is done using SHL and SRA.
947         if (Opcode == Instruction::SExt)
948           return SrcLT.first * 2;
949 
950         // Just check the op cost. If the operation is legal then assume it
951         // costs
952         // 1 and multiply by the type-legalization overhead.
953         if (!TLI->isOperationExpand(ISD, DstLT.second))
954           return SrcLT.first * 1;
955       }
956 
957       // If we are legalizing by splitting, query the concrete TTI for the cost
958       // of casting the original vector twice. We also need to factor in the
959       // cost of the split itself. Count that as 1, to be consistent with
960       // TLI->getTypeLegalizationCost().
961       bool SplitSrc =
962           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
963           TargetLowering::TypeSplitVector;
964       bool SplitDst =
965           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
966           TargetLowering::TypeSplitVector;
967       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
968           DstVTy->getElementCount().isVector()) {
969         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
970         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
971         T *TTI = static_cast<T *>(this);
972         // If both types need to be split then the split is free.
973         InstructionCost SplitCost =
974             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
975         return SplitCost +
976                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
977                                           CostKind, I));
978       }
979 
980       // In other cases where the source or destination are illegal, assume
981       // the operation will get scalarized.
982       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
983       InstructionCost Cost = thisT()->getCastInstrCost(
984           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
985 
986       // Return the cost of multiple scalar invocation plus the cost of
987       // inserting and extracting the values.
988       return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
989     }
990 
991     // We already handled vector-to-vector and scalar-to-scalar conversions.
992     // This
993     // is where we handle bitcast between vectors and scalars. We need to assume
994     //  that the conversion is scalarized in one way or another.
995     if (Opcode == Instruction::BitCast) {
996       // Illegal bitcasts are done by storing and loading from a stack slot.
997       return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
998              (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
999     }
1000 
1001     llvm_unreachable("Unhandled cast");
1002   }
1003 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1004   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1005                                            VectorType *VecTy, unsigned Index) {
1006     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1007                                        Index) +
1008            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1009                                      TTI::CastContextHint::None,
1010                                      TTI::TCK_RecipThroughput);
1011   }
1012 
1013   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1014                                  const Instruction *I = nullptr) {
1015     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1016   }
1017 
1018   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1019                                      CmpInst::Predicate VecPred,
1020                                      TTI::TargetCostKind CostKind,
1021                                      const Instruction *I = nullptr) {
1022     const TargetLoweringBase *TLI = getTLI();
1023     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1024     assert(ISD && "Invalid opcode");
1025 
1026     // TODO: Handle other cost kinds.
1027     if (CostKind != TTI::TCK_RecipThroughput)
1028       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1029                                        I);
1030 
1031     // Selects on vectors are actually vector selects.
1032     if (ISD == ISD::SELECT) {
1033       assert(CondTy && "CondTy must exist");
1034       if (CondTy->isVectorTy())
1035         ISD = ISD::VSELECT;
1036     }
1037     std::pair<InstructionCost, MVT> LT =
1038         TLI->getTypeLegalizationCost(DL, ValTy);
1039 
1040     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1041         !TLI->isOperationExpand(ISD, LT.second)) {
1042       // The operation is legal. Assume it costs 1. Multiply
1043       // by the type-legalization overhead.
1044       return LT.first * 1;
1045     }
1046 
1047     // Otherwise, assume that the cast is scalarized.
1048     // TODO: If one of the types get legalized by splitting, handle this
1049     // similarly to what getCastInstrCost() does.
1050     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1051       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1052       if (CondTy)
1053         CondTy = CondTy->getScalarType();
1054       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1055           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1056 
1057       // Return the cost of multiple scalar invocation plus the cost of
1058       // inserting and extracting the values.
1059       return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1060     }
1061 
1062     // Unknown scalar opcode.
1063     return 1;
1064   }
1065 
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)1066   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1067                                      unsigned Index) {
1068     std::pair<InstructionCost, MVT> LT =
1069         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1070 
1071     return LT.first;
1072   }
1073 
1074   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1075                                   MaybeAlign Alignment, unsigned AddressSpace,
1076                                   TTI::TargetCostKind CostKind,
1077                                   const Instruction *I = nullptr) {
1078     assert(!Src->isVoidTy() && "Invalid type");
1079     // Assume types, such as structs, are expensive.
1080     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1081       return 4;
1082     std::pair<InstructionCost, MVT> LT =
1083         getTLI()->getTypeLegalizationCost(DL, Src);
1084 
1085     // Assuming that all loads of legal types cost 1.
1086     InstructionCost Cost = LT.first;
1087     if (CostKind != TTI::TCK_RecipThroughput)
1088       return Cost;
1089 
1090     if (Src->isVectorTy() &&
1091         // In practice it's not currently possible to have a change in lane
1092         // length for extending loads or truncating stores so both types should
1093         // have the same scalable property.
1094         TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1095                             LT.second.getSizeInBits())) {
1096       // This is a vector load that legalizes to a larger type than the vector
1097       // itself. Unless the corresponding extending load or truncating store is
1098       // legal, then this will scalarize.
1099       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1100       EVT MemVT = getTLI()->getValueType(DL, Src);
1101       if (Opcode == Instruction::Store)
1102         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1103       else
1104         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1105 
1106       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1107         // This is a vector load/store for some illegal type that is scalarized.
1108         // We must account for the cost of building or decomposing the vector.
1109         Cost += getScalarizationOverhead(cast<VectorType>(Src),
1110                                          Opcode != Instruction::Store,
1111                                          Opcode == Instruction::Store);
1112       }
1113     }
1114 
1115     return Cost;
1116   }
1117 
getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1118   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1119                                         Align Alignment, unsigned AddressSpace,
1120                                         TTI::TargetCostKind CostKind) {
1121     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1122                                        CostKind);
1123   }
1124 
1125   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1126                                          const Value *Ptr, bool VariableMask,
1127                                          Align Alignment,
1128                                          TTI::TargetCostKind CostKind,
1129                                          const Instruction *I = nullptr) {
1130     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1131                                        true, CostKind);
1132   }
1133 
1134   InstructionCost getInterleavedMemoryOpCost(
1135       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1136       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1137       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1138     auto *VT = cast<FixedVectorType>(VecTy);
1139 
1140     unsigned NumElts = VT->getNumElements();
1141     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1142 
1143     unsigned NumSubElts = NumElts / Factor;
1144     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1145 
1146     // Firstly, the cost of load/store operation.
1147     InstructionCost Cost;
1148     if (UseMaskForCond || UseMaskForGaps)
1149       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1150                                             AddressSpace, CostKind);
1151     else
1152       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1153                                       CostKind);
1154 
1155     // Legalize the vector type, and get the legalized and unlegalized type
1156     // sizes.
1157     MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1158     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1159     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1160 
1161     // Scale the cost of the memory operation by the fraction of legalized
1162     // instructions that will actually be used. We shouldn't account for the
1163     // cost of dead instructions since they will be removed.
1164     //
1165     // E.g., An interleaved load of factor 8:
1166     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1167     //       %v0 = shufflevector %vec, undef, <0, 8>
1168     //
1169     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1170     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1171     // type). The other loads are unused.
1172     //
1173     // We only scale the cost of loads since interleaved store groups aren't
1174     // allowed to have gaps.
1175     if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
1176       // The number of loads of a legal type it will take to represent a load
1177       // of the unlegalized vector type.
1178       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1179 
1180       // The number of elements of the unlegalized type that correspond to a
1181       // single legal instruction.
1182       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1183 
1184       // Determine which legal instructions will be used.
1185       BitVector UsedInsts(NumLegalInsts, false);
1186       for (unsigned Index : Indices)
1187         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1188           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1189 
1190       // Scale the cost of the load by the fraction of legal instructions that
1191       // will be used.
1192       Cost *= UsedInsts.count() / NumLegalInsts;
1193     }
1194 
1195     // Then plus the cost of interleave operation.
1196     if (Opcode == Instruction::Load) {
1197       // The interleave cost is similar to extract sub vectors' elements
1198       // from the wide vector, and insert them into sub vectors.
1199       //
1200       // E.g. An interleaved load of factor 2 (with one member of index 0):
1201       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1202       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1203       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1204       // <8 x i32> vector and insert them into a <4 x i32> vector.
1205 
1206       assert(Indices.size() <= Factor &&
1207              "Interleaved memory op has too many members");
1208 
1209       for (unsigned Index : Indices) {
1210         assert(Index < Factor && "Invalid index for interleaved memory op");
1211 
1212         // Extract elements from loaded vector for each sub vector.
1213         for (unsigned i = 0; i < NumSubElts; i++)
1214           Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1215                                               Index + i * Factor);
1216       }
1217 
1218       InstructionCost InsSubCost = 0;
1219       for (unsigned i = 0; i < NumSubElts; i++)
1220         InsSubCost +=
1221             thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1222 
1223       Cost += Indices.size() * InsSubCost;
1224     } else {
1225       // The interleave cost is extract all elements from sub vectors, and
1226       // insert them into the wide vector.
1227       //
1228       // E.g. An interleaved store of factor 2:
1229       //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1230       //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1231       // The cost is estimated as extract all elements from both <4 x i32>
1232       // vectors and insert into the <8 x i32> vector.
1233 
1234       InstructionCost ExtSubCost = 0;
1235       for (unsigned i = 0; i < NumSubElts; i++)
1236         ExtSubCost +=
1237             thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1238       Cost += ExtSubCost * Factor;
1239 
1240       for (unsigned i = 0; i < NumElts; i++)
1241         Cost += static_cast<T *>(this)
1242                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1243     }
1244 
1245     if (!UseMaskForCond)
1246       return Cost;
1247 
1248     Type *I8Type = Type::getInt8Ty(VT->getContext());
1249     auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1250     SubVT = FixedVectorType::get(I8Type, NumSubElts);
1251 
1252     // The Mask shuffling cost is extract all the elements of the Mask
1253     // and insert each of them Factor times into the wide vector:
1254     //
1255     // E.g. an interleaved group with factor 3:
1256     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1257     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1258     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1259     // The cost is estimated as extract all mask elements from the <8xi1> mask
1260     // vector and insert them factor times into the <24xi1> shuffled mask
1261     // vector.
1262     for (unsigned i = 0; i < NumSubElts; i++)
1263       Cost +=
1264           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1265 
1266     for (unsigned i = 0; i < NumElts; i++)
1267       Cost +=
1268           thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1269 
1270     // The Gaps mask is invariant and created outside the loop, therefore the
1271     // cost of creating it is not accounted for here. However if we have both
1272     // a MaskForGaps and some other mask that guards the execution of the
1273     // memory access, we need to account for the cost of And-ing the two masks
1274     // inside the loop.
1275     if (UseMaskForGaps)
1276       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1277                                               CostKind);
1278 
1279     return Cost;
1280   }
1281 
1282   /// Get intrinsic cost based on arguments.
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1283   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1284                                         TTI::TargetCostKind CostKind) {
1285     // Check for generically free intrinsics.
1286     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1287       return 0;
1288 
1289     // Assume that target intrinsics are cheap.
1290     Intrinsic::ID IID = ICA.getID();
1291     if (Function::isTargetIntrinsic(IID))
1292       return TargetTransformInfo::TCC_Basic;
1293 
1294     if (ICA.isTypeBasedOnly())
1295       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1296 
1297     Type *RetTy = ICA.getReturnType();
1298 
1299     ElementCount RetVF =
1300         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1301                              : ElementCount::getFixed(1));
1302     const IntrinsicInst *I = ICA.getInst();
1303     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1304     FastMathFlags FMF = ICA.getFlags();
1305     switch (IID) {
1306     default:
1307       break;
1308 
1309     case Intrinsic::cttz:
1310       // FIXME: If necessary, this should go in target-specific overrides.
1311       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1312         return TargetTransformInfo::TCC_Basic;
1313       break;
1314 
1315     case Intrinsic::ctlz:
1316       // FIXME: If necessary, this should go in target-specific overrides.
1317       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1318         return TargetTransformInfo::TCC_Basic;
1319       break;
1320 
1321     case Intrinsic::memcpy:
1322       return thisT()->getMemcpyCost(ICA.getInst());
1323 
1324     case Intrinsic::masked_scatter: {
1325       const Value *Mask = Args[3];
1326       bool VarMask = !isa<Constant>(Mask);
1327       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1328       return thisT()->getGatherScatterOpCost(Instruction::Store,
1329                                              ICA.getArgTypes()[0], Args[1],
1330                                              VarMask, Alignment, CostKind, I);
1331     }
1332     case Intrinsic::masked_gather: {
1333       const Value *Mask = Args[2];
1334       bool VarMask = !isa<Constant>(Mask);
1335       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1336       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1337                                              VarMask, Alignment, CostKind, I);
1338     }
1339     case Intrinsic::experimental_stepvector: {
1340       if (isa<ScalableVectorType>(RetTy))
1341         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1342       // The cost of materialising a constant integer vector.
1343       return TargetTransformInfo::TCC_Basic;
1344     }
1345     case Intrinsic::experimental_vector_extract: {
1346       // FIXME: Handle case where a scalable vector is extracted from a scalable
1347       // vector
1348       if (isa<ScalableVectorType>(RetTy))
1349         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1350       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1351       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1352                                      cast<VectorType>(Args[0]->getType()), None,
1353                                      Index, cast<VectorType>(RetTy));
1354     }
1355     case Intrinsic::experimental_vector_insert: {
1356       // FIXME: Handle case where a scalable vector is inserted into a scalable
1357       // vector
1358       if (isa<ScalableVectorType>(Args[1]->getType()))
1359         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1360       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1361       return thisT()->getShuffleCost(
1362           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1363           Index, cast<VectorType>(Args[1]->getType()));
1364     }
1365     case Intrinsic::experimental_vector_reverse: {
1366       return thisT()->getShuffleCost(TTI::SK_Reverse,
1367                                      cast<VectorType>(Args[0]->getType()), None,
1368                                      0, cast<VectorType>(RetTy));
1369     }
1370     case Intrinsic::vector_reduce_add:
1371     case Intrinsic::vector_reduce_mul:
1372     case Intrinsic::vector_reduce_and:
1373     case Intrinsic::vector_reduce_or:
1374     case Intrinsic::vector_reduce_xor:
1375     case Intrinsic::vector_reduce_smax:
1376     case Intrinsic::vector_reduce_smin:
1377     case Intrinsic::vector_reduce_fmax:
1378     case Intrinsic::vector_reduce_fmin:
1379     case Intrinsic::vector_reduce_umax:
1380     case Intrinsic::vector_reduce_umin: {
1381       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1382       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1383     }
1384     case Intrinsic::vector_reduce_fadd:
1385     case Intrinsic::vector_reduce_fmul: {
1386       IntrinsicCostAttributes Attrs(
1387           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1388       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1389     }
1390     case Intrinsic::fshl:
1391     case Intrinsic::fshr: {
1392       if (isa<ScalableVectorType>(RetTy))
1393         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1394       const Value *X = Args[0];
1395       const Value *Y = Args[1];
1396       const Value *Z = Args[2];
1397       TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1398       TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1399       TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1400       TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1401       TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1402       OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1403                                                               : TTI::OP_None;
1404       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1405       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1406       InstructionCost Cost = 0;
1407       Cost +=
1408           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1409       Cost +=
1410           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1411       Cost += thisT()->getArithmeticInstrCost(
1412           BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1413       Cost += thisT()->getArithmeticInstrCost(
1414           BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1415       // Non-constant shift amounts requires a modulo.
1416       if (OpKindZ != TTI::OK_UniformConstantValue &&
1417           OpKindZ != TTI::OK_NonUniformConstantValue)
1418         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1419                                                 CostKind, OpKindZ, OpKindBW,
1420                                                 OpPropsZ, OpPropsBW);
1421       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1422       if (X != Y) {
1423         Type *CondTy = RetTy->getWithNewBitWidth(1);
1424         Cost +=
1425             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1426                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
1427         Cost +=
1428             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1429                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
1430       }
1431       return Cost;
1432     }
1433     }
1434 
1435     // Assume that we need to scalarize this intrinsic.
1436     // Compute the scalarization overhead based on Args for a vector
1437     // intrinsic.
1438     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1439     if (RetVF.isVector() && !RetVF.isScalable()) {
1440       ScalarizationCost = 0;
1441       if (!RetTy->isVoidTy())
1442         ScalarizationCost +=
1443             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1444       ScalarizationCost +=
1445           getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1446     }
1447 
1448     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1449                                   ScalarizationCost);
1450     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1451   }
1452 
1453   /// Get intrinsic cost based on argument types.
1454   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1455   /// cost of scalarizing the arguments and the return value will be computed
1456   /// based on types.
1457   InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1458   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1459                                  TTI::TargetCostKind CostKind) {
1460     Intrinsic::ID IID = ICA.getID();
1461     Type *RetTy = ICA.getReturnType();
1462     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1463     FastMathFlags FMF = ICA.getFlags();
1464     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1465     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1466 
1467     VectorType *VecOpTy = nullptr;
1468     if (!Tys.empty()) {
1469       // The vector reduction operand is operand 0 except for fadd/fmul.
1470       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1471       unsigned VecTyIndex = 0;
1472       if (IID == Intrinsic::vector_reduce_fadd ||
1473           IID == Intrinsic::vector_reduce_fmul)
1474         VecTyIndex = 1;
1475       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1476       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1477     }
1478 
1479     // Library call cost - other than size, make it expensive.
1480     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1481     SmallVector<unsigned, 2> ISDs;
1482     switch (IID) {
1483     default: {
1484       // Scalable vectors cannot be scalarized, so return Invalid.
1485       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1486             return isa<ScalableVectorType>(Ty);
1487           }))
1488         return InstructionCost::getInvalid();
1489 
1490       // Assume that we need to scalarize this intrinsic.
1491       InstructionCost ScalarizationCost =
1492           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1493       unsigned ScalarCalls = 1;
1494       Type *ScalarRetTy = RetTy;
1495       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1496         if (!SkipScalarizationCost)
1497           ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1498         ScalarCalls = std::max(ScalarCalls,
1499                                cast<FixedVectorType>(RetVTy)->getNumElements());
1500         ScalarRetTy = RetTy->getScalarType();
1501       }
1502       SmallVector<Type *, 4> ScalarTys;
1503       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1504         Type *Ty = Tys[i];
1505         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1506           if (!SkipScalarizationCost)
1507             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1508           ScalarCalls = std::max(ScalarCalls,
1509                                  cast<FixedVectorType>(VTy)->getNumElements());
1510           Ty = Ty->getScalarType();
1511         }
1512         ScalarTys.push_back(Ty);
1513       }
1514       if (ScalarCalls == 1)
1515         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1516 
1517       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1518       InstructionCost ScalarCost =
1519           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1520 
1521       return ScalarCalls * ScalarCost + ScalarizationCost;
1522     }
1523     // Look for intrinsics that can be lowered directly or turned into a scalar
1524     // intrinsic call.
1525     case Intrinsic::sqrt:
1526       ISDs.push_back(ISD::FSQRT);
1527       break;
1528     case Intrinsic::sin:
1529       ISDs.push_back(ISD::FSIN);
1530       break;
1531     case Intrinsic::cos:
1532       ISDs.push_back(ISD::FCOS);
1533       break;
1534     case Intrinsic::exp:
1535       ISDs.push_back(ISD::FEXP);
1536       break;
1537     case Intrinsic::exp2:
1538       ISDs.push_back(ISD::FEXP2);
1539       break;
1540     case Intrinsic::log:
1541       ISDs.push_back(ISD::FLOG);
1542       break;
1543     case Intrinsic::log10:
1544       ISDs.push_back(ISD::FLOG10);
1545       break;
1546     case Intrinsic::log2:
1547       ISDs.push_back(ISD::FLOG2);
1548       break;
1549     case Intrinsic::fabs:
1550       ISDs.push_back(ISD::FABS);
1551       break;
1552     case Intrinsic::canonicalize:
1553       ISDs.push_back(ISD::FCANONICALIZE);
1554       break;
1555     case Intrinsic::minnum:
1556       ISDs.push_back(ISD::FMINNUM);
1557       break;
1558     case Intrinsic::maxnum:
1559       ISDs.push_back(ISD::FMAXNUM);
1560       break;
1561     case Intrinsic::minimum:
1562       ISDs.push_back(ISD::FMINIMUM);
1563       break;
1564     case Intrinsic::maximum:
1565       ISDs.push_back(ISD::FMAXIMUM);
1566       break;
1567     case Intrinsic::copysign:
1568       ISDs.push_back(ISD::FCOPYSIGN);
1569       break;
1570     case Intrinsic::floor:
1571       ISDs.push_back(ISD::FFLOOR);
1572       break;
1573     case Intrinsic::ceil:
1574       ISDs.push_back(ISD::FCEIL);
1575       break;
1576     case Intrinsic::trunc:
1577       ISDs.push_back(ISD::FTRUNC);
1578       break;
1579     case Intrinsic::nearbyint:
1580       ISDs.push_back(ISD::FNEARBYINT);
1581       break;
1582     case Intrinsic::rint:
1583       ISDs.push_back(ISD::FRINT);
1584       break;
1585     case Intrinsic::round:
1586       ISDs.push_back(ISD::FROUND);
1587       break;
1588     case Intrinsic::roundeven:
1589       ISDs.push_back(ISD::FROUNDEVEN);
1590       break;
1591     case Intrinsic::pow:
1592       ISDs.push_back(ISD::FPOW);
1593       break;
1594     case Intrinsic::fma:
1595       ISDs.push_back(ISD::FMA);
1596       break;
1597     case Intrinsic::fmuladd:
1598       ISDs.push_back(ISD::FMA);
1599       break;
1600     case Intrinsic::experimental_constrained_fmuladd:
1601       ISDs.push_back(ISD::STRICT_FMA);
1602       break;
1603     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1604     case Intrinsic::lifetime_start:
1605     case Intrinsic::lifetime_end:
1606     case Intrinsic::sideeffect:
1607     case Intrinsic::pseudoprobe:
1608       return 0;
1609     case Intrinsic::masked_store: {
1610       Type *Ty = Tys[0];
1611       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1612       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1613                                             CostKind);
1614     }
1615     case Intrinsic::masked_load: {
1616       Type *Ty = RetTy;
1617       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1618       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1619                                             CostKind);
1620     }
1621     case Intrinsic::vector_reduce_add:
1622       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1623                                                  /*IsPairwiseForm=*/false,
1624                                                  CostKind);
1625     case Intrinsic::vector_reduce_mul:
1626       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1627                                                  /*IsPairwiseForm=*/false,
1628                                                  CostKind);
1629     case Intrinsic::vector_reduce_and:
1630       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1631                                                  /*IsPairwiseForm=*/false,
1632                                                  CostKind);
1633     case Intrinsic::vector_reduce_or:
1634       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1635                                                  /*IsPairwiseForm=*/false,
1636                                                  CostKind);
1637     case Intrinsic::vector_reduce_xor:
1638       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1639                                                  /*IsPairwiseForm=*/false,
1640                                                  CostKind);
1641     case Intrinsic::vector_reduce_fadd:
1642       // FIXME: Add new flag for cost of strict reductions.
1643       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1644                                                  /*IsPairwiseForm=*/false,
1645                                                  CostKind);
1646     case Intrinsic::vector_reduce_fmul:
1647       // FIXME: Add new flag for cost of strict reductions.
1648       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1649                                                  /*IsPairwiseForm=*/false,
1650                                                  CostKind);
1651     case Intrinsic::vector_reduce_smax:
1652     case Intrinsic::vector_reduce_smin:
1653     case Intrinsic::vector_reduce_fmax:
1654     case Intrinsic::vector_reduce_fmin:
1655       return thisT()->getMinMaxReductionCost(
1656           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1657           /*IsPairwiseForm=*/false,
1658           /*IsUnsigned=*/false, CostKind);
1659     case Intrinsic::vector_reduce_umax:
1660     case Intrinsic::vector_reduce_umin:
1661       return thisT()->getMinMaxReductionCost(
1662           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1663           /*IsPairwiseForm=*/false,
1664           /*IsUnsigned=*/true, CostKind);
1665     case Intrinsic::abs:
1666     case Intrinsic::smax:
1667     case Intrinsic::smin:
1668     case Intrinsic::umax:
1669     case Intrinsic::umin: {
1670       // abs(X) = select(icmp(X,0),X,sub(0,X))
1671       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1672       Type *CondTy = RetTy->getWithNewBitWidth(1);
1673       InstructionCost Cost = 0;
1674       // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1675       Cost +=
1676           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1677                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1678       Cost +=
1679           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1680                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1681       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1682       if (IID == Intrinsic::abs)
1683         Cost += thisT()->getArithmeticInstrCost(
1684             BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1685       return Cost;
1686     }
1687     case Intrinsic::sadd_sat:
1688     case Intrinsic::ssub_sat: {
1689       Type *CondTy = RetTy->getWithNewBitWidth(1);
1690 
1691       Type *OpTy = StructType::create({RetTy, CondTy});
1692       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1693                                      ? Intrinsic::sadd_with_overflow
1694                                      : Intrinsic::ssub_with_overflow;
1695 
1696       // SatMax -> Overflow && SumDiff < 0
1697       // SatMin -> Overflow && SumDiff >= 0
1698       InstructionCost Cost = 0;
1699       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1700                                     nullptr, ScalarizationCostPassed);
1701       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1702       Cost +=
1703           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1704                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1705       Cost += 2 * thisT()->getCmpSelInstrCost(
1706                       BinaryOperator::Select, RetTy, CondTy,
1707                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1708       return Cost;
1709     }
1710     case Intrinsic::uadd_sat:
1711     case Intrinsic::usub_sat: {
1712       Type *CondTy = RetTy->getWithNewBitWidth(1);
1713 
1714       Type *OpTy = StructType::create({RetTy, CondTy});
1715       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1716                                      ? Intrinsic::uadd_with_overflow
1717                                      : Intrinsic::usub_with_overflow;
1718 
1719       InstructionCost Cost = 0;
1720       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1721                                     nullptr, ScalarizationCostPassed);
1722       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1723       Cost +=
1724           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1725                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1726       return Cost;
1727     }
1728     case Intrinsic::smul_fix:
1729     case Intrinsic::umul_fix: {
1730       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1731       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1732 
1733       unsigned ExtOp =
1734           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1735       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1736 
1737       InstructionCost Cost = 0;
1738       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1739       Cost +=
1740           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1741       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1742                                             CCH, CostKind);
1743       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1744                                               CostKind, TTI::OK_AnyValue,
1745                                               TTI::OK_UniformConstantValue);
1746       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1747                                               TTI::OK_AnyValue,
1748                                               TTI::OK_UniformConstantValue);
1749       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1750       return Cost;
1751     }
1752     case Intrinsic::sadd_with_overflow:
1753     case Intrinsic::ssub_with_overflow: {
1754       Type *SumTy = RetTy->getContainedType(0);
1755       Type *OverflowTy = RetTy->getContainedType(1);
1756       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1757                             ? BinaryOperator::Add
1758                             : BinaryOperator::Sub;
1759 
1760       //   LHSSign -> LHS >= 0
1761       //   RHSSign -> RHS >= 0
1762       //   SumSign -> Sum >= 0
1763       //
1764       //   Add:
1765       //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1766       //   Sub:
1767       //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1768       InstructionCost Cost = 0;
1769       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1770       Cost += 3 * thisT()->getCmpSelInstrCost(
1771                       Instruction::ICmp, SumTy, OverflowTy,
1772                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1773       Cost += 2 * thisT()->getCmpSelInstrCost(
1774                       Instruction::Select, OverflowTy, OverflowTy,
1775                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1776       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1777                                               CostKind);
1778       return Cost;
1779     }
1780     case Intrinsic::uadd_with_overflow:
1781     case Intrinsic::usub_with_overflow: {
1782       Type *SumTy = RetTy->getContainedType(0);
1783       Type *OverflowTy = RetTy->getContainedType(1);
1784       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1785                             ? BinaryOperator::Add
1786                             : BinaryOperator::Sub;
1787 
1788       InstructionCost Cost = 0;
1789       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1790       Cost +=
1791           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1792                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1793       return Cost;
1794     }
1795     case Intrinsic::smul_with_overflow:
1796     case Intrinsic::umul_with_overflow: {
1797       Type *MulTy = RetTy->getContainedType(0);
1798       Type *OverflowTy = RetTy->getContainedType(1);
1799       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1800       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1801 
1802       unsigned ExtOp =
1803           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1804       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1805 
1806       InstructionCost Cost = 0;
1807       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1808       Cost +=
1809           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1810       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1811                                             CCH, CostKind);
1812       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1813                                               CostKind, TTI::OK_AnyValue,
1814                                               TTI::OK_UniformConstantValue);
1815 
1816       if (IID == Intrinsic::smul_with_overflow)
1817         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1818                                                 CostKind, TTI::OK_AnyValue,
1819                                                 TTI::OK_UniformConstantValue);
1820 
1821       Cost +=
1822           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1823                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1824       return Cost;
1825     }
1826     case Intrinsic::ctpop:
1827       ISDs.push_back(ISD::CTPOP);
1828       // In case of legalization use TCC_Expensive. This is cheaper than a
1829       // library call but still not a cheap instruction.
1830       SingleCallCost = TargetTransformInfo::TCC_Expensive;
1831       break;
1832     case Intrinsic::ctlz:
1833       ISDs.push_back(ISD::CTLZ);
1834       break;
1835     case Intrinsic::cttz:
1836       ISDs.push_back(ISD::CTTZ);
1837       break;
1838     case Intrinsic::bswap:
1839       ISDs.push_back(ISD::BSWAP);
1840       break;
1841     case Intrinsic::bitreverse:
1842       ISDs.push_back(ISD::BITREVERSE);
1843       break;
1844     }
1845 
1846     const TargetLoweringBase *TLI = getTLI();
1847     std::pair<InstructionCost, MVT> LT =
1848         TLI->getTypeLegalizationCost(DL, RetTy);
1849 
1850     SmallVector<InstructionCost, 2> LegalCost;
1851     SmallVector<InstructionCost, 2> CustomCost;
1852     for (unsigned ISD : ISDs) {
1853       if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1854         if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1855             TLI->isFAbsFree(LT.second)) {
1856           return 0;
1857         }
1858 
1859         // The operation is legal. Assume it costs 1.
1860         // If the type is split to multiple registers, assume that there is some
1861         // overhead to this.
1862         // TODO: Once we have extract/insert subvector cost we need to use them.
1863         if (LT.first > 1)
1864           LegalCost.push_back(LT.first * 2);
1865         else
1866           LegalCost.push_back(LT.first * 1);
1867       } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1868         // If the operation is custom lowered then assume
1869         // that the code is twice as expensive.
1870         CustomCost.push_back(LT.first * 2);
1871       }
1872     }
1873 
1874     auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1875     if (MinLegalCostI != LegalCost.end())
1876       return *MinLegalCostI;
1877 
1878     auto MinCustomCostI =
1879         std::min_element(CustomCost.begin(), CustomCost.end());
1880     if (MinCustomCostI != CustomCost.end())
1881       return *MinCustomCostI;
1882 
1883     // If we can't lower fmuladd into an FMA estimate the cost as a floating
1884     // point mul followed by an add.
1885     if (IID == Intrinsic::fmuladd)
1886       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1887                                              CostKind) +
1888              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1889                                              CostKind);
1890     if (IID == Intrinsic::experimental_constrained_fmuladd) {
1891       IntrinsicCostAttributes FMulAttrs(
1892         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1893       IntrinsicCostAttributes FAddAttrs(
1894         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1895       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1896              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1897     }
1898 
1899     // Else, assume that we need to scalarize this intrinsic. For math builtins
1900     // this will emit a costly libcall, adding call overhead and spills. Make it
1901     // very expensive.
1902     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1903       // Scalable vectors cannot be scalarized, so return Invalid.
1904       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1905             return isa<ScalableVectorType>(Ty);
1906           }))
1907         return InstructionCost::getInvalid();
1908 
1909       InstructionCost ScalarizationCost =
1910           SkipScalarizationCost ? ScalarizationCostPassed
1911                                 : getScalarizationOverhead(RetVTy, true, false);
1912 
1913       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1914       SmallVector<Type *, 4> ScalarTys;
1915       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1916         Type *Ty = Tys[i];
1917         if (Ty->isVectorTy())
1918           Ty = Ty->getScalarType();
1919         ScalarTys.push_back(Ty);
1920       }
1921       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1922       InstructionCost ScalarCost =
1923           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1924       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1925         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1926           if (!ICA.skipScalarizationCost())
1927             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1928           ScalarCalls = std::max(ScalarCalls,
1929                                  cast<FixedVectorType>(VTy)->getNumElements());
1930         }
1931       }
1932       return ScalarCalls * ScalarCost + ScalarizationCost;
1933     }
1934 
1935     // This is going to be turned into a library call, make it expensive.
1936     return SingleCallCost;
1937   }
1938 
1939   /// Compute a cost of the given call instruction.
1940   ///
1941   /// Compute the cost of calling function F with return type RetTy and
1942   /// argument types Tys. F might be nullptr, in this case the cost of an
1943   /// arbitrary call with the specified signature will be returned.
1944   /// This is used, for instance,  when we estimate call of a vector
1945   /// counterpart of the given function.
1946   /// \param F Called function, might be nullptr.
1947   /// \param RetTy Return value types.
1948   /// \param Tys Argument types.
1949   /// \returns The cost of Call instruction.
1950   InstructionCost
1951   getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1952                    TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1953     return 10;
1954   }
1955 
getNumberOfParts(Type * Tp)1956   unsigned getNumberOfParts(Type *Tp) {
1957     std::pair<InstructionCost, MVT> LT =
1958         getTLI()->getTypeLegalizationCost(DL, Tp);
1959     return *LT.first.getValue();
1960   }
1961 
getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1962   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
1963                                             const SCEV *) {
1964     return 0;
1965   }
1966 
1967   /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1968   /// We're assuming that reduction operation are performing the following way:
1969   /// 1. Non-pairwise reduction
1970   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1971   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1972   ///            \----------------v-------------/  \----------v------------/
1973   ///                            n/2 elements               n/2 elements
1974   /// %red1 = op <n x t> %val, <n x t> val1
1975   /// After this operation we have a vector %red1 where only the first n/2
1976   /// elements are meaningful, the second n/2 elements are undefined and can be
1977   /// dropped. All other operations are actually working with the vector of
1978   /// length n/2, not n, though the real vector length is still n.
1979   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1980   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1981   ///            \----------------v-------------/  \----------v------------/
1982   ///                            n/4 elements               3*n/4 elements
1983   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
1984   /// length n/2, the resulting vector has length n/4 etc.
1985   /// 2. Pairwise reduction:
1986   /// Everything is the same except for an additional shuffle operation which
1987   /// is used to produce operands for pairwise kind of reductions.
1988   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1989   /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1990   ///            \-------------v----------/  \----------v------------/
1991   ///                   n/2 elements               n/2 elements
1992   /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1993   /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1994   ///            \-------------v----------/  \----------v------------/
1995   ///                   n/2 elements               n/2 elements
1996   /// %red1 = op <n x t> %val1, <n x t> val2
1997   /// Again, the operation is performed on <n x t> vector, but the resulting
1998   /// vector %red1 is <n/2 x t> vector.
1999   ///
2000   /// The cost model should take into account that the actual length of the
2001   /// vector is reduced on each iteration.
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,bool IsPairwise,TTI::TargetCostKind CostKind)2002   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2003                                              bool IsPairwise,
2004                                              TTI::TargetCostKind CostKind) {
2005     Type *ScalarTy = Ty->getElementType();
2006     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2007     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2008         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2009         NumVecElts >= 2) {
2010       // Or reduction for i1 is represented as:
2011       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2012       // %res = cmp ne iReduxWidth %val, 0
2013       // And reduction for i1 is represented as:
2014       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2015       // %res = cmp eq iReduxWidth %val, 11111
2016       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2017       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2018                                        TTI::CastContextHint::None, CostKind) +
2019              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2020                                          CmpInst::makeCmpResultType(ValTy),
2021                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2022     }
2023     unsigned NumReduxLevels = Log2_32(NumVecElts);
2024     InstructionCost ArithCost = 0;
2025     InstructionCost ShuffleCost = 0;
2026     std::pair<InstructionCost, MVT> LT =
2027         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2028     unsigned LongVectorCount = 0;
2029     unsigned MVTLen =
2030         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2031     while (NumVecElts > MVTLen) {
2032       NumVecElts /= 2;
2033       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2034       // Assume the pairwise shuffles add a cost.
2035       ShuffleCost += (IsPairwise + 1) *
2036                      thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2037                                              NumVecElts, SubTy);
2038       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2039       Ty = SubTy;
2040       ++LongVectorCount;
2041     }
2042 
2043     NumReduxLevels -= LongVectorCount;
2044 
2045     // The minimal length of the vector is limited by the real length of vector
2046     // operations performed on the current platform. That's why several final
2047     // reduction operations are performed on the vectors with the same
2048     // architecture-dependent length.
2049 
2050     // Non pairwise reductions need one shuffle per reduction level. Pairwise
2051     // reductions need two shuffles on every level, but the last one. On that
2052     // level one of the shuffles is <0, u, u, ...> which is identity.
2053     unsigned NumShuffles = NumReduxLevels;
2054     if (IsPairwise && NumReduxLevels >= 1)
2055       NumShuffles += NumReduxLevels - 1;
2056     ShuffleCost += NumShuffles * thisT()->getShuffleCost(
2057                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2058     ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2059     return ShuffleCost + ArithCost +
2060            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2061   }
2062 
2063   /// Try to calculate op costs for min/max reduction operations.
2064   /// \param CondTy Conditional type for the Select instruction.
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsPairwise,bool IsUnsigned,TTI::TargetCostKind CostKind)2065   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2066                                          bool IsPairwise, bool IsUnsigned,
2067                                          TTI::TargetCostKind CostKind) {
2068     Type *ScalarTy = Ty->getElementType();
2069     Type *ScalarCondTy = CondTy->getElementType();
2070     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2071     unsigned NumReduxLevels = Log2_32(NumVecElts);
2072     unsigned CmpOpcode;
2073     if (Ty->isFPOrFPVectorTy()) {
2074       CmpOpcode = Instruction::FCmp;
2075     } else {
2076       assert(Ty->isIntOrIntVectorTy() &&
2077              "expecting floating point or integer type for min/max reduction");
2078       CmpOpcode = Instruction::ICmp;
2079     }
2080     InstructionCost MinMaxCost = 0;
2081     InstructionCost ShuffleCost = 0;
2082     std::pair<InstructionCost, MVT> LT =
2083         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2084     unsigned LongVectorCount = 0;
2085     unsigned MVTLen =
2086         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2087     while (NumVecElts > MVTLen) {
2088       NumVecElts /= 2;
2089       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2090       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2091 
2092       // Assume the pairwise shuffles add a cost.
2093       ShuffleCost += (IsPairwise + 1) *
2094                      thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2095                                              NumVecElts, SubTy);
2096       MinMaxCost +=
2097           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2098                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2099           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2100                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2101       Ty = SubTy;
2102       ++LongVectorCount;
2103     }
2104 
2105     NumReduxLevels -= LongVectorCount;
2106 
2107     // The minimal length of the vector is limited by the real length of vector
2108     // operations performed on the current platform. That's why several final
2109     // reduction opertions are perfomed on the vectors with the same
2110     // architecture-dependent length.
2111 
2112     // Non pairwise reductions need one shuffle per reduction level. Pairwise
2113     // reductions need two shuffles on every level, but the last one. On that
2114     // level one of the shuffles is <0, u, u, ...> which is identity.
2115     unsigned NumShuffles = NumReduxLevels;
2116     if (IsPairwise && NumReduxLevels >= 1)
2117       NumShuffles += NumReduxLevels - 1;
2118     ShuffleCost += NumShuffles * thisT()->getShuffleCost(
2119                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2120     MinMaxCost +=
2121         NumReduxLevels *
2122         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2123                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2124          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2125                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
2126     // The last min/max should be in vector registers and we counted it above.
2127     // So just need a single extractelement.
2128     return ShuffleCost + MinMaxCost +
2129            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2130   }
2131 
getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2132   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2133                                               Type *ResTy, VectorType *Ty,
2134                                               TTI::TargetCostKind CostKind) {
2135     // Without any native support, this is equivalent to the cost of
2136     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2137     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2138     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2139         Instruction::Add, ExtTy, false, CostKind);
2140     InstructionCost MulCost = 0;
2141     InstructionCost ExtCost = thisT()->getCastInstrCost(
2142         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2143         TTI::CastContextHint::None, CostKind);
2144     if (IsMLA) {
2145       MulCost =
2146           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2147       ExtCost *= 2;
2148     }
2149 
2150     return RedCost + MulCost + ExtCost;
2151   }
2152 
getVectorSplitCost()2153   InstructionCost getVectorSplitCost() { return 1; }
2154 
2155   /// @}
2156 };
2157 
2158 /// Concrete BasicTTIImpl that can be used if no further customization
2159 /// is needed.
2160 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2161   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2162 
2163   friend class BasicTTIImplBase<BasicTTIImpl>;
2164 
2165   const TargetSubtargetInfo *ST;
2166   const TargetLoweringBase *TLI;
2167 
getST()2168   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()2169   const TargetLoweringBase *getTLI() const { return TLI; }
2170 
2171 public:
2172   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2173 };
2174 
2175 } // end namespace llvm
2176 
2177 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2178