xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (revision 18f8106f310ee702046a11f360af47947c030d2e)
1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "llvm/CodeGen/BasicTTIImpl.h"
22 #include "llvm/Support/AMDGPUAddrSpace.h"
23 #include <optional>
24 
25 namespace llvm {
26 
27 class AMDGPUTargetMachine;
28 class GCNSubtarget;
29 class InstCombiner;
30 class Loop;
31 class ScalarEvolution;
32 class SITargetLowering;
33 class Type;
34 class Value;
35 
36 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
37   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
38   using TTI = TargetTransformInfo;
39 
40   friend BaseT;
41 
42   Triple TargetTriple;
43 
44   const TargetSubtargetInfo *ST;
45   const TargetLoweringBase *TLI;
46 
47   const TargetSubtargetInfo *getST() const { return ST; }
48   const TargetLoweringBase *getTLI() const { return TLI; }
49 
50 public:
51   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52 
53   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
54                                TTI::UnrollingPreferences &UP,
55                                OptimizationRemarkEmitter *ORE);
56 
57   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
58                              TTI::PeelingPreferences &PP);
59 
60   int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
61 };
62 
63 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
64   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
65   using TTI = TargetTransformInfo;
66 
67   friend BaseT;
68 
69   const GCNSubtarget *ST;
70   const SITargetLowering *TLI;
71   AMDGPUTTIImpl CommonTTI;
72   bool IsGraphics;
73   bool HasFP32Denormals;
74   bool HasFP64FP16Denormals;
75   static constexpr bool InlinerVectorBonusPercent = 0;
76 
77   static const FeatureBitset InlineFeatureIgnoreList;
78 
79   const GCNSubtarget *getST() const { return ST; }
80   const SITargetLowering *getTLI() const { return TLI; }
81 
82   static inline int getFullRateInstrCost() {
83     return TargetTransformInfo::TCC_Basic;
84   }
85 
86   static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87     return CostKind == TTI::TCK_CodeSize ? 2
88                                          : 2 * TargetTransformInfo::TCC_Basic;
89   }
90 
91   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92   // should be 2 or 4.
93   static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94     return CostKind == TTI::TCK_CodeSize ? 2
95                                          : 4 * TargetTransformInfo::TCC_Basic;
96   }
97 
98   // On some parts, normal fp64 operations are half rate, and others
99   // quarter. This also applies to some integer operations.
100   int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
101 
102   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
103 
104 public:
105   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
106 
107   bool hasBranchDivergence(const Function *F = nullptr) const;
108 
109   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
110                                TTI::UnrollingPreferences &UP,
111                                OptimizationRemarkEmitter *ORE);
112 
113   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
114                              TTI::PeelingPreferences &PP);
115 
116   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
117     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
118     return TTI::PSK_FastHardware;
119   }
120 
121   unsigned getNumberOfRegisters(unsigned RCID) const;
122   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
123   unsigned getMinVectorRegisterBitWidth() const;
124   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
125   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
126                                unsigned ChainSizeInBytes,
127                                VectorType *VecTy) const;
128   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
129                                 unsigned ChainSizeInBytes,
130                                 VectorType *VecTy) const;
131   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
132 
133   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
134                                   unsigned AddrSpace) const;
135   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
136                                    unsigned AddrSpace) const;
137   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
138                                     unsigned AddrSpace) const;
139 
140   int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
141   Type *
142   getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
143                             unsigned SrcAddrSpace, unsigned DestAddrSpace,
144                             Align SrcAlign, Align DestAlign,
145                             std::optional<uint32_t> AtomicElementSize) const;
146 
147   void getMemcpyLoopResidualLoweringType(
148       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
149       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
150       Align SrcAlign, Align DestAlign,
151       std::optional<uint32_t> AtomicCpySize) const;
152   unsigned getMaxInterleaveFactor(ElementCount VF);
153 
154   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
155 
156   InstructionCost getArithmeticInstrCost(
157       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
158       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
159       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
160       ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr);
161 
162   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
163                                  const Instruction *I = nullptr);
164 
165   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
166                                      ArrayRef<unsigned> Indices = {}) const;
167 
168   using BaseT::getVectorInstrCost;
169   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
170                                      TTI::TargetCostKind CostKind,
171                                      unsigned Index, Value *Op0, Value *Op1);
172 
173   bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
174   bool isSourceOfDivergence(const Value *V) const;
175   bool isAlwaysUniform(const Value *V) const;
176 
177   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
178     // Address space casts must cast between different address spaces.
179     if (FromAS == ToAS)
180       return false;
181 
182     if (FromAS == AMDGPUAS::FLAT_ADDRESS)
183       return AMDGPU::isExtendedGlobalAddrSpace(ToAS) ||
184              ToAS == AMDGPUAS::LOCAL_ADDRESS ||
185              ToAS == AMDGPUAS::PRIVATE_ADDRESS;
186 
187     if (AMDGPU::isExtendedGlobalAddrSpace(FromAS))
188       return AMDGPU::isFlatGlobalAddrSpace(ToAS) ||
189              ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
190 
191     if (FromAS == AMDGPUAS::LOCAL_ADDRESS ||
192         FromAS == AMDGPUAS::PRIVATE_ADDRESS)
193       return ToAS == AMDGPUAS::FLAT_ADDRESS;
194 
195     return false;
196   }
197 
198   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
199     return AMDGPU::addrspacesMayAlias(AS0, AS1);
200   }
201 
202   unsigned getFlatAddressSpace() const {
203     // Don't bother running InferAddressSpaces pass on graphics shaders which
204     // don't use flat addressing.
205     if (IsGraphics)
206       return -1;
207     return AMDGPUAS::FLAT_ADDRESS;
208   }
209 
210   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
211                                   Intrinsic::ID IID) const;
212 
213   bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
214     return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
215            AS != AMDGPUAS::PRIVATE_ADDRESS;
216   }
217 
218   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
219                                           Value *NewV) const;
220 
221   bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222                                  const Value *Op1, InstCombiner &IC) const;
223 
224   bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
225                                    unsigned LaneAgIdx) const;
226 
227   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
228                                                     IntrinsicInst &II) const;
229   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
230       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
231       APInt &UndefElts2, APInt &UndefElts3,
232       std::function<void(Instruction *, unsigned, APInt, APInt &)>
233           SimplifyAndSetOp) const;
234 
235   InstructionCost getVectorSplitCost() { return 0; }
236 
237   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
238                                  ArrayRef<int> Mask,
239                                  TTI::TargetCostKind CostKind, int Index,
240                                  VectorType *SubTp,
241                                  ArrayRef<const Value *> Args = {},
242                                  const Instruction *CxtI = nullptr);
243 
244   bool isProfitableToSinkOperands(Instruction *I,
245                                   SmallVectorImpl<Use *> &Ops) const;
246 
247   bool areInlineCompatible(const Function *Caller,
248                            const Function *Callee) const;
249 
250   int getInliningLastCallToStaticBonus() const;
251   unsigned getInliningThresholdMultiplier() const { return 11; }
252   unsigned adjustInliningThreshold(const CallBase *CB) const;
253   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
254 
255   int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
256 
257   InstructionCost getArithmeticReductionCost(
258       unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
259       TTI::TargetCostKind CostKind);
260 
261   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
262                                         TTI::TargetCostKind CostKind);
263   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
264                                          FastMathFlags FMF,
265                                          TTI::TargetCostKind CostKind);
266 
267   /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
268   unsigned getCacheLineSize() const override { return 128; }
269 
270   /// How much before a load we should place the prefetch instruction.
271   /// This is currently measured in number of IR instructions.
272   unsigned getPrefetchDistance() const override;
273 
274   /// \return if target want to issue a prefetch in address space \p AS.
275   bool shouldPrefetchAddressSpace(unsigned AS) const override;
276   void collectKernelLaunchBounds(
277       const Function &F,
278       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
279 };
280 
281 } // end namespace llvm
282 
283 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
284