1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file a TargetTransformInfo::Concept conforming object specific to the 11 /// AMDGPU target machine. It uses the target's detailed information to 12 /// provide more precise answers to certain TTI queries, while letting the 13 /// target independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 19 20 #include "AMDGPU.h" 21 #include "llvm/CodeGen/BasicTTIImpl.h" 22 #include "llvm/Support/AMDGPUAddrSpace.h" 23 #include <optional> 24 25 namespace llvm { 26 27 class AMDGPUTargetMachine; 28 class GCNSubtarget; 29 class InstCombiner; 30 class Loop; 31 class ScalarEvolution; 32 class SITargetLowering; 33 class Type; 34 class Value; 35 36 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { 37 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; 38 using TTI = TargetTransformInfo; 39 40 friend BaseT; 41 42 Triple TargetTriple; 43 44 const TargetSubtargetInfo *ST; 45 const TargetLoweringBase *TLI; 46 47 const TargetSubtargetInfo *getST() const { return ST; } 48 const TargetLoweringBase *getTLI() const { return TLI; } 49 50 public: 51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); 52 53 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 54 TTI::UnrollingPreferences &UP, 55 OptimizationRemarkEmitter *ORE); 56 57 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 58 TTI::PeelingPreferences &PP); 59 60 int64_t getMaxMemIntrinsicInlineSizeThreshold() const; 61 }; 62 63 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { 64 using BaseT = BasicTTIImplBase<GCNTTIImpl>; 65 using TTI = TargetTransformInfo; 66 67 friend BaseT; 68 69 const GCNSubtarget *ST; 70 const SITargetLowering *TLI; 71 AMDGPUTTIImpl CommonTTI; 72 bool IsGraphics; 73 bool HasFP32Denormals; 74 bool HasFP64FP16Denormals; 75 static constexpr bool InlinerVectorBonusPercent = 0; 76 77 static const FeatureBitset InlineFeatureIgnoreList; 78 79 const GCNSubtarget *getST() const { return ST; } 80 const SITargetLowering *getTLI() const { return TLI; } 81 82 static inline int getFullRateInstrCost() { 83 return TargetTransformInfo::TCC_Basic; 84 } 85 86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) { 87 return CostKind == TTI::TCK_CodeSize ? 2 88 : 2 * TargetTransformInfo::TCC_Basic; 89 } 90 91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe 92 // should be 2 or 4. 93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) { 94 return CostKind == TTI::TCK_CodeSize ? 2 95 : 4 * TargetTransformInfo::TCC_Basic; 96 } 97 98 // On some parts, normal fp64 operations are half rate, and others 99 // quarter. This also applies to some integer operations. 100 int get64BitInstrCost(TTI::TargetCostKind CostKind) const; 101 102 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; 103 104 public: 105 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); 106 107 bool hasBranchDivergence(const Function *F = nullptr) const; 108 109 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 110 TTI::UnrollingPreferences &UP, 111 OptimizationRemarkEmitter *ORE); 112 113 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 114 TTI::PeelingPreferences &PP); 115 116 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { 117 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 118 return TTI::PSK_FastHardware; 119 } 120 121 unsigned getNumberOfRegisters(unsigned RCID) const; 122 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; 123 unsigned getMinVectorRegisterBitWidth() const; 124 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; 125 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 126 unsigned ChainSizeInBytes, 127 VectorType *VecTy) const; 128 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 129 unsigned ChainSizeInBytes, 130 VectorType *VecTy) const; 131 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 132 133 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 134 unsigned AddrSpace) const; 135 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 136 unsigned AddrSpace) const; 137 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 138 unsigned AddrSpace) const; 139 140 int64_t getMaxMemIntrinsicInlineSizeThreshold() const; 141 Type * 142 getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, 143 unsigned SrcAddrSpace, unsigned DestAddrSpace, 144 Align SrcAlign, Align DestAlign, 145 std::optional<uint32_t> AtomicElementSize) const; 146 147 void getMemcpyLoopResidualLoweringType( 148 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 149 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 150 Align SrcAlign, Align DestAlign, 151 std::optional<uint32_t> AtomicCpySize) const; 152 unsigned getMaxInterleaveFactor(ElementCount VF); 153 154 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; 155 156 InstructionCost getArithmeticInstrCost( 157 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 158 TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None}, 159 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None}, 160 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr); 161 162 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 163 const Instruction *I = nullptr); 164 165 bool isInlineAsmSourceOfDivergence(const CallInst *CI, 166 ArrayRef<unsigned> Indices = {}) const; 167 168 using BaseT::getVectorInstrCost; 169 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, 170 TTI::TargetCostKind CostKind, 171 unsigned Index, Value *Op0, Value *Op1); 172 173 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; 174 bool isSourceOfDivergence(const Value *V) const; 175 bool isAlwaysUniform(const Value *V) const; 176 177 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 178 // Address space casts must cast between different address spaces. 179 if (FromAS == ToAS) 180 return false; 181 182 if (FromAS == AMDGPUAS::FLAT_ADDRESS) 183 return AMDGPU::isExtendedGlobalAddrSpace(ToAS) || 184 ToAS == AMDGPUAS::LOCAL_ADDRESS || 185 ToAS == AMDGPUAS::PRIVATE_ADDRESS; 186 187 if (AMDGPU::isExtendedGlobalAddrSpace(FromAS)) 188 return AMDGPU::isFlatGlobalAddrSpace(ToAS) || 189 ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 190 191 if (FromAS == AMDGPUAS::LOCAL_ADDRESS || 192 FromAS == AMDGPUAS::PRIVATE_ADDRESS) 193 return ToAS == AMDGPUAS::FLAT_ADDRESS; 194 195 return false; 196 } 197 198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 199 return AMDGPU::addrspacesMayAlias(AS0, AS1); 200 } 201 202 unsigned getFlatAddressSpace() const { 203 // Don't bother running InferAddressSpaces pass on graphics shaders which 204 // don't use flat addressing. 205 if (IsGraphics) 206 return -1; 207 return AMDGPUAS::FLAT_ADDRESS; 208 } 209 210 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 211 Intrinsic::ID IID) const; 212 213 bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const { 214 return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 215 AS != AMDGPUAS::PRIVATE_ADDRESS; 216 } 217 218 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 219 Value *NewV) const; 220 221 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, 222 const Value *Op1, InstCombiner &IC) const; 223 224 bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, 225 unsigned LaneAgIdx) const; 226 227 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 228 IntrinsicInst &II) const; 229 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 230 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 231 APInt &UndefElts2, APInt &UndefElts3, 232 std::function<void(Instruction *, unsigned, APInt, APInt &)> 233 SimplifyAndSetOp) const; 234 235 InstructionCost getVectorSplitCost() { return 0; } 236 237 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 238 ArrayRef<int> Mask, 239 TTI::TargetCostKind CostKind, int Index, 240 VectorType *SubTp, 241 ArrayRef<const Value *> Args = {}, 242 const Instruction *CxtI = nullptr); 243 244 bool isProfitableToSinkOperands(Instruction *I, 245 SmallVectorImpl<Use *> &Ops) const; 246 247 bool areInlineCompatible(const Function *Caller, 248 const Function *Callee) const; 249 250 int getInliningLastCallToStaticBonus() const; 251 unsigned getInliningThresholdMultiplier() const { return 11; } 252 unsigned adjustInliningThreshold(const CallBase *CB) const; 253 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; 254 255 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; } 256 257 InstructionCost getArithmeticReductionCost( 258 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, 259 TTI::TargetCostKind CostKind); 260 261 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 262 TTI::TargetCostKind CostKind); 263 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 264 FastMathFlags FMF, 265 TTI::TargetCostKind CostKind); 266 267 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. 268 unsigned getCacheLineSize() const override { return 128; } 269 270 /// How much before a load we should place the prefetch instruction. 271 /// This is currently measured in number of IR instructions. 272 unsigned getPrefetchDistance() const override; 273 274 /// \return if target want to issue a prefetch in address space \p AS. 275 bool shouldPrefetchAddressSpace(unsigned AS) const override; 276 void collectKernelLaunchBounds( 277 const Function &F, 278 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const; 279 }; 280 281 } // end namespace llvm 282 283 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 284