1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Base class for AMDGPU specific classes of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 17 #include "llvm/IR/CallingConv.h" 18 #include "llvm/Support/Alignment.h" 19 #include "llvm/TargetParser/Triple.h" 20 21 namespace llvm { 22 23 enum AMDGPUDwarfFlavour : unsigned; 24 class Function; 25 class Instruction; 26 class MachineFunction; 27 class TargetMachine; 28 29 class AMDGPUSubtarget { 30 public: 31 enum Generation { 32 INVALID = 0, 33 R600 = 1, 34 R700 = 2, 35 EVERGREEN = 3, 36 NORTHERN_ISLANDS = 4, 37 SOUTHERN_ISLANDS = 5, 38 SEA_ISLANDS = 6, 39 VOLCANIC_ISLANDS = 7, 40 GFX9 = 8, 41 GFX10 = 9, 42 GFX11 = 10, 43 GFX12 = 11, 44 }; 45 46 private: 47 Triple TargetTriple; 48 49 protected: 50 bool GCN3Encoding = false; 51 bool Has16BitInsts = false; 52 bool HasTrue16BitInsts = false; 53 bool HasFP8ConversionScaleInsts = false; 54 bool HasBF8ConversionScaleInsts = false; 55 bool HasFP4ConversionScaleInsts = false; 56 bool HasFP6BF6ConversionScaleInsts = false; 57 bool HasF16BF16ToFP6BF6ConversionScaleInsts = false; 58 bool HasCvtPkF16F32Inst = false; 59 bool HasF32ToF16BF16ConversionSRInsts = false; 60 bool EnableRealTrue16Insts = false; 61 bool HasBF16ConversionInsts = false; 62 bool HasMadMixInsts = false; 63 bool HasMadMacF32Insts = false; 64 bool HasDsSrc2Insts = false; 65 bool HasSDWA = false; 66 bool HasVOP3PInsts = false; 67 bool HasMulI24 = true; 68 bool HasMulU24 = true; 69 bool HasSMulHi = false; 70 bool HasInv2PiInlineImm = false; 71 bool HasFminFmaxLegacy = true; 72 bool EnablePromoteAlloca = false; 73 bool HasTrigReducedRange = false; 74 bool FastFMAF32 = false; 75 unsigned EUsPerCU = 4; 76 unsigned MaxWavesPerEU = 10; 77 unsigned LocalMemorySize = 0; 78 unsigned AddressableLocalMemorySize = 0; 79 char WavefrontSizeLog2 = 0; 80 81 public: 82 AMDGPUSubtarget(Triple TT); 83 84 static const AMDGPUSubtarget &get(const MachineFunction &MF); 85 static const AMDGPUSubtarget &get(const TargetMachine &TM, 86 const Function &F); 87 88 /// \returns Default range flat work group size for a calling convention. 89 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 90 91 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 92 /// for function \p F, or minimum/maximum flat work group sizes explicitly 93 /// requested using "amdgpu-flat-work-group-size" attribute attached to 94 /// function \p F. 95 /// 96 /// \returns Subtarget's default values if explicitly requested values cannot 97 /// be converted to integer, or violate subtarget's specifications. 98 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 99 100 /// \returns Subtarget's default pair of minimum/maximum number of waves per 101 /// execution unit for function \p F, or minimum/maximum number of waves per 102 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 103 /// attached to function \p F. 104 /// 105 /// \returns Subtarget's default values if explicitly requested values cannot 106 /// be converted to integer, violate subtarget's specifications, or are not 107 /// compatible with minimum/maximum number of waves limited by flat work group 108 /// size, register usage, and/or lds usage. 109 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const { 110 // Default/requested minimum/maximum flat work group sizes. 111 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 112 return getWavesPerEU(F, FlatWorkGroupSizes); 113 } 114 115 /// Overload which uses the specified values for the flat work group sizes, 116 /// rather than querying the function itself. \p FlatWorkGroupSizes Should 117 /// correspond to the function's value for getFlatWorkGroupSizes. 118 std::pair<unsigned, unsigned> 119 getWavesPerEU(const Function &F, 120 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; 121 std::pair<unsigned, unsigned> getEffectiveWavesPerEU( 122 std::pair<unsigned, unsigned> WavesPerEU, 123 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; 124 125 /// Return the amount of LDS that can be used that will not restrict the 126 /// occupancy lower than WaveCount. 127 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 128 const Function &) const; 129 130 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can 131 /// be achieved when the only function running on a CU is \p F and each 132 /// workgroup running the function requires \p LDSBytes bytes of LDS space. 133 /// This notably depends on the range of allowed flat group sizes for the 134 /// function and hardware characteristics. 135 std::pair<unsigned, unsigned> 136 getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; 137 138 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can 139 /// be achieved when the only function running on a CU is \p MF. This notably 140 /// depends on the range of allowed flat group sizes for the function, the 141 /// amount of per-workgroup LDS space required by the function, and hardware 142 /// characteristics. 143 std::pair<unsigned, unsigned> 144 getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const; 145 146 bool isAmdHsaOS() const { 147 return TargetTriple.getOS() == Triple::AMDHSA; 148 } 149 150 bool isAmdPalOS() const { 151 return TargetTriple.getOS() == Triple::AMDPAL; 152 } 153 154 bool isMesa3DOS() const { 155 return TargetTriple.getOS() == Triple::Mesa3D; 156 } 157 158 bool isMesaKernel(const Function &F) const; 159 160 bool isAmdHsaOrMesa(const Function &F) const { 161 return isAmdHsaOS() || isMesaKernel(F); 162 } 163 164 bool isGCN() const { 165 return TargetTriple.getArch() == Triple::amdgcn; 166 } 167 168 bool isGCN3Encoding() const { 169 return GCN3Encoding; 170 } 171 172 bool has16BitInsts() const { 173 return Has16BitInsts; 174 } 175 176 /// Return true if the subtarget supports True16 instructions. 177 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } 178 179 /// Return true if real (non-fake) variants of True16 instructions using 180 /// 16-bit registers should be code-generated. Fake True16 instructions are 181 /// identical to non-fake ones except that they take 32-bit registers as 182 /// operands and always use their low halves. 183 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully 184 // supported and the support for fake True16 instructions is removed. 185 bool useRealTrue16Insts() const; 186 187 bool hasBF16ConversionInsts() const { 188 return HasBF16ConversionInsts; 189 } 190 191 bool hasMadMixInsts() const { 192 return HasMadMixInsts; 193 } 194 195 bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; } 196 197 bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; } 198 199 bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; } 200 201 bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; } 202 203 bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; } 204 205 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; } 206 207 bool hasF32ToF16BF16ConversionSRInsts() const { 208 return HasF32ToF16BF16ConversionSRInsts; 209 } 210 211 bool hasMadMacF32Insts() const { 212 return HasMadMacF32Insts || !isGCN(); 213 } 214 215 bool hasDsSrc2Insts() const { 216 return HasDsSrc2Insts; 217 } 218 219 bool hasSDWA() const { 220 return HasSDWA; 221 } 222 223 bool hasVOP3PInsts() const { 224 return HasVOP3PInsts; 225 } 226 227 bool hasMulI24() const { 228 return HasMulI24; 229 } 230 231 bool hasMulU24() const { 232 return HasMulU24; 233 } 234 235 bool hasSMulHi() const { 236 return HasSMulHi; 237 } 238 239 bool hasInv2PiInlineImm() const { 240 return HasInv2PiInlineImm; 241 } 242 243 bool hasFminFmaxLegacy() const { 244 return HasFminFmaxLegacy; 245 } 246 247 bool hasTrigReducedRange() const { 248 return HasTrigReducedRange; 249 } 250 251 bool hasFastFMAF32() const { 252 return FastFMAF32; 253 } 254 255 bool isPromoteAllocaEnabled() const { 256 return EnablePromoteAlloca; 257 } 258 259 unsigned getWavefrontSize() const { 260 return 1 << WavefrontSizeLog2; 261 } 262 263 unsigned getWavefrontSizeLog2() const { 264 return WavefrontSizeLog2; 265 } 266 267 /// Return the maximum number of bytes of LDS available for all workgroups 268 /// running on the same WGP or CU. 269 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is 270 /// limited to 64k. 271 unsigned getLocalMemorySize() const { 272 return LocalMemorySize; 273 } 274 275 /// Return the maximum number of bytes of LDS that can be allocated to a 276 /// single workgroup. 277 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has 278 /// 128k in total. 279 unsigned getAddressableLocalMemorySize() const { 280 return AddressableLocalMemorySize; 281 } 282 283 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the 284 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs. 285 /// CU mode into account. 286 unsigned getEUsPerCU() const { return EUsPerCU; } 287 288 Align getAlignmentForImplicitArgPtr() const { 289 return isAmdHsaOS() ? Align(8) : Align(4); 290 } 291 292 /// Returns the offset in bytes from the start of the input buffer 293 /// of the first explicit kernel argument. 294 unsigned getExplicitKernelArgOffset() const { 295 switch (TargetTriple.getOS()) { 296 case Triple::AMDHSA: 297 case Triple::AMDPAL: 298 case Triple::Mesa3D: 299 return 0; 300 case Triple::UnknownOS: 301 default: 302 // For legacy reasons unknown/other is treated as a different version of 303 // mesa. 304 return 36; 305 } 306 307 llvm_unreachable("invalid triple OS"); 308 } 309 310 /// \returns Maximum number of work groups per compute unit supported by the 311 /// subtarget and limited by given \p FlatWorkGroupSize. 312 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 313 314 /// \returns Minimum flat work group size supported by the subtarget. 315 virtual unsigned getMinFlatWorkGroupSize() const = 0; 316 317 /// \returns Maximum flat work group size supported by the subtarget. 318 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 319 320 /// \returns Number of waves per execution unit required to support the given 321 /// \p FlatWorkGroupSize. 322 virtual unsigned 323 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; 324 325 /// \returns Minimum number of waves per execution unit supported by the 326 /// subtarget. 327 virtual unsigned getMinWavesPerEU() const = 0; 328 329 /// \returns Maximum number of waves per execution unit supported by the 330 /// subtarget without any kind of limitation. 331 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } 332 333 /// Return the maximum workitem ID value in the function, for the given (0, 1, 334 /// 2) dimension. 335 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; 336 337 /// Return the number of work groups for the function. 338 SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const; 339 340 /// Return true if only a single workitem can be active in a wave. 341 bool isSingleLaneExecution(const Function &Kernel) const; 342 343 /// Creates value range metadata on an workitemid.* intrinsic call or load. 344 bool makeLIDRangeMetadata(Instruction *I) const; 345 346 /// \returns Number of bytes of arguments that are passed to a shader or 347 /// kernel in addition to the explicit ones declared for the function. 348 unsigned getImplicitArgNumBytes(const Function &F) const; 349 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; 350 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; 351 352 /// \returns Corresponding DWARF register number mapping flavour for the 353 /// \p WavefrontSize. 354 AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; 355 356 virtual ~AMDGPUSubtarget() = default; 357 }; 358 359 } // end namespace llvm 360 361 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 362