xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Base class for AMDGPU specific classes of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 
17 #include "llvm/IR/CallingConv.h"
18 #include "llvm/Support/Alignment.h"
19 #include "llvm/TargetParser/Triple.h"
20 
21 namespace llvm {
22 
23 enum AMDGPUDwarfFlavour : unsigned;
24 class Function;
25 class Instruction;
26 class MachineFunction;
27 class TargetMachine;
28 
29 class AMDGPUSubtarget {
30 public:
31   enum Generation {
32     INVALID = 0,
33     R600 = 1,
34     R700 = 2,
35     EVERGREEN = 3,
36     NORTHERN_ISLANDS = 4,
37     SOUTHERN_ISLANDS = 5,
38     SEA_ISLANDS = 6,
39     VOLCANIC_ISLANDS = 7,
40     GFX9 = 8,
41     GFX10 = 9,
42     GFX11 = 10,
43     GFX12 = 11,
44   };
45 
46 private:
47   Triple TargetTriple;
48 
49 protected:
50   bool GCN3Encoding = false;
51   bool Has16BitInsts = false;
52   bool HasTrue16BitInsts = false;
53   bool HasFP8ConversionScaleInsts = false;
54   bool HasBF8ConversionScaleInsts = false;
55   bool HasFP4ConversionScaleInsts = false;
56   bool HasFP6BF6ConversionScaleInsts = false;
57   bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
58   bool HasCvtPkF16F32Inst = false;
59   bool HasF32ToF16BF16ConversionSRInsts = false;
60   bool EnableRealTrue16Insts = false;
61   bool HasBF16ConversionInsts = false;
62   bool HasMadMixInsts = false;
63   bool HasMadMacF32Insts = false;
64   bool HasDsSrc2Insts = false;
65   bool HasSDWA = false;
66   bool HasVOP3PInsts = false;
67   bool HasMulI24 = true;
68   bool HasMulU24 = true;
69   bool HasSMulHi = false;
70   bool HasInv2PiInlineImm = false;
71   bool HasFminFmaxLegacy = true;
72   bool EnablePromoteAlloca = false;
73   bool HasTrigReducedRange = false;
74   bool FastFMAF32 = false;
75   unsigned EUsPerCU = 4;
76   unsigned MaxWavesPerEU = 10;
77   unsigned LocalMemorySize = 0;
78   unsigned AddressableLocalMemorySize = 0;
79   char WavefrontSizeLog2 = 0;
80 
81 public:
82   AMDGPUSubtarget(Triple TT);
83 
84   static const AMDGPUSubtarget &get(const MachineFunction &MF);
85   static const AMDGPUSubtarget &get(const TargetMachine &TM,
86                                     const Function &F);
87 
88   /// \returns Default range flat work group size for a calling convention.
89   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
90 
91   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
92   /// for function \p F, or minimum/maximum flat work group sizes explicitly
93   /// requested using "amdgpu-flat-work-group-size" attribute attached to
94   /// function \p F.
95   ///
96   /// \returns Subtarget's default values if explicitly requested values cannot
97   /// be converted to integer, or violate subtarget's specifications.
98   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
99 
100   /// \returns Subtarget's default pair of minimum/maximum number of waves per
101   /// execution unit for function \p F, or minimum/maximum number of waves per
102   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
103   /// attached to function \p F.
104   ///
105   /// \returns Subtarget's default values if explicitly requested values cannot
106   /// be converted to integer, violate subtarget's specifications, or are not
107   /// compatible with minimum/maximum number of waves limited by flat work group
108   /// size, register usage, and/or lds usage.
109   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
110     // Default/requested minimum/maximum flat work group sizes.
111     std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
112     return getWavesPerEU(F, FlatWorkGroupSizes);
113   }
114 
115   /// Overload which uses the specified values for the flat work group sizes,
116   /// rather than querying the function itself. \p FlatWorkGroupSizes Should
117   /// correspond to the function's value for getFlatWorkGroupSizes.
118   std::pair<unsigned, unsigned>
119   getWavesPerEU(const Function &F,
120                 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
121   std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
122       std::pair<unsigned, unsigned> WavesPerEU,
123       std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
124 
125   /// Return the amount of LDS that can be used that will not restrict the
126   /// occupancy lower than WaveCount.
127   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
128                                            const Function &) const;
129 
130   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
131   /// be achieved when the only function running on a CU is \p F and each
132   /// workgroup running the function requires \p LDSBytes bytes of LDS space.
133   /// This notably depends on the range of allowed flat group sizes for the
134   /// function and hardware characteristics.
135   std::pair<unsigned, unsigned>
136   getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
137 
138   /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
139   /// be achieved when the only function running on a CU is \p MF. This notably
140   /// depends on the range of allowed flat group sizes for the function, the
141   /// amount of per-workgroup LDS space required by the function, and hardware
142   /// characteristics.
143   std::pair<unsigned, unsigned>
144   getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
145 
146   bool isAmdHsaOS() const {
147     return TargetTriple.getOS() == Triple::AMDHSA;
148   }
149 
150   bool isAmdPalOS() const {
151     return TargetTriple.getOS() == Triple::AMDPAL;
152   }
153 
154   bool isMesa3DOS() const {
155     return TargetTriple.getOS() == Triple::Mesa3D;
156   }
157 
158   bool isMesaKernel(const Function &F) const;
159 
160   bool isAmdHsaOrMesa(const Function &F) const {
161     return isAmdHsaOS() || isMesaKernel(F);
162   }
163 
164   bool isGCN() const {
165     return TargetTriple.getArch() == Triple::amdgcn;
166   }
167 
168   bool isGCN3Encoding() const {
169     return GCN3Encoding;
170   }
171 
172   bool has16BitInsts() const {
173     return Has16BitInsts;
174   }
175 
176   /// Return true if the subtarget supports True16 instructions.
177   bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
178 
179   /// Return true if real (non-fake) variants of True16 instructions using
180   /// 16-bit registers should be code-generated. Fake True16 instructions are
181   /// identical to non-fake ones except that they take 32-bit registers as
182   /// operands and always use their low halves.
183   // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
184   // supported and the support for fake True16 instructions is removed.
185   bool useRealTrue16Insts() const;
186 
187   bool hasBF16ConversionInsts() const {
188     return HasBF16ConversionInsts;
189   }
190 
191   bool hasMadMixInsts() const {
192     return HasMadMixInsts;
193   }
194 
195   bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
196 
197   bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
198 
199   bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
200 
201   bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; }
202 
203   bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
204 
205   bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
206 
207   bool hasF32ToF16BF16ConversionSRInsts() const {
208     return HasF32ToF16BF16ConversionSRInsts;
209   }
210 
211   bool hasMadMacF32Insts() const {
212     return HasMadMacF32Insts || !isGCN();
213   }
214 
215   bool hasDsSrc2Insts() const {
216     return HasDsSrc2Insts;
217   }
218 
219   bool hasSDWA() const {
220     return HasSDWA;
221   }
222 
223   bool hasVOP3PInsts() const {
224     return HasVOP3PInsts;
225   }
226 
227   bool hasMulI24() const {
228     return HasMulI24;
229   }
230 
231   bool hasMulU24() const {
232     return HasMulU24;
233   }
234 
235   bool hasSMulHi() const {
236     return HasSMulHi;
237   }
238 
239   bool hasInv2PiInlineImm() const {
240     return HasInv2PiInlineImm;
241   }
242 
243   bool hasFminFmaxLegacy() const {
244     return HasFminFmaxLegacy;
245   }
246 
247   bool hasTrigReducedRange() const {
248     return HasTrigReducedRange;
249   }
250 
251   bool hasFastFMAF32() const {
252     return FastFMAF32;
253   }
254 
255   bool isPromoteAllocaEnabled() const {
256     return EnablePromoteAlloca;
257   }
258 
259   unsigned getWavefrontSize() const {
260     return 1 << WavefrontSizeLog2;
261   }
262 
263   unsigned getWavefrontSizeLog2() const {
264     return WavefrontSizeLog2;
265   }
266 
267   /// Return the maximum number of bytes of LDS available for all workgroups
268   /// running on the same WGP or CU.
269   /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
270   /// limited to 64k.
271   unsigned getLocalMemorySize() const {
272     return LocalMemorySize;
273   }
274 
275   /// Return the maximum number of bytes of LDS that can be allocated to a
276   /// single workgroup.
277   /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
278   /// 128k in total.
279   unsigned getAddressableLocalMemorySize() const {
280     return AddressableLocalMemorySize;
281   }
282 
283   /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
284   /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
285   /// CU mode into account.
286   unsigned getEUsPerCU() const { return EUsPerCU; }
287 
288   Align getAlignmentForImplicitArgPtr() const {
289     return isAmdHsaOS() ? Align(8) : Align(4);
290   }
291 
292   /// Returns the offset in bytes from the start of the input buffer
293   ///        of the first explicit kernel argument.
294   unsigned getExplicitKernelArgOffset() const {
295     switch (TargetTriple.getOS()) {
296     case Triple::AMDHSA:
297     case Triple::AMDPAL:
298     case Triple::Mesa3D:
299       return 0;
300     case Triple::UnknownOS:
301     default:
302       // For legacy reasons unknown/other is treated as a different version of
303       // mesa.
304       return 36;
305     }
306 
307     llvm_unreachable("invalid triple OS");
308   }
309 
310   /// \returns Maximum number of work groups per compute unit supported by the
311   /// subtarget and limited by given \p FlatWorkGroupSize.
312   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
313 
314   /// \returns Minimum flat work group size supported by the subtarget.
315   virtual unsigned getMinFlatWorkGroupSize() const = 0;
316 
317   /// \returns Maximum flat work group size supported by the subtarget.
318   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
319 
320   /// \returns Number of waves per execution unit required to support the given
321   /// \p FlatWorkGroupSize.
322   virtual unsigned
323   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
324 
325   /// \returns Minimum number of waves per execution unit supported by the
326   /// subtarget.
327   virtual unsigned getMinWavesPerEU() const = 0;
328 
329   /// \returns Maximum number of waves per execution unit supported by the
330   /// subtarget without any kind of limitation.
331   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
332 
333   /// Return the maximum workitem ID value in the function, for the given (0, 1,
334   /// 2) dimension.
335   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
336 
337   /// Return the number of work groups for the function.
338   SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
339 
340   /// Return true if only a single workitem can be active in a wave.
341   bool isSingleLaneExecution(const Function &Kernel) const;
342 
343   /// Creates value range metadata on an workitemid.* intrinsic call or load.
344   bool makeLIDRangeMetadata(Instruction *I) const;
345 
346   /// \returns Number of bytes of arguments that are passed to a shader or
347   /// kernel in addition to the explicit ones declared for the function.
348   unsigned getImplicitArgNumBytes(const Function &F) const;
349   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
350   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
351 
352   /// \returns Corresponding DWARF register number mapping flavour for the
353   /// \p WavefrontSize.
354   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
355 
356   virtual ~AMDGPUSubtarget() = default;
357 };
358 
359 } // end namespace llvm
360 
361 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
362