xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "R600Subtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 #include "llvm/CodeGen/TargetFrameLowering.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/IntrinsicsR600.h"
28 #include "llvm/IR/MDBuilder.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
36 
37 bool AMDGPUSubtarget::useRealTrue16Insts() const {
38   return hasTrue16BitInsts() && EnableRealTrue16Insts;
39 }
40 
41 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
42 // allows the given function to achieve an occupancy of NWaves waves per
43 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
44 unsigned
45 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
46                                                  const Function &F) const {
47   const unsigned WaveSize = getWavefrontSize();
48   const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
49   const unsigned WavesPerWorkgroup =
50       std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
51 
52   const unsigned WorkGroupsPerCU =
53       std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
54 
55   return getLocalMemorySize() / WorkGroupsPerCU;
56 }
57 
58 std::pair<unsigned, unsigned>
59 AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
60                                                 const Function &F) const {
61   // FIXME: We should take into account the LDS allocation granularity.
62   const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
63 
64   // Queried LDS size may be larger than available on a CU, in which case we
65   // consider the only achievable occupancy to be 1, in line with what we
66   // consider the occupancy to be when the number of requested registers in a
67   // particular bank is higher than the number of available ones in that bank.
68   if (!MaxWGsLDS)
69     return {1, 1};
70 
71   const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
72 
73   auto PropsFromWGSize = [=](unsigned WGSize)
74       -> std::tuple<const unsigned, const unsigned, unsigned> {
75     unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
76     unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
77     return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
78   };
79 
80   // The maximum group size will generally yield the minimum number of
81   // workgroups, maximum number of waves, and minimum occupancy. The opposite is
82   // generally true for the minimum group size. LDS or barrier ressource
83   // limitations can flip those minimums/maximums.
84   const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
85   auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
86   auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
87 
88   // It is possible that we end up with flipped minimum and maximum number of
89   // waves per CU when the number of minimum/maximum concurrent groups on the CU
90   // is limited by LDS usage or barrier resources.
91   if (MinWavesPerCU >= MaxWavesPerCU) {
92     std::swap(MinWavesPerCU, MaxWavesPerCU);
93   } else {
94     const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
95 
96     // Look for a potential smaller group size than the maximum which decreases
97     // the concurrent number of waves on the CU for the same number of
98     // concurrent workgroups on the CU.
99     unsigned MinWavesPerCUForWGSize =
100         divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
101     if (MinWavesPerCU > MinWavesPerCUForWGSize) {
102       unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
103       if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
104         // There may exist a smaller group size than the maximum that achieves
105         // the minimum number of waves per CU. This group size is the largest
106         // possible size that requires MaxWavesPerWG - E waves where E is
107         // maximized under the following constraints.
108         // 1. 0 <= E <= ExcessSlotsPerWG
109         // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
110         MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
111                                                 MaxWavesPerWG - MinWavesPerWG);
112       }
113     }
114 
115     // Look for a potential larger group size than the minimum which increases
116     // the concurrent number of waves on the CU for the same number of
117     // concurrent workgroups on the CU.
118     unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
119     if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
120       // There may exist a larger group size than the minimum that achieves the
121       // maximum number of waves per CU. This group size is the smallest
122       // possible size that requires MinWavesPerWG + L waves where L is
123       // maximized under the following constraints.
124       // 1. 0 <= L <= LeftoverSlotsPerWG
125       // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
126       MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
127                                               ((MaxWGSize - 1) / WaveSize) + 1 -
128                                                   MinWavesPerWG);
129     }
130   }
131 
132   // Return the minimum/maximum number of waves on any EU, assuming that all
133   // wavefronts are spread across all EUs as evenly as possible.
134   return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
135           std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
136 }
137 
138 std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
139     const MachineFunction &MF) const {
140   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
141   return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
142 }
143 
144 std::pair<unsigned, unsigned>
145 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
146   switch (CC) {
147   case CallingConv::AMDGPU_VS:
148   case CallingConv::AMDGPU_LS:
149   case CallingConv::AMDGPU_HS:
150   case CallingConv::AMDGPU_ES:
151   case CallingConv::AMDGPU_GS:
152   case CallingConv::AMDGPU_PS:
153     return std::pair(1, getWavefrontSize());
154   default:
155     return std::pair(1u, getMaxFlatWorkGroupSize());
156   }
157 }
158 
159 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
160   const Function &F) const {
161   // Default minimum/maximum flat work group sizes.
162   std::pair<unsigned, unsigned> Default =
163     getDefaultFlatWorkGroupSize(F.getCallingConv());
164 
165   // Requested minimum/maximum flat work group sizes.
166   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
167     F, "amdgpu-flat-work-group-size", Default);
168 
169   // Make sure requested minimum is less than requested maximum.
170   if (Requested.first > Requested.second)
171     return Default;
172 
173   // Make sure requested values do not violate subtarget's specifications.
174   if (Requested.first < getMinFlatWorkGroupSize())
175     return Default;
176   if (Requested.second > getMaxFlatWorkGroupSize())
177     return Default;
178 
179   return Requested;
180 }
181 
182 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
183     std::pair<unsigned, unsigned> Requested,
184     std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
185   // Default minimum/maximum number of waves per execution unit.
186   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
187 
188   // If minimum/maximum flat work group sizes were explicitly requested using
189   // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
190   // number of waves per execution unit to values implied by requested
191   // minimum/maximum flat work group sizes.
192   unsigned MinImpliedByFlatWorkGroupSize =
193     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
194   Default.first = MinImpliedByFlatWorkGroupSize;
195 
196   // Make sure requested minimum is less than requested maximum.
197   if (Requested.second && Requested.first > Requested.second)
198     return Default;
199 
200   // Make sure requested values do not violate subtarget's specifications.
201   if (Requested.first < getMinWavesPerEU() ||
202       Requested.second > getMaxWavesPerEU())
203     return Default;
204 
205   // Make sure requested values are compatible with values implied by requested
206   // minimum/maximum flat work group sizes.
207   if (Requested.first < MinImpliedByFlatWorkGroupSize)
208     return Default;
209 
210   return Requested;
211 }
212 
213 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
214     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
215   // Default minimum/maximum number of waves per execution unit.
216   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
217 
218   // Requested minimum/maximum number of waves per execution unit.
219   std::pair<unsigned, unsigned> Requested =
220       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
221   return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
222 }
223 
224 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
225   auto *Node = Kernel.getMetadata("reqd_work_group_size");
226   if (Node && Node->getNumOperands() == 3)
227     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
228   return std::numeric_limits<unsigned>::max();
229 }
230 
231 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
232   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
233 }
234 
235 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
236                                            unsigned Dimension) const {
237   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
238   if (ReqdSize != std::numeric_limits<unsigned>::max())
239     return ReqdSize - 1;
240   return getFlatWorkGroupSizes(Kernel).second - 1;
241 }
242 
243 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
244   for (int I = 0; I < 3; ++I) {
245     if (getMaxWorkitemID(Func, I) > 0)
246       return false;
247   }
248 
249   return true;
250 }
251 
252 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
253   Function *Kernel = I->getParent()->getParent();
254   unsigned MinSize = 0;
255   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
256   bool IdQuery = false;
257 
258   // If reqd_work_group_size is present it narrows value down.
259   if (auto *CI = dyn_cast<CallInst>(I)) {
260     const Function *F = CI->getCalledFunction();
261     if (F) {
262       unsigned Dim = UINT_MAX;
263       switch (F->getIntrinsicID()) {
264       case Intrinsic::amdgcn_workitem_id_x:
265       case Intrinsic::r600_read_tidig_x:
266         IdQuery = true;
267         [[fallthrough]];
268       case Intrinsic::r600_read_local_size_x:
269         Dim = 0;
270         break;
271       case Intrinsic::amdgcn_workitem_id_y:
272       case Intrinsic::r600_read_tidig_y:
273         IdQuery = true;
274         [[fallthrough]];
275       case Intrinsic::r600_read_local_size_y:
276         Dim = 1;
277         break;
278       case Intrinsic::amdgcn_workitem_id_z:
279       case Intrinsic::r600_read_tidig_z:
280         IdQuery = true;
281         [[fallthrough]];
282       case Intrinsic::r600_read_local_size_z:
283         Dim = 2;
284         break;
285       default:
286         break;
287       }
288 
289       if (Dim <= 3) {
290         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
291         if (ReqdSize != std::numeric_limits<unsigned>::max())
292           MinSize = MaxSize = ReqdSize;
293       }
294     }
295   }
296 
297   if (!MaxSize)
298     return false;
299 
300   // Range metadata is [Lo, Hi). For ID query we need to pass max size
301   // as Hi. For size query we need to pass Hi + 1.
302   if (IdQuery)
303     MinSize = 0;
304   else
305     ++MaxSize;
306 
307   APInt Lower{32, MinSize};
308   APInt Upper{32, MaxSize};
309   if (auto *CI = dyn_cast<CallBase>(I)) {
310     ConstantRange Range(Lower, Upper);
311     CI->addRangeRetAttr(Range);
312   } else {
313     MDBuilder MDB(I->getContext());
314     MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
315     I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
316   }
317   return true;
318 }
319 
320 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
321   assert(AMDGPU::isKernel(F.getCallingConv()));
322 
323   // We don't allocate the segment if we know the implicit arguments weren't
324   // used, even if the ABI implies we need them.
325   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
326     return 0;
327 
328   if (isMesaKernel(F))
329     return 16;
330 
331   // Assume all implicit inputs are used by default
332   const Module *M = F.getParent();
333   unsigned NBytes =
334       AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
335   return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
336                                          NBytes);
337 }
338 
339 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
340                                                  Align &MaxAlign) const {
341   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
342          F.getCallingConv() == CallingConv::SPIR_KERNEL);
343 
344   const DataLayout &DL = F.getDataLayout();
345   uint64_t ExplicitArgBytes = 0;
346   MaxAlign = Align(1);
347 
348   for (const Argument &Arg : F.args()) {
349     if (Arg.hasAttribute("amdgpu-hidden-argument"))
350       continue;
351 
352     const bool IsByRef = Arg.hasByRefAttr();
353     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
354     Align Alignment = DL.getValueOrABITypeAlignment(
355         IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
356     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
357     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
358     MaxAlign = std::max(MaxAlign, Alignment);
359   }
360 
361   return ExplicitArgBytes;
362 }
363 
364 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
365                                                 Align &MaxAlign) const {
366   if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
367       F.getCallingConv() != CallingConv::SPIR_KERNEL)
368     return 0;
369 
370   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
371 
372   unsigned ExplicitOffset = getExplicitKernelArgOffset();
373 
374   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
375   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
376   if (ImplicitBytes != 0) {
377     const Align Alignment = getAlignmentForImplicitArgPtr();
378     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
379     MaxAlign = std::max(MaxAlign, Alignment);
380   }
381 
382   // Being able to dereference past the end is useful for emitting scalar loads.
383   return alignTo(TotalSize, 4);
384 }
385 
386 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
387   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
388                                   : AMDGPUDwarfFlavour::Wave64;
389 }
390 
391 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
392   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
393     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
394   return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
395 }
396 
397 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
398   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
399     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
400   return static_cast<const AMDGPUSubtarget &>(
401       TM.getSubtarget<R600Subtarget>(F));
402 }
403 
404 // FIXME: This has no reason to be in subtarget
405 SmallVector<unsigned>
406 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
407   return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
408                                         std::numeric_limits<uint32_t>::max());
409 }
410