xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision be187369a03bf2df8bdbc76ecd381377b3bb6074)
1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPUResourceUsageAnalysis.h"
19 #include "AMDGPU.h"
20 #include "GCNSubtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineModuleInfo.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Target/TargetMachine.h"
27 
28 using namespace llvm;
29 using namespace llvm::AMDGPU;
30 
31 #define DEBUG_TYPE "amdgpu-resource-usage"
32 
33 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
34 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
35 
36 // In code object v4 and older, we need to tell the runtime some amount ahead of
37 // time if we don't know the true stack size. Assume a smaller number if this is
38 // only due to dynamic / non-entry block allocas.
39 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
40     "amdgpu-assume-external-call-stack-size",
41     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
42     cl::init(16384));
43 
44 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
45     "amdgpu-assume-dynamic-stack-object-size",
46     cl::desc("Assumed extra stack use if there are any "
47              "variable sized objects (in bytes)"),
48     cl::Hidden, cl::init(4096));
49 
50 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
51                 "Function register usage analysis", true, true)
52 
53 static const Function *getCalleeFunction(const MachineOperand &Op) {
54   if (Op.isImm()) {
55     assert(Op.getImm() == 0);
56     return nullptr;
57   }
58   return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
59 }
60 
61 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
62                                   const SIInstrInfo &TII, unsigned Reg) {
63   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
64     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
65       return true;
66   }
67 
68   return false;
69 }
70 
71 bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
72   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
73   if (!TPC)
74     return false;
75 
76   const TargetMachine &TM = TPC->getTM<TargetMachine>();
77   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
78 
79   // By default, for code object v5 and later, track only the minimum scratch
80   // size
81   uint32_t AssumedStackSizeForDynamicSizeObjects =
82       clAssumedStackSizeForDynamicSizeObjects;
83   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
84   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
85           AMDGPU::AMDHSA_COV5 ||
86       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
87     if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
88       AssumedStackSizeForDynamicSizeObjects = 0;
89     if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
90       AssumedStackSizeForExternalCall = 0;
91   }
92 
93   ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
94                                       AssumedStackSizeForExternalCall);
95 
96   return false;
97 }
98 
99 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
100 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
101     const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
102     uint32_t AssumedStackSizeForExternalCall) const {
103   SIFunctionResourceInfo Info;
104 
105   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
106   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
107   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
108   const MachineRegisterInfo &MRI = MF.getRegInfo();
109   const SIInstrInfo *TII = ST.getInstrInfo();
110   const SIRegisterInfo &TRI = TII->getRegisterInfo();
111 
112   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
113                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
114                          MRI.isLiveIn(MFI->getPreloadedReg(
115                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
116 
117   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
118   // instructions aren't used to access the scratch buffer. Inline assembly may
119   // need it though.
120   //
121   // If we only have implicit uses of flat_scr on flat instructions, it is not
122   // really needed.
123   if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
124       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
125        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
126        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
127     Info.UsesFlatScratch = false;
128   }
129 
130   Info.PrivateSegmentSize = FrameInfo.getStackSize();
131 
132   // Assume a big number if there are any unknown sized objects.
133   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
134   if (Info.HasDynamicallySizedStack)
135     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
136 
137   if (MFI->isStackRealigned())
138     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
139 
140   Info.UsesVCC =
141       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
142 
143   // If there are no calls, MachineRegisterInfo can tell us the used register
144   // count easily.
145   // A tail call isn't considered a call for MachineFrameInfo's purposes.
146   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
147     Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
148     Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
149     if (ST.hasMAIInsts())
150       Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
151     return Info;
152   }
153 
154   int32_t MaxVGPR = -1;
155   int32_t MaxAGPR = -1;
156   int32_t MaxSGPR = -1;
157   Info.CalleeSegmentSize = 0;
158 
159   for (const MachineBasicBlock &MBB : MF) {
160     for (const MachineInstr &MI : MBB) {
161       // TODO: Check regmasks? Do they occur anywhere except calls?
162       for (const MachineOperand &MO : MI.operands()) {
163         unsigned Width = 0;
164         bool IsSGPR = false;
165         bool IsAGPR = false;
166 
167         if (!MO.isReg())
168           continue;
169 
170         Register Reg = MO.getReg();
171         switch (Reg) {
172         case AMDGPU::EXEC:
173         case AMDGPU::EXEC_LO:
174         case AMDGPU::EXEC_HI:
175         case AMDGPU::SCC:
176         case AMDGPU::M0:
177         case AMDGPU::M0_LO16:
178         case AMDGPU::M0_HI16:
179         case AMDGPU::SRC_SHARED_BASE_LO:
180         case AMDGPU::SRC_SHARED_BASE:
181         case AMDGPU::SRC_SHARED_LIMIT_LO:
182         case AMDGPU::SRC_SHARED_LIMIT:
183         case AMDGPU::SRC_PRIVATE_BASE_LO:
184         case AMDGPU::SRC_PRIVATE_BASE:
185         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
186         case AMDGPU::SRC_PRIVATE_LIMIT:
187         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
188         case AMDGPU::SGPR_NULL:
189         case AMDGPU::SGPR_NULL64:
190         case AMDGPU::MODE:
191           continue;
192 
193         case AMDGPU::NoRegister:
194           assert(MI.isDebugInstr() &&
195                  "Instruction uses invalid noreg register");
196           continue;
197 
198         case AMDGPU::VCC:
199         case AMDGPU::VCC_LO:
200         case AMDGPU::VCC_HI:
201         case AMDGPU::VCC_LO_LO16:
202         case AMDGPU::VCC_LO_HI16:
203         case AMDGPU::VCC_HI_LO16:
204         case AMDGPU::VCC_HI_HI16:
205           Info.UsesVCC = true;
206           continue;
207 
208         case AMDGPU::FLAT_SCR:
209         case AMDGPU::FLAT_SCR_LO:
210         case AMDGPU::FLAT_SCR_HI:
211           continue;
212 
213         case AMDGPU::XNACK_MASK:
214         case AMDGPU::XNACK_MASK_LO:
215         case AMDGPU::XNACK_MASK_HI:
216           llvm_unreachable("xnack_mask registers should not be used");
217 
218         case AMDGPU::LDS_DIRECT:
219           llvm_unreachable("lds_direct register should not be used");
220 
221         case AMDGPU::TBA:
222         case AMDGPU::TBA_LO:
223         case AMDGPU::TBA_HI:
224         case AMDGPU::TMA:
225         case AMDGPU::TMA_LO:
226         case AMDGPU::TMA_HI:
227           llvm_unreachable("trap handler registers should not be used");
228 
229         case AMDGPU::SRC_VCCZ:
230           llvm_unreachable("src_vccz register should not be used");
231 
232         case AMDGPU::SRC_EXECZ:
233           llvm_unreachable("src_execz register should not be used");
234 
235         case AMDGPU::SRC_SCC:
236           llvm_unreachable("src_scc register should not be used");
237 
238         default:
239           break;
240         }
241 
242         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
243             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
244             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
245           IsSGPR = true;
246           Width = 1;
247         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
248                    AMDGPU::VGPR_16RegClass.contains(Reg)) {
249           IsSGPR = false;
250           Width = 1;
251         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
252                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
253           IsSGPR = false;
254           IsAGPR = true;
255           Width = 1;
256         } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
257           IsSGPR = true;
258           Width = 2;
259         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
260           IsSGPR = false;
261           Width = 2;
262         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
263           IsSGPR = false;
264           IsAGPR = true;
265           Width = 2;
266         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
267           IsSGPR = false;
268           Width = 3;
269         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
270           IsSGPR = true;
271           Width = 3;
272         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
273           IsSGPR = false;
274           IsAGPR = true;
275           Width = 3;
276         } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
277           IsSGPR = true;
278           Width = 4;
279         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
280           IsSGPR = false;
281           Width = 4;
282         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
283           IsSGPR = false;
284           IsAGPR = true;
285           Width = 4;
286         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
287           IsSGPR = false;
288           Width = 5;
289         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
290           IsSGPR = true;
291           Width = 5;
292         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
293           IsSGPR = false;
294           IsAGPR = true;
295           Width = 5;
296         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
297           IsSGPR = false;
298           Width = 6;
299         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
300           IsSGPR = true;
301           Width = 6;
302         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
303           IsSGPR = false;
304           IsAGPR = true;
305           Width = 6;
306         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
307           IsSGPR = false;
308           Width = 7;
309         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
310           IsSGPR = true;
311           Width = 7;
312         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
313           IsSGPR = false;
314           IsAGPR = true;
315           Width = 7;
316         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
317           IsSGPR = true;
318           Width = 8;
319         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
320           IsSGPR = false;
321           Width = 8;
322         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
323           IsSGPR = false;
324           IsAGPR = true;
325           Width = 8;
326         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
327           IsSGPR = false;
328           Width = 9;
329         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
330           IsSGPR = true;
331           Width = 9;
332         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
333           IsSGPR = false;
334           IsAGPR = true;
335           Width = 9;
336         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
337           IsSGPR = false;
338           Width = 10;
339         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
340           IsSGPR = true;
341           Width = 10;
342         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
343           IsSGPR = false;
344           IsAGPR = true;
345           Width = 10;
346         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
347           IsSGPR = false;
348           Width = 11;
349         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
350           IsSGPR = true;
351           Width = 11;
352         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
353           IsSGPR = false;
354           IsAGPR = true;
355           Width = 11;
356         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
357           IsSGPR = false;
358           Width = 12;
359         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
360           IsSGPR = true;
361           Width = 12;
362         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
363           IsSGPR = false;
364           IsAGPR = true;
365           Width = 12;
366         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
367           IsSGPR = true;
368           Width = 16;
369         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
370           IsSGPR = false;
371           Width = 16;
372         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
373           IsSGPR = false;
374           IsAGPR = true;
375           Width = 16;
376         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
377           IsSGPR = true;
378           Width = 32;
379         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
380           IsSGPR = false;
381           Width = 32;
382         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
383           IsSGPR = false;
384           IsAGPR = true;
385           Width = 32;
386         } else {
387           // We only expect TTMP registers or registers that do not belong to
388           // any RC.
389           assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
390                   AMDGPU::TTMP_64RegClass.contains(Reg) ||
391                   AMDGPU::TTMP_128RegClass.contains(Reg) ||
392                   AMDGPU::TTMP_256RegClass.contains(Reg) ||
393                   AMDGPU::TTMP_512RegClass.contains(Reg) ||
394                   !TRI.getPhysRegBaseClass(Reg)) &&
395                  "Unknown register class");
396         }
397         unsigned HWReg = TRI.getHWRegIndex(Reg);
398         int MaxUsed = HWReg + Width - 1;
399         if (IsSGPR) {
400           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
401         } else if (IsAGPR) {
402           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
403         } else {
404           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
405         }
406       }
407 
408       if (MI.isCall()) {
409         // Pseudo used just to encode the underlying global. Is there a better
410         // way to track this?
411 
412         const MachineOperand *CalleeOp =
413             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
414 
415         const Function *Callee = getCalleeFunction(*CalleeOp);
416 
417         // Avoid crashing on undefined behavior with an illegal call to a
418         // kernel. If a callsite's calling convention doesn't match the
419         // function's, it's undefined behavior. If the callsite calling
420         // convention does match, that would have errored earlier.
421         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
422           report_fatal_error("invalid call to entry function");
423 
424         auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
425           return F == &MF.getFunction();
426         };
427 
428         if (Callee && !isSameFunction(MF, Callee))
429           Info.Callees.push_back(Callee);
430 
431         bool IsIndirect = !Callee || Callee->isDeclaration();
432 
433         // FIXME: Call site could have norecurse on it
434         if (!Callee || !Callee->doesNotRecurse()) {
435           Info.HasRecursion = true;
436 
437           // TODO: If we happen to know there is no stack usage in the
438           // callgraph, we don't need to assume an infinitely growing stack.
439           if (!MI.isReturn()) {
440             // We don't need to assume an unknown stack size for tail calls.
441 
442             // FIXME: This only benefits in the case where the kernel does not
443             // directly call the tail called function. If a kernel directly
444             // calls a tail recursive function, we'll assume maximum stack size
445             // based on the regular call instruction.
446             Info.CalleeSegmentSize = std::max(
447                 Info.CalleeSegmentSize,
448                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
449           }
450         }
451 
452         if (IsIndirect) {
453           Info.CalleeSegmentSize =
454               std::max(Info.CalleeSegmentSize,
455                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
456 
457           // Register usage of indirect calls gets handled later
458           Info.UsesVCC = true;
459           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
460           Info.HasDynamicallySizedStack = true;
461           Info.HasIndirectCall = true;
462         }
463       }
464     }
465   }
466 
467   Info.NumExplicitSGPR = MaxSGPR + 1;
468   Info.NumVGPR = MaxVGPR + 1;
469   Info.NumAGPR = MaxAGPR + 1;
470 
471   return Info;
472 }
473