xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision df5e431e8dcb555320be7f7ae4d499a11e0b17db)
1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/Analysis/CallGraph.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/TargetPassConfig.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/Target/TargetMachine.h"
37 
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40 
41 #define DEBUG_TYPE "amdgpu-resource-usage"
42 
43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45 
46 // In code object v4 and older, we need to tell the runtime some amount ahead of
47 // time if we don't know the true stack size. Assume a smaller number if this is
48 // only due to dynamic / non-entry block allocas.
49 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
50     "amdgpu-assume-external-call-stack-size",
51     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52     cl::init(16384));
53 
54 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
55     "amdgpu-assume-dynamic-stack-object-size",
56     cl::desc("Assumed extra stack use if there are any "
57              "variable sized objects (in bytes)"),
58     cl::Hidden, cl::init(4096));
59 
60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61                 "Function register usage analysis", true, true)
62 
63 static const Function *getCalleeFunction(const MachineOperand &Op) {
64   if (Op.isImm()) {
65     assert(Op.getImm() == 0);
66     return nullptr;
67   }
68   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69     return cast<Function>(GA->getOperand(0));
70   return cast<Function>(Op.getGlobal());
71 }
72 
73 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
74                                   const SIInstrInfo &TII, unsigned Reg) {
75   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
77       return true;
78   }
79 
80   return false;
81 }
82 
83 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
84     const GCNSubtarget &ST) const {
85   return NumExplicitSGPR +
86          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87                                    ST.getTargetID().isXnackOnOrAny());
88 }
89 
90 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93 }
94 
95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96     const GCNSubtarget &ST) const {
97   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99 
100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102   if (!TPC)
103     return false;
104 
105   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106   const TargetMachine &TM = TPC->getTM<TargetMachine>();
107   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
108   bool HasIndirectCall = false;
109 
110   CallGraph CG = CallGraph(M);
111   auto End = po_end(&CG);
112 
113   // By default, for code object v5 and later, track only the minimum scratch
114   // size
115   uint32_t AssumedStackSizeForDynamicSizeObjects =
116       clAssumedStackSizeForDynamicSizeObjects;
117   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
118   if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
119       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
120     if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
121       AssumedStackSizeForDynamicSizeObjects = 0;
122     if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
123       AssumedStackSizeForExternalCall = 0;
124   }
125 
126   for (auto IT = po_begin(&CG); IT != End; ++IT) {
127     Function *F = IT->getFunction();
128     if (!F || F->isDeclaration())
129       continue;
130 
131     MachineFunction *MF = MMI.getMachineFunction(*F);
132     assert(MF && "function must have been generated already");
133 
134     auto CI =
135         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
136     SIFunctionResourceInfo &Info = CI.first->second;
137     assert(CI.second && "should only be called once per function");
138     Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
139                                 AssumedStackSizeForExternalCall);
140     HasIndirectCall |= Info.HasIndirectCall;
141   }
142 
143   // It's possible we have unreachable functions in the module which weren't
144   // visited by the PO traversal. Make sure we have some resource counts to
145   // report.
146   for (const auto &IT : CG) {
147     const Function *F = IT.first;
148     if (!F || F->isDeclaration())
149       continue;
150 
151     auto CI =
152         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
153     if (!CI.second) // Skip already visited functions
154       continue;
155 
156     SIFunctionResourceInfo &Info = CI.first->second;
157     MachineFunction *MF = MMI.getMachineFunction(*F);
158     assert(MF && "function must have been generated already");
159     Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
160                                 AssumedStackSizeForExternalCall);
161     HasIndirectCall |= Info.HasIndirectCall;
162   }
163 
164   if (HasIndirectCall)
165     propagateIndirectCallRegisterUsage();
166 
167   return false;
168 }
169 
170 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
171 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
172     const MachineFunction &MF, const TargetMachine &TM,
173     uint32_t AssumedStackSizeForDynamicSizeObjects,
174     uint32_t AssumedStackSizeForExternalCall) const {
175   SIFunctionResourceInfo Info;
176 
177   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
178   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
179   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
180   const MachineRegisterInfo &MRI = MF.getRegInfo();
181   const SIInstrInfo *TII = ST.getInstrInfo();
182   const SIRegisterInfo &TRI = TII->getRegisterInfo();
183 
184   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
185                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
186                          MRI.isLiveIn(MFI->getPreloadedReg(
187                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
188 
189   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
190   // instructions aren't used to access the scratch buffer. Inline assembly may
191   // need it though.
192   //
193   // If we only have implicit uses of flat_scr on flat instructions, it is not
194   // really needed.
195   if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
196       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
197        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
198        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
199     Info.UsesFlatScratch = false;
200   }
201 
202   Info.PrivateSegmentSize = FrameInfo.getStackSize();
203 
204   // Assume a big number if there are any unknown sized objects.
205   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
206   if (Info.HasDynamicallySizedStack)
207     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
208 
209   if (MFI->isStackRealigned())
210     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
211 
212   Info.UsesVCC =
213       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
214 
215   // If there are no calls, MachineRegisterInfo can tell us the used register
216   // count easily.
217   // A tail call isn't considered a call for MachineFrameInfo's purposes.
218   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
219     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
220     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
221       if (MRI.isPhysRegUsed(Reg)) {
222         HighestVGPRReg = Reg;
223         break;
224       }
225     }
226 
227     if (ST.hasMAIInsts()) {
228       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
229       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
230         if (MRI.isPhysRegUsed(Reg)) {
231           HighestAGPRReg = Reg;
232           break;
233         }
234       }
235       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
236                          ? 0
237                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
238     }
239 
240     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
241     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
242       if (MRI.isPhysRegUsed(Reg)) {
243         HighestSGPRReg = Reg;
244         break;
245       }
246     }
247 
248     // We found the maximum register index. They start at 0, so add one to get
249     // the number of registers.
250     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
251                        ? 0
252                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
253     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
254                                ? 0
255                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
256 
257     return Info;
258   }
259 
260   int32_t MaxVGPR = -1;
261   int32_t MaxAGPR = -1;
262   int32_t MaxSGPR = -1;
263   uint64_t CalleeFrameSize = 0;
264 
265   for (const MachineBasicBlock &MBB : MF) {
266     for (const MachineInstr &MI : MBB) {
267       // TODO: Check regmasks? Do they occur anywhere except calls?
268       for (const MachineOperand &MO : MI.operands()) {
269         unsigned Width = 0;
270         bool IsSGPR = false;
271         bool IsAGPR = false;
272 
273         if (!MO.isReg())
274           continue;
275 
276         Register Reg = MO.getReg();
277         switch (Reg) {
278         case AMDGPU::EXEC:
279         case AMDGPU::EXEC_LO:
280         case AMDGPU::EXEC_HI:
281         case AMDGPU::SCC:
282         case AMDGPU::M0:
283         case AMDGPU::M0_LO16:
284         case AMDGPU::M0_HI16:
285         case AMDGPU::SRC_SHARED_BASE_LO:
286         case AMDGPU::SRC_SHARED_BASE:
287         case AMDGPU::SRC_SHARED_LIMIT_LO:
288         case AMDGPU::SRC_SHARED_LIMIT:
289         case AMDGPU::SRC_PRIVATE_BASE_LO:
290         case AMDGPU::SRC_PRIVATE_BASE:
291         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
292         case AMDGPU::SRC_PRIVATE_LIMIT:
293         case AMDGPU::SGPR_NULL:
294         case AMDGPU::SGPR_NULL64:
295         case AMDGPU::MODE:
296           continue;
297 
298         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
299           llvm_unreachable("src_pops_exiting_wave_id should not be used");
300 
301         case AMDGPU::NoRegister:
302           assert(MI.isDebugInstr() &&
303                  "Instruction uses invalid noreg register");
304           continue;
305 
306         case AMDGPU::VCC:
307         case AMDGPU::VCC_LO:
308         case AMDGPU::VCC_HI:
309         case AMDGPU::VCC_LO_LO16:
310         case AMDGPU::VCC_LO_HI16:
311         case AMDGPU::VCC_HI_LO16:
312         case AMDGPU::VCC_HI_HI16:
313           Info.UsesVCC = true;
314           continue;
315 
316         case AMDGPU::FLAT_SCR:
317         case AMDGPU::FLAT_SCR_LO:
318         case AMDGPU::FLAT_SCR_HI:
319           continue;
320 
321         case AMDGPU::XNACK_MASK:
322         case AMDGPU::XNACK_MASK_LO:
323         case AMDGPU::XNACK_MASK_HI:
324           llvm_unreachable("xnack_mask registers should not be used");
325 
326         case AMDGPU::LDS_DIRECT:
327           llvm_unreachable("lds_direct register should not be used");
328 
329         case AMDGPU::TBA:
330         case AMDGPU::TBA_LO:
331         case AMDGPU::TBA_HI:
332         case AMDGPU::TMA:
333         case AMDGPU::TMA_LO:
334         case AMDGPU::TMA_HI:
335           llvm_unreachable("trap handler registers should not be used");
336 
337         case AMDGPU::SRC_VCCZ:
338           llvm_unreachable("src_vccz register should not be used");
339 
340         case AMDGPU::SRC_EXECZ:
341           llvm_unreachable("src_execz register should not be used");
342 
343         case AMDGPU::SRC_SCC:
344           llvm_unreachable("src_scc register should not be used");
345 
346         default:
347           break;
348         }
349 
350         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
351             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
352             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
353           IsSGPR = true;
354           Width = 1;
355         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
356                    AMDGPU::VGPR_16RegClass.contains(Reg)) {
357           IsSGPR = false;
358           Width = 1;
359         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
360                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
361           IsSGPR = false;
362           IsAGPR = true;
363           Width = 1;
364         } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
365           IsSGPR = true;
366           Width = 2;
367         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
368           IsSGPR = false;
369           Width = 2;
370         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
371           IsSGPR = false;
372           IsAGPR = true;
373           Width = 2;
374         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
375           IsSGPR = false;
376           Width = 3;
377         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
378           IsSGPR = true;
379           Width = 3;
380         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
381           IsSGPR = false;
382           IsAGPR = true;
383           Width = 3;
384         } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
385           IsSGPR = true;
386           Width = 4;
387         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
388           IsSGPR = false;
389           Width = 4;
390         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
391           IsSGPR = false;
392           IsAGPR = true;
393           Width = 4;
394         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
395           IsSGPR = false;
396           Width = 5;
397         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
398           IsSGPR = true;
399           Width = 5;
400         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
401           IsSGPR = false;
402           IsAGPR = true;
403           Width = 5;
404         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
405           IsSGPR = false;
406           Width = 6;
407         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
408           IsSGPR = true;
409           Width = 6;
410         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
411           IsSGPR = false;
412           IsAGPR = true;
413           Width = 6;
414         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
415           IsSGPR = false;
416           Width = 7;
417         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
418           IsSGPR = true;
419           Width = 7;
420         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
421           IsSGPR = false;
422           IsAGPR = true;
423           Width = 7;
424         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
425           IsSGPR = true;
426           Width = 8;
427         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
428           IsSGPR = false;
429           Width = 8;
430         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
431           IsSGPR = false;
432           IsAGPR = true;
433           Width = 8;
434         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
435           IsSGPR = false;
436           Width = 9;
437         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
438           IsSGPR = true;
439           Width = 9;
440         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
441           IsSGPR = false;
442           IsAGPR = true;
443           Width = 9;
444         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
445           IsSGPR = false;
446           Width = 10;
447         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
448           IsSGPR = true;
449           Width = 10;
450         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
451           IsSGPR = false;
452           IsAGPR = true;
453           Width = 10;
454         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
455           IsSGPR = false;
456           Width = 11;
457         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
458           IsSGPR = true;
459           Width = 11;
460         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
461           IsSGPR = false;
462           IsAGPR = true;
463           Width = 11;
464         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
465           IsSGPR = false;
466           Width = 12;
467         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
468           IsSGPR = true;
469           Width = 12;
470         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
471           IsSGPR = false;
472           IsAGPR = true;
473           Width = 12;
474         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
475           IsSGPR = true;
476           Width = 16;
477         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
478           IsSGPR = false;
479           Width = 16;
480         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
481           IsSGPR = false;
482           IsAGPR = true;
483           Width = 16;
484         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
485           IsSGPR = true;
486           Width = 32;
487         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
488           IsSGPR = false;
489           Width = 32;
490         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
491           IsSGPR = false;
492           IsAGPR = true;
493           Width = 32;
494         } else {
495           // We only expect TTMP registers or registers that do not belong to
496           // any RC.
497           assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
498                   AMDGPU::TTMP_64RegClass.contains(Reg) ||
499                   AMDGPU::TTMP_128RegClass.contains(Reg) ||
500                   AMDGPU::TTMP_256RegClass.contains(Reg) ||
501                   AMDGPU::TTMP_512RegClass.contains(Reg) ||
502                   !TRI.getPhysRegBaseClass(Reg)) &&
503                  "Unknown register class");
504         }
505         unsigned HWReg = TRI.getHWRegIndex(Reg);
506         int MaxUsed = HWReg + Width - 1;
507         if (IsSGPR) {
508           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
509         } else if (IsAGPR) {
510           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
511         } else {
512           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
513         }
514       }
515 
516       if (MI.isCall()) {
517         // Pseudo used just to encode the underlying global. Is there a better
518         // way to track this?
519 
520         const MachineOperand *CalleeOp =
521             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
522 
523         const Function *Callee = getCalleeFunction(*CalleeOp);
524         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
525             CallGraphResourceInfo.end();
526 
527         // Avoid crashing on undefined behavior with an illegal call to a
528         // kernel. If a callsite's calling convention doesn't match the
529         // function's, it's undefined behavior. If the callsite calling
530         // convention does match, that would have errored earlier.
531         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
532           report_fatal_error("invalid call to entry function");
533 
534         bool IsIndirect = !Callee || Callee->isDeclaration();
535         if (!IsIndirect)
536           I = CallGraphResourceInfo.find(Callee);
537 
538         // FIXME: Call site could have norecurse on it
539         if (!Callee || !Callee->doesNotRecurse()) {
540           Info.HasRecursion = true;
541 
542           // TODO: If we happen to know there is no stack usage in the
543           // callgraph, we don't need to assume an infinitely growing stack.
544           if (!MI.isReturn()) {
545             // We don't need to assume an unknown stack size for tail calls.
546 
547             // FIXME: This only benefits in the case where the kernel does not
548             // directly call the tail called function. If a kernel directly
549             // calls a tail recursive function, we'll assume maximum stack size
550             // based on the regular call instruction.
551             CalleeFrameSize = std::max(
552                 CalleeFrameSize,
553                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
554           }
555         }
556 
557         if (IsIndirect || I == CallGraphResourceInfo.end()) {
558           CalleeFrameSize =
559               std::max(CalleeFrameSize,
560                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
561 
562           // Register usage of indirect calls gets handled later
563           Info.UsesVCC = true;
564           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
565           Info.HasDynamicallySizedStack = true;
566           Info.HasIndirectCall = true;
567         } else {
568           // We force CodeGen to run in SCC order, so the callee's register
569           // usage etc. should be the cumulative usage of all callees.
570           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
571           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
572           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
573           CalleeFrameSize =
574               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
575           Info.UsesVCC |= I->second.UsesVCC;
576           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
577           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
578           Info.HasRecursion |= I->second.HasRecursion;
579           Info.HasIndirectCall |= I->second.HasIndirectCall;
580         }
581       }
582     }
583   }
584 
585   Info.NumExplicitSGPR = MaxSGPR + 1;
586   Info.NumVGPR = MaxVGPR + 1;
587   Info.NumAGPR = MaxAGPR + 1;
588   Info.PrivateSegmentSize += CalleeFrameSize;
589 
590   return Info;
591 }
592 
593 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
594   // Collect the maximum number of registers from non-hardware-entrypoints.
595   // All these functions are potential targets for indirect calls.
596   int32_t NonKernelMaxSGPRs = 0;
597   int32_t NonKernelMaxVGPRs = 0;
598   int32_t NonKernelMaxAGPRs = 0;
599 
600   for (const auto &I : CallGraphResourceInfo) {
601     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
602       auto &Info = I.getSecond();
603       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
604       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
605       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
606     }
607   }
608 
609   // Add register usage for functions with indirect calls.
610   // For calls to unknown functions, we assume the maximum register usage of
611   // all non-hardware-entrypoints in the current module.
612   for (auto &I : CallGraphResourceInfo) {
613     auto &Info = I.getSecond();
614     if (Info.HasIndirectCall) {
615       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
616       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
617       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
618     }
619   }
620 }
621