xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
1fe6060f1SDimitry Andric //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2fe6060f1SDimitry Andric //
3fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6fe6060f1SDimitry Andric //
7fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
8fe6060f1SDimitry Andric //
9fe6060f1SDimitry Andric /// \file
10fe6060f1SDimitry Andric /// \brief Analyzes how many registers and other resources are used by
11fe6060f1SDimitry Andric /// functions.
12fe6060f1SDimitry Andric ///
13fe6060f1SDimitry Andric /// The results of this analysis are used to fill the register usage, flat
14fe6060f1SDimitry Andric /// usage, etc. into hardware registers.
15fe6060f1SDimitry Andric ///
16fe6060f1SDimitry Andric /// The analysis takes callees into account. E.g. if a function A that needs 10
17fe6060f1SDimitry Andric /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18fe6060f1SDimitry Andric /// will return 20.
19fe6060f1SDimitry Andric /// It is assumed that an indirect call can go into any function except
20fe6060f1SDimitry Andric /// hardware-entrypoints. Therefore the register usage of functions with
21fe6060f1SDimitry Andric /// indirect calls is estimated as the maximum of all non-entrypoint functions
22fe6060f1SDimitry Andric /// in the module.
23fe6060f1SDimitry Andric ///
24fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
25fe6060f1SDimitry Andric 
26fe6060f1SDimitry Andric #include "AMDGPUResourceUsageAnalysis.h"
27fe6060f1SDimitry Andric #include "AMDGPU.h"
28fe6060f1SDimitry Andric #include "GCNSubtarget.h"
29fe6060f1SDimitry Andric #include "SIMachineFunctionInfo.h"
30fe6060f1SDimitry Andric #include "llvm/Analysis/CallGraph.h"
31fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
32*349cc55cSDimitry Andric #include "llvm/IR/GlobalAlias.h"
33*349cc55cSDimitry Andric #include "llvm/IR/GlobalValue.h"
34fe6060f1SDimitry Andric #include "llvm/Target/TargetMachine.h"
35fe6060f1SDimitry Andric 
36fe6060f1SDimitry Andric using namespace llvm;
37fe6060f1SDimitry Andric using namespace llvm::AMDGPU;
38fe6060f1SDimitry Andric 
39fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-resource-usage"
40fe6060f1SDimitry Andric 
41fe6060f1SDimitry Andric char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
42fe6060f1SDimitry Andric char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
43fe6060f1SDimitry Andric 
44fe6060f1SDimitry Andric // We need to tell the runtime some amount ahead of time if we don't know the
45fe6060f1SDimitry Andric // true stack size. Assume a smaller number if this is only due to dynamic /
46fe6060f1SDimitry Andric // non-entry block allocas.
47fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
48fe6060f1SDimitry Andric     "amdgpu-assume-external-call-stack-size",
49fe6060f1SDimitry Andric     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
50fe6060f1SDimitry Andric     cl::init(16384));
51fe6060f1SDimitry Andric 
52fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
53fe6060f1SDimitry Andric     "amdgpu-assume-dynamic-stack-object-size",
54fe6060f1SDimitry Andric     cl::desc("Assumed extra stack use if there are any "
55fe6060f1SDimitry Andric              "variable sized objects (in bytes)"),
56fe6060f1SDimitry Andric     cl::Hidden, cl::init(4096));
57fe6060f1SDimitry Andric 
58fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
59fe6060f1SDimitry Andric                 "Function register usage analysis", true, true)
60fe6060f1SDimitry Andric 
61fe6060f1SDimitry Andric static const Function *getCalleeFunction(const MachineOperand &Op) {
62fe6060f1SDimitry Andric   if (Op.isImm()) {
63fe6060f1SDimitry Andric     assert(Op.getImm() == 0);
64fe6060f1SDimitry Andric     return nullptr;
65fe6060f1SDimitry Andric   }
66*349cc55cSDimitry Andric   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
67*349cc55cSDimitry Andric     return cast<Function>(GA->getOperand(0));
68fe6060f1SDimitry Andric   return cast<Function>(Op.getGlobal());
69fe6060f1SDimitry Andric }
70fe6060f1SDimitry Andric 
71fe6060f1SDimitry Andric static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72fe6060f1SDimitry Andric                                   const SIInstrInfo &TII, unsigned Reg) {
73fe6060f1SDimitry Andric   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74fe6060f1SDimitry Andric     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
75fe6060f1SDimitry Andric       return true;
76fe6060f1SDimitry Andric   }
77fe6060f1SDimitry Andric 
78fe6060f1SDimitry Andric   return false;
79fe6060f1SDimitry Andric }
80fe6060f1SDimitry Andric 
81fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82fe6060f1SDimitry Andric     const GCNSubtarget &ST) const {
83fe6060f1SDimitry Andric   return NumExplicitSGPR +
84fe6060f1SDimitry Andric          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
85fe6060f1SDimitry Andric                                    ST.getTargetID().isXnackOnOrAny());
86fe6060f1SDimitry Andric }
87fe6060f1SDimitry Andric 
88fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89*349cc55cSDimitry Andric     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90*349cc55cSDimitry Andric   if (ST.hasGFX90AInsts() && ArgNumAGPR)
91*349cc55cSDimitry Andric     return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
92*349cc55cSDimitry Andric   return std::max(ArgNumVGPR, ArgNumAGPR);
93*349cc55cSDimitry Andric }
94*349cc55cSDimitry Andric 
95*349cc55cSDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96fe6060f1SDimitry Andric     const GCNSubtarget &ST) const {
97*349cc55cSDimitry Andric   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98fe6060f1SDimitry Andric }
99fe6060f1SDimitry Andric 
100fe6060f1SDimitry Andric bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
101fe6060f1SDimitry Andric   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102fe6060f1SDimitry Andric   if (!TPC)
103fe6060f1SDimitry Andric     return false;
104fe6060f1SDimitry Andric 
105fe6060f1SDimitry Andric   const TargetMachine &TM = TPC->getTM<TargetMachine>();
106fe6060f1SDimitry Andric   bool HasIndirectCall = false;
107fe6060f1SDimitry Andric 
108fe6060f1SDimitry Andric   for (CallGraphNode *I : SCC) {
109fe6060f1SDimitry Andric     Function *F = I->getFunction();
110fe6060f1SDimitry Andric     if (!F || F->isDeclaration())
111fe6060f1SDimitry Andric       continue;
112fe6060f1SDimitry Andric 
113fe6060f1SDimitry Andric     MachineModuleInfo &MMI =
114fe6060f1SDimitry Andric         getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
115fe6060f1SDimitry Andric     MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
116fe6060f1SDimitry Andric 
117fe6060f1SDimitry Andric     auto CI = CallGraphResourceInfo.insert(
118fe6060f1SDimitry Andric         std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
119fe6060f1SDimitry Andric     SIFunctionResourceInfo &Info = CI.first->second;
120fe6060f1SDimitry Andric     assert(CI.second && "should only be called once per function");
121fe6060f1SDimitry Andric     Info = analyzeResourceUsage(MF, TM);
122fe6060f1SDimitry Andric     HasIndirectCall |= Info.HasIndirectCall;
123fe6060f1SDimitry Andric   }
124fe6060f1SDimitry Andric 
125fe6060f1SDimitry Andric   if (HasIndirectCall)
126fe6060f1SDimitry Andric     propagateIndirectCallRegisterUsage();
127fe6060f1SDimitry Andric 
128fe6060f1SDimitry Andric   return false;
129fe6060f1SDimitry Andric }
130fe6060f1SDimitry Andric 
131fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
132fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
133fe6060f1SDimitry Andric     const MachineFunction &MF, const TargetMachine &TM) const {
134fe6060f1SDimitry Andric   SIFunctionResourceInfo Info;
135fe6060f1SDimitry Andric 
136fe6060f1SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
137fe6060f1SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
138fe6060f1SDimitry Andric   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
139fe6060f1SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
140fe6060f1SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
141fe6060f1SDimitry Andric   const SIRegisterInfo &TRI = TII->getRegisterInfo();
142fe6060f1SDimitry Andric 
143fe6060f1SDimitry Andric   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
144fe6060f1SDimitry Andric                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
145fe6060f1SDimitry Andric                          MRI.isLiveIn(MFI->getPreloadedReg(
146fe6060f1SDimitry Andric                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
147fe6060f1SDimitry Andric 
148fe6060f1SDimitry Andric   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
149fe6060f1SDimitry Andric   // instructions aren't used to access the scratch buffer. Inline assembly may
150fe6060f1SDimitry Andric   // need it though.
151fe6060f1SDimitry Andric   //
152fe6060f1SDimitry Andric   // If we only have implicit uses of flat_scr on flat instructions, it is not
153fe6060f1SDimitry Andric   // really needed.
154fe6060f1SDimitry Andric   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
155fe6060f1SDimitry Andric       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
156fe6060f1SDimitry Andric        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
157fe6060f1SDimitry Andric        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
158fe6060f1SDimitry Andric     Info.UsesFlatScratch = false;
159fe6060f1SDimitry Andric   }
160fe6060f1SDimitry Andric 
161fe6060f1SDimitry Andric   Info.PrivateSegmentSize = FrameInfo.getStackSize();
162fe6060f1SDimitry Andric 
163fe6060f1SDimitry Andric   // Assume a big number if there are any unknown sized objects.
164fe6060f1SDimitry Andric   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
165fe6060f1SDimitry Andric   if (Info.HasDynamicallySizedStack)
166fe6060f1SDimitry Andric     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
167fe6060f1SDimitry Andric 
168fe6060f1SDimitry Andric   if (MFI->isStackRealigned())
169fe6060f1SDimitry Andric     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
170fe6060f1SDimitry Andric 
171fe6060f1SDimitry Andric   Info.UsesVCC =
172fe6060f1SDimitry Andric       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
173fe6060f1SDimitry Andric 
174fe6060f1SDimitry Andric   // If there are no calls, MachineRegisterInfo can tell us the used register
175fe6060f1SDimitry Andric   // count easily.
176fe6060f1SDimitry Andric   // A tail call isn't considered a call for MachineFrameInfo's purposes.
177fe6060f1SDimitry Andric   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
178fe6060f1SDimitry Andric     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
179fe6060f1SDimitry Andric     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
180fe6060f1SDimitry Andric       if (MRI.isPhysRegUsed(Reg)) {
181fe6060f1SDimitry Andric         HighestVGPRReg = Reg;
182fe6060f1SDimitry Andric         break;
183fe6060f1SDimitry Andric       }
184fe6060f1SDimitry Andric     }
185fe6060f1SDimitry Andric 
186fe6060f1SDimitry Andric     if (ST.hasMAIInsts()) {
187fe6060f1SDimitry Andric       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
188fe6060f1SDimitry Andric       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
189fe6060f1SDimitry Andric         if (MRI.isPhysRegUsed(Reg)) {
190fe6060f1SDimitry Andric           HighestAGPRReg = Reg;
191fe6060f1SDimitry Andric           break;
192fe6060f1SDimitry Andric         }
193fe6060f1SDimitry Andric       }
194fe6060f1SDimitry Andric       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
195fe6060f1SDimitry Andric                          ? 0
196fe6060f1SDimitry Andric                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
197fe6060f1SDimitry Andric     }
198fe6060f1SDimitry Andric 
199fe6060f1SDimitry Andric     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
200fe6060f1SDimitry Andric     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
201fe6060f1SDimitry Andric       if (MRI.isPhysRegUsed(Reg)) {
202fe6060f1SDimitry Andric         HighestSGPRReg = Reg;
203fe6060f1SDimitry Andric         break;
204fe6060f1SDimitry Andric       }
205fe6060f1SDimitry Andric     }
206fe6060f1SDimitry Andric 
207fe6060f1SDimitry Andric     // We found the maximum register index. They start at 0, so add one to get
208fe6060f1SDimitry Andric     // the number of registers.
209fe6060f1SDimitry Andric     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
210fe6060f1SDimitry Andric                        ? 0
211fe6060f1SDimitry Andric                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
212fe6060f1SDimitry Andric     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
213fe6060f1SDimitry Andric                                ? 0
214fe6060f1SDimitry Andric                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
215fe6060f1SDimitry Andric 
216fe6060f1SDimitry Andric     return Info;
217fe6060f1SDimitry Andric   }
218fe6060f1SDimitry Andric 
219fe6060f1SDimitry Andric   int32_t MaxVGPR = -1;
220fe6060f1SDimitry Andric   int32_t MaxAGPR = -1;
221fe6060f1SDimitry Andric   int32_t MaxSGPR = -1;
222fe6060f1SDimitry Andric   uint64_t CalleeFrameSize = 0;
223fe6060f1SDimitry Andric 
224fe6060f1SDimitry Andric   for (const MachineBasicBlock &MBB : MF) {
225fe6060f1SDimitry Andric     for (const MachineInstr &MI : MBB) {
226fe6060f1SDimitry Andric       // TODO: Check regmasks? Do they occur anywhere except calls?
227fe6060f1SDimitry Andric       for (const MachineOperand &MO : MI.operands()) {
228fe6060f1SDimitry Andric         unsigned Width = 0;
229fe6060f1SDimitry Andric         bool IsSGPR = false;
230fe6060f1SDimitry Andric         bool IsAGPR = false;
231fe6060f1SDimitry Andric 
232fe6060f1SDimitry Andric         if (!MO.isReg())
233fe6060f1SDimitry Andric           continue;
234fe6060f1SDimitry Andric 
235fe6060f1SDimitry Andric         Register Reg = MO.getReg();
236fe6060f1SDimitry Andric         switch (Reg) {
237fe6060f1SDimitry Andric         case AMDGPU::EXEC:
238fe6060f1SDimitry Andric         case AMDGPU::EXEC_LO:
239fe6060f1SDimitry Andric         case AMDGPU::EXEC_HI:
240fe6060f1SDimitry Andric         case AMDGPU::SCC:
241fe6060f1SDimitry Andric         case AMDGPU::M0:
242fe6060f1SDimitry Andric         case AMDGPU::M0_LO16:
243fe6060f1SDimitry Andric         case AMDGPU::M0_HI16:
244fe6060f1SDimitry Andric         case AMDGPU::SRC_SHARED_BASE:
245fe6060f1SDimitry Andric         case AMDGPU::SRC_SHARED_LIMIT:
246fe6060f1SDimitry Andric         case AMDGPU::SRC_PRIVATE_BASE:
247fe6060f1SDimitry Andric         case AMDGPU::SRC_PRIVATE_LIMIT:
248fe6060f1SDimitry Andric         case AMDGPU::SGPR_NULL:
249fe6060f1SDimitry Andric         case AMDGPU::MODE:
250fe6060f1SDimitry Andric           continue;
251fe6060f1SDimitry Andric 
252fe6060f1SDimitry Andric         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
253fe6060f1SDimitry Andric           llvm_unreachable("src_pops_exiting_wave_id should not be used");
254fe6060f1SDimitry Andric 
255fe6060f1SDimitry Andric         case AMDGPU::NoRegister:
256fe6060f1SDimitry Andric           assert(MI.isDebugInstr() &&
257fe6060f1SDimitry Andric                  "Instruction uses invalid noreg register");
258fe6060f1SDimitry Andric           continue;
259fe6060f1SDimitry Andric 
260fe6060f1SDimitry Andric         case AMDGPU::VCC:
261fe6060f1SDimitry Andric         case AMDGPU::VCC_LO:
262fe6060f1SDimitry Andric         case AMDGPU::VCC_HI:
263fe6060f1SDimitry Andric         case AMDGPU::VCC_LO_LO16:
264fe6060f1SDimitry Andric         case AMDGPU::VCC_LO_HI16:
265fe6060f1SDimitry Andric         case AMDGPU::VCC_HI_LO16:
266fe6060f1SDimitry Andric         case AMDGPU::VCC_HI_HI16:
267fe6060f1SDimitry Andric           Info.UsesVCC = true;
268fe6060f1SDimitry Andric           continue;
269fe6060f1SDimitry Andric 
270fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR:
271fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR_LO:
272fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR_HI:
273fe6060f1SDimitry Andric           continue;
274fe6060f1SDimitry Andric 
275fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK:
276fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK_LO:
277fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK_HI:
278fe6060f1SDimitry Andric           llvm_unreachable("xnack_mask registers should not be used");
279fe6060f1SDimitry Andric 
280fe6060f1SDimitry Andric         case AMDGPU::LDS_DIRECT:
281fe6060f1SDimitry Andric           llvm_unreachable("lds_direct register should not be used");
282fe6060f1SDimitry Andric 
283fe6060f1SDimitry Andric         case AMDGPU::TBA:
284fe6060f1SDimitry Andric         case AMDGPU::TBA_LO:
285fe6060f1SDimitry Andric         case AMDGPU::TBA_HI:
286fe6060f1SDimitry Andric         case AMDGPU::TMA:
287fe6060f1SDimitry Andric         case AMDGPU::TMA_LO:
288fe6060f1SDimitry Andric         case AMDGPU::TMA_HI:
289fe6060f1SDimitry Andric           llvm_unreachable("trap handler registers should not be used");
290fe6060f1SDimitry Andric 
291fe6060f1SDimitry Andric         case AMDGPU::SRC_VCCZ:
292fe6060f1SDimitry Andric           llvm_unreachable("src_vccz register should not be used");
293fe6060f1SDimitry Andric 
294fe6060f1SDimitry Andric         case AMDGPU::SRC_EXECZ:
295fe6060f1SDimitry Andric           llvm_unreachable("src_execz register should not be used");
296fe6060f1SDimitry Andric 
297fe6060f1SDimitry Andric         case AMDGPU::SRC_SCC:
298fe6060f1SDimitry Andric           llvm_unreachable("src_scc register should not be used");
299fe6060f1SDimitry Andric 
300fe6060f1SDimitry Andric         default:
301fe6060f1SDimitry Andric           break;
302fe6060f1SDimitry Andric         }
303fe6060f1SDimitry Andric 
304fe6060f1SDimitry Andric         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
305fe6060f1SDimitry Andric             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
306fe6060f1SDimitry Andric             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
307fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
308fe6060f1SDimitry Andric                  "trap handler registers should not be used");
309fe6060f1SDimitry Andric           IsSGPR = true;
310fe6060f1SDimitry Andric           Width = 1;
311fe6060f1SDimitry Andric         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
312fe6060f1SDimitry Andric                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
313fe6060f1SDimitry Andric                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
314fe6060f1SDimitry Andric           IsSGPR = false;
315fe6060f1SDimitry Andric           Width = 1;
316fe6060f1SDimitry Andric         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
317fe6060f1SDimitry Andric                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
318fe6060f1SDimitry Andric           IsSGPR = false;
319fe6060f1SDimitry Andric           IsAGPR = true;
320fe6060f1SDimitry Andric           Width = 1;
321fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
322fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
323fe6060f1SDimitry Andric                  "trap handler registers should not be used");
324fe6060f1SDimitry Andric           IsSGPR = true;
325fe6060f1SDimitry Andric           Width = 2;
326fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
327fe6060f1SDimitry Andric           IsSGPR = false;
328fe6060f1SDimitry Andric           Width = 2;
329fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
330fe6060f1SDimitry Andric           IsSGPR = false;
331fe6060f1SDimitry Andric           IsAGPR = true;
332fe6060f1SDimitry Andric           Width = 2;
333fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
334fe6060f1SDimitry Andric           IsSGPR = false;
335fe6060f1SDimitry Andric           Width = 3;
336fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
337fe6060f1SDimitry Andric           IsSGPR = true;
338fe6060f1SDimitry Andric           Width = 3;
339fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
340fe6060f1SDimitry Andric           IsSGPR = false;
341fe6060f1SDimitry Andric           IsAGPR = true;
342fe6060f1SDimitry Andric           Width = 3;
343fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
344fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
345fe6060f1SDimitry Andric                  "trap handler registers should not be used");
346fe6060f1SDimitry Andric           IsSGPR = true;
347fe6060f1SDimitry Andric           Width = 4;
348fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
349fe6060f1SDimitry Andric           IsSGPR = false;
350fe6060f1SDimitry Andric           Width = 4;
351fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
352fe6060f1SDimitry Andric           IsSGPR = false;
353fe6060f1SDimitry Andric           IsAGPR = true;
354fe6060f1SDimitry Andric           Width = 4;
355fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
356fe6060f1SDimitry Andric           IsSGPR = false;
357fe6060f1SDimitry Andric           Width = 5;
358fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
359fe6060f1SDimitry Andric           IsSGPR = true;
360fe6060f1SDimitry Andric           Width = 5;
361fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
362fe6060f1SDimitry Andric           IsSGPR = false;
363fe6060f1SDimitry Andric           IsAGPR = true;
364fe6060f1SDimitry Andric           Width = 5;
365fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
366fe6060f1SDimitry Andric           IsSGPR = false;
367fe6060f1SDimitry Andric           Width = 6;
368fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
369fe6060f1SDimitry Andric           IsSGPR = true;
370fe6060f1SDimitry Andric           Width = 6;
371fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
372fe6060f1SDimitry Andric           IsSGPR = false;
373fe6060f1SDimitry Andric           IsAGPR = true;
374fe6060f1SDimitry Andric           Width = 6;
375fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
376fe6060f1SDimitry Andric           IsSGPR = false;
377fe6060f1SDimitry Andric           Width = 7;
378fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
379fe6060f1SDimitry Andric           IsSGPR = true;
380fe6060f1SDimitry Andric           Width = 7;
381fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
382fe6060f1SDimitry Andric           IsSGPR = false;
383fe6060f1SDimitry Andric           IsAGPR = true;
384fe6060f1SDimitry Andric           Width = 7;
385fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
386fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
387fe6060f1SDimitry Andric                  "trap handler registers should not be used");
388fe6060f1SDimitry Andric           IsSGPR = true;
389fe6060f1SDimitry Andric           Width = 8;
390fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
391fe6060f1SDimitry Andric           IsSGPR = false;
392fe6060f1SDimitry Andric           Width = 8;
393fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
394fe6060f1SDimitry Andric           IsSGPR = false;
395fe6060f1SDimitry Andric           IsAGPR = true;
396fe6060f1SDimitry Andric           Width = 8;
397fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
398fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
399fe6060f1SDimitry Andric                  "trap handler registers should not be used");
400fe6060f1SDimitry Andric           IsSGPR = true;
401fe6060f1SDimitry Andric           Width = 16;
402fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
403fe6060f1SDimitry Andric           IsSGPR = false;
404fe6060f1SDimitry Andric           Width = 16;
405fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
406fe6060f1SDimitry Andric           IsSGPR = false;
407fe6060f1SDimitry Andric           IsAGPR = true;
408fe6060f1SDimitry Andric           Width = 16;
409fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
410fe6060f1SDimitry Andric           IsSGPR = true;
411fe6060f1SDimitry Andric           Width = 32;
412fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
413fe6060f1SDimitry Andric           IsSGPR = false;
414fe6060f1SDimitry Andric           Width = 32;
415fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
416fe6060f1SDimitry Andric           IsSGPR = false;
417fe6060f1SDimitry Andric           IsAGPR = true;
418fe6060f1SDimitry Andric           Width = 32;
419fe6060f1SDimitry Andric         } else {
420fe6060f1SDimitry Andric           llvm_unreachable("Unknown register class");
421fe6060f1SDimitry Andric         }
422fe6060f1SDimitry Andric         unsigned HWReg = TRI.getHWRegIndex(Reg);
423fe6060f1SDimitry Andric         int MaxUsed = HWReg + Width - 1;
424fe6060f1SDimitry Andric         if (IsSGPR) {
425fe6060f1SDimitry Andric           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
426fe6060f1SDimitry Andric         } else if (IsAGPR) {
427fe6060f1SDimitry Andric           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
428fe6060f1SDimitry Andric         } else {
429fe6060f1SDimitry Andric           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
430fe6060f1SDimitry Andric         }
431fe6060f1SDimitry Andric       }
432fe6060f1SDimitry Andric 
433fe6060f1SDimitry Andric       if (MI.isCall()) {
434fe6060f1SDimitry Andric         // Pseudo used just to encode the underlying global. Is there a better
435fe6060f1SDimitry Andric         // way to track this?
436fe6060f1SDimitry Andric 
437fe6060f1SDimitry Andric         const MachineOperand *CalleeOp =
438fe6060f1SDimitry Andric             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
439fe6060f1SDimitry Andric 
440fe6060f1SDimitry Andric         const Function *Callee = getCalleeFunction(*CalleeOp);
441fe6060f1SDimitry Andric         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
442fe6060f1SDimitry Andric             CallGraphResourceInfo.end();
443fe6060f1SDimitry Andric 
444fe6060f1SDimitry Andric         // Avoid crashing on undefined behavior with an illegal call to a
445fe6060f1SDimitry Andric         // kernel. If a callsite's calling convention doesn't match the
446fe6060f1SDimitry Andric         // function's, it's undefined behavior. If the callsite calling
447fe6060f1SDimitry Andric         // convention does match, that would have errored earlier.
448fe6060f1SDimitry Andric         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
449fe6060f1SDimitry Andric           report_fatal_error("invalid call to entry function");
450fe6060f1SDimitry Andric 
451fe6060f1SDimitry Andric         bool IsIndirect = !Callee || Callee->isDeclaration();
452fe6060f1SDimitry Andric         if (!IsIndirect)
453fe6060f1SDimitry Andric           I = CallGraphResourceInfo.find(Callee);
454fe6060f1SDimitry Andric 
455*349cc55cSDimitry Andric         // FIXME: Call site could have norecurse on it
456*349cc55cSDimitry Andric         if (!Callee || !Callee->doesNotRecurse()) {
457*349cc55cSDimitry Andric           Info.HasRecursion = true;
458*349cc55cSDimitry Andric 
459*349cc55cSDimitry Andric           // TODO: If we happen to know there is no stack usage in the
460*349cc55cSDimitry Andric           // callgraph, we don't need to assume an infinitely growing stack.
461*349cc55cSDimitry Andric           if (!MI.isReturn()) {
462*349cc55cSDimitry Andric             // We don't need to assume an unknown stack size for tail calls.
463*349cc55cSDimitry Andric 
464*349cc55cSDimitry Andric             // FIXME: This only benefits in the case where the kernel does not
465*349cc55cSDimitry Andric             // directly call the tail called function. If a kernel directly
466*349cc55cSDimitry Andric             // calls a tail recursive function, we'll assume maximum stack size
467*349cc55cSDimitry Andric             // based on the regular call instruction.
468*349cc55cSDimitry Andric             CalleeFrameSize =
469*349cc55cSDimitry Andric               std::max(CalleeFrameSize,
470*349cc55cSDimitry Andric                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
471*349cc55cSDimitry Andric           }
472*349cc55cSDimitry Andric         }
473*349cc55cSDimitry Andric 
474fe6060f1SDimitry Andric         if (IsIndirect || I == CallGraphResourceInfo.end()) {
475fe6060f1SDimitry Andric           CalleeFrameSize =
476fe6060f1SDimitry Andric               std::max(CalleeFrameSize,
477fe6060f1SDimitry Andric                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
478fe6060f1SDimitry Andric 
479fe6060f1SDimitry Andric           // Register usage of indirect calls gets handled later
480fe6060f1SDimitry Andric           Info.UsesVCC = true;
481fe6060f1SDimitry Andric           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
482fe6060f1SDimitry Andric           Info.HasDynamicallySizedStack = true;
483fe6060f1SDimitry Andric           Info.HasIndirectCall = true;
484fe6060f1SDimitry Andric         } else {
485fe6060f1SDimitry Andric           // We force CodeGen to run in SCC order, so the callee's register
486fe6060f1SDimitry Andric           // usage etc. should be the cumulative usage of all callees.
487fe6060f1SDimitry Andric           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
488fe6060f1SDimitry Andric           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
489fe6060f1SDimitry Andric           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
490fe6060f1SDimitry Andric           CalleeFrameSize =
491fe6060f1SDimitry Andric               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
492fe6060f1SDimitry Andric           Info.UsesVCC |= I->second.UsesVCC;
493fe6060f1SDimitry Andric           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
494fe6060f1SDimitry Andric           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
495fe6060f1SDimitry Andric           Info.HasRecursion |= I->second.HasRecursion;
496fe6060f1SDimitry Andric           Info.HasIndirectCall |= I->second.HasIndirectCall;
497fe6060f1SDimitry Andric         }
498fe6060f1SDimitry Andric       }
499fe6060f1SDimitry Andric     }
500fe6060f1SDimitry Andric   }
501fe6060f1SDimitry Andric 
502fe6060f1SDimitry Andric   Info.NumExplicitSGPR = MaxSGPR + 1;
503fe6060f1SDimitry Andric   Info.NumVGPR = MaxVGPR + 1;
504fe6060f1SDimitry Andric   Info.NumAGPR = MaxAGPR + 1;
505fe6060f1SDimitry Andric   Info.PrivateSegmentSize += CalleeFrameSize;
506fe6060f1SDimitry Andric 
507fe6060f1SDimitry Andric   return Info;
508fe6060f1SDimitry Andric }
509fe6060f1SDimitry Andric 
510fe6060f1SDimitry Andric void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
511fe6060f1SDimitry Andric   // Collect the maximum number of registers from non-hardware-entrypoints.
512fe6060f1SDimitry Andric   // All these functions are potential targets for indirect calls.
513fe6060f1SDimitry Andric   int32_t NonKernelMaxSGPRs = 0;
514fe6060f1SDimitry Andric   int32_t NonKernelMaxVGPRs = 0;
515fe6060f1SDimitry Andric   int32_t NonKernelMaxAGPRs = 0;
516fe6060f1SDimitry Andric 
517fe6060f1SDimitry Andric   for (const auto &I : CallGraphResourceInfo) {
518fe6060f1SDimitry Andric     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
519fe6060f1SDimitry Andric       auto &Info = I.getSecond();
520fe6060f1SDimitry Andric       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
521fe6060f1SDimitry Andric       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
522fe6060f1SDimitry Andric       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
523fe6060f1SDimitry Andric     }
524fe6060f1SDimitry Andric   }
525fe6060f1SDimitry Andric 
526fe6060f1SDimitry Andric   // Add register usage for functions with indirect calls.
527fe6060f1SDimitry Andric   // For calls to unknown functions, we assume the maximum register usage of
528fe6060f1SDimitry Andric   // all non-hardware-entrypoints in the current module.
529fe6060f1SDimitry Andric   for (auto &I : CallGraphResourceInfo) {
530fe6060f1SDimitry Andric     auto &Info = I.getSecond();
531fe6060f1SDimitry Andric     if (Info.HasIndirectCall) {
532fe6060f1SDimitry Andric       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
533fe6060f1SDimitry Andric       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
534fe6060f1SDimitry Andric       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
535fe6060f1SDimitry Andric     }
536fe6060f1SDimitry Andric   }
537fe6060f1SDimitry Andric }
538