1fe6060f1SDimitry Andric //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2fe6060f1SDimitry Andric // 3fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6fe6060f1SDimitry Andric // 7fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 8fe6060f1SDimitry Andric // 9fe6060f1SDimitry Andric /// \file 10fe6060f1SDimitry Andric /// \brief Analyzes how many registers and other resources are used by 11fe6060f1SDimitry Andric /// functions. 12fe6060f1SDimitry Andric /// 13fe6060f1SDimitry Andric /// The results of this analysis are used to fill the register usage, flat 14fe6060f1SDimitry Andric /// usage, etc. into hardware registers. 15fe6060f1SDimitry Andric /// 16fe6060f1SDimitry Andric /// The analysis takes callees into account. E.g. if a function A that needs 10 17fe6060f1SDimitry Andric /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18fe6060f1SDimitry Andric /// will return 20. 19fe6060f1SDimitry Andric /// It is assumed that an indirect call can go into any function except 20fe6060f1SDimitry Andric /// hardware-entrypoints. Therefore the register usage of functions with 21fe6060f1SDimitry Andric /// indirect calls is estimated as the maximum of all non-entrypoint functions 22fe6060f1SDimitry Andric /// in the module. 23fe6060f1SDimitry Andric /// 24fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 25fe6060f1SDimitry Andric 26fe6060f1SDimitry Andric #include "AMDGPUResourceUsageAnalysis.h" 27fe6060f1SDimitry Andric #include "AMDGPU.h" 28fe6060f1SDimitry Andric #include "GCNSubtarget.h" 29fe6060f1SDimitry Andric #include "SIMachineFunctionInfo.h" 3081ad6265SDimitry Andric #include "llvm/ADT/PostOrderIterator.h" 31fe6060f1SDimitry Andric #include "llvm/Analysis/CallGraph.h" 3281ad6265SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h" 33fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 34349cc55cSDimitry Andric #include "llvm/IR/GlobalAlias.h" 35349cc55cSDimitry Andric #include "llvm/IR/GlobalValue.h" 36fe6060f1SDimitry Andric #include "llvm/Target/TargetMachine.h" 37fe6060f1SDimitry Andric 38fe6060f1SDimitry Andric using namespace llvm; 39fe6060f1SDimitry Andric using namespace llvm::AMDGPU; 40fe6060f1SDimitry Andric 41fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-resource-usage" 42fe6060f1SDimitry Andric 43fe6060f1SDimitry Andric char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 44fe6060f1SDimitry Andric char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 45fe6060f1SDimitry Andric 46bdd1243dSDimitry Andric // In code object v4 and older, we need to tell the runtime some amount ahead of 47bdd1243dSDimitry Andric // time if we don't know the true stack size. Assume a smaller number if this is 48bdd1243dSDimitry Andric // only due to dynamic / non-entry block allocas. 49*0fca6ea1SDimitry Andric static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( 50fe6060f1SDimitry Andric "amdgpu-assume-external-call-stack-size", 51fe6060f1SDimitry Andric cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 52fe6060f1SDimitry Andric cl::init(16384)); 53fe6060f1SDimitry Andric 54*0fca6ea1SDimitry Andric static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( 55fe6060f1SDimitry Andric "amdgpu-assume-dynamic-stack-object-size", 56fe6060f1SDimitry Andric cl::desc("Assumed extra stack use if there are any " 57fe6060f1SDimitry Andric "variable sized objects (in bytes)"), 58fe6060f1SDimitry Andric cl::Hidden, cl::init(4096)); 59fe6060f1SDimitry Andric 60fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 61fe6060f1SDimitry Andric "Function register usage analysis", true, true) 62fe6060f1SDimitry Andric 63fe6060f1SDimitry Andric static const Function *getCalleeFunction(const MachineOperand &Op) { 64fe6060f1SDimitry Andric if (Op.isImm()) { 65fe6060f1SDimitry Andric assert(Op.getImm() == 0); 66fe6060f1SDimitry Andric return nullptr; 67fe6060f1SDimitry Andric } 68*0fca6ea1SDimitry Andric return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases()); 69fe6060f1SDimitry Andric } 70fe6060f1SDimitry Andric 71fe6060f1SDimitry Andric static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 72fe6060f1SDimitry Andric const SIInstrInfo &TII, unsigned Reg) { 73fe6060f1SDimitry Andric for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 74fe6060f1SDimitry Andric if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 75fe6060f1SDimitry Andric return true; 76fe6060f1SDimitry Andric } 77fe6060f1SDimitry Andric 78fe6060f1SDimitry Andric return false; 79fe6060f1SDimitry Andric } 80fe6060f1SDimitry Andric 81fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 82fe6060f1SDimitry Andric const GCNSubtarget &ST) const { 83fe6060f1SDimitry Andric return NumExplicitSGPR + 84fe6060f1SDimitry Andric IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 85fe6060f1SDimitry Andric ST.getTargetID().isXnackOnOrAny()); 86fe6060f1SDimitry Andric } 87fe6060f1SDimitry Andric 88fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 89349cc55cSDimitry Andric const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { 9081ad6265SDimitry Andric return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); 91349cc55cSDimitry Andric } 92349cc55cSDimitry Andric 93349cc55cSDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 94fe6060f1SDimitry Andric const GCNSubtarget &ST) const { 95349cc55cSDimitry Andric return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); 96fe6060f1SDimitry Andric } 97fe6060f1SDimitry Andric 9881ad6265SDimitry Andric bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { 99fe6060f1SDimitry Andric auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 100fe6060f1SDimitry Andric if (!TPC) 101fe6060f1SDimitry Andric return false; 102fe6060f1SDimitry Andric 10381ad6265SDimitry Andric MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 104fe6060f1SDimitry Andric const TargetMachine &TM = TPC->getTM<TargetMachine>(); 10506c3fb27SDimitry Andric const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); 106fe6060f1SDimitry Andric bool HasIndirectCall = false; 107fe6060f1SDimitry Andric 10881ad6265SDimitry Andric CallGraph CG = CallGraph(M); 10981ad6265SDimitry Andric auto End = po_end(&CG); 11081ad6265SDimitry Andric 111bdd1243dSDimitry Andric // By default, for code object v5 and later, track only the minimum scratch 112bdd1243dSDimitry Andric // size 113*0fca6ea1SDimitry Andric uint32_t AssumedStackSizeForDynamicSizeObjects = 114*0fca6ea1SDimitry Andric clAssumedStackSizeForDynamicSizeObjects; 115*0fca6ea1SDimitry Andric uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; 1167a6dacacSDimitry Andric if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || 11706c3fb27SDimitry Andric STI.getTargetTriple().getOS() == Triple::AMDPAL) { 118*0fca6ea1SDimitry Andric if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0) 119bdd1243dSDimitry Andric AssumedStackSizeForDynamicSizeObjects = 0; 120*0fca6ea1SDimitry Andric if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0) 121bdd1243dSDimitry Andric AssumedStackSizeForExternalCall = 0; 122bdd1243dSDimitry Andric } 123bdd1243dSDimitry Andric 12481ad6265SDimitry Andric for (auto IT = po_begin(&CG); IT != End; ++IT) { 12581ad6265SDimitry Andric Function *F = IT->getFunction(); 126fe6060f1SDimitry Andric if (!F || F->isDeclaration()) 127fe6060f1SDimitry Andric continue; 128fe6060f1SDimitry Andric 12981ad6265SDimitry Andric MachineFunction *MF = MMI.getMachineFunction(*F); 13081ad6265SDimitry Andric assert(MF && "function must have been generated already"); 131fe6060f1SDimitry Andric 132bdd1243dSDimitry Andric auto CI = 133bdd1243dSDimitry Andric CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); 134fe6060f1SDimitry Andric SIFunctionResourceInfo &Info = CI.first->second; 135fe6060f1SDimitry Andric assert(CI.second && "should only be called once per function"); 136*0fca6ea1SDimitry Andric Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, 137*0fca6ea1SDimitry Andric AssumedStackSizeForExternalCall); 138fe6060f1SDimitry Andric HasIndirectCall |= Info.HasIndirectCall; 139fe6060f1SDimitry Andric } 140fe6060f1SDimitry Andric 141bdd1243dSDimitry Andric // It's possible we have unreachable functions in the module which weren't 142bdd1243dSDimitry Andric // visited by the PO traversal. Make sure we have some resource counts to 143bdd1243dSDimitry Andric // report. 144bdd1243dSDimitry Andric for (const auto &IT : CG) { 145bdd1243dSDimitry Andric const Function *F = IT.first; 146bdd1243dSDimitry Andric if (!F || F->isDeclaration()) 147bdd1243dSDimitry Andric continue; 148bdd1243dSDimitry Andric 149bdd1243dSDimitry Andric auto CI = 150bdd1243dSDimitry Andric CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); 151bdd1243dSDimitry Andric if (!CI.second) // Skip already visited functions 152bdd1243dSDimitry Andric continue; 153bdd1243dSDimitry Andric 154bdd1243dSDimitry Andric SIFunctionResourceInfo &Info = CI.first->second; 155bdd1243dSDimitry Andric MachineFunction *MF = MMI.getMachineFunction(*F); 156bdd1243dSDimitry Andric assert(MF && "function must have been generated already"); 157*0fca6ea1SDimitry Andric Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, 158*0fca6ea1SDimitry Andric AssumedStackSizeForExternalCall); 159bdd1243dSDimitry Andric HasIndirectCall |= Info.HasIndirectCall; 160bdd1243dSDimitry Andric } 161bdd1243dSDimitry Andric 162fe6060f1SDimitry Andric if (HasIndirectCall) 163fe6060f1SDimitry Andric propagateIndirectCallRegisterUsage(); 164fe6060f1SDimitry Andric 165fe6060f1SDimitry Andric return false; 166fe6060f1SDimitry Andric } 167fe6060f1SDimitry Andric 168fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 169fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 170*0fca6ea1SDimitry Andric const MachineFunction &MF, const TargetMachine &TM, 171*0fca6ea1SDimitry Andric uint32_t AssumedStackSizeForDynamicSizeObjects, 172*0fca6ea1SDimitry Andric uint32_t AssumedStackSizeForExternalCall) const { 173fe6060f1SDimitry Andric SIFunctionResourceInfo Info; 174fe6060f1SDimitry Andric 175fe6060f1SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 176fe6060f1SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 177fe6060f1SDimitry Andric const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 178fe6060f1SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 179fe6060f1SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 180fe6060f1SDimitry Andric const SIRegisterInfo &TRI = TII->getRegisterInfo(); 181fe6060f1SDimitry Andric 182fe6060f1SDimitry Andric Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 183fe6060f1SDimitry Andric MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 184fe6060f1SDimitry Andric MRI.isLiveIn(MFI->getPreloadedReg( 185fe6060f1SDimitry Andric AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 186fe6060f1SDimitry Andric 187fe6060f1SDimitry Andric // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 188fe6060f1SDimitry Andric // instructions aren't used to access the scratch buffer. Inline assembly may 189fe6060f1SDimitry Andric // need it though. 190fe6060f1SDimitry Andric // 191fe6060f1SDimitry Andric // If we only have implicit uses of flat_scr on flat instructions, it is not 192fe6060f1SDimitry Andric // really needed. 1935f757f3fSDimitry Andric if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && 194fe6060f1SDimitry Andric (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 195fe6060f1SDimitry Andric !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 196fe6060f1SDimitry Andric !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 197fe6060f1SDimitry Andric Info.UsesFlatScratch = false; 198fe6060f1SDimitry Andric } 199fe6060f1SDimitry Andric 200fe6060f1SDimitry Andric Info.PrivateSegmentSize = FrameInfo.getStackSize(); 201fe6060f1SDimitry Andric 202fe6060f1SDimitry Andric // Assume a big number if there are any unknown sized objects. 203fe6060f1SDimitry Andric Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 204fe6060f1SDimitry Andric if (Info.HasDynamicallySizedStack) 205fe6060f1SDimitry Andric Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 206fe6060f1SDimitry Andric 207fe6060f1SDimitry Andric if (MFI->isStackRealigned()) 208fe6060f1SDimitry Andric Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 209fe6060f1SDimitry Andric 210fe6060f1SDimitry Andric Info.UsesVCC = 211fe6060f1SDimitry Andric MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 212fe6060f1SDimitry Andric 213fe6060f1SDimitry Andric // If there are no calls, MachineRegisterInfo can tell us the used register 214fe6060f1SDimitry Andric // count easily. 215fe6060f1SDimitry Andric // A tail call isn't considered a call for MachineFrameInfo's purposes. 216fe6060f1SDimitry Andric if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 217fe6060f1SDimitry Andric MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 218fe6060f1SDimitry Andric for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 219fe6060f1SDimitry Andric if (MRI.isPhysRegUsed(Reg)) { 220fe6060f1SDimitry Andric HighestVGPRReg = Reg; 221fe6060f1SDimitry Andric break; 222fe6060f1SDimitry Andric } 223fe6060f1SDimitry Andric } 224fe6060f1SDimitry Andric 225fe6060f1SDimitry Andric if (ST.hasMAIInsts()) { 226fe6060f1SDimitry Andric MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 227fe6060f1SDimitry Andric for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 228fe6060f1SDimitry Andric if (MRI.isPhysRegUsed(Reg)) { 229fe6060f1SDimitry Andric HighestAGPRReg = Reg; 230fe6060f1SDimitry Andric break; 231fe6060f1SDimitry Andric } 232fe6060f1SDimitry Andric } 233fe6060f1SDimitry Andric Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 234fe6060f1SDimitry Andric ? 0 235fe6060f1SDimitry Andric : TRI.getHWRegIndex(HighestAGPRReg) + 1; 236fe6060f1SDimitry Andric } 237fe6060f1SDimitry Andric 238fe6060f1SDimitry Andric MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 239fe6060f1SDimitry Andric for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 240fe6060f1SDimitry Andric if (MRI.isPhysRegUsed(Reg)) { 241fe6060f1SDimitry Andric HighestSGPRReg = Reg; 242fe6060f1SDimitry Andric break; 243fe6060f1SDimitry Andric } 244fe6060f1SDimitry Andric } 245fe6060f1SDimitry Andric 246fe6060f1SDimitry Andric // We found the maximum register index. They start at 0, so add one to get 247fe6060f1SDimitry Andric // the number of registers. 248fe6060f1SDimitry Andric Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 249fe6060f1SDimitry Andric ? 0 250fe6060f1SDimitry Andric : TRI.getHWRegIndex(HighestVGPRReg) + 1; 251fe6060f1SDimitry Andric Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 252fe6060f1SDimitry Andric ? 0 253fe6060f1SDimitry Andric : TRI.getHWRegIndex(HighestSGPRReg) + 1; 254fe6060f1SDimitry Andric 255fe6060f1SDimitry Andric return Info; 256fe6060f1SDimitry Andric } 257fe6060f1SDimitry Andric 258fe6060f1SDimitry Andric int32_t MaxVGPR = -1; 259fe6060f1SDimitry Andric int32_t MaxAGPR = -1; 260fe6060f1SDimitry Andric int32_t MaxSGPR = -1; 261fe6060f1SDimitry Andric uint64_t CalleeFrameSize = 0; 262fe6060f1SDimitry Andric 263fe6060f1SDimitry Andric for (const MachineBasicBlock &MBB : MF) { 264fe6060f1SDimitry Andric for (const MachineInstr &MI : MBB) { 265fe6060f1SDimitry Andric // TODO: Check regmasks? Do they occur anywhere except calls? 266fe6060f1SDimitry Andric for (const MachineOperand &MO : MI.operands()) { 267fe6060f1SDimitry Andric unsigned Width = 0; 268fe6060f1SDimitry Andric bool IsSGPR = false; 269fe6060f1SDimitry Andric bool IsAGPR = false; 270fe6060f1SDimitry Andric 271fe6060f1SDimitry Andric if (!MO.isReg()) 272fe6060f1SDimitry Andric continue; 273fe6060f1SDimitry Andric 274fe6060f1SDimitry Andric Register Reg = MO.getReg(); 275fe6060f1SDimitry Andric switch (Reg) { 276fe6060f1SDimitry Andric case AMDGPU::EXEC: 277fe6060f1SDimitry Andric case AMDGPU::EXEC_LO: 278fe6060f1SDimitry Andric case AMDGPU::EXEC_HI: 279fe6060f1SDimitry Andric case AMDGPU::SCC: 280fe6060f1SDimitry Andric case AMDGPU::M0: 281fe6060f1SDimitry Andric case AMDGPU::M0_LO16: 282fe6060f1SDimitry Andric case AMDGPU::M0_HI16: 283bdd1243dSDimitry Andric case AMDGPU::SRC_SHARED_BASE_LO: 284fe6060f1SDimitry Andric case AMDGPU::SRC_SHARED_BASE: 285bdd1243dSDimitry Andric case AMDGPU::SRC_SHARED_LIMIT_LO: 286fe6060f1SDimitry Andric case AMDGPU::SRC_SHARED_LIMIT: 287bdd1243dSDimitry Andric case AMDGPU::SRC_PRIVATE_BASE_LO: 288fe6060f1SDimitry Andric case AMDGPU::SRC_PRIVATE_BASE: 289bdd1243dSDimitry Andric case AMDGPU::SRC_PRIVATE_LIMIT_LO: 290fe6060f1SDimitry Andric case AMDGPU::SRC_PRIVATE_LIMIT: 291*0fca6ea1SDimitry Andric case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 292fe6060f1SDimitry Andric case AMDGPU::SGPR_NULL: 29381ad6265SDimitry Andric case AMDGPU::SGPR_NULL64: 294fe6060f1SDimitry Andric case AMDGPU::MODE: 295fe6060f1SDimitry Andric continue; 296fe6060f1SDimitry Andric 297fe6060f1SDimitry Andric case AMDGPU::NoRegister: 298fe6060f1SDimitry Andric assert(MI.isDebugInstr() && 299fe6060f1SDimitry Andric "Instruction uses invalid noreg register"); 300fe6060f1SDimitry Andric continue; 301fe6060f1SDimitry Andric 302fe6060f1SDimitry Andric case AMDGPU::VCC: 303fe6060f1SDimitry Andric case AMDGPU::VCC_LO: 304fe6060f1SDimitry Andric case AMDGPU::VCC_HI: 305fe6060f1SDimitry Andric case AMDGPU::VCC_LO_LO16: 306fe6060f1SDimitry Andric case AMDGPU::VCC_LO_HI16: 307fe6060f1SDimitry Andric case AMDGPU::VCC_HI_LO16: 308fe6060f1SDimitry Andric case AMDGPU::VCC_HI_HI16: 309fe6060f1SDimitry Andric Info.UsesVCC = true; 310fe6060f1SDimitry Andric continue; 311fe6060f1SDimitry Andric 312fe6060f1SDimitry Andric case AMDGPU::FLAT_SCR: 313fe6060f1SDimitry Andric case AMDGPU::FLAT_SCR_LO: 314fe6060f1SDimitry Andric case AMDGPU::FLAT_SCR_HI: 315fe6060f1SDimitry Andric continue; 316fe6060f1SDimitry Andric 317fe6060f1SDimitry Andric case AMDGPU::XNACK_MASK: 318fe6060f1SDimitry Andric case AMDGPU::XNACK_MASK_LO: 319fe6060f1SDimitry Andric case AMDGPU::XNACK_MASK_HI: 320fe6060f1SDimitry Andric llvm_unreachable("xnack_mask registers should not be used"); 321fe6060f1SDimitry Andric 322fe6060f1SDimitry Andric case AMDGPU::LDS_DIRECT: 323fe6060f1SDimitry Andric llvm_unreachable("lds_direct register should not be used"); 324fe6060f1SDimitry Andric 325fe6060f1SDimitry Andric case AMDGPU::TBA: 326fe6060f1SDimitry Andric case AMDGPU::TBA_LO: 327fe6060f1SDimitry Andric case AMDGPU::TBA_HI: 328fe6060f1SDimitry Andric case AMDGPU::TMA: 329fe6060f1SDimitry Andric case AMDGPU::TMA_LO: 330fe6060f1SDimitry Andric case AMDGPU::TMA_HI: 331fe6060f1SDimitry Andric llvm_unreachable("trap handler registers should not be used"); 332fe6060f1SDimitry Andric 333fe6060f1SDimitry Andric case AMDGPU::SRC_VCCZ: 334fe6060f1SDimitry Andric llvm_unreachable("src_vccz register should not be used"); 335fe6060f1SDimitry Andric 336fe6060f1SDimitry Andric case AMDGPU::SRC_EXECZ: 337fe6060f1SDimitry Andric llvm_unreachable("src_execz register should not be used"); 338fe6060f1SDimitry Andric 339fe6060f1SDimitry Andric case AMDGPU::SRC_SCC: 340fe6060f1SDimitry Andric llvm_unreachable("src_scc register should not be used"); 341fe6060f1SDimitry Andric 342fe6060f1SDimitry Andric default: 343fe6060f1SDimitry Andric break; 344fe6060f1SDimitry Andric } 345fe6060f1SDimitry Andric 34606c3fb27SDimitry Andric if (AMDGPU::SGPR_32RegClass.contains(Reg) || 34706c3fb27SDimitry Andric AMDGPU::SGPR_LO16RegClass.contains(Reg) || 348fe6060f1SDimitry Andric AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 349fe6060f1SDimitry Andric IsSGPR = true; 350fe6060f1SDimitry Andric Width = 1; 351fe6060f1SDimitry Andric } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 352647cbc5dSDimitry Andric AMDGPU::VGPR_16RegClass.contains(Reg)) { 353fe6060f1SDimitry Andric IsSGPR = false; 354fe6060f1SDimitry Andric Width = 1; 355fe6060f1SDimitry Andric } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 356fe6060f1SDimitry Andric AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 357fe6060f1SDimitry Andric IsSGPR = false; 358fe6060f1SDimitry Andric IsAGPR = true; 359fe6060f1SDimitry Andric Width = 1; 36006c3fb27SDimitry Andric } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { 361fe6060f1SDimitry Andric IsSGPR = true; 362fe6060f1SDimitry Andric Width = 2; 363fe6060f1SDimitry Andric } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 364fe6060f1SDimitry Andric IsSGPR = false; 365fe6060f1SDimitry Andric Width = 2; 366fe6060f1SDimitry Andric } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 367fe6060f1SDimitry Andric IsSGPR = false; 368fe6060f1SDimitry Andric IsAGPR = true; 369fe6060f1SDimitry Andric Width = 2; 370fe6060f1SDimitry Andric } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 371fe6060f1SDimitry Andric IsSGPR = false; 372fe6060f1SDimitry Andric Width = 3; 373fe6060f1SDimitry Andric } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 374fe6060f1SDimitry Andric IsSGPR = true; 375fe6060f1SDimitry Andric Width = 3; 376fe6060f1SDimitry Andric } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 377fe6060f1SDimitry Andric IsSGPR = false; 378fe6060f1SDimitry Andric IsAGPR = true; 379fe6060f1SDimitry Andric Width = 3; 38006c3fb27SDimitry Andric } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { 381fe6060f1SDimitry Andric IsSGPR = true; 382fe6060f1SDimitry Andric Width = 4; 383fe6060f1SDimitry Andric } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 384fe6060f1SDimitry Andric IsSGPR = false; 385fe6060f1SDimitry Andric Width = 4; 386fe6060f1SDimitry Andric } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 387fe6060f1SDimitry Andric IsSGPR = false; 388fe6060f1SDimitry Andric IsAGPR = true; 389fe6060f1SDimitry Andric Width = 4; 390fe6060f1SDimitry Andric } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 391fe6060f1SDimitry Andric IsSGPR = false; 392fe6060f1SDimitry Andric Width = 5; 393fe6060f1SDimitry Andric } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 394fe6060f1SDimitry Andric IsSGPR = true; 395fe6060f1SDimitry Andric Width = 5; 396fe6060f1SDimitry Andric } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 397fe6060f1SDimitry Andric IsSGPR = false; 398fe6060f1SDimitry Andric IsAGPR = true; 399fe6060f1SDimitry Andric Width = 5; 400fe6060f1SDimitry Andric } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 401fe6060f1SDimitry Andric IsSGPR = false; 402fe6060f1SDimitry Andric Width = 6; 403fe6060f1SDimitry Andric } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 404fe6060f1SDimitry Andric IsSGPR = true; 405fe6060f1SDimitry Andric Width = 6; 406fe6060f1SDimitry Andric } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 407fe6060f1SDimitry Andric IsSGPR = false; 408fe6060f1SDimitry Andric IsAGPR = true; 409fe6060f1SDimitry Andric Width = 6; 410fe6060f1SDimitry Andric } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 411fe6060f1SDimitry Andric IsSGPR = false; 412fe6060f1SDimitry Andric Width = 7; 413fe6060f1SDimitry Andric } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 414fe6060f1SDimitry Andric IsSGPR = true; 415fe6060f1SDimitry Andric Width = 7; 416fe6060f1SDimitry Andric } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 417fe6060f1SDimitry Andric IsSGPR = false; 418fe6060f1SDimitry Andric IsAGPR = true; 419fe6060f1SDimitry Andric Width = 7; 420fe6060f1SDimitry Andric } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 421fe6060f1SDimitry Andric IsSGPR = true; 422fe6060f1SDimitry Andric Width = 8; 423fe6060f1SDimitry Andric } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 424fe6060f1SDimitry Andric IsSGPR = false; 425fe6060f1SDimitry Andric Width = 8; 426fe6060f1SDimitry Andric } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 427fe6060f1SDimitry Andric IsSGPR = false; 428fe6060f1SDimitry Andric IsAGPR = true; 429fe6060f1SDimitry Andric Width = 8; 430bdd1243dSDimitry Andric } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { 431bdd1243dSDimitry Andric IsSGPR = false; 432bdd1243dSDimitry Andric Width = 9; 433bdd1243dSDimitry Andric } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { 434bdd1243dSDimitry Andric IsSGPR = true; 435bdd1243dSDimitry Andric Width = 9; 436bdd1243dSDimitry Andric } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { 437bdd1243dSDimitry Andric IsSGPR = false; 438bdd1243dSDimitry Andric IsAGPR = true; 439bdd1243dSDimitry Andric Width = 9; 440bdd1243dSDimitry Andric } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { 441bdd1243dSDimitry Andric IsSGPR = false; 442bdd1243dSDimitry Andric Width = 10; 443bdd1243dSDimitry Andric } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { 444bdd1243dSDimitry Andric IsSGPR = true; 445bdd1243dSDimitry Andric Width = 10; 446bdd1243dSDimitry Andric } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { 447bdd1243dSDimitry Andric IsSGPR = false; 448bdd1243dSDimitry Andric IsAGPR = true; 449bdd1243dSDimitry Andric Width = 10; 450bdd1243dSDimitry Andric } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { 451bdd1243dSDimitry Andric IsSGPR = false; 452bdd1243dSDimitry Andric Width = 11; 453bdd1243dSDimitry Andric } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { 454bdd1243dSDimitry Andric IsSGPR = true; 455bdd1243dSDimitry Andric Width = 11; 456bdd1243dSDimitry Andric } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { 457bdd1243dSDimitry Andric IsSGPR = false; 458bdd1243dSDimitry Andric IsAGPR = true; 459bdd1243dSDimitry Andric Width = 11; 460bdd1243dSDimitry Andric } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { 461bdd1243dSDimitry Andric IsSGPR = false; 462bdd1243dSDimitry Andric Width = 12; 463bdd1243dSDimitry Andric } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { 464bdd1243dSDimitry Andric IsSGPR = true; 465bdd1243dSDimitry Andric Width = 12; 466bdd1243dSDimitry Andric } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { 467bdd1243dSDimitry Andric IsSGPR = false; 468bdd1243dSDimitry Andric IsAGPR = true; 469bdd1243dSDimitry Andric Width = 12; 470fe6060f1SDimitry Andric } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 471fe6060f1SDimitry Andric IsSGPR = true; 472fe6060f1SDimitry Andric Width = 16; 473fe6060f1SDimitry Andric } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 474fe6060f1SDimitry Andric IsSGPR = false; 475fe6060f1SDimitry Andric Width = 16; 476fe6060f1SDimitry Andric } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 477fe6060f1SDimitry Andric IsSGPR = false; 478fe6060f1SDimitry Andric IsAGPR = true; 479fe6060f1SDimitry Andric Width = 16; 480fe6060f1SDimitry Andric } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 481fe6060f1SDimitry Andric IsSGPR = true; 482fe6060f1SDimitry Andric Width = 32; 483fe6060f1SDimitry Andric } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 484fe6060f1SDimitry Andric IsSGPR = false; 485fe6060f1SDimitry Andric Width = 32; 486fe6060f1SDimitry Andric } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 487fe6060f1SDimitry Andric IsSGPR = false; 488fe6060f1SDimitry Andric IsAGPR = true; 489fe6060f1SDimitry Andric Width = 32; 490fe6060f1SDimitry Andric } else { 49106c3fb27SDimitry Andric // We only expect TTMP registers or registers that do not belong to 49206c3fb27SDimitry Andric // any RC. 49306c3fb27SDimitry Andric assert((AMDGPU::TTMP_32RegClass.contains(Reg) || 49406c3fb27SDimitry Andric AMDGPU::TTMP_64RegClass.contains(Reg) || 49506c3fb27SDimitry Andric AMDGPU::TTMP_128RegClass.contains(Reg) || 49606c3fb27SDimitry Andric AMDGPU::TTMP_256RegClass.contains(Reg) || 49706c3fb27SDimitry Andric AMDGPU::TTMP_512RegClass.contains(Reg) || 49806c3fb27SDimitry Andric !TRI.getPhysRegBaseClass(Reg)) && 49906c3fb27SDimitry Andric "Unknown register class"); 500fe6060f1SDimitry Andric } 501fe6060f1SDimitry Andric unsigned HWReg = TRI.getHWRegIndex(Reg); 502fe6060f1SDimitry Andric int MaxUsed = HWReg + Width - 1; 503fe6060f1SDimitry Andric if (IsSGPR) { 504fe6060f1SDimitry Andric MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 505fe6060f1SDimitry Andric } else if (IsAGPR) { 506fe6060f1SDimitry Andric MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 507fe6060f1SDimitry Andric } else { 508fe6060f1SDimitry Andric MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 509fe6060f1SDimitry Andric } 510fe6060f1SDimitry Andric } 511fe6060f1SDimitry Andric 512fe6060f1SDimitry Andric if (MI.isCall()) { 513fe6060f1SDimitry Andric // Pseudo used just to encode the underlying global. Is there a better 514fe6060f1SDimitry Andric // way to track this? 515fe6060f1SDimitry Andric 516fe6060f1SDimitry Andric const MachineOperand *CalleeOp = 517fe6060f1SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::callee); 518fe6060f1SDimitry Andric 519fe6060f1SDimitry Andric const Function *Callee = getCalleeFunction(*CalleeOp); 520fe6060f1SDimitry Andric DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 521fe6060f1SDimitry Andric CallGraphResourceInfo.end(); 522fe6060f1SDimitry Andric 523fe6060f1SDimitry Andric // Avoid crashing on undefined behavior with an illegal call to a 524fe6060f1SDimitry Andric // kernel. If a callsite's calling convention doesn't match the 525fe6060f1SDimitry Andric // function's, it's undefined behavior. If the callsite calling 526fe6060f1SDimitry Andric // convention does match, that would have errored earlier. 527fe6060f1SDimitry Andric if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 528fe6060f1SDimitry Andric report_fatal_error("invalid call to entry function"); 529fe6060f1SDimitry Andric 530fe6060f1SDimitry Andric bool IsIndirect = !Callee || Callee->isDeclaration(); 531fe6060f1SDimitry Andric if (!IsIndirect) 532fe6060f1SDimitry Andric I = CallGraphResourceInfo.find(Callee); 533fe6060f1SDimitry Andric 534349cc55cSDimitry Andric // FIXME: Call site could have norecurse on it 535349cc55cSDimitry Andric if (!Callee || !Callee->doesNotRecurse()) { 536349cc55cSDimitry Andric Info.HasRecursion = true; 537349cc55cSDimitry Andric 538349cc55cSDimitry Andric // TODO: If we happen to know there is no stack usage in the 539349cc55cSDimitry Andric // callgraph, we don't need to assume an infinitely growing stack. 540349cc55cSDimitry Andric if (!MI.isReturn()) { 541349cc55cSDimitry Andric // We don't need to assume an unknown stack size for tail calls. 542349cc55cSDimitry Andric 543349cc55cSDimitry Andric // FIXME: This only benefits in the case where the kernel does not 544349cc55cSDimitry Andric // directly call the tail called function. If a kernel directly 545349cc55cSDimitry Andric // calls a tail recursive function, we'll assume maximum stack size 546349cc55cSDimitry Andric // based on the regular call instruction. 547*0fca6ea1SDimitry Andric CalleeFrameSize = std::max( 548*0fca6ea1SDimitry Andric CalleeFrameSize, 549349cc55cSDimitry Andric static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 550349cc55cSDimitry Andric } 551349cc55cSDimitry Andric } 552349cc55cSDimitry Andric 553fe6060f1SDimitry Andric if (IsIndirect || I == CallGraphResourceInfo.end()) { 554fe6060f1SDimitry Andric CalleeFrameSize = 555fe6060f1SDimitry Andric std::max(CalleeFrameSize, 556fe6060f1SDimitry Andric static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 557fe6060f1SDimitry Andric 558fe6060f1SDimitry Andric // Register usage of indirect calls gets handled later 559fe6060f1SDimitry Andric Info.UsesVCC = true; 560fe6060f1SDimitry Andric Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 561fe6060f1SDimitry Andric Info.HasDynamicallySizedStack = true; 562fe6060f1SDimitry Andric Info.HasIndirectCall = true; 563fe6060f1SDimitry Andric } else { 564fe6060f1SDimitry Andric // We force CodeGen to run in SCC order, so the callee's register 565fe6060f1SDimitry Andric // usage etc. should be the cumulative usage of all callees. 566fe6060f1SDimitry Andric MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 567fe6060f1SDimitry Andric MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 568fe6060f1SDimitry Andric MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 569fe6060f1SDimitry Andric CalleeFrameSize = 570fe6060f1SDimitry Andric std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 571fe6060f1SDimitry Andric Info.UsesVCC |= I->second.UsesVCC; 572fe6060f1SDimitry Andric Info.UsesFlatScratch |= I->second.UsesFlatScratch; 573fe6060f1SDimitry Andric Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 574fe6060f1SDimitry Andric Info.HasRecursion |= I->second.HasRecursion; 575fe6060f1SDimitry Andric Info.HasIndirectCall |= I->second.HasIndirectCall; 576fe6060f1SDimitry Andric } 577fe6060f1SDimitry Andric } 578fe6060f1SDimitry Andric } 579fe6060f1SDimitry Andric } 580fe6060f1SDimitry Andric 581fe6060f1SDimitry Andric Info.NumExplicitSGPR = MaxSGPR + 1; 582fe6060f1SDimitry Andric Info.NumVGPR = MaxVGPR + 1; 583fe6060f1SDimitry Andric Info.NumAGPR = MaxAGPR + 1; 584fe6060f1SDimitry Andric Info.PrivateSegmentSize += CalleeFrameSize; 585fe6060f1SDimitry Andric 586fe6060f1SDimitry Andric return Info; 587fe6060f1SDimitry Andric } 588fe6060f1SDimitry Andric 589fe6060f1SDimitry Andric void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 590fe6060f1SDimitry Andric // Collect the maximum number of registers from non-hardware-entrypoints. 591fe6060f1SDimitry Andric // All these functions are potential targets for indirect calls. 592fe6060f1SDimitry Andric int32_t NonKernelMaxSGPRs = 0; 593fe6060f1SDimitry Andric int32_t NonKernelMaxVGPRs = 0; 594fe6060f1SDimitry Andric int32_t NonKernelMaxAGPRs = 0; 595fe6060f1SDimitry Andric 596fe6060f1SDimitry Andric for (const auto &I : CallGraphResourceInfo) { 597fe6060f1SDimitry Andric if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 598fe6060f1SDimitry Andric auto &Info = I.getSecond(); 599fe6060f1SDimitry Andric NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 600fe6060f1SDimitry Andric NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 601fe6060f1SDimitry Andric NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 602fe6060f1SDimitry Andric } 603fe6060f1SDimitry Andric } 604fe6060f1SDimitry Andric 605fe6060f1SDimitry Andric // Add register usage for functions with indirect calls. 606fe6060f1SDimitry Andric // For calls to unknown functions, we assume the maximum register usage of 607fe6060f1SDimitry Andric // all non-hardware-entrypoints in the current module. 608fe6060f1SDimitry Andric for (auto &I : CallGraphResourceInfo) { 609fe6060f1SDimitry Andric auto &Info = I.getSecond(); 610fe6060f1SDimitry Andric if (Info.HasIndirectCall) { 611fe6060f1SDimitry Andric Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 612fe6060f1SDimitry Andric Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 613fe6060f1SDimitry Andric Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 614fe6060f1SDimitry Andric } 615fe6060f1SDimitry Andric } 616fe6060f1SDimitry Andric } 617