1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUResourceUsageAnalysis.h" 19 #include "AMDGPU.h" 20 #include "GCNSubtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineModuleInfo.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Target/TargetMachine.h" 27 28 using namespace llvm; 29 using namespace llvm::AMDGPU; 30 31 #define DEBUG_TYPE "amdgpu-resource-usage" 32 33 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 34 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 35 36 // In code object v4 and older, we need to tell the runtime some amount ahead of 37 // time if we don't know the true stack size. Assume a smaller number if this is 38 // only due to dynamic / non-entry block allocas. 39 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( 40 "amdgpu-assume-external-call-stack-size", 41 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 42 cl::init(16384)); 43 44 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( 45 "amdgpu-assume-dynamic-stack-object-size", 46 cl::desc("Assumed extra stack use if there are any " 47 "variable sized objects (in bytes)"), 48 cl::Hidden, cl::init(4096)); 49 50 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 51 "Function register usage analysis", true, true) 52 53 static const Function *getCalleeFunction(const MachineOperand &Op) { 54 if (Op.isImm()) { 55 assert(Op.getImm() == 0); 56 return nullptr; 57 } 58 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases()); 59 } 60 61 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 62 const SIInstrInfo &TII, unsigned Reg) { 63 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 64 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 65 return true; 66 } 67 68 return false; 69 } 70 71 bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) { 72 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 73 if (!TPC) 74 return false; 75 76 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 77 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); 78 79 // By default, for code object v5 and later, track only the minimum scratch 80 // size 81 uint32_t AssumedStackSizeForDynamicSizeObjects = 82 clAssumedStackSizeForDynamicSizeObjects; 83 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; 84 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 85 AMDGPU::AMDHSA_COV5 || 86 STI.getTargetTriple().getOS() == Triple::AMDPAL) { 87 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) 88 AssumedStackSizeForDynamicSizeObjects = 0; 89 if (!clAssumedStackSizeForExternalCall.getNumOccurrences()) 90 AssumedStackSizeForExternalCall = 0; 91 } 92 93 ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects, 94 AssumedStackSizeForExternalCall); 95 96 return false; 97 } 98 99 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 100 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 101 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, 102 uint32_t AssumedStackSizeForExternalCall) const { 103 SIFunctionResourceInfo Info; 104 105 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 106 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 107 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 108 const MachineRegisterInfo &MRI = MF.getRegInfo(); 109 const SIInstrInfo *TII = ST.getInstrInfo(); 110 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 111 112 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 113 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 114 MRI.isLiveIn(MFI->getPreloadedReg( 115 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 116 117 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 118 // instructions aren't used to access the scratch buffer. Inline assembly may 119 // need it though. 120 // 121 // If we only have implicit uses of flat_scr on flat instructions, it is not 122 // really needed. 123 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && 124 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 125 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 126 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 127 Info.UsesFlatScratch = false; 128 } 129 130 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 131 132 // Assume a big number if there are any unknown sized objects. 133 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 134 if (Info.HasDynamicallySizedStack) 135 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 136 137 if (MFI->isStackRealigned()) 138 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 139 140 Info.UsesVCC = 141 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 142 143 // If there are no calls, MachineRegisterInfo can tell us the used register 144 // count easily. 145 // A tail call isn't considered a call for MachineFrameInfo's purposes. 146 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 147 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); 148 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); 149 if (ST.hasMAIInsts()) 150 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); 151 return Info; 152 } 153 154 int32_t MaxVGPR = -1; 155 int32_t MaxAGPR = -1; 156 int32_t MaxSGPR = -1; 157 Info.CalleeSegmentSize = 0; 158 159 for (const MachineBasicBlock &MBB : MF) { 160 for (const MachineInstr &MI : MBB) { 161 // TODO: Check regmasks? Do they occur anywhere except calls? 162 for (const MachineOperand &MO : MI.operands()) { 163 unsigned Width = 0; 164 bool IsSGPR = false; 165 bool IsAGPR = false; 166 167 if (!MO.isReg()) 168 continue; 169 170 Register Reg = MO.getReg(); 171 switch (Reg) { 172 case AMDGPU::EXEC: 173 case AMDGPU::EXEC_LO: 174 case AMDGPU::EXEC_HI: 175 case AMDGPU::SCC: 176 case AMDGPU::M0: 177 case AMDGPU::M0_LO16: 178 case AMDGPU::M0_HI16: 179 case AMDGPU::SRC_SHARED_BASE_LO: 180 case AMDGPU::SRC_SHARED_BASE: 181 case AMDGPU::SRC_SHARED_LIMIT_LO: 182 case AMDGPU::SRC_SHARED_LIMIT: 183 case AMDGPU::SRC_PRIVATE_BASE_LO: 184 case AMDGPU::SRC_PRIVATE_BASE: 185 case AMDGPU::SRC_PRIVATE_LIMIT_LO: 186 case AMDGPU::SRC_PRIVATE_LIMIT: 187 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 188 case AMDGPU::SGPR_NULL: 189 case AMDGPU::SGPR_NULL64: 190 case AMDGPU::MODE: 191 continue; 192 193 case AMDGPU::NoRegister: 194 assert(MI.isDebugInstr() && 195 "Instruction uses invalid noreg register"); 196 continue; 197 198 case AMDGPU::VCC: 199 case AMDGPU::VCC_LO: 200 case AMDGPU::VCC_HI: 201 case AMDGPU::VCC_LO_LO16: 202 case AMDGPU::VCC_LO_HI16: 203 case AMDGPU::VCC_HI_LO16: 204 case AMDGPU::VCC_HI_HI16: 205 Info.UsesVCC = true; 206 continue; 207 208 case AMDGPU::FLAT_SCR: 209 case AMDGPU::FLAT_SCR_LO: 210 case AMDGPU::FLAT_SCR_HI: 211 continue; 212 213 case AMDGPU::XNACK_MASK: 214 case AMDGPU::XNACK_MASK_LO: 215 case AMDGPU::XNACK_MASK_HI: 216 llvm_unreachable("xnack_mask registers should not be used"); 217 218 case AMDGPU::LDS_DIRECT: 219 llvm_unreachable("lds_direct register should not be used"); 220 221 case AMDGPU::TBA: 222 case AMDGPU::TBA_LO: 223 case AMDGPU::TBA_HI: 224 case AMDGPU::TMA: 225 case AMDGPU::TMA_LO: 226 case AMDGPU::TMA_HI: 227 llvm_unreachable("trap handler registers should not be used"); 228 229 case AMDGPU::SRC_VCCZ: 230 llvm_unreachable("src_vccz register should not be used"); 231 232 case AMDGPU::SRC_EXECZ: 233 llvm_unreachable("src_execz register should not be used"); 234 235 case AMDGPU::SRC_SCC: 236 llvm_unreachable("src_scc register should not be used"); 237 238 default: 239 break; 240 } 241 242 if (AMDGPU::SGPR_32RegClass.contains(Reg) || 243 AMDGPU::SGPR_LO16RegClass.contains(Reg) || 244 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 245 IsSGPR = true; 246 Width = 1; 247 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 248 AMDGPU::VGPR_16RegClass.contains(Reg)) { 249 IsSGPR = false; 250 Width = 1; 251 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 252 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 253 IsSGPR = false; 254 IsAGPR = true; 255 Width = 1; 256 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { 257 IsSGPR = true; 258 Width = 2; 259 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 260 IsSGPR = false; 261 Width = 2; 262 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 263 IsSGPR = false; 264 IsAGPR = true; 265 Width = 2; 266 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 267 IsSGPR = false; 268 Width = 3; 269 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 270 IsSGPR = true; 271 Width = 3; 272 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 273 IsSGPR = false; 274 IsAGPR = true; 275 Width = 3; 276 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { 277 IsSGPR = true; 278 Width = 4; 279 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 280 IsSGPR = false; 281 Width = 4; 282 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 283 IsSGPR = false; 284 IsAGPR = true; 285 Width = 4; 286 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 287 IsSGPR = false; 288 Width = 5; 289 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 290 IsSGPR = true; 291 Width = 5; 292 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 293 IsSGPR = false; 294 IsAGPR = true; 295 Width = 5; 296 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 297 IsSGPR = false; 298 Width = 6; 299 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 300 IsSGPR = true; 301 Width = 6; 302 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 303 IsSGPR = false; 304 IsAGPR = true; 305 Width = 6; 306 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 307 IsSGPR = false; 308 Width = 7; 309 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 310 IsSGPR = true; 311 Width = 7; 312 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 313 IsSGPR = false; 314 IsAGPR = true; 315 Width = 7; 316 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 317 IsSGPR = true; 318 Width = 8; 319 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 320 IsSGPR = false; 321 Width = 8; 322 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 323 IsSGPR = false; 324 IsAGPR = true; 325 Width = 8; 326 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { 327 IsSGPR = false; 328 Width = 9; 329 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { 330 IsSGPR = true; 331 Width = 9; 332 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { 333 IsSGPR = false; 334 IsAGPR = true; 335 Width = 9; 336 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { 337 IsSGPR = false; 338 Width = 10; 339 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { 340 IsSGPR = true; 341 Width = 10; 342 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { 343 IsSGPR = false; 344 IsAGPR = true; 345 Width = 10; 346 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { 347 IsSGPR = false; 348 Width = 11; 349 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { 350 IsSGPR = true; 351 Width = 11; 352 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { 353 IsSGPR = false; 354 IsAGPR = true; 355 Width = 11; 356 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { 357 IsSGPR = false; 358 Width = 12; 359 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { 360 IsSGPR = true; 361 Width = 12; 362 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { 363 IsSGPR = false; 364 IsAGPR = true; 365 Width = 12; 366 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 367 IsSGPR = true; 368 Width = 16; 369 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 370 IsSGPR = false; 371 Width = 16; 372 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 373 IsSGPR = false; 374 IsAGPR = true; 375 Width = 16; 376 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 377 IsSGPR = true; 378 Width = 32; 379 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 380 IsSGPR = false; 381 Width = 32; 382 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 383 IsSGPR = false; 384 IsAGPR = true; 385 Width = 32; 386 } else { 387 // We only expect TTMP registers or registers that do not belong to 388 // any RC. 389 assert((AMDGPU::TTMP_32RegClass.contains(Reg) || 390 AMDGPU::TTMP_64RegClass.contains(Reg) || 391 AMDGPU::TTMP_128RegClass.contains(Reg) || 392 AMDGPU::TTMP_256RegClass.contains(Reg) || 393 AMDGPU::TTMP_512RegClass.contains(Reg) || 394 !TRI.getPhysRegBaseClass(Reg)) && 395 "Unknown register class"); 396 } 397 unsigned HWReg = TRI.getHWRegIndex(Reg); 398 int MaxUsed = HWReg + Width - 1; 399 if (IsSGPR) { 400 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 401 } else if (IsAGPR) { 402 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 403 } else { 404 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 405 } 406 } 407 408 if (MI.isCall()) { 409 // Pseudo used just to encode the underlying global. Is there a better 410 // way to track this? 411 412 const MachineOperand *CalleeOp = 413 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 414 415 const Function *Callee = getCalleeFunction(*CalleeOp); 416 417 // Avoid crashing on undefined behavior with an illegal call to a 418 // kernel. If a callsite's calling convention doesn't match the 419 // function's, it's undefined behavior. If the callsite calling 420 // convention does match, that would have errored earlier. 421 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 422 report_fatal_error("invalid call to entry function"); 423 424 auto isSameFunction = [](const MachineFunction &MF, const Function *F) { 425 return F == &MF.getFunction(); 426 }; 427 428 if (Callee && !isSameFunction(MF, Callee)) 429 Info.Callees.push_back(Callee); 430 431 bool IsIndirect = !Callee || Callee->isDeclaration(); 432 433 // FIXME: Call site could have norecurse on it 434 if (!Callee || !Callee->doesNotRecurse()) { 435 Info.HasRecursion = true; 436 437 // TODO: If we happen to know there is no stack usage in the 438 // callgraph, we don't need to assume an infinitely growing stack. 439 if (!MI.isReturn()) { 440 // We don't need to assume an unknown stack size for tail calls. 441 442 // FIXME: This only benefits in the case where the kernel does not 443 // directly call the tail called function. If a kernel directly 444 // calls a tail recursive function, we'll assume maximum stack size 445 // based on the regular call instruction. 446 Info.CalleeSegmentSize = std::max( 447 Info.CalleeSegmentSize, 448 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 449 } 450 } 451 452 if (IsIndirect) { 453 Info.CalleeSegmentSize = 454 std::max(Info.CalleeSegmentSize, 455 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 456 457 // Register usage of indirect calls gets handled later 458 Info.UsesVCC = true; 459 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 460 Info.HasDynamicallySizedStack = true; 461 Info.HasIndirectCall = true; 462 } 463 } 464 } 465 } 466 467 Info.NumExplicitSGPR = MaxSGPR + 1; 468 Info.NumVGPR = MaxVGPR + 1; 469 Info.NumAGPR = MaxAGPR + 1; 470 471 return Info; 472 } 473