1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 12 /// code. When passed an MCAsmStreamer it prints assembly and when passed 13 /// an MCObjectStreamer it outputs binary code. 14 // 15 //===----------------------------------------------------------------------===// 16 // 17 18 #include "AMDGPUAsmPrinter.h" 19 #include "AMDGPU.h" 20 #include "AMDGPUHSAMetadataStreamer.h" 21 #include "AMDGPUMCResourceInfo.h" 22 #include "AMDGPUResourceUsageAnalysis.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUInstPrinter.h" 25 #include "MCTargetDesc/AMDGPUMCExpr.h" 26 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h" 27 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 28 #include "R600AsmPrinter.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "TargetInfo/AMDGPUTargetInfo.h" 31 #include "Utils/AMDGPUBaseInfo.h" 32 #include "Utils/AMDKernelCodeTUtils.h" 33 #include "Utils/SIDefinesUtils.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/BinaryFormat/ELF.h" 36 #include "llvm/CodeGen/MachineFrameInfo.h" 37 #include "llvm/CodeGen/MachineModuleInfo.h" 38 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" 39 #include "llvm/IR/DiagnosticInfo.h" 40 #include "llvm/MC/MCAssembler.h" 41 #include "llvm/MC/MCContext.h" 42 #include "llvm/MC/MCSectionELF.h" 43 #include "llvm/MC/MCStreamer.h" 44 #include "llvm/MC/TargetRegistry.h" 45 #include "llvm/Support/AMDHSAKernelDescriptor.h" 46 #include "llvm/Target/TargetLoweringObjectFile.h" 47 #include "llvm/Target/TargetMachine.h" 48 #include "llvm/TargetParser/TargetParser.h" 49 50 using namespace llvm; 51 using namespace llvm::AMDGPU; 52 53 // This should get the default rounding mode from the kernel. We just set the 54 // default here, but this could change if the OpenCL rounding mode pragmas are 55 // used. 56 // 57 // The denormal mode here should match what is reported by the OpenCL runtime 58 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 59 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 60 // 61 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 62 // precision, and leaves single precision to flush all and does not report 63 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 64 // CL_FP_DENORM for both. 65 // 66 // FIXME: It seems some instructions do not support single precision denormals 67 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 68 // and sin_f32, cos_f32 on most parts). 69 70 // We want to use these instructions, and using fp32 denormals also causes 71 // instructions to run at the double precision rate for the device so it's 72 // probably best to just report no single precision denormals. 73 static uint32_t getFPMode(SIModeRegisterDefaults Mode) { 74 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 75 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 76 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | 77 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); 78 } 79 80 static AsmPrinter * 81 createAMDGPUAsmPrinterPass(TargetMachine &tm, 82 std::unique_ptr<MCStreamer> &&Streamer) { 83 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 84 } 85 86 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() { 87 TargetRegistry::RegisterAsmPrinter(getTheR600Target(), 88 llvm::createR600AsmPrinterPass); 89 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 90 createAMDGPUAsmPrinterPass); 91 } 92 93 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 94 std::unique_ptr<MCStreamer> Streamer) 95 : AsmPrinter(TM, std::move(Streamer)) { 96 assert(OutStreamer && "AsmPrinter constructed without streamer"); 97 } 98 99 StringRef AMDGPUAsmPrinter::getPassName() const { 100 return "AMDGPU Assembly Printer"; 101 } 102 103 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { 104 return TM.getMCSubtargetInfo(); 105 } 106 107 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { 108 if (!OutStreamer) 109 return nullptr; 110 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); 111 } 112 113 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { 114 IsTargetStreamerInitialized = false; 115 } 116 117 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { 118 IsTargetStreamerInitialized = true; 119 120 // TODO: Which one is called first, emitStartOfAsmFile or 121 // emitFunctionBodyStart? 122 if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) 123 initializeTargetID(M); 124 125 if (TM.getTargetTriple().getOS() != Triple::AMDHSA && 126 TM.getTargetTriple().getOS() != Triple::AMDPAL) 127 return; 128 129 getTargetStreamer()->EmitDirectiveAMDGCNTarget(); 130 131 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 132 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion( 133 CodeObjectVersion); 134 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); 135 } 136 137 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 138 getTargetStreamer()->getPALMetadata()->readFromIR(M); 139 } 140 141 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { 142 // Init target streamer if it has not yet happened 143 if (!IsTargetStreamerInitialized) 144 initTargetStreamer(M); 145 146 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 147 getTargetStreamer()->EmitISAVersion(); 148 149 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). 150 // Emit HSA Metadata (NT_AMD_HSA_METADATA). 151 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 152 HSAMetadataStream->end(); 153 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); 154 (void)Success; 155 assert(Success && "Malformed HSA Metadata"); 156 } 157 } 158 159 void AMDGPUAsmPrinter::emitFunctionBodyStart() { 160 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 161 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 162 const Function &F = MF->getFunction(); 163 164 // TODO: We're checking this late, would be nice to check it earlier. 165 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) { 166 report_fatal_error( 167 STM.getCPU() + " is only available on code object version 6 or better", 168 /*gen_crash_diag*/ false); 169 } 170 171 // TODO: Which one is called first, emitStartOfAsmFile or 172 // emitFunctionBodyStart? 173 if (!getTargetStreamer()->getTargetID()) 174 initializeTargetID(*F.getParent()); 175 176 const auto &FunctionTargetID = STM.getTargetID(); 177 // Make sure function's xnack settings are compatible with module's 178 // xnack settings. 179 if (FunctionTargetID.isXnackSupported() && 180 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && 181 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { 182 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) + 183 "' function does not match module xnack setting"); 184 return; 185 } 186 // Make sure function's sramecc settings are compatible with module's 187 // sramecc settings. 188 if (FunctionTargetID.isSramEccSupported() && 189 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && 190 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { 191 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) + 192 "' function does not match module sramecc setting"); 193 return; 194 } 195 196 if (!MFI.isEntryFunction()) 197 return; 198 199 if (STM.isMesaKernel(F) && 200 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 201 F.getCallingConv() == CallingConv::SPIR_KERNEL)) { 202 AMDGPUMCKernelCodeT KernelCode; 203 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 204 KernelCode.validate(&STM, MF->getContext()); 205 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); 206 } 207 208 if (STM.isAmdHsaOS()) 209 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); 210 } 211 212 void AMDGPUAsmPrinter::emitFunctionBodyEnd() { 213 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 214 if (!MFI.isEntryFunction()) 215 return; 216 217 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 218 return; 219 220 auto &Streamer = getTargetStreamer()->getStreamer(); 221 auto &Context = Streamer.getContext(); 222 auto &ObjectFileInfo = *Context.getObjectFileInfo(); 223 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); 224 225 Streamer.pushSection(); 226 Streamer.switchSection(&ReadOnlySection); 227 228 // CP microcode requires the kernel descriptor to be allocated on 64 byte 229 // alignment. 230 Streamer.emitValueToAlignment(Align(64), 0, 1, 0); 231 ReadOnlySection.ensureMinAlignment(Align(64)); 232 233 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 234 235 SmallString<128> KernelName; 236 getNameWithPrefix(KernelName, &MF->getFunction()); 237 getTargetStreamer()->EmitAmdhsaKernelDescriptor( 238 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), 239 CurrentProgramInfo.NumVGPRsForWavesPerEU, 240 MCBinaryExpr::createSub( 241 CurrentProgramInfo.NumSGPRsForWavesPerEU, 242 AMDGPUMCExpr::createExtraSGPRs( 243 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, 244 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context), 245 Context), 246 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); 247 248 Streamer.popSection(); 249 } 250 251 void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { 252 Register RegNo = MI->getOperand(0).getReg(); 253 254 SmallString<128> Str; 255 raw_svector_ostream OS(Str); 256 OS << "implicit-def: " 257 << printReg(RegNo, MF->getSubtarget().getRegisterInfo()); 258 259 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL) 260 OS << " : SGPR spill to VGPR lane"; 261 262 OutStreamer->AddComment(OS.str()); 263 OutStreamer->addBlankLine(); 264 } 265 266 void AMDGPUAsmPrinter::emitFunctionEntryLabel() { 267 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 268 AsmPrinter::emitFunctionEntryLabel(); 269 return; 270 } 271 272 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 273 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 274 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { 275 SmallString<128> SymbolName; 276 getNameWithPrefix(SymbolName, &MF->getFunction()), 277 getTargetStreamer()->EmitAMDGPUSymbolType( 278 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 279 } 280 if (DumpCodeInstEmitter) { 281 // Disassemble function name label to text. 282 DisasmLines.push_back(MF->getName().str() + ":"); 283 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 284 HexLines.emplace_back(""); 285 } 286 287 AsmPrinter::emitFunctionEntryLabel(); 288 } 289 290 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { 291 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { 292 // Write a line for the basic block label if it is not only fallthrough. 293 DisasmLines.push_back( 294 (Twine("BB") + Twine(getFunctionNumber()) 295 + "_" + Twine(MBB.getNumber()) + ":").str()); 296 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 297 HexLines.emplace_back(""); 298 } 299 AsmPrinter::emitBasicBlockStart(MBB); 300 } 301 302 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { 303 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 304 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { 305 OutContext.reportError({}, 306 Twine(GV->getName()) + 307 ": unsupported initializer for address space"); 308 return; 309 } 310 311 // LDS variables aren't emitted in HSA or PAL yet. 312 const Triple::OSType OS = TM.getTargetTriple().getOS(); 313 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 314 return; 315 316 MCSymbol *GVSym = getSymbol(GV); 317 318 GVSym->redefineIfPossible(); 319 if (GVSym->isDefined() || GVSym->isVariable()) 320 report_fatal_error("symbol '" + Twine(GVSym->getName()) + 321 "' is already defined"); 322 323 const DataLayout &DL = GV->getDataLayout(); 324 uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); 325 Align Alignment = GV->getAlign().value_or(Align(4)); 326 327 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); 328 emitLinkage(GV, GVSym); 329 auto *TS = getTargetStreamer(); 330 TS->emitAMDGPULDS(GVSym, Size, Alignment); 331 return; 332 } 333 334 AsmPrinter::emitGlobalVariable(GV); 335 } 336 337 bool AMDGPUAsmPrinter::doInitialization(Module &M) { 338 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M); 339 340 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 341 switch (CodeObjectVersion) { 342 case AMDGPU::AMDHSA_COV4: 343 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>(); 344 break; 345 case AMDGPU::AMDHSA_COV5: 346 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>(); 347 break; 348 case AMDGPU::AMDHSA_COV6: 349 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>(); 350 break; 351 default: 352 report_fatal_error("Unexpected code object version"); 353 } 354 } 355 356 return AsmPrinter::doInitialization(M); 357 } 358 359 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { 360 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())) 361 return; 362 363 using RIK = MCResourceInfo::ResourceInfoKind; 364 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 365 MCSymbol *FnSym = TM.getSymbol(&F); 366 bool IsLocal = F.hasLocalLinkage(); 367 368 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { 369 int64_t Val; 370 if (Value->evaluateAsAbsolute(Val)) { 371 Res = Val; 372 return true; 373 } 374 return false; 375 }; 376 377 const uint64_t MaxScratchPerWorkitem = 378 STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); 379 MCSymbol *ScratchSizeSymbol = RI.getSymbol( 380 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal); 381 uint64_t ScratchSize; 382 if (ScratchSizeSymbol->isVariable() && 383 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) && 384 ScratchSize > MaxScratchPerWorkitem) { 385 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem, 386 DS_Error); 387 F.getContext().diagnose(DiagStackSize); 388 } 389 390 // Validate addressable scalar registers (i.e., prior to added implicit 391 // SGPRs). 392 MCSymbol *NumSGPRSymbol = 393 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal); 394 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 395 !STM.hasSGPRInitBug()) { 396 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 397 uint64_t NumSgpr; 398 if (NumSGPRSymbol->isVariable() && 399 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && 400 NumSgpr > MaxAddressableNumSGPRs) { 401 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers", 402 NumSgpr, MaxAddressableNumSGPRs, 403 DS_Error, DK_ResourceLimit); 404 F.getContext().diagnose(Diag); 405 return; 406 } 407 } 408 409 MCSymbol *VCCUsedSymbol = 410 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal); 411 MCSymbol *FlatUsedSymbol = RI.getSymbol( 412 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal); 413 uint64_t VCCUsed, FlatUsed, NumSgpr; 414 415 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() && 416 FlatUsedSymbol->isVariable() && 417 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && 418 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) && 419 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) { 420 421 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be 422 // resolvable. 423 NumSgpr += IsaInfo::getNumExtraSGPRs( 424 &STM, VCCUsed, FlatUsed, 425 getTargetStreamer()->getTargetID()->isXnackOnOrAny()); 426 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 427 STM.hasSGPRInitBug()) { 428 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 429 if (NumSgpr > MaxAddressableNumSGPRs) { 430 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr, 431 MaxAddressableNumSGPRs, DS_Error, 432 DK_ResourceLimit); 433 F.getContext().diagnose(Diag); 434 return; 435 } 436 } 437 438 MCSymbol *NumVgprSymbol = 439 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal); 440 MCSymbol *NumAgprSymbol = 441 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal); 442 uint64_t NumVgpr, NumAgpr; 443 444 MachineModuleInfo &MMI = 445 getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 446 MachineFunction *MF = MMI.getMachineFunction(F); 447 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() && 448 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) && 449 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) { 450 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 451 unsigned MaxWaves = MFI.getMaxWavesPerEU(); 452 uint64_t TotalNumVgpr = 453 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr); 454 uint64_t NumVGPRsForWavesPerEU = std::max( 455 {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)}); 456 uint64_t NumSGPRsForWavesPerEU = std::max( 457 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); 458 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( 459 STM.getOccupancyWithWorkGroupSizes(*MF).second, 460 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), 461 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM, 462 OutContext); 463 uint64_t Occupancy; 464 465 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute( 466 F, "amdgpu-waves-per-eu", {0, 0}, true); 467 468 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) { 469 DiagnosticInfoOptimizationFailure Diag( 470 F, F.getSubprogram(), 471 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " 472 "'" + 473 F.getName() + "': desired occupancy was " + Twine(MinWEU) + 474 ", final occupancy is " + Twine(Occupancy)); 475 F.getContext().diagnose(Diag); 476 return; 477 } 478 } 479 } 480 } 481 482 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 483 // Pad with s_code_end to help tools and guard against instruction prefetch 484 // causing stale data in caches. Arguably this should be done by the linker, 485 // which is why this isn't done for Mesa. 486 const MCSubtargetInfo &STI = *getGlobalSTI(); 487 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && 488 (STI.getTargetTriple().getOS() == Triple::AMDHSA || 489 STI.getTargetTriple().getOS() == Triple::AMDPAL)) { 490 OutStreamer->switchSection(getObjFileLowering().getTextSection()); 491 getTargetStreamer()->EmitCodeEnd(STI); 492 } 493 494 // Assign expressions which can only be resolved when all other functions are 495 // known. 496 RI.finalize(OutContext); 497 498 // Switch section and emit all GPR maximums within the processed module. 499 OutStreamer->pushSection(); 500 MCSectionELF *MaxGPRSection = 501 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0); 502 OutStreamer->switchSection(MaxGPRSection); 503 getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext), 504 RI.getMaxAGPRSymbol(OutContext), 505 RI.getMaxSGPRSymbol(OutContext)); 506 OutStreamer->popSection(); 507 508 for (Function &F : M.functions()) 509 validateMCResourceInfo(F); 510 511 RI.reset(); 512 513 return AsmPrinter::doFinalization(M); 514 } 515 516 SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { 517 SmallString<128> Str; 518 raw_svector_ostream OSS(Str); 519 auto &Streamer = getTargetStreamer()->getStreamer(); 520 auto &Context = Streamer.getContext(); 521 const MCExpr *New = foldAMDGPUMCExpr(Value, Context); 522 printAMDGPUMCExpr(New, OSS, MAI); 523 return Str; 524 } 525 526 // Print comments that apply to both callable functions and entry points. 527 void AMDGPUAsmPrinter::emitCommonFunctionComments( 528 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, 529 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, 530 const AMDGPUMachineFunction *MFI) { 531 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 532 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR), 533 false); 534 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); 535 if (NumAGPR && TotalNumVGPR) { 536 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); 537 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR), 538 false); 539 } 540 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize), 541 false); 542 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 543 false); 544 } 545 546 const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( 547 const MachineFunction &MF) const { 548 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 549 MCContext &Ctx = MF.getContext(); 550 uint16_t KernelCodeProperties = 0; 551 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); 552 553 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 554 KernelCodeProperties |= 555 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 556 } 557 if (UserSGPRInfo.hasDispatchPtr()) { 558 KernelCodeProperties |= 559 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 560 } 561 if (UserSGPRInfo.hasQueuePtr()) { 562 KernelCodeProperties |= 563 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 564 } 565 if (UserSGPRInfo.hasKernargSegmentPtr()) { 566 KernelCodeProperties |= 567 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 568 } 569 if (UserSGPRInfo.hasDispatchID()) { 570 KernelCodeProperties |= 571 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 572 } 573 if (UserSGPRInfo.hasFlatScratchInit()) { 574 KernelCodeProperties |= 575 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 576 } 577 if (UserSGPRInfo.hasPrivateSegmentSize()) { 578 KernelCodeProperties |= 579 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; 580 } 581 if (MF.getSubtarget<GCNSubtarget>().isWave32()) { 582 KernelCodeProperties |= 583 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; 584 } 585 586 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be 587 // un-evaluatable at this point so it cannot be conditionally checked here. 588 // Instead, we'll directly shift the possibly unknown MCExpr into its place 589 // and bitwise-or it into KernelCodeProperties. 590 const MCExpr *KernelCodePropExpr = 591 MCConstantExpr::create(KernelCodeProperties, Ctx); 592 const MCExpr *OrValue = MCConstantExpr::create( 593 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx); 594 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack, 595 OrValue, Ctx); 596 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx); 597 598 return KernelCodePropExpr; 599 } 600 601 MCKernelDescriptor 602 AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, 603 const SIProgramInfo &PI) const { 604 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 605 const Function &F = MF.getFunction(); 606 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 607 MCContext &Ctx = MF.getContext(); 608 609 MCKernelDescriptor KernelDescriptor; 610 611 KernelDescriptor.group_segment_fixed_size = 612 MCConstantExpr::create(PI.LDSSize, Ctx); 613 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; 614 615 Align MaxKernArgAlign; 616 KernelDescriptor.kernarg_size = MCConstantExpr::create( 617 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx); 618 619 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); 620 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); 621 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); 622 623 int64_t PGRM_Rsrc3 = 1; 624 bool EvaluatableRsrc3 = 625 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3); 626 (void)PGRM_Rsrc3; 627 (void)EvaluatableRsrc3; 628 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 || 629 static_cast<uint64_t>(PGRM_Rsrc3) == 0); 630 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; 631 632 KernelDescriptor.kernarg_preload = MCConstantExpr::create( 633 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0, 634 Ctx); 635 636 return KernelDescriptor; 637 } 638 639 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 640 // Init target streamer lazily on the first function so that previous passes 641 // can set metadata. 642 if (!IsTargetStreamerInitialized) 643 initTargetStreamer(*MF.getFunction().getParent()); 644 645 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); 646 CurrentProgramInfo.reset(MF); 647 648 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 649 MCContext &Ctx = MF.getContext(); 650 651 // The starting address of all shader programs must be 256 bytes aligned. 652 // Regular functions just need the basic required instruction alignment. 653 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); 654 655 SetupMachineFunction(MF); 656 657 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 658 MCContext &Context = getObjFileLowering().getContext(); 659 bool IsLocal = MF.getFunction().hasLocalLinkage(); 660 // FIXME: This should be an explicit check for Mesa. 661 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { 662 MCSectionELF *ConfigSection = 663 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 664 OutStreamer->switchSection(ConfigSection); 665 } 666 667 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 668 ResourceUsage->getResourceInfo(); 669 RI.gatherResourceInfo(MF, Info, OutContext); 670 671 if (MFI->isModuleEntryFunction()) { 672 getSIProgramInfo(CurrentProgramInfo, MF); 673 } 674 675 if (STM.isAmdPalOS()) { 676 if (MFI->isEntryFunction()) 677 EmitPALMetadata(MF, CurrentProgramInfo); 678 else if (MFI->isModuleEntryFunction()) 679 emitPALFunctionMetadata(MF); 680 } else if (!STM.isAmdHsaOS()) { 681 EmitProgramInfoSI(MF, CurrentProgramInfo); 682 } 683 684 DumpCodeInstEmitter = nullptr; 685 if (STM.dumpCode()) { 686 // For -dumpcode, get the assembler out of the streamer. This only works 687 // with -filetype=obj. 688 MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); 689 if (Assembler) 690 DumpCodeInstEmitter = Assembler->getEmitterPtr(); 691 } 692 693 DisasmLines.clear(); 694 HexLines.clear(); 695 DisasmLineMaxLen = 0; 696 697 emitFunctionBody(); 698 699 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), 700 STM.hasMAIInsts()); 701 702 { 703 using RIK = MCResourceInfo::ResourceInfoKind; 704 getTargetStreamer()->EmitMCResourceInfo( 705 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext, 706 IsLocal), 707 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext, 708 IsLocal), 709 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext, 710 IsLocal), 711 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, 712 OutContext, IsLocal), 713 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext, 714 IsLocal), 715 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch, 716 OutContext, IsLocal), 717 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack, 718 OutContext, IsLocal), 719 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext, 720 IsLocal), 721 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall, 722 OutContext, IsLocal)); 723 } 724 725 if (isVerbose()) { 726 MCSectionELF *CommentSection = 727 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 728 OutStreamer->switchSection(CommentSection); 729 730 if (!MFI->isEntryFunction()) { 731 using RIK = MCResourceInfo::ResourceInfoKind; 732 OutStreamer->emitRawComment(" Function info:", false); 733 734 emitCommonFunctionComments( 735 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext, 736 IsLocal) 737 ->getVariableValue(), 738 STM.hasMAIInsts() 739 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, 740 OutContext, IsLocal) 741 ->getVariableValue() 742 : nullptr, 743 RI.createTotalNumVGPRs(MF, Ctx), 744 RI.createTotalNumSGPRs( 745 MF, 746 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(), 747 Ctx), 748 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, 749 OutContext, IsLocal) 750 ->getVariableValue(), 751 getFunctionCodeSize(MF), MFI); 752 return false; 753 } 754 755 OutStreamer->emitRawComment(" Kernel info:", false); 756 emitCommonFunctionComments( 757 CurrentProgramInfo.NumArchVGPR, 758 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, 759 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, 760 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); 761 762 OutStreamer->emitRawComment( 763 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 764 OutStreamer->emitRawComment( 765 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 766 OutStreamer->emitRawComment( 767 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 768 " bytes/workgroup (compile time only)", false); 769 770 OutStreamer->emitRawComment( 771 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false); 772 773 OutStreamer->emitRawComment( 774 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false); 775 776 OutStreamer->emitRawComment( 777 " NumSGPRsForWavesPerEU: " + 778 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU), 779 false); 780 OutStreamer->emitRawComment( 781 " NumVGPRsForWavesPerEU: " + 782 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU), 783 false); 784 785 if (STM.hasGFX90AInsts()) { 786 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd( 787 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx); 788 AdjustedAccum = MCBinaryExpr::createMul( 789 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx); 790 OutStreamer->emitRawComment( 791 " AccumOffset: " + getMCExprStr(AdjustedAccum), false); 792 } 793 794 OutStreamer->emitRawComment( 795 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false); 796 797 OutStreamer->emitRawComment( 798 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); 799 800 OutStreamer->emitRawComment( 801 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + 802 getMCExprStr(CurrentProgramInfo.ScratchEnable), 803 false); 804 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + 805 Twine(CurrentProgramInfo.UserSGPR), 806 false); 807 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 808 Twine(CurrentProgramInfo.TrapHandlerEnable), 809 false); 810 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + 811 Twine(CurrentProgramInfo.TGIdXEnable), 812 false); 813 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 814 Twine(CurrentProgramInfo.TGIdYEnable), 815 false); 816 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 817 Twine(CurrentProgramInfo.TGIdZEnable), 818 false); 819 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 820 Twine(CurrentProgramInfo.TIdIGCompCount), 821 false); 822 823 [[maybe_unused]] int64_t PGMRSrc3; 824 assert(STM.hasGFX90AInsts() || 825 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute( 826 PGMRSrc3) && 827 static_cast<uint64_t>(PGMRSrc3) == 0)); 828 if (STM.hasGFX90AInsts()) { 829 OutStreamer->emitRawComment( 830 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + 831 getMCExprStr(MCKernelDescriptor::bits_get( 832 CurrentProgramInfo.ComputePGMRSrc3GFX90A, 833 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, 834 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)), 835 false); 836 OutStreamer->emitRawComment( 837 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + 838 getMCExprStr(MCKernelDescriptor::bits_get( 839 CurrentProgramInfo.ComputePGMRSrc3GFX90A, 840 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, 841 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)), 842 false); 843 } 844 } 845 846 if (DumpCodeInstEmitter) { 847 848 OutStreamer->switchSection( 849 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); 850 851 for (size_t i = 0; i < DisasmLines.size(); ++i) { 852 std::string Comment = "\n"; 853 if (!HexLines[i].empty()) { 854 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 855 Comment += " ; " + HexLines[i] + "\n"; 856 } 857 858 OutStreamer->emitBytes(StringRef(DisasmLines[i])); 859 OutStreamer->emitBytes(StringRef(Comment)); 860 } 861 } 862 863 return false; 864 } 865 866 // TODO: Fold this into emitFunctionBodyStart. 867 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { 868 // In the beginning all features are either 'Any' or 'NotSupported', 869 // depending on global target features. This will cover empty modules. 870 getTargetStreamer()->initializeTargetID(*getGlobalSTI(), 871 getGlobalSTI()->getFeatureString()); 872 873 // If module is empty, we are done. 874 if (M.empty()) 875 return; 876 877 // If module is not empty, need to find first 'Off' or 'On' feature 878 // setting per feature from functions in module. 879 for (auto &F : M) { 880 auto &TSTargetID = getTargetStreamer()->getTargetID(); 881 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && 882 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) 883 break; 884 885 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 886 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); 887 if (TSTargetID->isXnackSupported()) 888 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) 889 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); 890 if (TSTargetID->isSramEccSupported()) 891 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) 892 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); 893 } 894 } 895 896 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 897 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 898 const SIInstrInfo *TII = STM.getInstrInfo(); 899 900 uint64_t CodeSize = 0; 901 902 for (const MachineBasicBlock &MBB : MF) { 903 for (const MachineInstr &MI : MBB) { 904 // TODO: CodeSize should account for multiple functions. 905 906 // TODO: Should we count size of debug info? 907 if (MI.isDebugInstr()) 908 continue; 909 910 CodeSize += TII->getInstSizeInBytes(MI); 911 } 912 } 913 914 return CodeSize; 915 } 916 917 // AccumOffset computed for the MCExpr equivalent of: 918 // alignTo(std::max(1, NumVGPR), 4) / 4 - 1; 919 static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) { 920 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx); 921 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx); 922 923 // Can't be lower than 1 for subsequent alignTo. 924 const MCExpr *MaximumTaken = 925 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx); 926 927 // Practically, it's computing divideCeil(MaximumTaken, 4). 928 const MCExpr *DivCeil = MCBinaryExpr::createDiv( 929 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour, 930 Ctx); 931 932 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx); 933 } 934 935 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 936 const MachineFunction &MF) { 937 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 938 bool IsLocal = MF.getFunction().hasLocalLinkage(); 939 MCContext &Ctx = MF.getContext(); 940 941 auto CreateExpr = [&Ctx](int64_t Value) { 942 return MCConstantExpr::create(Value, Ctx); 943 }; 944 945 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { 946 int64_t Val; 947 if (Value->evaluateAsAbsolute(Val)) { 948 Res = Val; 949 return true; 950 } 951 return false; 952 }; 953 954 auto GetSymRefExpr = 955 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * { 956 MCSymbol *Sym = 957 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal); 958 return MCSymbolRefExpr::create(Sym, Ctx); 959 }; 960 961 using RIK = MCResourceInfo::ResourceInfoKind; 962 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR); 963 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR); 964 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( 965 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); 966 967 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx); 968 ProgInfo.TgSplit = STM.isTgSplitEnabled(); 969 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR); 970 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize); 971 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC); 972 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch); 973 ProgInfo.DynamicCallStack = 974 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack), 975 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx); 976 977 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 978 979 // The calculations related to SGPR/VGPR blocks are 980 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be 981 // unified. 982 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs( 983 ProgInfo.VCCUsed, ProgInfo.FlatUsed, 984 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); 985 986 // Check the addressable register limit before we add ExtraSGPRs. 987 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 988 !STM.hasSGPRInitBug()) { 989 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 990 uint64_t NumSgpr; 991 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && 992 NumSgpr > MaxAddressableNumSGPRs) { 993 // This can happen due to a compiler bug or when using inline asm. 994 LLVMContext &Ctx = MF.getFunction().getContext(); 995 DiagnosticInfoResourceLimit Diag( 996 MF.getFunction(), "addressable scalar registers", NumSgpr, 997 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); 998 Ctx.diagnose(Diag); 999 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1); 1000 } 1001 } 1002 1003 // Account for extra SGPRs and VGPRs reserved for debugger use. 1004 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx); 1005 1006 const Function &F = MF.getFunction(); 1007 1008 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave 1009 // dispatch registers are function args. 1010 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; 1011 1012 if (isShader(F.getCallingConv())) { 1013 bool IsPixelShader = 1014 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); 1015 1016 // Calculate the number of VGPR registers based on the SPI input registers 1017 uint32_t InputEna = 0; 1018 uint32_t InputAddr = 0; 1019 unsigned LastEna = 0; 1020 1021 if (IsPixelShader) { 1022 // Note for IsPixelShader: 1023 // By this stage, all enabled inputs are tagged in InputAddr as well. 1024 // We will use InputAddr to determine whether the input counts against the 1025 // vgpr total and only use the InputEnable to determine the last input 1026 // that is relevant - if extra arguments are used, then we have to honour 1027 // the InputAddr for any intermediate non-enabled inputs. 1028 InputEna = MFI->getPSInputEnable(); 1029 InputAddr = MFI->getPSInputAddr(); 1030 1031 // We only need to consider input args up to the last used arg. 1032 assert((InputEna || InputAddr) && 1033 "PSInputAddr and PSInputEnable should " 1034 "never both be 0 for AMDGPU_PS shaders"); 1035 // There are some rare circumstances where InputAddr is non-zero and 1036 // InputEna can be set to 0. In this case we default to setting LastEna 1037 // to 1. 1038 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; 1039 } 1040 1041 // FIXME: We should be using the number of registers determined during 1042 // calling convention lowering to legalize the types. 1043 const DataLayout &DL = F.getDataLayout(); 1044 unsigned PSArgCount = 0; 1045 unsigned IntermediateVGPR = 0; 1046 for (auto &Arg : F.args()) { 1047 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; 1048 if (Arg.hasAttribute(Attribute::InReg)) { 1049 WaveDispatchNumSGPR += NumRegs; 1050 } else { 1051 // If this is a PS shader and we're processing the PS Input args (first 1052 // 16 VGPR), use the InputEna and InputAddr bits to define how many 1053 // VGPRs are actually used. 1054 // Any extra VGPR arguments are handled as normal arguments (and 1055 // contribute to the VGPR count whether they're used or not). 1056 if (IsPixelShader && PSArgCount < 16) { 1057 if ((1 << PSArgCount) & InputAddr) { 1058 if (PSArgCount < LastEna) 1059 WaveDispatchNumVGPR += NumRegs; 1060 else 1061 IntermediateVGPR += NumRegs; 1062 } 1063 PSArgCount++; 1064 } else { 1065 // If there are extra arguments we have to include the allocation for 1066 // the non-used (but enabled with InputAddr) input arguments 1067 if (IntermediateVGPR) { 1068 WaveDispatchNumVGPR += IntermediateVGPR; 1069 IntermediateVGPR = 0; 1070 } 1071 WaveDispatchNumVGPR += NumRegs; 1072 } 1073 } 1074 } 1075 ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( 1076 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); 1077 1078 ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( 1079 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); 1080 1081 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( 1082 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); 1083 } else if (isKernel(F.getCallingConv()) && 1084 MFI->getNumKernargPreloadedSGPRs()) { 1085 // Consider cases where the total number of UserSGPRs with trailing 1086 // allocated preload SGPRs, is greater than the number of explicitly 1087 // referenced SGPRs. 1088 const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( 1089 CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); 1090 ProgInfo.NumSGPR = 1091 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); 1092 } 1093 1094 // Adjust number of registers used to meet default/requested minimum/maximum 1095 // number of waves per execution unit request. 1096 unsigned MaxWaves = MFI->getMaxWavesPerEU(); 1097 ProgInfo.NumSGPRsForWavesPerEU = 1098 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul), 1099 CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, 1100 Ctx); 1101 ProgInfo.NumVGPRsForWavesPerEU = 1102 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul), 1103 CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, 1104 Ctx); 1105 1106 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 1107 STM.hasSGPRInitBug()) { 1108 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 1109 uint64_t NumSgpr; 1110 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && 1111 NumSgpr > MaxAddressableNumSGPRs) { 1112 // This can happen due to a compiler bug or when using inline asm to use 1113 // the registers which are usually reserved for vcc etc. 1114 LLVMContext &Ctx = MF.getFunction().getContext(); 1115 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", 1116 NumSgpr, MaxAddressableNumSGPRs, 1117 DS_Error, DK_ResourceLimit); 1118 Ctx.diagnose(Diag); 1119 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs); 1120 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs); 1121 } 1122 } 1123 1124 if (STM.hasSGPRInitBug()) { 1125 ProgInfo.NumSGPR = 1126 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); 1127 ProgInfo.NumSGPRsForWavesPerEU = 1128 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); 1129 } 1130 1131 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 1132 LLVMContext &Ctx = MF.getFunction().getContext(); 1133 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", 1134 MFI->getNumUserSGPRs(), 1135 STM.getMaxNumUserSGPRs(), DS_Error); 1136 Ctx.diagnose(Diag); 1137 } 1138 1139 if (MFI->getLDSSize() > 1140 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { 1141 LLVMContext &Ctx = MF.getFunction().getContext(); 1142 DiagnosticInfoResourceLimit Diag( 1143 MF.getFunction(), "local memory", MFI->getLDSSize(), 1144 STM.getAddressableLocalMemorySize(), DS_Error); 1145 Ctx.diagnose(Diag); 1146 } 1147 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: 1148 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 1149 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR, 1150 unsigned Granule) { 1151 const MCExpr *OneConst = CreateExpr(1ul); 1152 const MCExpr *GranuleConst = CreateExpr(Granule); 1153 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); 1154 const MCExpr *AlignToGPR = 1155 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); 1156 const MCExpr *DivGPR = 1157 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); 1158 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); 1159 return SubGPR; 1160 }; 1161 1162 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU, 1163 IsaInfo::getSGPREncodingGranule(&STM)); 1164 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU, 1165 IsaInfo::getVGPREncodingGranule(&STM)); 1166 1167 const SIModeRegisterDefaults Mode = MFI->getMode(); 1168 1169 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 1170 // register. 1171 ProgInfo.FloatMode = getFPMode(Mode); 1172 1173 ProgInfo.IEEEMode = Mode.IEEE; 1174 1175 // Make clamp modifier on NaN input returns 0. 1176 ProgInfo.DX10Clamp = Mode.DX10Clamp; 1177 1178 unsigned LDSAlignShift; 1179 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { 1180 // LDS is allocated in 320 dword blocks. 1181 LDSAlignShift = 11; 1182 } else if (STM.getFeatureBits().test( 1183 FeatureAddressableLocalMemorySize65536)) { 1184 // LDS is allocated in 128 dword blocks. 1185 LDSAlignShift = 9; 1186 } else { 1187 // LDS is allocated in 64 dword blocks. 1188 LDSAlignShift = 8; 1189 } 1190 1191 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); 1192 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); 1193 1194 ProgInfo.LDSSize = MFI->getLDSSize(); 1195 ProgInfo.LDSBlocks = 1196 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 1197 1198 // The MCExpr equivalent of divideCeil. 1199 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { 1200 const MCExpr *Ceil = 1201 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx); 1202 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx); 1203 }; 1204 1205 // Scratch is allocated in 64-dword or 256-dword blocks. 1206 unsigned ScratchAlignShift = 1207 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; 1208 // We need to program the hardware with the amount of scratch memory that 1209 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 1210 // scratch memory used per thread. 1211 ProgInfo.ScratchBlocks = DivideCeil( 1212 MCBinaryExpr::createMul(ProgInfo.ScratchSize, 1213 CreateExpr(STM.getWavefrontSize()), Ctx), 1214 CreateExpr(1ULL << ScratchAlignShift)); 1215 1216 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { 1217 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; 1218 ProgInfo.MemOrdered = 1; 1219 } 1220 1221 // 0 = X, 1 = XY, 2 = XYZ 1222 unsigned TIDIGCompCnt = 0; 1223 if (MFI->hasWorkItemIDZ()) 1224 TIDIGCompCnt = 2; 1225 else if (MFI->hasWorkItemIDY()) 1226 TIDIGCompCnt = 1; 1227 1228 // The private segment wave byte offset is the last of the system SGPRs. We 1229 // initially assumed it was allocated, and may have used it. It shouldn't harm 1230 // anything to disable it if we know the stack isn't used here. We may still 1231 // have emitted code reading it to initialize scratch, but if that's unused 1232 // reading garbage should be OK. 1233 ProgInfo.ScratchEnable = MCBinaryExpr::createLOr( 1234 MCBinaryExpr::createGT(ProgInfo.ScratchBlocks, 1235 MCConstantExpr::create(0, Ctx), Ctx), 1236 ProgInfo.DynamicCallStack, Ctx); 1237 1238 ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); 1239 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. 1240 ProgInfo.TrapHandlerEnable = 1241 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); 1242 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); 1243 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); 1244 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); 1245 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); 1246 ProgInfo.TIdIGCompCount = TIDIGCompCnt; 1247 ProgInfo.EXCPEnMSB = 0; 1248 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 1249 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; 1250 ProgInfo.EXCPEnable = 0; 1251 1252 if (STM.hasGFX90AInsts()) { 1253 // return ((Dst & ~Mask) | (Value << Shift)) 1254 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, 1255 uint32_t Shift) { 1256 const auto *Shft = MCConstantExpr::create(Shift, Ctx); 1257 const auto *Msk = MCConstantExpr::create(Mask, Ctx); 1258 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); 1259 Dst = MCBinaryExpr::createOr( 1260 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx); 1261 return Dst; 1262 }; 1263 1264 ProgInfo.ComputePGMRSrc3GFX90A = 1265 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset, 1266 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, 1267 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); 1268 ProgInfo.ComputePGMRSrc3GFX90A = 1269 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit), 1270 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, 1271 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); 1272 } 1273 1274 ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( 1275 STM.computeOccupancy(F, ProgInfo.LDSSize).second, 1276 ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); 1277 1278 const auto [MinWEU, MaxWEU] = 1279 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); 1280 uint64_t Occupancy; 1281 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) { 1282 DiagnosticInfoOptimizationFailure Diag( 1283 F, F.getSubprogram(), 1284 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " 1285 "'" + 1286 F.getName() + "': desired occupancy was " + Twine(MinWEU) + 1287 ", final occupancy is " + Twine(Occupancy)); 1288 F.getContext().diagnose(Diag); 1289 } 1290 } 1291 1292 static unsigned getRsrcReg(CallingConv::ID CallConv) { 1293 switch (CallConv) { 1294 default: [[fallthrough]]; 1295 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 1296 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 1297 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 1298 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 1299 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 1300 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 1301 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 1302 } 1303 } 1304 1305 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 1306 const SIProgramInfo &CurrentProgramInfo) { 1307 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1308 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1309 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); 1310 MCContext &Ctx = MF.getContext(); 1311 1312 // (((Value) & Mask) << Shift) 1313 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) { 1314 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx); 1315 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx); 1316 return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx), 1317 shft, Ctx); 1318 }; 1319 1320 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) { 1321 int64_t Val; 1322 if (Value->evaluateAsAbsolute(Val)) 1323 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size); 1324 else 1325 OutStreamer->emitValue(Value, Size); 1326 }; 1327 1328 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 1329 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); 1330 1331 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx), 1332 /*Size=*/4); 1333 1334 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); 1335 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4); 1336 1337 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); 1338 1339 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the 1340 // appropriate generation. 1341 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { 1342 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1343 /*Mask=*/0x3FFFF, /*Shift=*/12), 1344 /*Size=*/4); 1345 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { 1346 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1347 /*Mask=*/0x7FFF, /*Shift=*/12), 1348 /*Size=*/4); 1349 } else { 1350 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1351 /*Mask=*/0x1FFF, /*Shift=*/12), 1352 /*Size=*/4); 1353 } 1354 1355 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 1356 // 0" comment but I don't see a corresponding field in the register spec. 1357 } else { 1358 OutStreamer->emitInt32(RsrcReg); 1359 1360 const MCExpr *GPRBlocks = MCBinaryExpr::createOr( 1361 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0), 1362 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6), 1363 MF.getContext()); 1364 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4); 1365 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); 1366 1367 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the 1368 // appropriate generation. 1369 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { 1370 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1371 /*Mask=*/0x3FFFF, /*Shift=*/12), 1372 /*Size=*/4); 1373 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { 1374 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1375 /*Mask=*/0x7FFF, /*Shift=*/12), 1376 /*Size=*/4); 1377 } else { 1378 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1379 /*Mask=*/0x1FFF, /*Shift=*/12), 1380 /*Size=*/4); 1381 } 1382 } 1383 1384 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1385 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); 1386 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1387 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1388 : CurrentProgramInfo.LDSBlocks; 1389 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 1390 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); 1391 OutStreamer->emitInt32(MFI->getPSInputEnable()); 1392 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); 1393 OutStreamer->emitInt32(MFI->getPSInputAddr()); 1394 } 1395 1396 OutStreamer->emitInt32(R_SPILLED_SGPRS); 1397 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); 1398 OutStreamer->emitInt32(R_SPILLED_VGPRS); 1399 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); 1400 } 1401 1402 // Helper function to add common PAL Metadata 3.0+ 1403 static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, 1404 const SIProgramInfo &CurrentProgramInfo, 1405 CallingConv::ID CC, const GCNSubtarget &ST) { 1406 if (ST.hasIEEEMode()) 1407 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); 1408 1409 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); 1410 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); 1411 1412 if (AMDGPU::isCompute(CC)) { 1413 MD->setHwStage(CC, ".trap_present", 1414 (bool)CurrentProgramInfo.TrapHandlerEnable); 1415 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); 1416 } 1417 1418 MD->setHwStage(CC, ".lds_size", 1419 (unsigned)(CurrentProgramInfo.LdsSize * 1420 getLdsDwGranularity(ST) * sizeof(uint32_t))); 1421 } 1422 1423 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 1424 // is AMDPAL. It stores each compute/SPI register setting and other PAL 1425 // metadata items into the PALMD::Metadata, combining with any provided by the 1426 // frontend as LLVM metadata. Once all functions are written, the PAL metadata 1427 // is then written as a single block in the .note section. 1428 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, 1429 const SIProgramInfo &CurrentProgramInfo) { 1430 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1431 auto CC = MF.getFunction().getCallingConv(); 1432 auto *MD = getTargetStreamer()->getPALMetadata(); 1433 auto &Ctx = MF.getContext(); 1434 1435 MD->setEntryPoint(CC, MF.getFunction().getName()); 1436 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); 1437 1438 // Only set AGPRs for supported devices 1439 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1440 if (STM.hasMAIInsts()) { 1441 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); 1442 } 1443 1444 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx); 1445 if (MD->getPALMajorVersion() < 3) { 1446 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx); 1447 if (AMDGPU::isCompute(CC)) { 1448 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); 1449 } else { 1450 const MCExpr *HasScratchBlocks = 1451 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks, 1452 MCConstantExpr::create(0, Ctx), Ctx); 1453 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN); 1454 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx); 1455 } 1456 } else { 1457 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); 1458 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean, 1459 CurrentProgramInfo.ScratchEnable); 1460 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM); 1461 } 1462 1463 // ScratchSize is in bytes, 16 aligned. 1464 MD->setScratchSize( 1465 CC, 1466 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize, 1467 MCConstantExpr::create(16, Ctx), Ctx), 1468 Ctx); 1469 1470 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1471 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1472 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1473 : CurrentProgramInfo.LDSBlocks; 1474 if (MD->getPALMajorVersion() < 3) { 1475 MD->setRsrc2( 1476 CC, 1477 MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx), 1478 Ctx); 1479 MD->setSpiPsInputEna(MFI->getPSInputEnable()); 1480 MD->setSpiPsInputAddr(MFI->getPSInputAddr()); 1481 } else { 1482 // Graphics registers 1483 const unsigned ExtraLdsDwGranularity = 1484 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; 1485 MD->setGraphicsRegisters( 1486 ".ps_extra_lds_size", 1487 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); 1488 1489 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr 1490 static StringLiteral const PsInputFields[] = { 1491 ".persp_sample_ena", ".persp_center_ena", 1492 ".persp_centroid_ena", ".persp_pull_model_ena", 1493 ".linear_sample_ena", ".linear_center_ena", 1494 ".linear_centroid_ena", ".line_stipple_tex_ena", 1495 ".pos_x_float_ena", ".pos_y_float_ena", 1496 ".pos_z_float_ena", ".pos_w_float_ena", 1497 ".front_face_ena", ".ancillary_ena", 1498 ".sample_coverage_ena", ".pos_fixed_pt_ena"}; 1499 unsigned PSInputEna = MFI->getPSInputEnable(); 1500 unsigned PSInputAddr = MFI->getPSInputAddr(); 1501 for (auto [Idx, Field] : enumerate(PsInputFields)) { 1502 MD->setGraphicsRegisters(".spi_ps_input_ena", Field, 1503 (bool)((PSInputEna >> Idx) & 1)); 1504 MD->setGraphicsRegisters(".spi_ps_input_addr", Field, 1505 (bool)((PSInputAddr >> Idx) & 1)); 1506 } 1507 } 1508 } 1509 1510 // For version 3 and above the wave front size is already set in the metadata 1511 if (MD->getPALMajorVersion() < 3 && STM.isWave32()) 1512 MD->setWave32(MF.getFunction().getCallingConv()); 1513 } 1514 1515 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { 1516 auto *MD = getTargetStreamer()->getPALMetadata(); 1517 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1518 StringRef FnName = MF.getFunction().getName(); 1519 MD->setFunctionScratchSize(FnName, MFI.getStackSize()); 1520 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1521 MCContext &Ctx = MF.getContext(); 1522 1523 if (MD->getPALMajorVersion() < 3) { 1524 // Set compute registers 1525 MD->setRsrc1( 1526 CallingConv::AMDGPU_CS, 1527 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx); 1528 MD->setRsrc2(CallingConv::AMDGPU_CS, 1529 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); 1530 } else { 1531 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST); 1532 } 1533 1534 // Set optional info 1535 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); 1536 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1537 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1538 } 1539 1540 // This is supposed to be log2(Size) 1541 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1542 switch (Size) { 1543 case 4: 1544 return AMD_ELEMENT_4_BYTES; 1545 case 8: 1546 return AMD_ELEMENT_8_BYTES; 1547 case 16: 1548 return AMD_ELEMENT_16_BYTES; 1549 default: 1550 llvm_unreachable("invalid private_element_size"); 1551 } 1552 } 1553 1554 void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, 1555 const SIProgramInfo &CurrentProgramInfo, 1556 const MachineFunction &MF) const { 1557 const Function &F = MF.getFunction(); 1558 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 1559 F.getCallingConv() == CallingConv::SPIR_KERNEL); 1560 1561 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1562 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1563 MCContext &Ctx = MF.getContext(); 1564 1565 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false); 1566 1567 Out.compute_pgm_resource1_registers = 1568 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx); 1569 Out.compute_pgm_resource2_registers = 1570 CurrentProgramInfo.getComputePGMRSrc2(Ctx); 1571 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; 1572 1573 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack; 1574 1575 AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1576 getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); 1577 1578 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); 1579 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 1580 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1581 } 1582 1583 if (UserSGPRInfo.hasDispatchPtr()) 1584 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1585 1586 if (UserSGPRInfo.hasQueuePtr()) 1587 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1588 1589 if (UserSGPRInfo.hasKernargSegmentPtr()) 1590 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1591 1592 if (UserSGPRInfo.hasDispatchID()) 1593 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1594 1595 if (UserSGPRInfo.hasFlatScratchInit()) 1596 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1597 1598 if (UserSGPRInfo.hasPrivateSegmentSize()) 1599 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; 1600 1601 if (STM.isXNACKEnabled()) 1602 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1603 1604 Align MaxKernArgAlign; 1605 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 1606 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1607 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1608 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1609 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1610 1611 // kernarg_segment_alignment is specified as log of the alignment. 1612 // The minimum alignment is 16. 1613 // FIXME: The metadata treats the minimum as 4? 1614 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); 1615 } 1616 1617 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1618 const char *ExtraCode, raw_ostream &O) { 1619 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1620 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) 1621 return false; 1622 1623 if (ExtraCode && ExtraCode[0]) { 1624 if (ExtraCode[1] != 0) 1625 return true; // Unknown modifier. 1626 1627 switch (ExtraCode[0]) { 1628 case 'r': 1629 break; 1630 default: 1631 return true; 1632 } 1633 } 1634 1635 // TODO: Should be able to support other operand types like globals. 1636 const MachineOperand &MO = MI->getOperand(OpNo); 1637 if (MO.isReg()) { 1638 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1639 *MF->getSubtarget().getRegisterInfo()); 1640 return false; 1641 } 1642 if (MO.isImm()) { 1643 int64_t Val = MO.getImm(); 1644 if (AMDGPU::isInlinableIntLiteral(Val)) { 1645 O << Val; 1646 } else if (isUInt<16>(Val)) { 1647 O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); 1648 } else if (isUInt<32>(Val)) { 1649 O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); 1650 } else { 1651 O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); 1652 } 1653 return false; 1654 } 1655 return true; 1656 } 1657 1658 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { 1659 AU.addRequired<AMDGPUResourceUsageAnalysis>(); 1660 AU.addPreserved<AMDGPUResourceUsageAnalysis>(); 1661 AU.addRequired<MachineModuleInfoWrapperPass>(); 1662 AU.addPreserved<MachineModuleInfoWrapperPass>(); 1663 AsmPrinter::getAnalysisUsage(AU); 1664 } 1665 1666 void AMDGPUAsmPrinter::emitResourceUsageRemarks( 1667 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, 1668 bool isModuleEntryFunction, bool hasMAIInsts) { 1669 if (!ORE) 1670 return; 1671 1672 const char *Name = "kernel-resource-usage"; 1673 const char *Indent = " "; 1674 1675 // If the remark is not specifically enabled, do not output to yaml 1676 LLVMContext &Ctx = MF.getFunction().getContext(); 1677 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name)) 1678 return; 1679 1680 // Currently non-kernel functions have no resources to emit. 1681 if (!isEntryFunctionCC(MF.getFunction().getCallingConv())) 1682 return; 1683 1684 auto EmitResourceUsageRemark = [&](StringRef RemarkName, 1685 StringRef RemarkLabel, auto Argument) { 1686 // Add an indent for every line besides the line with the kernel name. This 1687 // makes it easier to tell which resource usage go with which kernel since 1688 // the kernel name will always be displayed first. 1689 std::string LabelStr = RemarkLabel.str() + ": "; 1690 if (RemarkName != "FunctionName") 1691 LabelStr = Indent + LabelStr; 1692 1693 ORE->emit([&]() { 1694 return MachineOptimizationRemarkAnalysis(Name, RemarkName, 1695 MF.getFunction().getSubprogram(), 1696 &MF.front()) 1697 << LabelStr << ore::NV(RemarkName, Argument); 1698 }); 1699 }; 1700 1701 // FIXME: Formatting here is pretty nasty because clang does not accept 1702 // newlines from diagnostics. This forces us to emit multiple diagnostic 1703 // remarks to simulate newlines. If and when clang does accept newlines, this 1704 // formatting should be aggregated into one remark with newlines to avoid 1705 // printing multiple diagnostic location and diag opts. 1706 EmitResourceUsageRemark("FunctionName", "Function Name", 1707 MF.getFunction().getName()); 1708 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs", 1709 getMCExprStr(CurrentProgramInfo.NumSGPR)); 1710 EmitResourceUsageRemark("NumVGPR", "VGPRs", 1711 getMCExprStr(CurrentProgramInfo.NumArchVGPR)); 1712 if (hasMAIInsts) { 1713 EmitResourceUsageRemark("NumAGPR", "AGPRs", 1714 getMCExprStr(CurrentProgramInfo.NumAccVGPR)); 1715 } 1716 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", 1717 getMCExprStr(CurrentProgramInfo.ScratchSize)); 1718 int64_t DynStack; 1719 bool DynStackEvaluatable = 1720 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack); 1721 StringRef DynamicStackStr = 1722 DynStackEvaluatable && DynStack ? "True" : "False"; 1723 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); 1724 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", 1725 getMCExprStr(CurrentProgramInfo.Occupancy)); 1726 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", 1727 CurrentProgramInfo.SGPRSpill); 1728 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", 1729 CurrentProgramInfo.VGPRSpill); 1730 if (isModuleEntryFunction) 1731 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]", 1732 CurrentProgramInfo.LDSSize); 1733 } 1734