xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17 
18 #include "AMDGPUAsmPrinter.h"
19 #include "AMDGPU.h"
20 #include "AMDGPUHSAMetadataStreamer.h"
21 #include "AMDGPUMCResourceInfo.h"
22 #include "AMDGPUResourceUsageAnalysis.h"
23 #include "GCNSubtarget.h"
24 #include "MCTargetDesc/AMDGPUInstPrinter.h"
25 #include "MCTargetDesc/AMDGPUMCExpr.h"
26 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
27 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
28 #include "R600AsmPrinter.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "TargetInfo/AMDGPUTargetInfo.h"
31 #include "Utils/AMDGPUBaseInfo.h"
32 #include "Utils/AMDKernelCodeTUtils.h"
33 #include "Utils/SIDefinesUtils.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/BinaryFormat/ELF.h"
36 #include "llvm/CodeGen/MachineFrameInfo.h"
37 #include "llvm/CodeGen/MachineModuleInfo.h"
38 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
39 #include "llvm/IR/DiagnosticInfo.h"
40 #include "llvm/MC/MCAssembler.h"
41 #include "llvm/MC/MCContext.h"
42 #include "llvm/MC/MCSectionELF.h"
43 #include "llvm/MC/MCStreamer.h"
44 #include "llvm/MC/TargetRegistry.h"
45 #include "llvm/Support/AMDHSAKernelDescriptor.h"
46 #include "llvm/Target/TargetLoweringObjectFile.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include "llvm/TargetParser/TargetParser.h"
49 
50 using namespace llvm;
51 using namespace llvm::AMDGPU;
52 
53 // This should get the default rounding mode from the kernel. We just set the
54 // default here, but this could change if the OpenCL rounding mode pragmas are
55 // used.
56 //
57 // The denormal mode here should match what is reported by the OpenCL runtime
58 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
59 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
60 //
61 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
62 // precision, and leaves single precision to flush all and does not report
63 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
64 // CL_FP_DENORM for both.
65 //
66 // FIXME: It seems some instructions do not support single precision denormals
67 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
68 // and sin_f32, cos_f32 on most parts).
69 
70 // We want to use these instructions, and using fp32 denormals also causes
71 // instructions to run at the double precision rate for the device so it's
72 // probably best to just report no single precision denormals.
73 static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
74   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
75          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
76          FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
77          FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
78 }
79 
80 static AsmPrinter *
81 createAMDGPUAsmPrinterPass(TargetMachine &tm,
82                            std::unique_ptr<MCStreamer> &&Streamer) {
83   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
84 }
85 
86 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
87   TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
88                                      llvm::createR600AsmPrinterPass);
89   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
90                                      createAMDGPUAsmPrinterPass);
91 }
92 
93 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
94                                    std::unique_ptr<MCStreamer> Streamer)
95     : AsmPrinter(TM, std::move(Streamer)) {
96   assert(OutStreamer && "AsmPrinter constructed without streamer");
97 }
98 
99 StringRef AMDGPUAsmPrinter::getPassName() const {
100   return "AMDGPU Assembly Printer";
101 }
102 
103 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
104   return TM.getMCSubtargetInfo();
105 }
106 
107 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
108   if (!OutStreamer)
109     return nullptr;
110   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
111 }
112 
113 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
114   IsTargetStreamerInitialized = false;
115 }
116 
117 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
118   IsTargetStreamerInitialized = true;
119 
120   // TODO: Which one is called first, emitStartOfAsmFile or
121   // emitFunctionBodyStart?
122   if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
123     initializeTargetID(M);
124 
125   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
126       TM.getTargetTriple().getOS() != Triple::AMDPAL)
127     return;
128 
129   getTargetStreamer()->EmitDirectiveAMDGCNTarget();
130 
131   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
132     getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
133         CodeObjectVersion);
134     HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
135   }
136 
137   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
138     getTargetStreamer()->getPALMetadata()->readFromIR(M);
139 }
140 
141 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
142   // Init target streamer if it has not yet happened
143   if (!IsTargetStreamerInitialized)
144     initTargetStreamer(M);
145 
146   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
147     getTargetStreamer()->EmitISAVersion();
148 
149   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
150   // Emit HSA Metadata (NT_AMD_HSA_METADATA).
151   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
152     HSAMetadataStream->end();
153     bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
154     (void)Success;
155     assert(Success && "Malformed HSA Metadata");
156   }
157 }
158 
159 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
160   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
161   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
162   const Function &F = MF->getFunction();
163 
164   // TODO: We're checking this late, would be nice to check it earlier.
165   if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
166     report_fatal_error(
167         STM.getCPU() + " is only available on code object version 6 or better",
168         /*gen_crash_diag*/ false);
169   }
170 
171   // TODO: Which one is called first, emitStartOfAsmFile or
172   // emitFunctionBodyStart?
173   if (!getTargetStreamer()->getTargetID())
174     initializeTargetID(*F.getParent());
175 
176   const auto &FunctionTargetID = STM.getTargetID();
177   // Make sure function's xnack settings are compatible with module's
178   // xnack settings.
179   if (FunctionTargetID.isXnackSupported() &&
180       FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
181       FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
182     OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
183                            "' function does not match module xnack setting");
184     return;
185   }
186   // Make sure function's sramecc settings are compatible with module's
187   // sramecc settings.
188   if (FunctionTargetID.isSramEccSupported() &&
189       FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
190       FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
191     OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
192                            "' function does not match module sramecc setting");
193     return;
194   }
195 
196   if (!MFI.isEntryFunction())
197     return;
198 
199   if (STM.isMesaKernel(F) &&
200       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
201        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
202     AMDGPUMCKernelCodeT KernelCode;
203     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
204     KernelCode.validate(&STM, MF->getContext());
205     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
206   }
207 
208   if (STM.isAmdHsaOS())
209     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
210 }
211 
212 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
213   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
214   if (!MFI.isEntryFunction())
215     return;
216 
217   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
218     return;
219 
220   auto &Streamer = getTargetStreamer()->getStreamer();
221   auto &Context = Streamer.getContext();
222   auto &ObjectFileInfo = *Context.getObjectFileInfo();
223   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
224 
225   Streamer.pushSection();
226   Streamer.switchSection(&ReadOnlySection);
227 
228   // CP microcode requires the kernel descriptor to be allocated on 64 byte
229   // alignment.
230   Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
231   ReadOnlySection.ensureMinAlignment(Align(64));
232 
233   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
234 
235   SmallString<128> KernelName;
236   getNameWithPrefix(KernelName, &MF->getFunction());
237   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
238       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
239       CurrentProgramInfo.NumVGPRsForWavesPerEU,
240       MCBinaryExpr::createSub(
241           CurrentProgramInfo.NumSGPRsForWavesPerEU,
242           AMDGPUMCExpr::createExtraSGPRs(
243               CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
244               getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
245           Context),
246       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
247 
248   Streamer.popSection();
249 }
250 
251 void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
252   Register RegNo = MI->getOperand(0).getReg();
253 
254   SmallString<128> Str;
255   raw_svector_ostream OS(Str);
256   OS << "implicit-def: "
257      << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
258 
259   if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
260     OS << " : SGPR spill to VGPR lane";
261 
262   OutStreamer->AddComment(OS.str());
263   OutStreamer->addBlankLine();
264 }
265 
266 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
267   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
268     AsmPrinter::emitFunctionEntryLabel();
269     return;
270   }
271 
272   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
273   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
274   if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
275     SmallString<128> SymbolName;
276     getNameWithPrefix(SymbolName, &MF->getFunction()),
277     getTargetStreamer()->EmitAMDGPUSymbolType(
278         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
279   }
280   if (DumpCodeInstEmitter) {
281     // Disassemble function name label to text.
282     DisasmLines.push_back(MF->getName().str() + ":");
283     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
284     HexLines.emplace_back("");
285   }
286 
287   AsmPrinter::emitFunctionEntryLabel();
288 }
289 
290 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
291   if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
292     // Write a line for the basic block label if it is not only fallthrough.
293     DisasmLines.push_back(
294         (Twine("BB") + Twine(getFunctionNumber())
295          + "_" + Twine(MBB.getNumber()) + ":").str());
296     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
297     HexLines.emplace_back("");
298   }
299   AsmPrinter::emitBasicBlockStart(MBB);
300 }
301 
302 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
303   if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
304     if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
305       OutContext.reportError({},
306                              Twine(GV->getName()) +
307                                  ": unsupported initializer for address space");
308       return;
309     }
310 
311     // LDS variables aren't emitted in HSA or PAL yet.
312     const Triple::OSType OS = TM.getTargetTriple().getOS();
313     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
314       return;
315 
316     MCSymbol *GVSym = getSymbol(GV);
317 
318     GVSym->redefineIfPossible();
319     if (GVSym->isDefined() || GVSym->isVariable())
320       report_fatal_error("symbol '" + Twine(GVSym->getName()) +
321                          "' is already defined");
322 
323     const DataLayout &DL = GV->getDataLayout();
324     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
325     Align Alignment = GV->getAlign().value_or(Align(4));
326 
327     emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
328     emitLinkage(GV, GVSym);
329     auto *TS = getTargetStreamer();
330     TS->emitAMDGPULDS(GVSym, Size, Alignment);
331     return;
332   }
333 
334   AsmPrinter::emitGlobalVariable(GV);
335 }
336 
337 bool AMDGPUAsmPrinter::doInitialization(Module &M) {
338   CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
339 
340   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
341     switch (CodeObjectVersion) {
342     case AMDGPU::AMDHSA_COV4:
343       HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
344       break;
345     case AMDGPU::AMDHSA_COV5:
346       HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
347       break;
348     case AMDGPU::AMDHSA_COV6:
349       HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
350       break;
351     default:
352       report_fatal_error("Unexpected code object version");
353     }
354   }
355 
356   return AsmPrinter::doInitialization(M);
357 }
358 
359 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
360   if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
361     return;
362 
363   using RIK = MCResourceInfo::ResourceInfoKind;
364   const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
365   MCSymbol *FnSym = TM.getSymbol(&F);
366   bool IsLocal = F.hasLocalLinkage();
367 
368   auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
369     int64_t Val;
370     if (Value->evaluateAsAbsolute(Val)) {
371       Res = Val;
372       return true;
373     }
374     return false;
375   };
376 
377   const uint64_t MaxScratchPerWorkitem =
378       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
379   MCSymbol *ScratchSizeSymbol = RI.getSymbol(
380       FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
381   uint64_t ScratchSize;
382   if (ScratchSizeSymbol->isVariable() &&
383       TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
384       ScratchSize > MaxScratchPerWorkitem) {
385     DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
386                                           DS_Error);
387     F.getContext().diagnose(DiagStackSize);
388   }
389 
390   // Validate addressable scalar registers (i.e., prior to added implicit
391   // SGPRs).
392   MCSymbol *NumSGPRSymbol =
393       RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
394   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
395       !STM.hasSGPRInitBug()) {
396     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
397     uint64_t NumSgpr;
398     if (NumSGPRSymbol->isVariable() &&
399         TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
400         NumSgpr > MaxAddressableNumSGPRs) {
401       DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
402                                        NumSgpr, MaxAddressableNumSGPRs,
403                                        DS_Error, DK_ResourceLimit);
404       F.getContext().diagnose(Diag);
405       return;
406     }
407   }
408 
409   MCSymbol *VCCUsedSymbol =
410       RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
411   MCSymbol *FlatUsedSymbol = RI.getSymbol(
412       FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
413   uint64_t VCCUsed, FlatUsed, NumSgpr;
414 
415   if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
416       FlatUsedSymbol->isVariable() &&
417       TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
418       TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
419       TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
420 
421     // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
422     // resolvable.
423     NumSgpr += IsaInfo::getNumExtraSGPRs(
424         &STM, VCCUsed, FlatUsed,
425         getTargetStreamer()->getTargetID()->isXnackOnOrAny());
426     if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
427         STM.hasSGPRInitBug()) {
428       unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
429       if (NumSgpr > MaxAddressableNumSGPRs) {
430         DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
431                                          MaxAddressableNumSGPRs, DS_Error,
432                                          DK_ResourceLimit);
433         F.getContext().diagnose(Diag);
434         return;
435       }
436     }
437 
438     MCSymbol *NumVgprSymbol =
439         RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
440     MCSymbol *NumAgprSymbol =
441         RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
442     uint64_t NumVgpr, NumAgpr;
443 
444     MachineModuleInfo &MMI =
445         getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
446     MachineFunction *MF = MMI.getMachineFunction(F);
447     if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
448         TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
449         TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
450       const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
451       unsigned MaxWaves = MFI.getMaxWavesPerEU();
452       uint64_t TotalNumVgpr =
453           getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
454       uint64_t NumVGPRsForWavesPerEU = std::max(
455           {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
456       uint64_t NumSGPRsForWavesPerEU = std::max(
457           {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
458       const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
459           STM.getOccupancyWithWorkGroupSizes(*MF).second,
460           MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
461           MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
462           OutContext);
463       uint64_t Occupancy;
464 
465       const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
466           F, "amdgpu-waves-per-eu", {0, 0}, true);
467 
468       if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
469         DiagnosticInfoOptimizationFailure Diag(
470             F, F.getSubprogram(),
471             "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
472             "'" +
473                 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
474                 ", final occupancy is " + Twine(Occupancy));
475         F.getContext().diagnose(Diag);
476         return;
477       }
478     }
479   }
480 }
481 
482 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
483   // Pad with s_code_end to help tools and guard against instruction prefetch
484   // causing stale data in caches. Arguably this should be done by the linker,
485   // which is why this isn't done for Mesa.
486   const MCSubtargetInfo &STI = *getGlobalSTI();
487   if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
488       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
489        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
490     OutStreamer->switchSection(getObjFileLowering().getTextSection());
491     getTargetStreamer()->EmitCodeEnd(STI);
492   }
493 
494   // Assign expressions which can only be resolved when all other functions are
495   // known.
496   RI.finalize(OutContext);
497 
498   // Switch section and emit all GPR maximums within the processed module.
499   OutStreamer->pushSection();
500   MCSectionELF *MaxGPRSection =
501       OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
502   OutStreamer->switchSection(MaxGPRSection);
503   getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
504                                               RI.getMaxAGPRSymbol(OutContext),
505                                               RI.getMaxSGPRSymbol(OutContext));
506   OutStreamer->popSection();
507 
508   for (Function &F : M.functions())
509     validateMCResourceInfo(F);
510 
511   RI.reset();
512 
513   return AsmPrinter::doFinalization(M);
514 }
515 
516 SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
517   SmallString<128> Str;
518   raw_svector_ostream OSS(Str);
519   auto &Streamer = getTargetStreamer()->getStreamer();
520   auto &Context = Streamer.getContext();
521   const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
522   printAMDGPUMCExpr(New, OSS, MAI);
523   return Str;
524 }
525 
526 // Print comments that apply to both callable functions and entry points.
527 void AMDGPUAsmPrinter::emitCommonFunctionComments(
528     const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
529     const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
530     const AMDGPUMachineFunction *MFI) {
531   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
532   OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
533                               false);
534   OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
535   if (NumAGPR && TotalNumVGPR) {
536     OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
537     OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
538                                 false);
539   }
540   OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
541                               false);
542   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
543                               false);
544 }
545 
546 const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
547     const MachineFunction &MF) const {
548   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
549   MCContext &Ctx = MF.getContext();
550   uint16_t KernelCodeProperties = 0;
551   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
552 
553   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
554     KernelCodeProperties |=
555         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
556   }
557   if (UserSGPRInfo.hasDispatchPtr()) {
558     KernelCodeProperties |=
559         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
560   }
561   if (UserSGPRInfo.hasQueuePtr()) {
562     KernelCodeProperties |=
563         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
564   }
565   if (UserSGPRInfo.hasKernargSegmentPtr()) {
566     KernelCodeProperties |=
567         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
568   }
569   if (UserSGPRInfo.hasDispatchID()) {
570     KernelCodeProperties |=
571         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
572   }
573   if (UserSGPRInfo.hasFlatScratchInit()) {
574     KernelCodeProperties |=
575         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
576   }
577   if (UserSGPRInfo.hasPrivateSegmentSize()) {
578     KernelCodeProperties |=
579         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
580   }
581   if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
582     KernelCodeProperties |=
583         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
584   }
585 
586   // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
587   // un-evaluatable at this point so it cannot be conditionally checked here.
588   // Instead, we'll directly shift the possibly unknown MCExpr into its place
589   // and bitwise-or it into KernelCodeProperties.
590   const MCExpr *KernelCodePropExpr =
591       MCConstantExpr::create(KernelCodeProperties, Ctx);
592   const MCExpr *OrValue = MCConstantExpr::create(
593       amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
594   OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
595                                     OrValue, Ctx);
596   KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
597 
598   return KernelCodePropExpr;
599 }
600 
601 MCKernelDescriptor
602 AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
603                                             const SIProgramInfo &PI) const {
604   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
605   const Function &F = MF.getFunction();
606   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
607   MCContext &Ctx = MF.getContext();
608 
609   MCKernelDescriptor KernelDescriptor;
610 
611   KernelDescriptor.group_segment_fixed_size =
612       MCConstantExpr::create(PI.LDSSize, Ctx);
613   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
614 
615   Align MaxKernArgAlign;
616   KernelDescriptor.kernarg_size = MCConstantExpr::create(
617       STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
618 
619   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
620   KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
621   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
622 
623   int64_t PGRM_Rsrc3 = 1;
624   bool EvaluatableRsrc3 =
625       CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
626   (void)PGRM_Rsrc3;
627   (void)EvaluatableRsrc3;
628   assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
629          static_cast<uint64_t>(PGRM_Rsrc3) == 0);
630   KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
631 
632   KernelDescriptor.kernarg_preload = MCConstantExpr::create(
633       AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
634       Ctx);
635 
636   return KernelDescriptor;
637 }
638 
639 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
640   // Init target streamer lazily on the first function so that previous passes
641   // can set metadata.
642   if (!IsTargetStreamerInitialized)
643     initTargetStreamer(*MF.getFunction().getParent());
644 
645   ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
646   CurrentProgramInfo.reset(MF);
647 
648   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
649   MCContext &Ctx = MF.getContext();
650 
651   // The starting address of all shader programs must be 256 bytes aligned.
652   // Regular functions just need the basic required instruction alignment.
653   MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
654 
655   SetupMachineFunction(MF);
656 
657   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
658   MCContext &Context = getObjFileLowering().getContext();
659   bool IsLocal = MF.getFunction().hasLocalLinkage();
660   // FIXME: This should be an explicit check for Mesa.
661   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
662     MCSectionELF *ConfigSection =
663         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
664     OutStreamer->switchSection(ConfigSection);
665   }
666 
667   const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
668       ResourceUsage->getResourceInfo();
669   RI.gatherResourceInfo(MF, Info, OutContext);
670 
671   if (MFI->isModuleEntryFunction()) {
672     getSIProgramInfo(CurrentProgramInfo, MF);
673   }
674 
675   if (STM.isAmdPalOS()) {
676     if (MFI->isEntryFunction())
677       EmitPALMetadata(MF, CurrentProgramInfo);
678     else if (MFI->isModuleEntryFunction())
679       emitPALFunctionMetadata(MF);
680   } else if (!STM.isAmdHsaOS()) {
681     EmitProgramInfoSI(MF, CurrentProgramInfo);
682   }
683 
684   DumpCodeInstEmitter = nullptr;
685   if (STM.dumpCode()) {
686     // For -dumpcode, get the assembler out of the streamer. This only works
687     // with -filetype=obj.
688     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
689     if (Assembler)
690       DumpCodeInstEmitter = Assembler->getEmitterPtr();
691   }
692 
693   DisasmLines.clear();
694   HexLines.clear();
695   DisasmLineMaxLen = 0;
696 
697   emitFunctionBody();
698 
699   emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
700                            STM.hasMAIInsts());
701 
702   {
703     using RIK = MCResourceInfo::ResourceInfoKind;
704     getTargetStreamer()->EmitMCResourceInfo(
705         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
706                      IsLocal),
707         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
708                      IsLocal),
709         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
710                      IsLocal),
711         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
712                      OutContext, IsLocal),
713         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
714                      IsLocal),
715         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
716                      OutContext, IsLocal),
717         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
718                      OutContext, IsLocal),
719         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
720                      IsLocal),
721         RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
722                      OutContext, IsLocal));
723   }
724 
725   if (isVerbose()) {
726     MCSectionELF *CommentSection =
727         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
728     OutStreamer->switchSection(CommentSection);
729 
730     if (!MFI->isEntryFunction()) {
731       using RIK = MCResourceInfo::ResourceInfoKind;
732       OutStreamer->emitRawComment(" Function info:", false);
733 
734       emitCommonFunctionComments(
735           RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
736                        IsLocal)
737               ->getVariableValue(),
738           STM.hasMAIInsts()
739               ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
740                              OutContext, IsLocal)
741                     ->getVariableValue()
742               : nullptr,
743           RI.createTotalNumVGPRs(MF, Ctx),
744           RI.createTotalNumSGPRs(
745               MF,
746               MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
747               Ctx),
748           RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
749                        OutContext, IsLocal)
750               ->getVariableValue(),
751           getFunctionCodeSize(MF), MFI);
752       return false;
753     }
754 
755     OutStreamer->emitRawComment(" Kernel info:", false);
756     emitCommonFunctionComments(
757         CurrentProgramInfo.NumArchVGPR,
758         STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
759         CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
760         CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
761 
762     OutStreamer->emitRawComment(
763       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
764     OutStreamer->emitRawComment(
765       " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
766     OutStreamer->emitRawComment(
767       " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
768       " bytes/workgroup (compile time only)", false);
769 
770     OutStreamer->emitRawComment(
771         " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
772 
773     OutStreamer->emitRawComment(
774         " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
775 
776     OutStreamer->emitRawComment(
777         " NumSGPRsForWavesPerEU: " +
778             getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
779         false);
780     OutStreamer->emitRawComment(
781         " NumVGPRsForWavesPerEU: " +
782             getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
783         false);
784 
785     if (STM.hasGFX90AInsts()) {
786       const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
787           CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
788       AdjustedAccum = MCBinaryExpr::createMul(
789           AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
790       OutStreamer->emitRawComment(
791           " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
792     }
793 
794     OutStreamer->emitRawComment(
795         " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
796 
797     OutStreamer->emitRawComment(
798       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
799 
800     OutStreamer->emitRawComment(
801         " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
802             getMCExprStr(CurrentProgramInfo.ScratchEnable),
803         false);
804     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
805                                     Twine(CurrentProgramInfo.UserSGPR),
806                                 false);
807     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
808                                     Twine(CurrentProgramInfo.TrapHandlerEnable),
809                                 false);
810     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
811                                     Twine(CurrentProgramInfo.TGIdXEnable),
812                                 false);
813     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
814                                     Twine(CurrentProgramInfo.TGIdYEnable),
815                                 false);
816     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
817                                     Twine(CurrentProgramInfo.TGIdZEnable),
818                                 false);
819     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
820                                     Twine(CurrentProgramInfo.TIdIGCompCount),
821                                 false);
822 
823     [[maybe_unused]] int64_t PGMRSrc3;
824     assert(STM.hasGFX90AInsts() ||
825            (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
826                 PGMRSrc3) &&
827             static_cast<uint64_t>(PGMRSrc3) == 0));
828     if (STM.hasGFX90AInsts()) {
829       OutStreamer->emitRawComment(
830           " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
831               getMCExprStr(MCKernelDescriptor::bits_get(
832                   CurrentProgramInfo.ComputePGMRSrc3GFX90A,
833                   amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
834                   amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
835           false);
836       OutStreamer->emitRawComment(
837           " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
838               getMCExprStr(MCKernelDescriptor::bits_get(
839                   CurrentProgramInfo.ComputePGMRSrc3GFX90A,
840                   amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
841                   amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
842           false);
843     }
844   }
845 
846   if (DumpCodeInstEmitter) {
847 
848     OutStreamer->switchSection(
849         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
850 
851     for (size_t i = 0; i < DisasmLines.size(); ++i) {
852       std::string Comment = "\n";
853       if (!HexLines[i].empty()) {
854         Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
855         Comment += " ; " + HexLines[i] + "\n";
856       }
857 
858       OutStreamer->emitBytes(StringRef(DisasmLines[i]));
859       OutStreamer->emitBytes(StringRef(Comment));
860     }
861   }
862 
863   return false;
864 }
865 
866 // TODO: Fold this into emitFunctionBodyStart.
867 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
868   // In the beginning all features are either 'Any' or 'NotSupported',
869   // depending on global target features. This will cover empty modules.
870   getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
871                                           getGlobalSTI()->getFeatureString());
872 
873   // If module is empty, we are done.
874   if (M.empty())
875     return;
876 
877   // If module is not empty, need to find first 'Off' or 'On' feature
878   // setting per feature from functions in module.
879   for (auto &F : M) {
880     auto &TSTargetID = getTargetStreamer()->getTargetID();
881     if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
882         (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
883       break;
884 
885     const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
886     const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
887     if (TSTargetID->isXnackSupported())
888       if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
889         TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
890     if (TSTargetID->isSramEccSupported())
891       if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
892         TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
893   }
894 }
895 
896 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
897   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
898   const SIInstrInfo *TII = STM.getInstrInfo();
899 
900   uint64_t CodeSize = 0;
901 
902   for (const MachineBasicBlock &MBB : MF) {
903     for (const MachineInstr &MI : MBB) {
904       // TODO: CodeSize should account for multiple functions.
905 
906       // TODO: Should we count size of debug info?
907       if (MI.isDebugInstr())
908         continue;
909 
910       CodeSize += TII->getInstSizeInBytes(MI);
911     }
912   }
913 
914   return CodeSize;
915 }
916 
917 // AccumOffset computed for the MCExpr equivalent of:
918 // alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
919 static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
920   const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
921   const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
922 
923   // Can't be lower than 1 for subsequent alignTo.
924   const MCExpr *MaximumTaken =
925       AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
926 
927   // Practically, it's computing divideCeil(MaximumTaken, 4).
928   const MCExpr *DivCeil = MCBinaryExpr::createDiv(
929       AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
930       Ctx);
931 
932   return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
933 }
934 
935 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
936                                         const MachineFunction &MF) {
937   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
938   bool IsLocal = MF.getFunction().hasLocalLinkage();
939   MCContext &Ctx = MF.getContext();
940 
941   auto CreateExpr = [&Ctx](int64_t Value) {
942     return MCConstantExpr::create(Value, Ctx);
943   };
944 
945   auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
946     int64_t Val;
947     if (Value->evaluateAsAbsolute(Val)) {
948       Res = Val;
949       return true;
950     }
951     return false;
952   };
953 
954   auto GetSymRefExpr =
955       [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
956     MCSymbol *Sym =
957         RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
958     return MCSymbolRefExpr::create(Sym, Ctx);
959   };
960 
961   using RIK = MCResourceInfo::ResourceInfoKind;
962   ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
963   ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
964   ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
965       ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
966 
967   ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
968   ProgInfo.TgSplit = STM.isTgSplitEnabled();
969   ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
970   ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
971   ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
972   ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
973   ProgInfo.DynamicCallStack =
974       MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
975                              GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
976 
977   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
978 
979   // The calculations related to SGPR/VGPR blocks are
980   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
981   // unified.
982   const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
983       ProgInfo.VCCUsed, ProgInfo.FlatUsed,
984       getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
985 
986   // Check the addressable register limit before we add ExtraSGPRs.
987   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
988       !STM.hasSGPRInitBug()) {
989     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
990     uint64_t NumSgpr;
991     if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
992         NumSgpr > MaxAddressableNumSGPRs) {
993       // This can happen due to a compiler bug or when using inline asm.
994       LLVMContext &Ctx = MF.getFunction().getContext();
995       DiagnosticInfoResourceLimit Diag(
996           MF.getFunction(), "addressable scalar registers", NumSgpr,
997           MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
998       Ctx.diagnose(Diag);
999       ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1000     }
1001   }
1002 
1003   // Account for extra SGPRs and VGPRs reserved for debugger use.
1004   ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1005 
1006   const Function &F = MF.getFunction();
1007 
1008   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1009   // dispatch registers are function args.
1010   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1011 
1012   if (isShader(F.getCallingConv())) {
1013     bool IsPixelShader =
1014         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
1015 
1016     // Calculate the number of VGPR registers based on the SPI input registers
1017     uint32_t InputEna = 0;
1018     uint32_t InputAddr = 0;
1019     unsigned LastEna = 0;
1020 
1021     if (IsPixelShader) {
1022       // Note for IsPixelShader:
1023       // By this stage, all enabled inputs are tagged in InputAddr as well.
1024       // We will use InputAddr to determine whether the input counts against the
1025       // vgpr total and only use the InputEnable to determine the last input
1026       // that is relevant - if extra arguments are used, then we have to honour
1027       // the InputAddr for any intermediate non-enabled inputs.
1028       InputEna = MFI->getPSInputEnable();
1029       InputAddr = MFI->getPSInputAddr();
1030 
1031       // We only need to consider input args up to the last used arg.
1032       assert((InputEna || InputAddr) &&
1033              "PSInputAddr and PSInputEnable should "
1034              "never both be 0 for AMDGPU_PS shaders");
1035       // There are some rare circumstances where InputAddr is non-zero and
1036       // InputEna can be set to 0. In this case we default to setting LastEna
1037       // to 1.
1038       LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
1039     }
1040 
1041     // FIXME: We should be using the number of registers determined during
1042     // calling convention lowering to legalize the types.
1043     const DataLayout &DL = F.getDataLayout();
1044     unsigned PSArgCount = 0;
1045     unsigned IntermediateVGPR = 0;
1046     for (auto &Arg : F.args()) {
1047       unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1048       if (Arg.hasAttribute(Attribute::InReg)) {
1049         WaveDispatchNumSGPR += NumRegs;
1050       } else {
1051         // If this is a PS shader and we're processing the PS Input args (first
1052         // 16 VGPR), use the InputEna and InputAddr bits to define how many
1053         // VGPRs are actually used.
1054         // Any extra VGPR arguments are handled as normal arguments (and
1055         // contribute to the VGPR count whether they're used or not).
1056         if (IsPixelShader && PSArgCount < 16) {
1057           if ((1 << PSArgCount) & InputAddr) {
1058             if (PSArgCount < LastEna)
1059               WaveDispatchNumVGPR += NumRegs;
1060             else
1061               IntermediateVGPR += NumRegs;
1062           }
1063           PSArgCount++;
1064         } else {
1065           // If there are extra arguments we have to include the allocation for
1066           // the non-used (but enabled with InputAddr) input arguments
1067           if (IntermediateVGPR) {
1068             WaveDispatchNumVGPR += IntermediateVGPR;
1069             IntermediateVGPR = 0;
1070           }
1071           WaveDispatchNumVGPR += NumRegs;
1072         }
1073       }
1074     }
1075     ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1076         {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
1077 
1078     ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
1079         {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1080 
1081     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1082         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1083   } else if (isKernel(F.getCallingConv()) &&
1084              MFI->getNumKernargPreloadedSGPRs()) {
1085     // Consider cases where the total number of UserSGPRs with trailing
1086     // allocated preload SGPRs, is greater than the number of explicitly
1087     // referenced SGPRs.
1088     const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1089         CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1090     ProgInfo.NumSGPR =
1091         AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
1092   }
1093 
1094   // Adjust number of registers used to meet default/requested minimum/maximum
1095   // number of waves per execution unit request.
1096   unsigned MaxWaves = MFI->getMaxWavesPerEU();
1097   ProgInfo.NumSGPRsForWavesPerEU =
1098       AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1099                                CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1100                               Ctx);
1101   ProgInfo.NumVGPRsForWavesPerEU =
1102       AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1103                                CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
1104                               Ctx);
1105 
1106   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
1107       STM.hasSGPRInitBug()) {
1108     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1109     uint64_t NumSgpr;
1110     if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1111         NumSgpr > MaxAddressableNumSGPRs) {
1112       // This can happen due to a compiler bug or when using inline asm to use
1113       // the registers which are usually reserved for vcc etc.
1114       LLVMContext &Ctx = MF.getFunction().getContext();
1115       DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1116                                        NumSgpr, MaxAddressableNumSGPRs,
1117                                        DS_Error, DK_ResourceLimit);
1118       Ctx.diagnose(Diag);
1119       ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1120       ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1121     }
1122   }
1123 
1124   if (STM.hasSGPRInitBug()) {
1125     ProgInfo.NumSGPR =
1126         CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1127     ProgInfo.NumSGPRsForWavesPerEU =
1128         CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1129   }
1130 
1131   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1132     LLVMContext &Ctx = MF.getFunction().getContext();
1133     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1134                                      MFI->getNumUserSGPRs(),
1135                                      STM.getMaxNumUserSGPRs(), DS_Error);
1136     Ctx.diagnose(Diag);
1137   }
1138 
1139   if (MFI->getLDSSize() >
1140       static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
1141     LLVMContext &Ctx = MF.getFunction().getContext();
1142     DiagnosticInfoResourceLimit Diag(
1143         MF.getFunction(), "local memory", MFI->getLDSSize(),
1144         STM.getAddressableLocalMemorySize(), DS_Error);
1145     Ctx.diagnose(Diag);
1146   }
1147   // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1148   // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1149   auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1150                                              unsigned Granule) {
1151     const MCExpr *OneConst = CreateExpr(1ul);
1152     const MCExpr *GranuleConst = CreateExpr(Granule);
1153     const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1154     const MCExpr *AlignToGPR =
1155         AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1156     const MCExpr *DivGPR =
1157         MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1158     const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1159     return SubGPR;
1160   };
1161 
1162   ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1163                                         IsaInfo::getSGPREncodingGranule(&STM));
1164   ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1165                                         IsaInfo::getVGPREncodingGranule(&STM));
1166 
1167   const SIModeRegisterDefaults Mode = MFI->getMode();
1168 
1169   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1170   // register.
1171   ProgInfo.FloatMode = getFPMode(Mode);
1172 
1173   ProgInfo.IEEEMode = Mode.IEEE;
1174 
1175   // Make clamp modifier on NaN input returns 0.
1176   ProgInfo.DX10Clamp = Mode.DX10Clamp;
1177 
1178   unsigned LDSAlignShift;
1179   if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
1180     // LDS is allocated in 320 dword blocks.
1181     LDSAlignShift = 11;
1182   } else if (STM.getFeatureBits().test(
1183                  FeatureAddressableLocalMemorySize65536)) {
1184     // LDS is allocated in 128 dword blocks.
1185     LDSAlignShift = 9;
1186   } else {
1187     // LDS is allocated in 64 dword blocks.
1188     LDSAlignShift = 8;
1189   }
1190 
1191   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1192   ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1193 
1194   ProgInfo.LDSSize = MFI->getLDSSize();
1195   ProgInfo.LDSBlocks =
1196       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1197 
1198   // The MCExpr equivalent of divideCeil.
1199   auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1200     const MCExpr *Ceil =
1201         AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1202     return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1203   };
1204 
1205   // Scratch is allocated in 64-dword or 256-dword blocks.
1206   unsigned ScratchAlignShift =
1207       STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1208   // We need to program the hardware with the amount of scratch memory that
1209   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
1210   // scratch memory used per thread.
1211   ProgInfo.ScratchBlocks = DivideCeil(
1212       MCBinaryExpr::createMul(ProgInfo.ScratchSize,
1213                               CreateExpr(STM.getWavefrontSize()), Ctx),
1214       CreateExpr(1ULL << ScratchAlignShift));
1215 
1216   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1217     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1218     ProgInfo.MemOrdered = 1;
1219   }
1220 
1221   // 0 = X, 1 = XY, 2 = XYZ
1222   unsigned TIDIGCompCnt = 0;
1223   if (MFI->hasWorkItemIDZ())
1224     TIDIGCompCnt = 2;
1225   else if (MFI->hasWorkItemIDY())
1226     TIDIGCompCnt = 1;
1227 
1228   // The private segment wave byte offset is the last of the system SGPRs. We
1229   // initially assumed it was allocated, and may have used it. It shouldn't harm
1230   // anything to disable it if we know the stack isn't used here. We may still
1231   // have emitted code reading it to initialize scratch, but if that's unused
1232   // reading garbage should be OK.
1233   ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1234       MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
1235                              MCConstantExpr::create(0, Ctx), Ctx),
1236       ProgInfo.DynamicCallStack, Ctx);
1237 
1238   ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1239   // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1240   ProgInfo.TrapHandlerEnable =
1241       STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1242   ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1243   ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1244   ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1245   ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1246   ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1247   ProgInfo.EXCPEnMSB = 0;
1248   // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1249   ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1250   ProgInfo.EXCPEnable = 0;
1251 
1252   if (STM.hasGFX90AInsts()) {
1253     // return ((Dst & ~Mask) | (Value << Shift))
1254     auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1255                           uint32_t Shift) {
1256       const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1257       const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1258       Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1259       Dst = MCBinaryExpr::createOr(
1260           Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1261       return Dst;
1262     };
1263 
1264     ProgInfo.ComputePGMRSrc3GFX90A =
1265         SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1266                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1267                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1268     ProgInfo.ComputePGMRSrc3GFX90A =
1269         SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1270                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1271                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1272   }
1273 
1274   ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
1275       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1276       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1277 
1278   const auto [MinWEU, MaxWEU] =
1279       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1280   uint64_t Occupancy;
1281   if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1282     DiagnosticInfoOptimizationFailure Diag(
1283         F, F.getSubprogram(),
1284         "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1285         "'" +
1286             F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1287             ", final occupancy is " + Twine(Occupancy));
1288     F.getContext().diagnose(Diag);
1289   }
1290 }
1291 
1292 static unsigned getRsrcReg(CallingConv::ID CallConv) {
1293   switch (CallConv) {
1294   default: [[fallthrough]];
1295   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1296   case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1297   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1298   case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1299   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1300   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1301   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1302   }
1303 }
1304 
1305 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1306                                          const SIProgramInfo &CurrentProgramInfo) {
1307   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1308   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1309   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1310   MCContext &Ctx = MF.getContext();
1311 
1312   // (((Value) & Mask) << Shift)
1313   auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1314     const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1315     const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1316     return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
1317                                    shft, Ctx);
1318   };
1319 
1320   auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1321     int64_t Val;
1322     if (Value->evaluateAsAbsolute(Val))
1323       OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1324     else
1325       OutStreamer->emitValue(Value, Size);
1326   };
1327 
1328   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1329     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1330 
1331     EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1332                        /*Size=*/4);
1333 
1334     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1335     EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1336 
1337     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1338 
1339     // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1340     // appropriate generation.
1341     if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1342       EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1343                                  /*Mask=*/0x3FFFF, /*Shift=*/12),
1344                          /*Size=*/4);
1345     } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1346       EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1347                                  /*Mask=*/0x7FFF, /*Shift=*/12),
1348                          /*Size=*/4);
1349     } else {
1350       EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1351                                  /*Mask=*/0x1FFF, /*Shift=*/12),
1352                          /*Size=*/4);
1353     }
1354 
1355     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1356     // 0" comment but I don't see a corresponding field in the register spec.
1357   } else {
1358     OutStreamer->emitInt32(RsrcReg);
1359 
1360     const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1361         SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1362         SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1363         MF.getContext());
1364     EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1365     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1366 
1367     // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1368     // appropriate generation.
1369     if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1370       EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1371                                  /*Mask=*/0x3FFFF, /*Shift=*/12),
1372                          /*Size=*/4);
1373     } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1374       EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1375                                  /*Mask=*/0x7FFF, /*Shift=*/12),
1376                          /*Size=*/4);
1377     } else {
1378       EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1379                                  /*Mask=*/0x1FFF, /*Shift=*/12),
1380                          /*Size=*/4);
1381     }
1382   }
1383 
1384   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1385     OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1386     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1387                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1388                                 : CurrentProgramInfo.LDSBlocks;
1389     OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1390     OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1391     OutStreamer->emitInt32(MFI->getPSInputEnable());
1392     OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1393     OutStreamer->emitInt32(MFI->getPSInputAddr());
1394   }
1395 
1396   OutStreamer->emitInt32(R_SPILLED_SGPRS);
1397   OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1398   OutStreamer->emitInt32(R_SPILLED_VGPRS);
1399   OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1400 }
1401 
1402 // Helper function to add common PAL Metadata 3.0+
1403 static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1404                                   const SIProgramInfo &CurrentProgramInfo,
1405                                   CallingConv::ID CC, const GCNSubtarget &ST) {
1406   if (ST.hasIEEEMode())
1407     MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1408 
1409   MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1410   MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1411 
1412   if (AMDGPU::isCompute(CC)) {
1413     MD->setHwStage(CC, ".trap_present",
1414                    (bool)CurrentProgramInfo.TrapHandlerEnable);
1415     MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1416   }
1417 
1418   MD->setHwStage(CC, ".lds_size",
1419                  (unsigned)(CurrentProgramInfo.LdsSize *
1420                             getLdsDwGranularity(ST) * sizeof(uint32_t)));
1421 }
1422 
1423 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1424 // is AMDPAL.  It stores each compute/SPI register setting and other PAL
1425 // metadata items into the PALMD::Metadata, combining with any provided by the
1426 // frontend as LLVM metadata. Once all functions are written, the PAL metadata
1427 // is then written as a single block in the .note section.
1428 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1429        const SIProgramInfo &CurrentProgramInfo) {
1430   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431   auto CC = MF.getFunction().getCallingConv();
1432   auto *MD = getTargetStreamer()->getPALMetadata();
1433   auto &Ctx = MF.getContext();
1434 
1435   MD->setEntryPoint(CC, MF.getFunction().getName());
1436   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1437 
1438   // Only set AGPRs for supported devices
1439   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1440   if (STM.hasMAIInsts()) {
1441     MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1442   }
1443 
1444   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1445   if (MD->getPALMajorVersion() < 3) {
1446     MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1447     if (AMDGPU::isCompute(CC)) {
1448       MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1449     } else {
1450       const MCExpr *HasScratchBlocks =
1451           MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1452                                  MCConstantExpr::create(0, Ctx), Ctx);
1453       auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1454       MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1455     }
1456   } else {
1457     MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1458     MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1459                    CurrentProgramInfo.ScratchEnable);
1460     EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1461   }
1462 
1463   // ScratchSize is in bytes, 16 aligned.
1464   MD->setScratchSize(
1465       CC,
1466       AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1467                                   MCConstantExpr::create(16, Ctx), Ctx),
1468       Ctx);
1469 
1470   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1471     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1472                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1473                                 : CurrentProgramInfo.LDSBlocks;
1474     if (MD->getPALMajorVersion() < 3) {
1475       MD->setRsrc2(
1476           CC,
1477           MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1478           Ctx);
1479       MD->setSpiPsInputEna(MFI->getPSInputEnable());
1480       MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1481     } else {
1482       // Graphics registers
1483       const unsigned ExtraLdsDwGranularity =
1484           STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1485       MD->setGraphicsRegisters(
1486           ".ps_extra_lds_size",
1487           (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1488 
1489       // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1490       static StringLiteral const PsInputFields[] = {
1491           ".persp_sample_ena",    ".persp_center_ena",
1492           ".persp_centroid_ena",  ".persp_pull_model_ena",
1493           ".linear_sample_ena",   ".linear_center_ena",
1494           ".linear_centroid_ena", ".line_stipple_tex_ena",
1495           ".pos_x_float_ena",     ".pos_y_float_ena",
1496           ".pos_z_float_ena",     ".pos_w_float_ena",
1497           ".front_face_ena",      ".ancillary_ena",
1498           ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1499       unsigned PSInputEna = MFI->getPSInputEnable();
1500       unsigned PSInputAddr = MFI->getPSInputAddr();
1501       for (auto [Idx, Field] : enumerate(PsInputFields)) {
1502         MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1503                                  (bool)((PSInputEna >> Idx) & 1));
1504         MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1505                                  (bool)((PSInputAddr >> Idx) & 1));
1506       }
1507     }
1508   }
1509 
1510   // For version 3 and above the wave front size is already set in the metadata
1511   if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1512     MD->setWave32(MF.getFunction().getCallingConv());
1513 }
1514 
1515 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1516   auto *MD = getTargetStreamer()->getPALMetadata();
1517   const MachineFrameInfo &MFI = MF.getFrameInfo();
1518   StringRef FnName = MF.getFunction().getName();
1519   MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1520   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1521   MCContext &Ctx = MF.getContext();
1522 
1523   if (MD->getPALMajorVersion() < 3) {
1524     // Set compute registers
1525     MD->setRsrc1(
1526         CallingConv::AMDGPU_CS,
1527         CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1528     MD->setRsrc2(CallingConv::AMDGPU_CS,
1529                  CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1530   } else {
1531     EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1532   }
1533 
1534   // Set optional info
1535   MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1536   MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1537   MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1538 }
1539 
1540 // This is supposed to be log2(Size)
1541 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1542   switch (Size) {
1543   case 4:
1544     return AMD_ELEMENT_4_BYTES;
1545   case 8:
1546     return AMD_ELEMENT_8_BYTES;
1547   case 16:
1548     return AMD_ELEMENT_16_BYTES;
1549   default:
1550     llvm_unreachable("invalid private_element_size");
1551   }
1552 }
1553 
1554 void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1555                                         const SIProgramInfo &CurrentProgramInfo,
1556                                         const MachineFunction &MF) const {
1557   const Function &F = MF.getFunction();
1558   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1559          F.getCallingConv() == CallingConv::SPIR_KERNEL);
1560 
1561   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1562   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1563   MCContext &Ctx = MF.getContext();
1564 
1565   Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1566 
1567   Out.compute_pgm_resource1_registers =
1568       CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1569   Out.compute_pgm_resource2_registers =
1570       CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1571   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1572 
1573   Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1574 
1575   AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1576                    getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1577 
1578   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1579   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1580     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1581   }
1582 
1583   if (UserSGPRInfo.hasDispatchPtr())
1584     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1585 
1586   if (UserSGPRInfo.hasQueuePtr())
1587     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1588 
1589   if (UserSGPRInfo.hasKernargSegmentPtr())
1590     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1591 
1592   if (UserSGPRInfo.hasDispatchID())
1593     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1594 
1595   if (UserSGPRInfo.hasFlatScratchInit())
1596     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1597 
1598   if (UserSGPRInfo.hasPrivateSegmentSize())
1599     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1600 
1601   if (STM.isXNACKEnabled())
1602     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1603 
1604   Align MaxKernArgAlign;
1605   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1606   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1607   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1608   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1609   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1610 
1611   // kernarg_segment_alignment is specified as log of the alignment.
1612   // The minimum alignment is 16.
1613   // FIXME: The metadata treats the minimum as 4?
1614   Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1615 }
1616 
1617 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1618                                        const char *ExtraCode, raw_ostream &O) {
1619   // First try the generic code, which knows about modifiers like 'c' and 'n'.
1620   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1621     return false;
1622 
1623   if (ExtraCode && ExtraCode[0]) {
1624     if (ExtraCode[1] != 0)
1625       return true; // Unknown modifier.
1626 
1627     switch (ExtraCode[0]) {
1628     case 'r':
1629       break;
1630     default:
1631       return true;
1632     }
1633   }
1634 
1635   // TODO: Should be able to support other operand types like globals.
1636   const MachineOperand &MO = MI->getOperand(OpNo);
1637   if (MO.isReg()) {
1638     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1639                                        *MF->getSubtarget().getRegisterInfo());
1640     return false;
1641   }
1642   if (MO.isImm()) {
1643     int64_t Val = MO.getImm();
1644     if (AMDGPU::isInlinableIntLiteral(Val)) {
1645       O << Val;
1646     } else if (isUInt<16>(Val)) {
1647       O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1648     } else if (isUInt<32>(Val)) {
1649       O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1650     } else {
1651       O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1652     }
1653     return false;
1654   }
1655   return true;
1656 }
1657 
1658 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1659   AU.addRequired<AMDGPUResourceUsageAnalysis>();
1660   AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1661   AU.addRequired<MachineModuleInfoWrapperPass>();
1662   AU.addPreserved<MachineModuleInfoWrapperPass>();
1663   AsmPrinter::getAnalysisUsage(AU);
1664 }
1665 
1666 void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1667     const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1668     bool isModuleEntryFunction, bool hasMAIInsts) {
1669   if (!ORE)
1670     return;
1671 
1672   const char *Name = "kernel-resource-usage";
1673   const char *Indent = "    ";
1674 
1675   // If the remark is not specifically enabled, do not output to yaml
1676   LLVMContext &Ctx = MF.getFunction().getContext();
1677   if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1678     return;
1679 
1680   // Currently non-kernel functions have no resources to emit.
1681   if (!isEntryFunctionCC(MF.getFunction().getCallingConv()))
1682     return;
1683 
1684   auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1685                                      StringRef RemarkLabel, auto Argument) {
1686     // Add an indent for every line besides the line with the kernel name. This
1687     // makes it easier to tell which resource usage go with which kernel since
1688     // the kernel name will always be displayed first.
1689     std::string LabelStr = RemarkLabel.str() + ": ";
1690     if (RemarkName != "FunctionName")
1691       LabelStr = Indent + LabelStr;
1692 
1693     ORE->emit([&]() {
1694       return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1695                                                MF.getFunction().getSubprogram(),
1696                                                &MF.front())
1697              << LabelStr << ore::NV(RemarkName, Argument);
1698     });
1699   };
1700 
1701   // FIXME: Formatting here is pretty nasty because clang does not accept
1702   // newlines from diagnostics. This forces us to emit multiple diagnostic
1703   // remarks to simulate newlines. If and when clang does accept newlines, this
1704   // formatting should be aggregated into one remark with newlines to avoid
1705   // printing multiple diagnostic location and diag opts.
1706   EmitResourceUsageRemark("FunctionName", "Function Name",
1707                           MF.getFunction().getName());
1708   EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1709                           getMCExprStr(CurrentProgramInfo.NumSGPR));
1710   EmitResourceUsageRemark("NumVGPR", "VGPRs",
1711                           getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1712   if (hasMAIInsts) {
1713     EmitResourceUsageRemark("NumAGPR", "AGPRs",
1714                             getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1715   }
1716   EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1717                           getMCExprStr(CurrentProgramInfo.ScratchSize));
1718   int64_t DynStack;
1719   bool DynStackEvaluatable =
1720       CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1721   StringRef DynamicStackStr =
1722       DynStackEvaluatable && DynStack ? "True" : "False";
1723   EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1724   EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1725                           getMCExprStr(CurrentProgramInfo.Occupancy));
1726   EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1727                           CurrentProgramInfo.SGPRSpill);
1728   EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1729                           CurrentProgramInfo.VGPRSpill);
1730   if (isModuleEntryFunction)
1731     EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1732                             CurrentProgramInfo.LDSSize);
1733 }
1734