xref: /llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (revision 9fb01fcd9fd5ccffa2421096e5e058156b86aa84)
1 //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 //
11 /// \file
12 ///
13 /// This file contains definition for AMDGPU ISA disassembler
14 //
15 //===----------------------------------------------------------------------===//
16 
17 // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
18 
19 #include "Disassembler/AMDGPUDisassembler.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIDefines.h"
22 #include "SIRegisterInfo.h"
23 #include "TargetInfo/AMDGPUTargetInfo.h"
24 #include "Utils/AMDGPUAsmUtils.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm-c/DisassemblerTypes.h"
27 #include "llvm/BinaryFormat/ELF.h"
28 #include "llvm/MC/MCAsmInfo.h"
29 #include "llvm/MC/MCContext.h"
30 #include "llvm/MC/MCDecoderOps.h"
31 #include "llvm/MC/MCExpr.h"
32 #include "llvm/MC/MCInstrDesc.h"
33 #include "llvm/MC/MCRegisterInfo.h"
34 #include "llvm/MC/MCSubtargetInfo.h"
35 #include "llvm/MC/TargetRegistry.h"
36 #include "llvm/Support/AMDHSAKernelDescriptor.h"
37 
38 using namespace llvm;
39 
40 #define DEBUG_TYPE "amdgpu-disassembler"
41 
42 #define SGPR_MAX                                                               \
43   (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10                           \
44                  : AMDGPU::EncValues::SGPR_MAX_SI)
45 
46 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
47 
48 static const MCSubtargetInfo &addDefaultWaveSize(const MCSubtargetInfo &STI,
49                                                  MCContext &Ctx) {
50   if (!STI.hasFeature(AMDGPU::FeatureWavefrontSize64) &&
51       !STI.hasFeature(AMDGPU::FeatureWavefrontSize32)) {
52     MCSubtargetInfo &STICopy = Ctx.getSubtargetCopy(STI);
53     // If there is no default wave size it must be a generation before gfx10,
54     // these have FeatureWavefrontSize64 in their definition already. For gfx10+
55     // set wave32 as a default.
56     STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize32);
57     return STICopy;
58   }
59 
60   return STI;
61 }
62 
63 AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
64                                        MCContext &Ctx, MCInstrInfo const *MCII)
65     : MCDisassembler(addDefaultWaveSize(STI, Ctx), Ctx), MCII(MCII),
66       MRI(*Ctx.getRegisterInfo()), MAI(*Ctx.getAsmInfo()),
67       TargetMaxInstBytes(MAI.getMaxInstLength(&STI)),
68       CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {
69   // ToDo: AMDGPUDisassembler supports only VI ISA.
70   if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
71     report_fatal_error("Disassembly not yet supported for subtarget");
72 
73   for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
74     createConstantSymbolExpr(Symbol, Code);
75 
76   UCVersionW64Expr = createConstantSymbolExpr("UC_VERSION_W64_BIT", 0x2000);
77   UCVersionW32Expr = createConstantSymbolExpr("UC_VERSION_W32_BIT", 0x4000);
78   UCVersionMDPExpr = createConstantSymbolExpr("UC_VERSION_MDP_BIT", 0x8000);
79 }
80 
81 void AMDGPUDisassembler::setABIVersion(unsigned Version) {
82   CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(Version);
83 }
84 
85 inline static MCDisassembler::DecodeStatus
86 addOperand(MCInst &Inst, const MCOperand& Opnd) {
87   Inst.addOperand(Opnd);
88   return Opnd.isValid() ?
89     MCDisassembler::Success :
90     MCDisassembler::Fail;
91 }
92 
93 static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
94                                 uint16_t NameIdx) {
95   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
96   if (OpIdx != -1) {
97     auto *I = MI.begin();
98     std::advance(I, OpIdx);
99     MI.insert(I, Op);
100   }
101   return OpIdx;
102 }
103 
104 static DecodeStatus decodeSOPPBrTarget(MCInst &Inst, unsigned Imm,
105                                        uint64_t Addr,
106                                        const MCDisassembler *Decoder) {
107   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
108 
109   // Our branches take a simm16.
110   int64_t Offset = SignExtend64<16>(Imm) * 4 + 4 + Addr;
111 
112   if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
113     return MCDisassembler::Success;
114   return addOperand(Inst, MCOperand::createImm(Imm));
115 }
116 
117 static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
118                                      const MCDisassembler *Decoder) {
119   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
120   int64_t Offset;
121   if (DAsm->isGFX12Plus()) { // GFX12 supports 24-bit signed offsets.
122     Offset = SignExtend64<24>(Imm);
123   } else if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
124     Offset = Imm & 0xFFFFF;
125   } else { // GFX9+ supports 21-bit signed offsets.
126     Offset = SignExtend64<21>(Imm);
127   }
128   return addOperand(Inst, MCOperand::createImm(Offset));
129 }
130 
131 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
132                                   const MCDisassembler *Decoder) {
133   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
134   return addOperand(Inst, DAsm->decodeBoolReg(Val));
135 }
136 
137 static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
138                                        uint64_t Addr,
139                                        const MCDisassembler *Decoder) {
140   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
141   return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
142 }
143 
144 static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr,
145                                  const MCDisassembler *Decoder) {
146   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
147   return addOperand(Inst, DAsm->decodeDpp8FI(Val));
148 }
149 
150 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
151   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
152                                         uint64_t /*Addr*/,                     \
153                                         const MCDisassembler *Decoder) {       \
154     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
155     return addOperand(Inst, DAsm->DecoderName(Imm));                           \
156   }
157 
158 // Decoder for registers, decode directly using RegClassID. Imm(8-bit) is
159 // number of register. Used by VGPR only and AGPR only operands.
160 #define DECODE_OPERAND_REG_8(RegClass)                                         \
161   static DecodeStatus Decode##RegClass##RegisterClass(                         \
162       MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,                           \
163       const MCDisassembler *Decoder) {                                         \
164     assert(Imm < (1 << 8) && "8-bit encoding");                                \
165     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
166     return addOperand(                                                         \
167         Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm));      \
168   }
169 
170 #define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral,         \
171                      ImmWidth)                                                 \
172   static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,      \
173                            const MCDisassembler *Decoder) {                    \
174     assert(Imm < (1 << EncSize) && #EncSize "-bit encoding");                  \
175     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
176     return addOperand(Inst,                                                    \
177                       DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm,   \
178                                         MandatoryLiteral, ImmWidth));          \
179   }
180 
181 static DecodeStatus decodeSrcOp(MCInst &Inst, unsigned EncSize,
182                                 AMDGPUDisassembler::OpWidthTy OpWidth,
183                                 unsigned Imm, unsigned EncImm,
184                                 bool MandatoryLiteral, unsigned ImmWidth,
185                                 AMDGPU::OperandSemantics Sema,
186                                 const MCDisassembler *Decoder) {
187   assert(Imm < (1U << EncSize) && "Operand doesn't fit encoding!");
188   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
189   return addOperand(Inst, DAsm->decodeSrcOp(OpWidth, EncImm, MandatoryLiteral,
190                                             ImmWidth, Sema));
191 }
192 
193 // Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to
194 // get register class. Used by SGPR only operands.
195 #define DECODE_OPERAND_REG_7(RegClass, OpWidth)                                \
196   DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0)
197 
198 // Decoder for registers. Imm(10-bit): Imm{7-0} is number of register,
199 // Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC).
200 // Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp.
201 // Used by AV_ register classes (AGPR or VGPR only register operands).
202 template <AMDGPUDisassembler::OpWidthTy OpWidth>
203 static DecodeStatus decodeAV10(MCInst &Inst, unsigned Imm, uint64_t /* Addr */,
204                                const MCDisassembler *Decoder) {
205   return decodeSrcOp(Inst, 10, OpWidth, Imm, Imm | AMDGPU::EncValues::IS_VGPR,
206                      false, 0, AMDGPU::OperandSemantics::INT, Decoder);
207 }
208 
209 // Decoder for Src(9-bit encoding) registers only.
210 template <AMDGPUDisassembler::OpWidthTy OpWidth>
211 static DecodeStatus decodeSrcReg9(MCInst &Inst, unsigned Imm,
212                                   uint64_t /* Addr */,
213                                   const MCDisassembler *Decoder) {
214   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, 0,
215                      AMDGPU::OperandSemantics::INT, Decoder);
216 }
217 
218 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
219 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
220 // only.
221 template <AMDGPUDisassembler::OpWidthTy OpWidth>
222 static DecodeStatus decodeSrcA9(MCInst &Inst, unsigned Imm, uint64_t /* Addr */,
223                                 const MCDisassembler *Decoder) {
224   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm | 512, false, 0,
225                      AMDGPU::OperandSemantics::INT, Decoder);
226 }
227 
228 // Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding
229 // Imm{9} is acc, registers only.
230 template <AMDGPUDisassembler::OpWidthTy OpWidth>
231 static DecodeStatus decodeSrcAV10(MCInst &Inst, unsigned Imm,
232                                   uint64_t /* Addr */,
233                                   const MCDisassembler *Decoder) {
234   return decodeSrcOp(Inst, 10, OpWidth, Imm, Imm, false, 0,
235                      AMDGPU::OperandSemantics::INT, Decoder);
236 }
237 
238 // Decoder for RegisterOperands using 9-bit Src encoding. Operand can be
239 // register from RegClass or immediate. Registers that don't belong to RegClass
240 // will be decoded and InstPrinter will report warning. Immediate will be
241 // decoded into constant of size ImmWidth, should match width of immediate used
242 // by OperandType (important for floating point types).
243 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
244           unsigned OperandSemantics>
245 static DecodeStatus decodeSrcRegOrImm9(MCInst &Inst, unsigned Imm,
246                                        uint64_t /* Addr */,
247                                        const MCDisassembler *Decoder) {
248   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, ImmWidth,
249                      (AMDGPU::OperandSemantics)OperandSemantics, Decoder);
250 }
251 
252 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
253 // and decode using 'enum10' from decodeSrcOp.
254 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
255           unsigned OperandSemantics>
256 static DecodeStatus decodeSrcRegOrImmA9(MCInst &Inst, unsigned Imm,
257                                         uint64_t /* Addr */,
258                                         const MCDisassembler *Decoder) {
259   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm | 512, false, ImmWidth,
260                      (AMDGPU::OperandSemantics)OperandSemantics, Decoder);
261 }
262 
263 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
264           unsigned OperandSemantics>
265 static DecodeStatus decodeSrcRegOrImmDeferred9(MCInst &Inst, unsigned Imm,
266                                                uint64_t /* Addr */,
267                                                const MCDisassembler *Decoder) {
268   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, true, ImmWidth,
269                      (AMDGPU::OperandSemantics)OperandSemantics, Decoder);
270 }
271 
272 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
273 // when RegisterClass is used as an operand. Most often used for destination
274 // operands.
275 
276 DECODE_OPERAND_REG_8(VGPR_32)
277 DECODE_OPERAND_REG_8(VGPR_32_Lo128)
278 DECODE_OPERAND_REG_8(VReg_64)
279 DECODE_OPERAND_REG_8(VReg_96)
280 DECODE_OPERAND_REG_8(VReg_128)
281 DECODE_OPERAND_REG_8(VReg_256)
282 DECODE_OPERAND_REG_8(VReg_288)
283 DECODE_OPERAND_REG_8(VReg_352)
284 DECODE_OPERAND_REG_8(VReg_384)
285 DECODE_OPERAND_REG_8(VReg_512)
286 DECODE_OPERAND_REG_8(VReg_1024)
287 
288 DECODE_OPERAND_REG_7(SReg_32, OPW32)
289 DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
290 DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
291 DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
292 DECODE_OPERAND_REG_7(SReg_64, OPW64)
293 DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
294 DECODE_OPERAND_REG_7(SReg_64_XEXEC_XNULL, OPW64)
295 DECODE_OPERAND_REG_7(SReg_96, OPW96)
296 DECODE_OPERAND_REG_7(SReg_128, OPW128)
297 DECODE_OPERAND_REG_7(SReg_256, OPW256)
298 DECODE_OPERAND_REG_7(SReg_512, OPW512)
299 
300 DECODE_OPERAND_REG_8(AGPR_32)
301 DECODE_OPERAND_REG_8(AReg_64)
302 DECODE_OPERAND_REG_8(AReg_128)
303 DECODE_OPERAND_REG_8(AReg_256)
304 DECODE_OPERAND_REG_8(AReg_512)
305 DECODE_OPERAND_REG_8(AReg_1024)
306 
307 static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
308                                                uint64_t /*Addr*/,
309                                                const MCDisassembler *Decoder) {
310   assert(isUInt<10>(Imm) && "10-bit encoding expected");
311   assert((Imm & (1 << 8)) == 0 && "Imm{8} should not be used");
312 
313   bool IsHi = Imm & (1 << 9);
314   unsigned RegIdx = Imm & 0xff;
315   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
316   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
317 }
318 
319 static DecodeStatus
320 DecodeVGPR_16_Lo128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,
321                                  const MCDisassembler *Decoder) {
322   assert(isUInt<8>(Imm) && "8-bit encoding expected");
323 
324   bool IsHi = Imm & (1 << 7);
325   unsigned RegIdx = Imm & 0x7f;
326   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
327   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
328 }
329 
330 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
331           unsigned OperandSemantics>
332 static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
333                                                 uint64_t /*Addr*/,
334                                                 const MCDisassembler *Decoder) {
335   assert(isUInt<9>(Imm) && "9-bit encoding expected");
336 
337   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
338   if (Imm & AMDGPU::EncValues::IS_VGPR) {
339     bool IsHi = Imm & (1 << 7);
340     unsigned RegIdx = Imm & 0x7f;
341     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
342   }
343   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
344                               OpWidth, Imm & 0xFF, false, ImmWidth,
345                               (AMDGPU::OperandSemantics)OperandSemantics));
346 }
347 
348 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
349           unsigned OperandSemantics>
350 static DecodeStatus
351 decodeOperand_VSrcT16_Lo128_Deferred(MCInst &Inst, unsigned Imm,
352                                      uint64_t /*Addr*/,
353                                      const MCDisassembler *Decoder) {
354   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
355   assert(isUInt<9>(Imm) && "9-bit encoding expected");
356 
357   if (Imm & AMDGPU::EncValues::IS_VGPR) {
358     bool IsHi = Imm & (1 << 7);
359     unsigned RegIdx = Imm & 0x7f;
360     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
361   }
362   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
363                               OpWidth, Imm & 0xFF, true, ImmWidth,
364                               (AMDGPU::OperandSemantics)OperandSemantics));
365 }
366 
367 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
368           unsigned OperandSemantics>
369 static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
370                                           uint64_t /*Addr*/,
371                                           const MCDisassembler *Decoder) {
372   assert(isUInt<10>(Imm) && "10-bit encoding expected");
373 
374   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
375   if (Imm & AMDGPU::EncValues::IS_VGPR) {
376     bool IsHi = Imm & (1 << 9);
377     unsigned RegIdx = Imm & 0xff;
378     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
379   }
380   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
381                               OpWidth, Imm & 0xFF, false, ImmWidth,
382                               (AMDGPU::OperandSemantics)OperandSemantics));
383 }
384 
385 static DecodeStatus decodeOperand_VGPR_16(MCInst &Inst, unsigned Imm,
386                                           uint64_t /*Addr*/,
387                                           const MCDisassembler *Decoder) {
388   assert(isUInt<10>(Imm) && "10-bit encoding expected");
389   assert(Imm & AMDGPU::EncValues::IS_VGPR && "VGPR expected");
390 
391   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
392 
393   bool IsHi = Imm & (1 << 9);
394   unsigned RegIdx = Imm & 0xff;
395   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
396 }
397 
398 static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
399                                          uint64_t Addr,
400                                          const MCDisassembler *Decoder) {
401   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
402   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
403 }
404 
405 static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
406                                           uint64_t Addr, const void *Decoder) {
407   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
408   return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
409 }
410 
411 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
412                           const MCRegisterInfo *MRI) {
413   if (OpIdx < 0)
414     return false;
415 
416   const MCOperand &Op = Inst.getOperand(OpIdx);
417   if (!Op.isReg())
418     return false;
419 
420   MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
421   auto Reg = Sub ? Sub : Op.getReg();
422   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
423 }
424 
425 static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm,
426                                  AMDGPUDisassembler::OpWidthTy Opw,
427                                  const MCDisassembler *Decoder) {
428   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
429   if (!DAsm->isGFX90A()) {
430     Imm &= 511;
431   } else {
432     // If atomic has both vdata and vdst their register classes are tied.
433     // The bit is decoded along with the vdst, first operand. We need to
434     // change register class to AGPR if vdst was AGPR.
435     // If a DS instruction has both data0 and data1 their register classes
436     // are also tied.
437     unsigned Opc = Inst.getOpcode();
438     uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
439     uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
440                                                         : AMDGPU::OpName::vdata;
441     const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
442     int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
443     if ((int)Inst.getNumOperands() == DataIdx) {
444       int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
445       if (IsAGPROperand(Inst, DstIdx, MRI))
446         Imm |= 512;
447     }
448 
449     if (TSFlags & SIInstrFlags::DS) {
450       int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
451       if ((int)Inst.getNumOperands() == Data2Idx &&
452           IsAGPROperand(Inst, DataIdx, MRI))
453         Imm |= 512;
454     }
455   }
456   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
457 }
458 
459 template <AMDGPUDisassembler::OpWidthTy Opw>
460 static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm,
461                                  uint64_t /* Addr */,
462                                  const MCDisassembler *Decoder) {
463   return decodeAVLdSt(Inst, Imm, Opw, Decoder);
464 }
465 
466 static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
467                                            uint64_t Addr,
468                                            const MCDisassembler *Decoder) {
469   assert(Imm < (1 << 9) && "9-bit encoding");
470   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
471   return addOperand(Inst,
472                     DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64,
473                                       AMDGPU::OperandSemantics::FP64));
474 }
475 
476 #define DECODE_SDWA(DecName) \
477 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
478 
479 DECODE_SDWA(Src32)
480 DECODE_SDWA(Src16)
481 DECODE_SDWA(VopcDst)
482 
483 static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
484                                      uint64_t /* Addr */,
485                                      const MCDisassembler *Decoder) {
486   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
487   return addOperand(Inst, DAsm->decodeVersionImm(Imm));
488 }
489 
490 #include "AMDGPUGenDisassemblerTables.inc"
491 
492 //===----------------------------------------------------------------------===//
493 //
494 //===----------------------------------------------------------------------===//
495 
496 template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
497   assert(Bytes.size() >= sizeof(T));
498   const auto Res =
499       support::endian::read<T, llvm::endianness::little>(Bytes.data());
500   Bytes = Bytes.slice(sizeof(T));
501   return Res;
502 }
503 
504 static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
505   assert(Bytes.size() >= 12);
506   uint64_t Lo =
507       support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
508   Bytes = Bytes.slice(8);
509   uint64_t Hi =
510       support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
511   Bytes = Bytes.slice(4);
512   return DecoderUInt128(Lo, Hi);
513 }
514 
515 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
516                                                 ArrayRef<uint8_t> Bytes_,
517                                                 uint64_t Address,
518                                                 raw_ostream &CS) const {
519   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
520   Bytes = Bytes_.slice(0, MaxInstBytesNum);
521 
522   // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless
523   // there are fewer bytes left). This will be overridden on success.
524   Size = std::min((size_t)4, Bytes_.size());
525 
526   do {
527     // ToDo: better to switch encoding length using some bit predicate
528     // but it is unknown yet, so try all we can
529 
530     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
531     // encodings
532     if (isGFX11Plus() && Bytes.size() >= 12 ) {
533       DecoderUInt128 DecW = eat12Bytes(Bytes);
534 
535       if (isGFX11() &&
536           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
537                         DecW, Address, CS))
538         break;
539 
540       if (isGFX12() &&
541           tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
542                         DecW, Address, CS))
543         break;
544 
545       if (isGFX12() &&
546           tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
547         break;
548 
549       // Reinitialize Bytes
550       Bytes = Bytes_.slice(0, MaxInstBytesNum);
551     }
552 
553     if (Bytes.size() >= 8) {
554       const uint64_t QW = eatBytes<uint64_t>(Bytes);
555 
556       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
557           tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS))
558         break;
559 
560       if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) &&
561           tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
562         break;
563 
564       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
565       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
566       // table first so we print the correct name.
567       if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) &&
568           tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS))
569         break;
570 
571       if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) &&
572           tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS))
573         break;
574 
575       if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
576           tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
577         break;
578 
579       if ((isVI() || isGFX9()) &&
580           tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
581         break;
582 
583       if (isGFX9() && tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
584         break;
585 
586       if (isGFX10() && tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
587         break;
588 
589       if (isGFX12() &&
590           tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
591                         Address, CS))
592         break;
593 
594       if (isGFX11() &&
595           tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
596                         Address, CS))
597         break;
598 
599       if (isGFX11() &&
600           tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
601         break;
602 
603       if (isGFX12() &&
604           tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
605         break;
606 
607       // Reinitialize Bytes
608       Bytes = Bytes_.slice(0, MaxInstBytesNum);
609     }
610 
611     // Try decode 32-bit instruction
612     if (Bytes.size() >= 4) {
613       const uint32_t DW = eatBytes<uint32_t>(Bytes);
614 
615       if ((isVI() || isGFX9()) &&
616           tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
617         break;
618 
619       if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
620         break;
621 
622       if (isGFX9() && tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
623         break;
624 
625       if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
626           tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
627         break;
628 
629       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
630           tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
631         break;
632 
633       if (isGFX10() && tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
634         break;
635 
636       if (isGFX11() &&
637           tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
638                         Address, CS))
639         break;
640 
641       if (isGFX12() &&
642           tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
643                         Address, CS))
644         break;
645     }
646 
647     return MCDisassembler::Fail;
648   } while (false);
649 
650   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) {
651     if (isMacDPP(MI))
652       convertMacDPPInst(MI);
653 
654     if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
655       convertVOP3PDPPInst(MI);
656     else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) ||
657              AMDGPU::isVOPC64DPP(MI.getOpcode()))
658       convertVOPCDPPInst(MI); // Special VOP3 case
659     else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) !=
660              -1)
661       convertDPP8Inst(MI);
662     else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3)
663       convertVOP3DPPInst(MI); // Regular VOP3 case
664   }
665 
666   convertTrue16OpSel(MI);
667 
668   if (AMDGPU::isMAC(MI.getOpcode())) {
669     // Insert dummy unused src2_modifiers.
670     insertNamedMCOperand(MI, MCOperand::createImm(0),
671                          AMDGPU::OpName::src2_modifiers);
672   }
673 
674   if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
675       MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) {
676     // Insert dummy unused src2_modifiers.
677     insertNamedMCOperand(MI, MCOperand::createImm(0),
678                          AMDGPU::OpName::src2_modifiers);
679   }
680 
681   if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
682       !AMDGPU::hasGDS(STI)) {
683     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
684   }
685 
686   if (MCII->get(MI.getOpcode()).TSFlags &
687       (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) {
688     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
689                                              AMDGPU::OpName::cpol);
690     if (CPolPos != -1) {
691       unsigned CPol =
692           (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
693               AMDGPU::CPol::GLC : 0;
694       if (MI.getNumOperands() <= (unsigned)CPolPos) {
695         insertNamedMCOperand(MI, MCOperand::createImm(CPol),
696                              AMDGPU::OpName::cpol);
697       } else if (CPol) {
698         MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
699       }
700     }
701   }
702 
703   if ((MCII->get(MI.getOpcode()).TSFlags &
704        (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
705       (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
706     // GFX90A lost TFE, its place is occupied by ACC.
707     int TFEOpIdx =
708         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
709     if (TFEOpIdx != -1) {
710       auto *TFEIter = MI.begin();
711       std::advance(TFEIter, TFEOpIdx);
712       MI.insert(TFEIter, MCOperand::createImm(0));
713     }
714   }
715 
716   if (MCII->get(MI.getOpcode()).TSFlags &
717       (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
718     int SWZOpIdx =
719         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
720     if (SWZOpIdx != -1) {
721       auto *SWZIter = MI.begin();
722       std::advance(SWZIter, SWZOpIdx);
723       MI.insert(SWZIter, MCOperand::createImm(0));
724     }
725   }
726 
727   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
728     int VAddr0Idx =
729         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
730     int RsrcIdx =
731         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
732     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
733     if (VAddr0Idx >= 0 && NSAArgs > 0) {
734       unsigned NSAWords = (NSAArgs + 3) / 4;
735       if (Bytes.size() < 4 * NSAWords)
736         return MCDisassembler::Fail;
737       for (unsigned i = 0; i < NSAArgs; ++i) {
738         const unsigned VAddrIdx = VAddr0Idx + 1 + i;
739         auto VAddrRCID =
740             MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
741         MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
742       }
743       Bytes = Bytes.slice(4 * NSAWords);
744     }
745 
746     convertMIMGInst(MI);
747   }
748 
749   if (MCII->get(MI.getOpcode()).TSFlags &
750       (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))
751     convertMIMGInst(MI);
752 
753   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)
754     convertEXPInst(MI);
755 
756   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)
757     convertVINTERPInst(MI);
758 
759   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
760     convertSDWAInst(MI);
761 
762   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
763                                               AMDGPU::OpName::vdst_in);
764   if (VDstIn_Idx != -1) {
765     int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
766                            MCOI::OperandConstraint::TIED_TO);
767     if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
768          !MI.getOperand(VDstIn_Idx).isReg() ||
769          MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
770       if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
771         MI.erase(&MI.getOperand(VDstIn_Idx));
772       insertNamedMCOperand(MI,
773         MCOperand::createReg(MI.getOperand(Tied).getReg()),
774         AMDGPU::OpName::vdst_in);
775     }
776   }
777 
778   int ImmLitIdx =
779       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
780   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
781   if (ImmLitIdx != -1 && !IsSOPK)
782     convertFMAanyK(MI, ImmLitIdx);
783 
784   Size = MaxInstBytesNum - Bytes.size();
785   return MCDisassembler::Success;
786 }
787 
788 void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
789   if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
790     // The MCInst still has these fields even though they are no longer encoded
791     // in the GFX11 instruction.
792     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
793     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
794   }
795 }
796 
797 void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
798   convertTrue16OpSel(MI);
799   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_t16_gfx11 ||
800       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_fake16_gfx11 ||
801       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_t16_gfx12 ||
802       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_fake16_gfx12 ||
803       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_t16_gfx11 ||
804       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_fake16_gfx11 ||
805       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_t16_gfx12 ||
806       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_fake16_gfx12 ||
807       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_t16_gfx11 ||
808       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_fake16_gfx11 ||
809       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_t16_gfx12 ||
810       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_fake16_gfx12 ||
811       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_t16_gfx11 ||
812       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_fake16_gfx11 ||
813       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_t16_gfx12 ||
814       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_fake16_gfx12) {
815     // The MCInst has this field that is not directly encoded in the
816     // instruction.
817     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
818   }
819 }
820 
821 void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
822   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
823       STI.hasFeature(AMDGPU::FeatureGFX10)) {
824     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
825       // VOPC - insert clamp
826       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
827   } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
828     int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
829     if (SDst != -1) {
830       // VOPC - insert VCC register as sdst
831       insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC),
832                            AMDGPU::OpName::sdst);
833     } else {
834       // VOP1/2 - insert omod if present in instruction
835       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
836     }
837   }
838 }
839 
840 struct VOPModifiers {
841   unsigned OpSel = 0;
842   unsigned OpSelHi = 0;
843   unsigned NegLo = 0;
844   unsigned NegHi = 0;
845 };
846 
847 // Reconstruct values of VOP3/VOP3P operands such as op_sel.
848 // Note that these values do not affect disassembler output,
849 // so this is only necessary for consistency with src_modifiers.
850 static VOPModifiers collectVOPModifiers(const MCInst &MI,
851                                         bool IsVOP3P = false) {
852   VOPModifiers Modifiers;
853   unsigned Opc = MI.getOpcode();
854   const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
855                         AMDGPU::OpName::src1_modifiers,
856                         AMDGPU::OpName::src2_modifiers};
857   for (int J = 0; J < 3; ++J) {
858     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
859     if (OpIdx == -1)
860       continue;
861 
862     unsigned Val = MI.getOperand(OpIdx).getImm();
863 
864     Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
865     if (IsVOP3P) {
866       Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
867       Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
868       Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
869     } else if (J == 0) {
870       Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
871     }
872   }
873 
874   return Modifiers;
875 }
876 
877 // Instructions decode the op_sel/suffix bits into the src_modifier
878 // operands. Copy those bits into the src operands for true16 VGPRs.
879 void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
880   const unsigned Opc = MI.getOpcode();
881   const MCRegisterClass &ConversionRC =
882       MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
883   constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
884       {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
885         SISrcMods::OP_SEL_0},
886        {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
887         SISrcMods::OP_SEL_0},
888        {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
889         SISrcMods::OP_SEL_0},
890        {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
891         SISrcMods::DST_OP_SEL}}};
892   for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
893     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
894     int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
895     if (OpIdx == -1 || OpModsIdx == -1)
896       continue;
897     MCOperand &Op = MI.getOperand(OpIdx);
898     if (!Op.isReg())
899       continue;
900     if (!ConversionRC.contains(Op.getReg()))
901       continue;
902     unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
903     const MCOperand &OpMods = MI.getOperand(OpModsIdx);
904     unsigned ModVal = OpMods.getImm();
905     if (ModVal & OpSelMask) { // isHi
906       unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
907       Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
908     }
909   }
910 }
911 
912 // MAC opcodes have special old and src2 operands.
913 // src2 is tied to dst, while old is not tied (but assumed to be).
914 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
915   constexpr int DST_IDX = 0;
916   auto Opcode = MI.getOpcode();
917   const auto &Desc = MCII->get(Opcode);
918   auto OldIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::old);
919 
920   if (OldIdx != -1 && Desc.getOperandConstraint(
921                           OldIdx, MCOI::OperandConstraint::TIED_TO) == -1) {
922     assert(AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2));
923     assert(Desc.getOperandConstraint(
924                AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2),
925                MCOI::OperandConstraint::TIED_TO) == DST_IDX);
926     (void)DST_IDX;
927     return true;
928   }
929 
930   return false;
931 }
932 
933 // Create dummy old operand and insert dummy unused src2_modifiers
934 void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
935   assert(MI.getNumOperands() + 1 < MCII->get(MI.getOpcode()).getNumOperands());
936   insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
937   insertNamedMCOperand(MI, MCOperand::createImm(0),
938                        AMDGPU::OpName::src2_modifiers);
939 }
940 
941 void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
942   unsigned Opc = MI.getOpcode();
943 
944   int VDstInIdx =
945       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
946   if (VDstInIdx != -1)
947     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
948 
949   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
950   if (MI.getNumOperands() < DescNumOps &&
951       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
952     convertTrue16OpSel(MI);
953     auto Mods = collectVOPModifiers(MI);
954     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
955                          AMDGPU::OpName::op_sel);
956   } else {
957     // Insert dummy unused src modifiers.
958     if (MI.getNumOperands() < DescNumOps &&
959         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
960       insertNamedMCOperand(MI, MCOperand::createImm(0),
961                            AMDGPU::OpName::src0_modifiers);
962 
963     if (MI.getNumOperands() < DescNumOps &&
964         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
965       insertNamedMCOperand(MI, MCOperand::createImm(0),
966                            AMDGPU::OpName::src1_modifiers);
967   }
968 }
969 
970 void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
971   convertTrue16OpSel(MI);
972 
973   int VDstInIdx =
974       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
975   if (VDstInIdx != -1)
976     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
977 
978   unsigned Opc = MI.getOpcode();
979   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
980   if (MI.getNumOperands() < DescNumOps &&
981       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
982     auto Mods = collectVOPModifiers(MI);
983     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
984                          AMDGPU::OpName::op_sel);
985   }
986 }
987 
988 // Note that before gfx10, the MIMG encoding provided no information about
989 // VADDR size. Consequently, decoded instructions always show address as if it
990 // has 1 dword, which could be not really so.
991 void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
992   auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
993 
994   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
995                                            AMDGPU::OpName::vdst);
996 
997   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
998                                             AMDGPU::OpName::vdata);
999   int VAddr0Idx =
1000       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
1001   int RsrcOpName = (TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc
1002                                                   : AMDGPU::OpName::rsrc;
1003   int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName);
1004   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1005                                             AMDGPU::OpName::dmask);
1006 
1007   int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1008                                             AMDGPU::OpName::tfe);
1009   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1010                                             AMDGPU::OpName::d16);
1011 
1012   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1013   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1014       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
1015 
1016   assert(VDataIdx != -1);
1017   if (BaseOpcode->BVH) {
1018     // Add A16 operand for intersect_ray instructions
1019     addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
1020     return;
1021   }
1022 
1023   bool IsAtomic = (VDstIdx != -1);
1024   bool IsGather4 = TSFlags & SIInstrFlags::Gather4;
1025   bool IsVSample = TSFlags & SIInstrFlags::VSAMPLE;
1026   bool IsNSA = false;
1027   bool IsPartialNSA = false;
1028   unsigned AddrSize = Info->VAddrDwords;
1029 
1030   if (isGFX10Plus()) {
1031     unsigned DimIdx =
1032         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
1033     int A16Idx =
1034         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
1035     const AMDGPU::MIMGDimInfo *Dim =
1036         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
1037     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
1038 
1039     AddrSize =
1040         AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
1041 
1042     // VSAMPLE insts that do not use vaddr3 behave the same as NSA forms.
1043     // VIMAGE insts other than BVH never use vaddr4.
1044     IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
1045             Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA ||
1046             Info->MIMGEncoding == AMDGPU::MIMGEncGfx12;
1047     if (!IsNSA) {
1048       if (!IsVSample && AddrSize > 12)
1049         AddrSize = 16;
1050     } else {
1051       if (AddrSize > Info->VAddrDwords) {
1052         if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
1053           // The NSA encoding does not contain enough operands for the
1054           // combination of base opcode / dimension. Should this be an error?
1055           return;
1056         }
1057         IsPartialNSA = true;
1058       }
1059     }
1060   }
1061 
1062   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
1063   unsigned DstSize = IsGather4 ? 4 : std::max(llvm::popcount(DMask), 1);
1064 
1065   bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
1066   if (D16 && AMDGPU::hasPackedD16(STI)) {
1067     DstSize = (DstSize + 1) / 2;
1068   }
1069 
1070   if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
1071     DstSize += 1;
1072 
1073   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
1074     return;
1075 
1076   int NewOpcode =
1077       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
1078   if (NewOpcode == -1)
1079     return;
1080 
1081   // Widen the register to the correct number of enabled channels.
1082   MCRegister NewVdata;
1083   if (DstSize != Info->VDataDwords) {
1084     auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
1085 
1086     // Get first subregister of VData
1087     MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg();
1088     MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
1089     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
1090 
1091     NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
1092                                        &MRI.getRegClass(DataRCID));
1093     if (!NewVdata) {
1094       // It's possible to encode this such that the low register + enabled
1095       // components exceeds the register count.
1096       return;
1097     }
1098   }
1099 
1100   // If not using NSA on GFX10+, widen vaddr0 address register to correct size.
1101   // If using partial NSA on GFX11+ widen last address register.
1102   int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx;
1103   MCRegister NewVAddrSA;
1104   if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) &&
1105       AddrSize != Info->VAddrDwords) {
1106     MCRegister VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
1107     MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
1108     VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
1109 
1110     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
1111     NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
1112                                         &MRI.getRegClass(AddrRCID));
1113     if (!NewVAddrSA)
1114       return;
1115   }
1116 
1117   MI.setOpcode(NewOpcode);
1118 
1119   if (NewVdata != AMDGPU::NoRegister) {
1120     MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
1121 
1122     if (IsAtomic) {
1123       // Atomic operations have an additional operand (a copy of data)
1124       MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
1125     }
1126   }
1127 
1128   if (NewVAddrSA) {
1129     MI.getOperand(VAddrSAIdx) = MCOperand::createReg(NewVAddrSA);
1130   } else if (IsNSA) {
1131     assert(AddrSize <= Info->VAddrDwords);
1132     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
1133              MI.begin() + VAddr0Idx + Info->VAddrDwords);
1134   }
1135 }
1136 
1137 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
1138 // decoder only adds to src_modifiers, so manually add the bits to the other
1139 // operands.
1140 void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
1141   unsigned Opc = MI.getOpcode();
1142   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1143   auto Mods = collectVOPModifiers(MI, true);
1144 
1145   if (MI.getNumOperands() < DescNumOps &&
1146       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
1147     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
1148 
1149   if (MI.getNumOperands() < DescNumOps &&
1150       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel))
1151     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1152                          AMDGPU::OpName::op_sel);
1153   if (MI.getNumOperands() < DescNumOps &&
1154       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel_hi))
1155     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
1156                          AMDGPU::OpName::op_sel_hi);
1157   if (MI.getNumOperands() < DescNumOps &&
1158       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_lo))
1159     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
1160                          AMDGPU::OpName::neg_lo);
1161   if (MI.getNumOperands() < DescNumOps &&
1162       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
1163     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
1164                          AMDGPU::OpName::neg_hi);
1165 }
1166 
1167 // Create dummy old operand and insert optional operands
1168 void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
1169   unsigned Opc = MI.getOpcode();
1170   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1171 
1172   if (MI.getNumOperands() < DescNumOps &&
1173       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::old))
1174     insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
1175 
1176   if (MI.getNumOperands() < DescNumOps &&
1177       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
1178     insertNamedMCOperand(MI, MCOperand::createImm(0),
1179                          AMDGPU::OpName::src0_modifiers);
1180 
1181   if (MI.getNumOperands() < DescNumOps &&
1182       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
1183     insertNamedMCOperand(MI, MCOperand::createImm(0),
1184                          AMDGPU::OpName::src1_modifiers);
1185 }
1186 
1187 void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const {
1188   assert(HasLiteral && "Should have decoded a literal");
1189   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
1190   unsigned DescNumOps = Desc.getNumOperands();
1191   insertNamedMCOperand(MI, MCOperand::createImm(Literal),
1192                        AMDGPU::OpName::immDeferred);
1193   assert(DescNumOps == MI.getNumOperands());
1194   for (unsigned I = 0; I < DescNumOps; ++I) {
1195     auto &Op = MI.getOperand(I);
1196     auto OpType = Desc.operands()[I].OperandType;
1197     bool IsDeferredOp = (OpType == AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED ||
1198                          OpType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED);
1199     if (Op.isImm() && Op.getImm() == AMDGPU::EncValues::LITERAL_CONST &&
1200         IsDeferredOp)
1201       Op.setImm(Literal);
1202   }
1203 }
1204 
1205 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
1206   return getContext().getRegisterInfo()->
1207     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
1208 }
1209 
1210 inline
1211 MCOperand AMDGPUDisassembler::errOperand(unsigned V,
1212                                          const Twine& ErrMsg) const {
1213   *CommentStream << "Error: " + ErrMsg;
1214 
1215   // ToDo: add support for error operands to MCInst.h
1216   // return MCOperand::createError(V);
1217   return MCOperand();
1218 }
1219 
1220 inline
1221 MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
1222   return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
1223 }
1224 
1225 inline
1226 MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
1227                                                unsigned Val) const {
1228   const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
1229   if (Val >= RegCl.getNumRegs())
1230     return errOperand(Val, Twine(getRegClassName(RegClassID)) +
1231                            ": unknown register " + Twine(Val));
1232   return createRegOperand(RegCl.getRegister(Val));
1233 }
1234 
1235 inline
1236 MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
1237                                                 unsigned Val) const {
1238   // ToDo: SI/CI have 104 SGPRs, VI - 102
1239   // Valery: here we accepting as much as we can, let assembler sort it out
1240   int shift = 0;
1241   switch (SRegClassID) {
1242   case AMDGPU::SGPR_32RegClassID:
1243   case AMDGPU::TTMP_32RegClassID:
1244     break;
1245   case AMDGPU::SGPR_64RegClassID:
1246   case AMDGPU::TTMP_64RegClassID:
1247     shift = 1;
1248     break;
1249   case AMDGPU::SGPR_96RegClassID:
1250   case AMDGPU::TTMP_96RegClassID:
1251   case AMDGPU::SGPR_128RegClassID:
1252   case AMDGPU::TTMP_128RegClassID:
1253   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
1254   // this bundle?
1255   case AMDGPU::SGPR_256RegClassID:
1256   case AMDGPU::TTMP_256RegClassID:
1257     // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
1258   // this bundle?
1259   case AMDGPU::SGPR_288RegClassID:
1260   case AMDGPU::TTMP_288RegClassID:
1261   case AMDGPU::SGPR_320RegClassID:
1262   case AMDGPU::TTMP_320RegClassID:
1263   case AMDGPU::SGPR_352RegClassID:
1264   case AMDGPU::TTMP_352RegClassID:
1265   case AMDGPU::SGPR_384RegClassID:
1266   case AMDGPU::TTMP_384RegClassID:
1267   case AMDGPU::SGPR_512RegClassID:
1268   case AMDGPU::TTMP_512RegClassID:
1269     shift = 2;
1270     break;
1271   // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
1272   // this bundle?
1273   default:
1274     llvm_unreachable("unhandled register class");
1275   }
1276 
1277   if (Val % (1 << shift)) {
1278     *CommentStream << "Warning: " << getRegClassName(SRegClassID)
1279                    << ": scalar reg isn't aligned " << Val;
1280   }
1281 
1282   return createRegOperand(SRegClassID, Val >> shift);
1283 }
1284 
1285 MCOperand AMDGPUDisassembler::createVGPR16Operand(unsigned RegIdx,
1286                                                   bool IsHi) const {
1287   unsigned RegIdxInVGPR16 = RegIdx * 2 + (IsHi ? 1 : 0);
1288   return createRegOperand(AMDGPU::VGPR_16RegClassID, RegIdxInVGPR16);
1289 }
1290 
1291 // Decode Literals for insts which always have a literal in the encoding
1292 MCOperand
1293 AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
1294   if (HasLiteral) {
1295     assert(
1296         AMDGPU::hasVOPD(STI) &&
1297         "Should only decode multiple kimm with VOPD, check VSrc operand types");
1298     if (Literal != Val)
1299       return errOperand(Val, "More than one unique literal is illegal");
1300   }
1301   HasLiteral = true;
1302   Literal = Val;
1303   return MCOperand::createImm(Literal);
1304 }
1305 
1306 MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
1307   // For now all literal constants are supposed to be unsigned integer
1308   // ToDo: deal with signed/unsigned 64-bit integer constants
1309   // ToDo: deal with float/double constants
1310   if (!HasLiteral) {
1311     if (Bytes.size() < 4) {
1312       return errOperand(0, "cannot read literal, inst bytes left " +
1313                         Twine(Bytes.size()));
1314     }
1315     HasLiteral = true;
1316     Literal = Literal64 = eatBytes<uint32_t>(Bytes);
1317     if (ExtendFP64)
1318       Literal64 <<= 32;
1319   }
1320   return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
1321 }
1322 
1323 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
1324   using namespace AMDGPU::EncValues;
1325 
1326   assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
1327   return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
1328     (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
1329     (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
1330       // Cast prevents negative overflow.
1331 }
1332 
1333 static int64_t getInlineImmVal32(unsigned Imm) {
1334   switch (Imm) {
1335   case 240:
1336     return llvm::bit_cast<uint32_t>(0.5f);
1337   case 241:
1338     return llvm::bit_cast<uint32_t>(-0.5f);
1339   case 242:
1340     return llvm::bit_cast<uint32_t>(1.0f);
1341   case 243:
1342     return llvm::bit_cast<uint32_t>(-1.0f);
1343   case 244:
1344     return llvm::bit_cast<uint32_t>(2.0f);
1345   case 245:
1346     return llvm::bit_cast<uint32_t>(-2.0f);
1347   case 246:
1348     return llvm::bit_cast<uint32_t>(4.0f);
1349   case 247:
1350     return llvm::bit_cast<uint32_t>(-4.0f);
1351   case 248: // 1 / (2 * PI)
1352     return 0x3e22f983;
1353   default:
1354     llvm_unreachable("invalid fp inline imm");
1355   }
1356 }
1357 
1358 static int64_t getInlineImmVal64(unsigned Imm) {
1359   switch (Imm) {
1360   case 240:
1361     return llvm::bit_cast<uint64_t>(0.5);
1362   case 241:
1363     return llvm::bit_cast<uint64_t>(-0.5);
1364   case 242:
1365     return llvm::bit_cast<uint64_t>(1.0);
1366   case 243:
1367     return llvm::bit_cast<uint64_t>(-1.0);
1368   case 244:
1369     return llvm::bit_cast<uint64_t>(2.0);
1370   case 245:
1371     return llvm::bit_cast<uint64_t>(-2.0);
1372   case 246:
1373     return llvm::bit_cast<uint64_t>(4.0);
1374   case 247:
1375     return llvm::bit_cast<uint64_t>(-4.0);
1376   case 248: // 1 / (2 * PI)
1377     return 0x3fc45f306dc9c882;
1378   default:
1379     llvm_unreachable("invalid fp inline imm");
1380   }
1381 }
1382 
1383 static int64_t getInlineImmValF16(unsigned Imm) {
1384   switch (Imm) {
1385   case 240:
1386     return 0x3800;
1387   case 241:
1388     return 0xB800;
1389   case 242:
1390     return 0x3C00;
1391   case 243:
1392     return 0xBC00;
1393   case 244:
1394     return 0x4000;
1395   case 245:
1396     return 0xC000;
1397   case 246:
1398     return 0x4400;
1399   case 247:
1400     return 0xC400;
1401   case 248: // 1 / (2 * PI)
1402     return 0x3118;
1403   default:
1404     llvm_unreachable("invalid fp inline imm");
1405   }
1406 }
1407 
1408 static int64_t getInlineImmValBF16(unsigned Imm) {
1409   switch (Imm) {
1410   case 240:
1411     return 0x3F00;
1412   case 241:
1413     return 0xBF00;
1414   case 242:
1415     return 0x3F80;
1416   case 243:
1417     return 0xBF80;
1418   case 244:
1419     return 0x4000;
1420   case 245:
1421     return 0xC000;
1422   case 246:
1423     return 0x4080;
1424   case 247:
1425     return 0xC080;
1426   case 248: // 1 / (2 * PI)
1427     return 0x3E22;
1428   default:
1429     llvm_unreachable("invalid fp inline imm");
1430   }
1431 }
1432 
1433 static int64_t getInlineImmVal16(unsigned Imm, AMDGPU::OperandSemantics Sema) {
1434   return (Sema == AMDGPU::OperandSemantics::BF16) ? getInlineImmValBF16(Imm)
1435                                                   : getInlineImmValF16(Imm);
1436 }
1437 
1438 MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm,
1439                                             AMDGPU::OperandSemantics Sema) {
1440   assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN &&
1441          Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
1442 
1443   // ToDo: case 248: 1/(2*PI) - is allowed only on VI
1444   // ImmWidth 0 is a default case where operand should not allow immediates.
1445   // Imm value is still decoded into 32 bit immediate operand, inst printer will
1446   // use it to print verbose error message.
1447   switch (ImmWidth) {
1448   case 0:
1449   case 32:
1450     return MCOperand::createImm(getInlineImmVal32(Imm));
1451   case 64:
1452     return MCOperand::createImm(getInlineImmVal64(Imm));
1453   case 16:
1454     return MCOperand::createImm(getInlineImmVal16(Imm, Sema));
1455   default:
1456     llvm_unreachable("implement me");
1457   }
1458 }
1459 
1460 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
1461   using namespace AMDGPU;
1462 
1463   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1464   switch (Width) {
1465   default: // fall
1466   case OPW32:
1467   case OPW16:
1468   case OPWV216:
1469     return VGPR_32RegClassID;
1470   case OPW64:
1471   case OPWV232: return VReg_64RegClassID;
1472   case OPW96: return VReg_96RegClassID;
1473   case OPW128: return VReg_128RegClassID;
1474   case OPW160: return VReg_160RegClassID;
1475   case OPW256: return VReg_256RegClassID;
1476   case OPW288: return VReg_288RegClassID;
1477   case OPW320: return VReg_320RegClassID;
1478   case OPW352: return VReg_352RegClassID;
1479   case OPW384: return VReg_384RegClassID;
1480   case OPW512: return VReg_512RegClassID;
1481   case OPW1024: return VReg_1024RegClassID;
1482   }
1483 }
1484 
1485 unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
1486   using namespace AMDGPU;
1487 
1488   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1489   switch (Width) {
1490   default: // fall
1491   case OPW32:
1492   case OPW16:
1493   case OPWV216:
1494     return AGPR_32RegClassID;
1495   case OPW64:
1496   case OPWV232: return AReg_64RegClassID;
1497   case OPW96: return AReg_96RegClassID;
1498   case OPW128: return AReg_128RegClassID;
1499   case OPW160: return AReg_160RegClassID;
1500   case OPW256: return AReg_256RegClassID;
1501   case OPW288: return AReg_288RegClassID;
1502   case OPW320: return AReg_320RegClassID;
1503   case OPW352: return AReg_352RegClassID;
1504   case OPW384: return AReg_384RegClassID;
1505   case OPW512: return AReg_512RegClassID;
1506   case OPW1024: return AReg_1024RegClassID;
1507   }
1508 }
1509 
1510 
1511 unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
1512   using namespace AMDGPU;
1513 
1514   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1515   switch (Width) {
1516   default: // fall
1517   case OPW32:
1518   case OPW16:
1519   case OPWV216:
1520     return SGPR_32RegClassID;
1521   case OPW64:
1522   case OPWV232: return SGPR_64RegClassID;
1523   case OPW96: return SGPR_96RegClassID;
1524   case OPW128: return SGPR_128RegClassID;
1525   case OPW160: return SGPR_160RegClassID;
1526   case OPW256: return SGPR_256RegClassID;
1527   case OPW288: return SGPR_288RegClassID;
1528   case OPW320: return SGPR_320RegClassID;
1529   case OPW352: return SGPR_352RegClassID;
1530   case OPW384: return SGPR_384RegClassID;
1531   case OPW512: return SGPR_512RegClassID;
1532   }
1533 }
1534 
1535 unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
1536   using namespace AMDGPU;
1537 
1538   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1539   switch (Width) {
1540   default: // fall
1541   case OPW32:
1542   case OPW16:
1543   case OPWV216:
1544     return TTMP_32RegClassID;
1545   case OPW64:
1546   case OPWV232: return TTMP_64RegClassID;
1547   case OPW128: return TTMP_128RegClassID;
1548   case OPW256: return TTMP_256RegClassID;
1549   case OPW288: return TTMP_288RegClassID;
1550   case OPW320: return TTMP_320RegClassID;
1551   case OPW352: return TTMP_352RegClassID;
1552   case OPW384: return TTMP_384RegClassID;
1553   case OPW512: return TTMP_512RegClassID;
1554   }
1555 }
1556 
1557 int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
1558   using namespace AMDGPU::EncValues;
1559 
1560   unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
1561   unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
1562 
1563   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
1564 }
1565 
1566 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
1567                                           bool MandatoryLiteral,
1568                                           unsigned ImmWidth,
1569                                           AMDGPU::OperandSemantics Sema) const {
1570   using namespace AMDGPU::EncValues;
1571 
1572   assert(Val < 1024); // enum10
1573 
1574   bool IsAGPR = Val & 512;
1575   Val &= 511;
1576 
1577   if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
1578     return createRegOperand(IsAGPR ? getAgprClassId(Width)
1579                                    : getVgprClassId(Width), Val - VGPR_MIN);
1580   }
1581   return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
1582                             Sema);
1583 }
1584 
1585 MCOperand
1586 AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
1587                                        bool MandatoryLiteral, unsigned ImmWidth,
1588                                        AMDGPU::OperandSemantics Sema) const {
1589   // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
1590   // decoded earlier.
1591   assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
1592   using namespace AMDGPU::EncValues;
1593 
1594   if (Val <= SGPR_MAX) {
1595     // "SGPR_MIN <= Val" is always true and causes compilation warning.
1596     static_assert(SGPR_MIN == 0);
1597     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
1598   }
1599 
1600   int TTmpIdx = getTTmpIdx(Val);
1601   if (TTmpIdx >= 0) {
1602     return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
1603   }
1604 
1605   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
1606     return decodeIntImmed(Val);
1607 
1608   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
1609     return decodeFPImmed(ImmWidth, Val, Sema);
1610 
1611   if (Val == LITERAL_CONST) {
1612     if (MandatoryLiteral)
1613       // Keep a sentinel value for deferred setting
1614       return MCOperand::createImm(LITERAL_CONST);
1615     return decodeLiteralConstant(Sema == AMDGPU::OperandSemantics::FP64);
1616   }
1617 
1618   switch (Width) {
1619   case OPW32:
1620   case OPW16:
1621   case OPWV216:
1622     return decodeSpecialReg32(Val);
1623   case OPW64:
1624   case OPWV232:
1625     return decodeSpecialReg64(Val);
1626   default:
1627     llvm_unreachable("unexpected immediate type");
1628   }
1629 }
1630 
1631 // Bit 0 of DstY isn't stored in the instruction, because it's always the
1632 // opposite of bit 0 of DstX.
1633 MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
1634                                                unsigned Val) const {
1635   int VDstXInd =
1636       AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
1637   assert(VDstXInd != -1);
1638   assert(Inst.getOperand(VDstXInd).isReg());
1639   unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
1640   Val |= ~XDstReg & 1;
1641   auto Width = llvm::AMDGPUDisassembler::OPW32;
1642   return createRegOperand(getVgprClassId(Width), Val);
1643 }
1644 
1645 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
1646   using namespace AMDGPU;
1647 
1648   switch (Val) {
1649   // clang-format off
1650   case 102: return createRegOperand(FLAT_SCR_LO);
1651   case 103: return createRegOperand(FLAT_SCR_HI);
1652   case 104: return createRegOperand(XNACK_MASK_LO);
1653   case 105: return createRegOperand(XNACK_MASK_HI);
1654   case 106: return createRegOperand(VCC_LO);
1655   case 107: return createRegOperand(VCC_HI);
1656   case 108: return createRegOperand(TBA_LO);
1657   case 109: return createRegOperand(TBA_HI);
1658   case 110: return createRegOperand(TMA_LO);
1659   case 111: return createRegOperand(TMA_HI);
1660   case 124:
1661     return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
1662   case 125:
1663     return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
1664   case 126: return createRegOperand(EXEC_LO);
1665   case 127: return createRegOperand(EXEC_HI);
1666   case 235: return createRegOperand(SRC_SHARED_BASE_LO);
1667   case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
1668   case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
1669   case 238: return createRegOperand(SRC_PRIVATE_LIMIT_LO);
1670   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1671   case 251: return createRegOperand(SRC_VCCZ);
1672   case 252: return createRegOperand(SRC_EXECZ);
1673   case 253: return createRegOperand(SRC_SCC);
1674   case 254: return createRegOperand(LDS_DIRECT);
1675   default: break;
1676     // clang-format on
1677   }
1678   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1679 }
1680 
1681 MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
1682   using namespace AMDGPU;
1683 
1684   switch (Val) {
1685   case 102: return createRegOperand(FLAT_SCR);
1686   case 104: return createRegOperand(XNACK_MASK);
1687   case 106: return createRegOperand(VCC);
1688   case 108: return createRegOperand(TBA);
1689   case 110: return createRegOperand(TMA);
1690   case 124:
1691     if (isGFX11Plus())
1692       return createRegOperand(SGPR_NULL);
1693     break;
1694   case 125:
1695     if (!isGFX11Plus())
1696       return createRegOperand(SGPR_NULL);
1697     break;
1698   case 126: return createRegOperand(EXEC);
1699   case 235: return createRegOperand(SRC_SHARED_BASE);
1700   case 236: return createRegOperand(SRC_SHARED_LIMIT);
1701   case 237: return createRegOperand(SRC_PRIVATE_BASE);
1702   case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
1703   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1704   case 251: return createRegOperand(SRC_VCCZ);
1705   case 252: return createRegOperand(SRC_EXECZ);
1706   case 253: return createRegOperand(SRC_SCC);
1707   default: break;
1708   }
1709   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1710 }
1711 
1712 MCOperand
1713 AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, const unsigned Val,
1714                                   unsigned ImmWidth,
1715                                   AMDGPU::OperandSemantics Sema) const {
1716   using namespace AMDGPU::SDWA;
1717   using namespace AMDGPU::EncValues;
1718 
1719   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
1720       STI.hasFeature(AMDGPU::FeatureGFX10)) {
1721     // XXX: cast to int is needed to avoid stupid warning:
1722     // compare with unsigned is always true
1723     if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
1724         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
1725       return createRegOperand(getVgprClassId(Width),
1726                               Val - SDWA9EncValues::SRC_VGPR_MIN);
1727     }
1728     if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
1729         Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
1730                               : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
1731       return createSRegOperand(getSgprClassId(Width),
1732                                Val - SDWA9EncValues::SRC_SGPR_MIN);
1733     }
1734     if (SDWA9EncValues::SRC_TTMP_MIN <= Val &&
1735         Val <= SDWA9EncValues::SRC_TTMP_MAX) {
1736       return createSRegOperand(getTtmpClassId(Width),
1737                                Val - SDWA9EncValues::SRC_TTMP_MIN);
1738     }
1739 
1740     const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
1741 
1742     if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
1743       return decodeIntImmed(SVal);
1744 
1745     if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
1746       return decodeFPImmed(ImmWidth, SVal, Sema);
1747 
1748     return decodeSpecialReg32(SVal);
1749   }
1750   if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands))
1751     return createRegOperand(getVgprClassId(Width), Val);
1752   llvm_unreachable("unsupported target");
1753 }
1754 
1755 MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
1756   return decodeSDWASrc(OPW16, Val, 16, AMDGPU::OperandSemantics::FP16);
1757 }
1758 
1759 MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
1760   return decodeSDWASrc(OPW32, Val, 32, AMDGPU::OperandSemantics::FP32);
1761 }
1762 
1763 MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
1764   using namespace AMDGPU::SDWA;
1765 
1766   assert((STI.hasFeature(AMDGPU::FeatureGFX9) ||
1767           STI.hasFeature(AMDGPU::FeatureGFX10)) &&
1768          "SDWAVopcDst should be present only on GFX9+");
1769 
1770   bool IsWave64 = STI.hasFeature(AMDGPU::FeatureWavefrontSize64);
1771 
1772   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
1773     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
1774 
1775     int TTmpIdx = getTTmpIdx(Val);
1776     if (TTmpIdx >= 0) {
1777       auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
1778       return createSRegOperand(TTmpClsId, TTmpIdx);
1779     }
1780     if (Val > SGPR_MAX) {
1781       return IsWave64 ? decodeSpecialReg64(Val) : decodeSpecialReg32(Val);
1782     }
1783     return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
1784   }
1785   return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
1786 }
1787 
1788 MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
1789   return STI.hasFeature(AMDGPU::FeatureWavefrontSize64)
1790              ? decodeSrcOp(OPW64, Val)
1791              : decodeSrcOp(OPW32, Val);
1792 }
1793 
1794 MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
1795   return decodeSrcOp(OPW32, Val);
1796 }
1797 
1798 MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
1799   if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1)
1800     return MCOperand();
1801   return MCOperand::createImm(Val);
1802 }
1803 
1804 MCOperand AMDGPUDisassembler::decodeVersionImm(unsigned Imm) const {
1805   using VersionField = AMDGPU::EncodingField<7, 0>;
1806   using W64Bit = AMDGPU::EncodingBit<13>;
1807   using W32Bit = AMDGPU::EncodingBit<14>;
1808   using MDPBit = AMDGPU::EncodingBit<15>;
1809   using Encoding = AMDGPU::EncodingFields<VersionField, W64Bit, W32Bit, MDPBit>;
1810 
1811   auto [Version, W64, W32, MDP] = Encoding::decode(Imm);
1812 
1813   // Decode into a plain immediate if any unused bits are raised.
1814   if (Encoding::encode(Version, W64, W32, MDP) != Imm)
1815     return MCOperand::createImm(Imm);
1816 
1817   const auto &Versions = AMDGPU::UCVersion::getGFXVersions();
1818   const auto *I = find_if(
1819       Versions, [Version = Version](const AMDGPU::UCVersion::GFXVersion &V) {
1820         return V.Code == Version;
1821       });
1822   MCContext &Ctx = getContext();
1823   const MCExpr *E;
1824   if (I == Versions.end())
1825     E = MCConstantExpr::create(Version, Ctx);
1826   else
1827     E = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(I->Symbol), Ctx);
1828 
1829   if (W64)
1830     E = MCBinaryExpr::createOr(E, UCVersionW64Expr, Ctx);
1831   if (W32)
1832     E = MCBinaryExpr::createOr(E, UCVersionW32Expr, Ctx);
1833   if (MDP)
1834     E = MCBinaryExpr::createOr(E, UCVersionMDPExpr, Ctx);
1835 
1836   return MCOperand::createExpr(E);
1837 }
1838 
1839 bool AMDGPUDisassembler::isVI() const {
1840   return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
1841 }
1842 
1843 bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
1844 
1845 bool AMDGPUDisassembler::isGFX90A() const {
1846   return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
1847 }
1848 
1849 bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
1850 
1851 bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
1852 
1853 bool AMDGPUDisassembler::isGFX10Plus() const {
1854   return AMDGPU::isGFX10Plus(STI);
1855 }
1856 
1857 bool AMDGPUDisassembler::isGFX11() const {
1858   return STI.hasFeature(AMDGPU::FeatureGFX11);
1859 }
1860 
1861 bool AMDGPUDisassembler::isGFX11Plus() const {
1862   return AMDGPU::isGFX11Plus(STI);
1863 }
1864 
1865 bool AMDGPUDisassembler::isGFX12() const {
1866   return STI.hasFeature(AMDGPU::FeatureGFX12);
1867 }
1868 
1869 bool AMDGPUDisassembler::isGFX12Plus() const {
1870   return AMDGPU::isGFX12Plus(STI);
1871 }
1872 
1873 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
1874   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
1875 }
1876 
1877 bool AMDGPUDisassembler::hasKernargPreload() const {
1878   return AMDGPU::hasKernargPreload(STI);
1879 }
1880 
1881 //===----------------------------------------------------------------------===//
1882 // AMDGPU specific symbol handling
1883 //===----------------------------------------------------------------------===//
1884 
1885 /// Print a string describing the reserved bit range specified by Mask with
1886 /// offset BaseBytes for use in error comments. Mask is a single continuous
1887 /// range of 1s surrounded by zeros. The format here is meant to align with the
1888 /// tables that describe these bits in llvm.org/docs/AMDGPUUsage.html.
1889 static SmallString<32> getBitRangeFromMask(uint32_t Mask, unsigned BaseBytes) {
1890   SmallString<32> Result;
1891   raw_svector_ostream S(Result);
1892 
1893   int TrailingZeros = llvm::countr_zero(Mask);
1894   int PopCount = llvm::popcount(Mask);
1895 
1896   if (PopCount == 1) {
1897     S << "bit (" << (TrailingZeros + BaseBytes * CHAR_BIT) << ')';
1898   } else {
1899     S << "bits in range ("
1900       << (TrailingZeros + PopCount - 1 + BaseBytes * CHAR_BIT) << ':'
1901       << (TrailingZeros + BaseBytes * CHAR_BIT) << ')';
1902   }
1903 
1904   return Result;
1905 }
1906 
1907 #define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK))
1908 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1909   do {                                                                         \
1910     KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n';            \
1911   } while (0)
1912 #define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK)                        \
1913   do {                                                                         \
1914     KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " "       \
1915              << GET_FIELD(MASK) << '\n';                                       \
1916   } while (0)
1917 
1918 #define CHECK_RESERVED_BITS_IMPL(MASK, DESC, MSG)                              \
1919   do {                                                                         \
1920     if (FourByteBuffer & (MASK)) {                                             \
1921       return createStringError(std::errc::invalid_argument,                    \
1922                                "kernel descriptor " DESC                       \
1923                                " reserved %s set" MSG,                         \
1924                                getBitRangeFromMask((MASK), 0).c_str());        \
1925     }                                                                          \
1926   } while (0)
1927 
1928 #define CHECK_RESERVED_BITS(MASK) CHECK_RESERVED_BITS_IMPL(MASK, #MASK, "")
1929 #define CHECK_RESERVED_BITS_MSG(MASK, MSG)                                     \
1930   CHECK_RESERVED_BITS_IMPL(MASK, #MASK, ", " MSG)
1931 #define CHECK_RESERVED_BITS_DESC(MASK, DESC)                                   \
1932   CHECK_RESERVED_BITS_IMPL(MASK, DESC, "")
1933 #define CHECK_RESERVED_BITS_DESC_MSG(MASK, DESC, MSG)                          \
1934   CHECK_RESERVED_BITS_IMPL(MASK, DESC, ", " MSG)
1935 
1936 // NOLINTNEXTLINE(readability-identifier-naming)
1937 Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
1938     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1939   using namespace amdhsa;
1940   StringRef Indent = "\t";
1941 
1942   // We cannot accurately backward compute #VGPRs used from
1943   // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
1944   // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
1945   // simply calculate the inverse of what the assembler does.
1946 
1947   uint32_t GranulatedWorkitemVGPRCount =
1948       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT);
1949 
1950   uint32_t NextFreeVGPR =
1951       (GranulatedWorkitemVGPRCount + 1) *
1952       AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32);
1953 
1954   KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
1955 
1956   // We cannot backward compute values used to calculate
1957   // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
1958   // directives can't be computed:
1959   // .amdhsa_reserve_vcc
1960   // .amdhsa_reserve_flat_scratch
1961   // .amdhsa_reserve_xnack_mask
1962   // They take their respective default values if not specified in the assembly.
1963   //
1964   // GRANULATED_WAVEFRONT_SGPR_COUNT
1965   //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
1966   //
1967   // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
1968   // are set to 0. So while disassembling we consider that:
1969   //
1970   // GRANULATED_WAVEFRONT_SGPR_COUNT
1971   //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
1972   //
1973   // The disassembler cannot recover the original values of those 3 directives.
1974 
1975   uint32_t GranulatedWavefrontSGPRCount =
1976       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT);
1977 
1978   if (isGFX10Plus())
1979     CHECK_RESERVED_BITS_MSG(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
1980                             "must be zero on gfx10+");
1981 
1982   uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
1983                           AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
1984 
1985   KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
1986   if (!hasArchitectedFlatScratch())
1987     KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
1988   KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
1989   KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
1990 
1991   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_PRIORITY);
1992 
1993   PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
1994                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
1995   PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
1996                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
1997   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
1998                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
1999   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
2000                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
2001 
2002   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_PRIV);
2003 
2004   if (!isGFX12Plus())
2005     PRINT_DIRECTIVE(".amdhsa_dx10_clamp",
2006                     COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
2007 
2008   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_DEBUG_MODE);
2009 
2010   if (!isGFX12Plus())
2011     PRINT_DIRECTIVE(".amdhsa_ieee_mode",
2012                     COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
2013 
2014   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_BULKY);
2015   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_CDBG_USER);
2016 
2017   if (isGFX9Plus())
2018     PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
2019 
2020   if (!isGFX9Plus())
2021     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0,
2022                                  "COMPUTE_PGM_RSRC1", "must be zero pre-gfx9");
2023 
2024   CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC1_RESERVED1, "COMPUTE_PGM_RSRC1");
2025 
2026   if (!isGFX10Plus())
2027     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2,
2028                                  "COMPUTE_PGM_RSRC1", "must be zero pre-gfx10");
2029 
2030   if (isGFX10Plus()) {
2031     PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
2032                     COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
2033     PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
2034     PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
2035   }
2036 
2037   if (isGFX12Plus())
2038     PRINT_DIRECTIVE(".amdhsa_round_robin_scheduling",
2039                     COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
2040 
2041   return true;
2042 }
2043 
2044 // NOLINTNEXTLINE(readability-identifier-naming)
2045 Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
2046     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
2047   using namespace amdhsa;
2048   StringRef Indent = "\t";
2049   if (hasArchitectedFlatScratch())
2050     PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
2051                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
2052   else
2053     PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
2054                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
2055   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
2056                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
2057   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
2058                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
2059   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
2060                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
2061   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
2062                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
2063   PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
2064                   COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
2065 
2066   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH);
2067   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY);
2068   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE);
2069 
2070   PRINT_DIRECTIVE(
2071       ".amdhsa_exception_fp_ieee_invalid_op",
2072       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
2073   PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
2074                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
2075   PRINT_DIRECTIVE(
2076       ".amdhsa_exception_fp_ieee_div_zero",
2077       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
2078   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
2079                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
2080   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
2081                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
2082   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
2083                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
2084   PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
2085                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
2086 
2087   CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC2_RESERVED0, "COMPUTE_PGM_RSRC2");
2088 
2089   return true;
2090 }
2091 
2092 // NOLINTNEXTLINE(readability-identifier-naming)
2093 Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
2094     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
2095   using namespace amdhsa;
2096   StringRef Indent = "\t";
2097   if (isGFX90A()) {
2098     KdStream << Indent << ".amdhsa_accum_offset "
2099              << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
2100              << '\n';
2101 
2102     PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
2103 
2104     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX90A_RESERVED0,
2105                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx90a");
2106     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX90A_RESERVED1,
2107                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx90a");
2108   } else if (isGFX10Plus()) {
2109     // Bits [0-3].
2110     if (!isGFX12Plus()) {
2111       if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
2112         PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
2113                         COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
2114       } else {
2115         PRINT_PSEUDO_DIRECTIVE_COMMENT(
2116             "SHARED_VGPR_COUNT",
2117             COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
2118       }
2119     } else {
2120       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0,
2121                                    "COMPUTE_PGM_RSRC3",
2122                                    "must be zero on gfx12+");
2123     }
2124 
2125     // Bits [4-11].
2126     if (isGFX11()) {
2127       PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
2128                                      COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE);
2129       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
2130                                      COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START);
2131       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
2132                                      COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END);
2133     } else if (isGFX12Plus()) {
2134       PRINT_PSEUDO_DIRECTIVE_COMMENT(
2135           "INST_PREF_SIZE", COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE);
2136     } else {
2137       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED1,
2138                                    "COMPUTE_PGM_RSRC3",
2139                                    "must be zero on gfx10");
2140     }
2141 
2142     // Bits [12].
2143     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2,
2144                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
2145 
2146     // Bits [13].
2147     if (isGFX12Plus()) {
2148       PRINT_PSEUDO_DIRECTIVE_COMMENT("GLG_EN",
2149                                      COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN);
2150     } else {
2151       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3,
2152                                    "COMPUTE_PGM_RSRC3",
2153                                    "must be zero on gfx10 or gfx11");
2154     }
2155 
2156     // Bits [14-30].
2157     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
2158                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
2159 
2160     // Bits [31].
2161     if (isGFX11Plus()) {
2162       PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
2163                                      COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP);
2164     } else {
2165       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED5,
2166                                    "COMPUTE_PGM_RSRC3",
2167                                    "must be zero on gfx10");
2168     }
2169   } else if (FourByteBuffer) {
2170     return createStringError(
2171         std::errc::invalid_argument,
2172         "kernel descriptor COMPUTE_PGM_RSRC3 must be all zero before gfx9");
2173   }
2174   return true;
2175 }
2176 #undef PRINT_PSEUDO_DIRECTIVE_COMMENT
2177 #undef PRINT_DIRECTIVE
2178 #undef GET_FIELD
2179 #undef CHECK_RESERVED_BITS_IMPL
2180 #undef CHECK_RESERVED_BITS
2181 #undef CHECK_RESERVED_BITS_MSG
2182 #undef CHECK_RESERVED_BITS_DESC
2183 #undef CHECK_RESERVED_BITS_DESC_MSG
2184 
2185 /// Create an error object to return from onSymbolStart for reserved kernel
2186 /// descriptor bits being set.
2187 static Error createReservedKDBitsError(uint32_t Mask, unsigned BaseBytes,
2188                                        const char *Msg = "") {
2189   return createStringError(
2190       std::errc::invalid_argument, "kernel descriptor reserved %s set%s%s",
2191       getBitRangeFromMask(Mask, BaseBytes).c_str(), *Msg ? ", " : "", Msg);
2192 }
2193 
2194 /// Create an error object to return from onSymbolStart for reserved kernel
2195 /// descriptor bytes being set.
2196 static Error createReservedKDBytesError(unsigned BaseInBytes,
2197                                         unsigned WidthInBytes) {
2198   // Create an error comment in the same format as the "Kernel Descriptor"
2199   // table here: https://llvm.org/docs/AMDGPUUsage.html#kernel-descriptor .
2200   return createStringError(
2201       std::errc::invalid_argument,
2202       "kernel descriptor reserved bits in range (%u:%u) set",
2203       (BaseInBytes + WidthInBytes) * CHAR_BIT - 1, BaseInBytes * CHAR_BIT);
2204 }
2205 
2206 Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
2207     DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
2208     raw_string_ostream &KdStream) const {
2209 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
2210   do {                                                                         \
2211     KdStream << Indent << DIRECTIVE " "                                        \
2212              << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
2213   } while (0)
2214 
2215   uint16_t TwoByteBuffer = 0;
2216   uint32_t FourByteBuffer = 0;
2217 
2218   StringRef ReservedBytes;
2219   StringRef Indent = "\t";
2220 
2221   assert(Bytes.size() == 64);
2222   DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
2223 
2224   switch (Cursor.tell()) {
2225   case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
2226     FourByteBuffer = DE.getU32(Cursor);
2227     KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
2228              << '\n';
2229     return true;
2230 
2231   case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
2232     FourByteBuffer = DE.getU32(Cursor);
2233     KdStream << Indent << ".amdhsa_private_segment_fixed_size "
2234              << FourByteBuffer << '\n';
2235     return true;
2236 
2237   case amdhsa::KERNARG_SIZE_OFFSET:
2238     FourByteBuffer = DE.getU32(Cursor);
2239     KdStream << Indent << ".amdhsa_kernarg_size "
2240              << FourByteBuffer << '\n';
2241     return true;
2242 
2243   case amdhsa::RESERVED0_OFFSET:
2244     // 4 reserved bytes, must be 0.
2245     ReservedBytes = DE.getBytes(Cursor, 4);
2246     for (int I = 0; I < 4; ++I) {
2247       if (ReservedBytes[I] != 0)
2248         return createReservedKDBytesError(amdhsa::RESERVED0_OFFSET, 4);
2249     }
2250     return true;
2251 
2252   case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
2253     // KERNEL_CODE_ENTRY_BYTE_OFFSET
2254     // So far no directive controls this for Code Object V3, so simply skip for
2255     // disassembly.
2256     DE.skip(Cursor, 8);
2257     return true;
2258 
2259   case amdhsa::RESERVED1_OFFSET:
2260     // 20 reserved bytes, must be 0.
2261     ReservedBytes = DE.getBytes(Cursor, 20);
2262     for (int I = 0; I < 20; ++I) {
2263       if (ReservedBytes[I] != 0)
2264         return createReservedKDBytesError(amdhsa::RESERVED1_OFFSET, 20);
2265     }
2266     return true;
2267 
2268   case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
2269     FourByteBuffer = DE.getU32(Cursor);
2270     return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream);
2271 
2272   case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
2273     FourByteBuffer = DE.getU32(Cursor);
2274     return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream);
2275 
2276   case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
2277     FourByteBuffer = DE.getU32(Cursor);
2278     return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream);
2279 
2280   case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
2281     using namespace amdhsa;
2282     TwoByteBuffer = DE.getU16(Cursor);
2283 
2284     if (!hasArchitectedFlatScratch())
2285       PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
2286                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
2287     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
2288                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
2289     PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
2290                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
2291     PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
2292                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
2293     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
2294                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
2295     if (!hasArchitectedFlatScratch())
2296       PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
2297                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
2298     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
2299                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
2300 
2301     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
2302       return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
2303                                        amdhsa::KERNEL_CODE_PROPERTIES_OFFSET);
2304 
2305     // Reserved for GFX9
2306     if (isGFX9() &&
2307         (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
2308       return createReservedKDBitsError(
2309           KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
2310           amdhsa::KERNEL_CODE_PROPERTIES_OFFSET, "must be zero on gfx9");
2311     }
2312     if (isGFX10Plus()) {
2313       PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
2314                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2315     }
2316 
2317     if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
2318       PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
2319                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
2320 
2321     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1) {
2322       return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED1,
2323                                        amdhsa::KERNEL_CODE_PROPERTIES_OFFSET);
2324     }
2325 
2326     return true;
2327 
2328   case amdhsa::KERNARG_PRELOAD_OFFSET:
2329     using namespace amdhsa;
2330     TwoByteBuffer = DE.getU16(Cursor);
2331     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) {
2332       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length",
2333                       KERNARG_PRELOAD_SPEC_LENGTH);
2334     }
2335 
2336     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) {
2337       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset",
2338                       KERNARG_PRELOAD_SPEC_OFFSET);
2339     }
2340     return true;
2341 
2342   case amdhsa::RESERVED3_OFFSET:
2343     // 4 bytes from here are reserved, must be 0.
2344     ReservedBytes = DE.getBytes(Cursor, 4);
2345     for (int I = 0; I < 4; ++I) {
2346       if (ReservedBytes[I] != 0)
2347         return createReservedKDBytesError(amdhsa::RESERVED3_OFFSET, 4);
2348     }
2349     return true;
2350 
2351   default:
2352     llvm_unreachable("Unhandled index. Case statements cover everything.");
2353     return true;
2354   }
2355 #undef PRINT_DIRECTIVE
2356 }
2357 
2358 Expected<bool> AMDGPUDisassembler::decodeKernelDescriptor(
2359     StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
2360 
2361   // CP microcode requires the kernel descriptor to be 64 aligned.
2362   if (Bytes.size() != 64 || KdAddress % 64 != 0)
2363     return createStringError(std::errc::invalid_argument,
2364                              "kernel descriptor must be 64-byte aligned");
2365 
2366   // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10
2367   // requires us to know the setting of .amdhsa_wavefront_size32 in order to
2368   // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong
2369   // order. Workaround this by first looking up .amdhsa_wavefront_size32 here
2370   // when required.
2371   if (isGFX10Plus()) {
2372     uint16_t KernelCodeProperties =
2373         support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET],
2374                                 llvm::endianness::little);
2375     EnableWavefrontSize32 =
2376         AMDHSA_BITS_GET(KernelCodeProperties,
2377                         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2378   }
2379 
2380   std::string Kd;
2381   raw_string_ostream KdStream(Kd);
2382   KdStream << ".amdhsa_kernel " << KdName << '\n';
2383 
2384   DataExtractor::Cursor C(0);
2385   while (C && C.tell() < Bytes.size()) {
2386     Expected<bool> Res = decodeKernelDescriptorDirective(C, Bytes, KdStream);
2387 
2388     cantFail(C.takeError());
2389 
2390     if (!Res)
2391       return Res;
2392   }
2393   KdStream << ".end_amdhsa_kernel\n";
2394   outs() << KdStream.str();
2395   return true;
2396 }
2397 
2398 Expected<bool> AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
2399                                                  uint64_t &Size,
2400                                                  ArrayRef<uint8_t> Bytes,
2401                                                  uint64_t Address) const {
2402   // Right now only kernel descriptor needs to be handled.
2403   // We ignore all other symbols for target specific handling.
2404   // TODO:
2405   // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
2406   // Object V2 and V3 when symbols are marked protected.
2407 
2408   // amd_kernel_code_t for Code Object V2.
2409   if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
2410     Size = 256;
2411     return createStringError(std::errc::invalid_argument,
2412                              "code object v2 is not supported");
2413   }
2414 
2415   // Code Object V3 kernel descriptors.
2416   StringRef Name = Symbol.Name;
2417   if (Symbol.Type == ELF::STT_OBJECT && Name.ends_with(StringRef(".kd"))) {
2418     Size = 64; // Size = 64 regardless of success or failure.
2419     return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
2420   }
2421 
2422   return false;
2423 }
2424 
2425 const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id,
2426                                                            int64_t Val) {
2427   MCContext &Ctx = getContext();
2428   MCSymbol *Sym = Ctx.getOrCreateSymbol(Id);
2429   // Note: only set value to Val on a new symbol in case an dissassembler
2430   // has already been initialized in this context.
2431   if (!Sym->isVariable()) {
2432     Sym->setVariableValue(MCConstantExpr::create(Val, Ctx));
2433   } else {
2434     int64_t Res = ~Val;
2435     bool Valid = Sym->getVariableValue()->evaluateAsAbsolute(Res);
2436     if (!Valid || Res != Val)
2437       Ctx.reportWarning(SMLoc(), "unsupported redefinition of " + Id);
2438   }
2439   return MCSymbolRefExpr::create(Sym, Ctx);
2440 }
2441 
2442 //===----------------------------------------------------------------------===//
2443 // AMDGPUSymbolizer
2444 //===----------------------------------------------------------------------===//
2445 
2446 // Try to find symbol name for specified label
2447 bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
2448     MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
2449     uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
2450     uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
2451 
2452   if (!IsBranch) {
2453     return false;
2454   }
2455 
2456   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
2457   if (!Symbols)
2458     return false;
2459 
2460   auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
2461     return Val.Addr == static_cast<uint64_t>(Value) &&
2462            Val.Type == ELF::STT_NOTYPE;
2463   });
2464   if (Result != Symbols->end()) {
2465     auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
2466     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
2467     Inst.addOperand(MCOperand::createExpr(Add));
2468     return true;
2469   }
2470   // Add to list of referenced addresses, so caller can synthesize a label.
2471   ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
2472   return false;
2473 }
2474 
2475 void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
2476                                                        int64_t Value,
2477                                                        uint64_t Address) {
2478   llvm_unreachable("unimplemented");
2479 }
2480 
2481 //===----------------------------------------------------------------------===//
2482 // Initialization
2483 //===----------------------------------------------------------------------===//
2484 
2485 static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
2486                               LLVMOpInfoCallback /*GetOpInfo*/,
2487                               LLVMSymbolLookupCallback /*SymbolLookUp*/,
2488                               void *DisInfo,
2489                               MCContext *Ctx,
2490                               std::unique_ptr<MCRelocationInfo> &&RelInfo) {
2491   return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
2492 }
2493 
2494 static MCDisassembler *createAMDGPUDisassembler(const Target &T,
2495                                                 const MCSubtargetInfo &STI,
2496                                                 MCContext &Ctx) {
2497   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
2498 }
2499 
2500 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
2501   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
2502                                          createAMDGPUDisassembler);
2503   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
2504                                        createAMDGPUSymbolizer);
2505 }
2506