xref: /llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (revision 01c9a14ccf98dba257bb36d9e9242b0bf5cdcaf2)
1 //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 //
11 /// \file
12 ///
13 /// This file contains definition for AMDGPU ISA disassembler
14 //
15 //===----------------------------------------------------------------------===//
16 
17 // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
18 
19 #include "Disassembler/AMDGPUDisassembler.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIDefines.h"
22 #include "SIRegisterInfo.h"
23 #include "TargetInfo/AMDGPUTargetInfo.h"
24 #include "Utils/AMDGPUAsmUtils.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm-c/DisassemblerTypes.h"
27 #include "llvm/BinaryFormat/ELF.h"
28 #include "llvm/MC/MCAsmInfo.h"
29 #include "llvm/MC/MCContext.h"
30 #include "llvm/MC/MCDecoderOps.h"
31 #include "llvm/MC/MCExpr.h"
32 #include "llvm/MC/MCInstrDesc.h"
33 #include "llvm/MC/MCRegisterInfo.h"
34 #include "llvm/MC/MCSubtargetInfo.h"
35 #include "llvm/MC/TargetRegistry.h"
36 #include "llvm/Support/AMDHSAKernelDescriptor.h"
37 
38 using namespace llvm;
39 
40 #define DEBUG_TYPE "amdgpu-disassembler"
41 
42 #define SGPR_MAX                                                               \
43   (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10                           \
44                  : AMDGPU::EncValues::SGPR_MAX_SI)
45 
46 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
47 
48 static const MCSubtargetInfo &addDefaultWaveSize(const MCSubtargetInfo &STI,
49                                                  MCContext &Ctx) {
50   if (!STI.hasFeature(AMDGPU::FeatureWavefrontSize64) &&
51       !STI.hasFeature(AMDGPU::FeatureWavefrontSize32)) {
52     MCSubtargetInfo &STICopy = Ctx.getSubtargetCopy(STI);
53     // If there is no default wave size it must be a generation before gfx10,
54     // these have FeatureWavefrontSize64 in their definition already. For gfx10+
55     // set wave32 as a default.
56     STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize32);
57     return STICopy;
58   }
59 
60   return STI;
61 }
62 
63 AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
64                                        MCContext &Ctx, MCInstrInfo const *MCII)
65     : MCDisassembler(addDefaultWaveSize(STI, Ctx), Ctx), MCII(MCII),
66       MRI(*Ctx.getRegisterInfo()), MAI(*Ctx.getAsmInfo()),
67       TargetMaxInstBytes(MAI.getMaxInstLength(&STI)),
68       CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {
69   // ToDo: AMDGPUDisassembler supports only VI ISA.
70   if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
71     report_fatal_error("Disassembly not yet supported for subtarget");
72 
73   for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
74     createConstantSymbolExpr(Symbol, Code);
75 
76   UCVersionW64Expr = createConstantSymbolExpr("UC_VERSION_W64_BIT", 0x2000);
77   UCVersionW32Expr = createConstantSymbolExpr("UC_VERSION_W32_BIT", 0x4000);
78   UCVersionMDPExpr = createConstantSymbolExpr("UC_VERSION_MDP_BIT", 0x8000);
79 }
80 
81 void AMDGPUDisassembler::setABIVersion(unsigned Version) {
82   CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(Version);
83 }
84 
85 inline static MCDisassembler::DecodeStatus
86 addOperand(MCInst &Inst, const MCOperand& Opnd) {
87   Inst.addOperand(Opnd);
88   return Opnd.isValid() ?
89     MCDisassembler::Success :
90     MCDisassembler::Fail;
91 }
92 
93 static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
94                                 uint16_t NameIdx) {
95   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
96   if (OpIdx != -1) {
97     auto *I = MI.begin();
98     std::advance(I, OpIdx);
99     MI.insert(I, Op);
100   }
101   return OpIdx;
102 }
103 
104 static DecodeStatus decodeSOPPBrTarget(MCInst &Inst, unsigned Imm,
105                                        uint64_t Addr,
106                                        const MCDisassembler *Decoder) {
107   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
108 
109   // Our branches take a simm16.
110   int64_t Offset = SignExtend64<16>(Imm) * 4 + 4 + Addr;
111 
112   if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
113     return MCDisassembler::Success;
114   return addOperand(Inst, MCOperand::createImm(Imm));
115 }
116 
117 static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
118                                      const MCDisassembler *Decoder) {
119   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
120   int64_t Offset;
121   if (DAsm->isGFX12Plus()) { // GFX12 supports 24-bit signed offsets.
122     Offset = SignExtend64<24>(Imm);
123   } else if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
124     Offset = Imm & 0xFFFFF;
125   } else { // GFX9+ supports 21-bit signed offsets.
126     Offset = SignExtend64<21>(Imm);
127   }
128   return addOperand(Inst, MCOperand::createImm(Offset));
129 }
130 
131 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
132                                   const MCDisassembler *Decoder) {
133   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
134   return addOperand(Inst, DAsm->decodeBoolReg(Val));
135 }
136 
137 static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
138                                        uint64_t Addr,
139                                        const MCDisassembler *Decoder) {
140   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
141   return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
142 }
143 
144 static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr,
145                                  const MCDisassembler *Decoder) {
146   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
147   return addOperand(Inst, DAsm->decodeDpp8FI(Val));
148 }
149 
150 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
151   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
152                                         uint64_t /*Addr*/,                     \
153                                         const MCDisassembler *Decoder) {       \
154     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
155     return addOperand(Inst, DAsm->DecoderName(Imm));                           \
156   }
157 
158 // Decoder for registers, decode directly using RegClassID. Imm(8-bit) is
159 // number of register. Used by VGPR only and AGPR only operands.
160 #define DECODE_OPERAND_REG_8(RegClass)                                         \
161   static DecodeStatus Decode##RegClass##RegisterClass(                         \
162       MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,                           \
163       const MCDisassembler *Decoder) {                                         \
164     assert(Imm < (1 << 8) && "8-bit encoding");                                \
165     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
166     return addOperand(                                                         \
167         Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm));      \
168   }
169 
170 #define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral,         \
171                      ImmWidth)                                                 \
172   static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,      \
173                            const MCDisassembler *Decoder) {                    \
174     assert(Imm < (1 << EncSize) && #EncSize "-bit encoding");                  \
175     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
176     return addOperand(Inst,                                                    \
177                       DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm,   \
178                                         MandatoryLiteral, ImmWidth));          \
179   }
180 
181 static DecodeStatus decodeSrcOp(MCInst &Inst, unsigned EncSize,
182                                 AMDGPUDisassembler::OpWidthTy OpWidth,
183                                 unsigned Imm, unsigned EncImm,
184                                 bool MandatoryLiteral, unsigned ImmWidth,
185                                 AMDGPU::OperandSemantics Sema,
186                                 const MCDisassembler *Decoder) {
187   assert(Imm < (1U << EncSize) && "Operand doesn't fit encoding!");
188   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
189   return addOperand(Inst, DAsm->decodeSrcOp(OpWidth, EncImm, MandatoryLiteral,
190                                             ImmWidth, Sema));
191 }
192 
193 // Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to
194 // get register class. Used by SGPR only operands.
195 #define DECODE_OPERAND_REG_7(RegClass, OpWidth)                                \
196   DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0)
197 
198 // Decoder for registers. Imm(10-bit): Imm{7-0} is number of register,
199 // Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC).
200 // Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp.
201 // Used by AV_ register classes (AGPR or VGPR only register operands).
202 template <AMDGPUDisassembler::OpWidthTy OpWidth>
203 static DecodeStatus decodeAV10(MCInst &Inst, unsigned Imm, uint64_t /* Addr */,
204                                const MCDisassembler *Decoder) {
205   return decodeSrcOp(Inst, 10, OpWidth, Imm, Imm | AMDGPU::EncValues::IS_VGPR,
206                      false, 0, AMDGPU::OperandSemantics::INT, Decoder);
207 }
208 
209 // Decoder for Src(9-bit encoding) registers only.
210 template <AMDGPUDisassembler::OpWidthTy OpWidth>
211 static DecodeStatus decodeSrcReg9(MCInst &Inst, unsigned Imm,
212                                   uint64_t /* Addr */,
213                                   const MCDisassembler *Decoder) {
214   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, 0,
215                      AMDGPU::OperandSemantics::INT, Decoder);
216 }
217 
218 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
219 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
220 // only.
221 template <AMDGPUDisassembler::OpWidthTy OpWidth>
222 static DecodeStatus decodeSrcA9(MCInst &Inst, unsigned Imm, uint64_t /* Addr */,
223                                 const MCDisassembler *Decoder) {
224   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm | 512, false, 0,
225                      AMDGPU::OperandSemantics::INT, Decoder);
226 }
227 
228 // Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding
229 // Imm{9} is acc, registers only.
230 template <AMDGPUDisassembler::OpWidthTy OpWidth>
231 static DecodeStatus decodeSrcAV10(MCInst &Inst, unsigned Imm,
232                                   uint64_t /* Addr */,
233                                   const MCDisassembler *Decoder) {
234   return decodeSrcOp(Inst, 10, OpWidth, Imm, Imm, false, 0,
235                      AMDGPU::OperandSemantics::INT, Decoder);
236 }
237 
238 // Decoder for RegisterOperands using 9-bit Src encoding. Operand can be
239 // register from RegClass or immediate. Registers that don't belong to RegClass
240 // will be decoded and InstPrinter will report warning. Immediate will be
241 // decoded into constant of size ImmWidth, should match width of immediate used
242 // by OperandType (important for floating point types).
243 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
244           unsigned OperandSemantics>
245 static DecodeStatus decodeSrcRegOrImm9(MCInst &Inst, unsigned Imm,
246                                        uint64_t /* Addr */,
247                                        const MCDisassembler *Decoder) {
248   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, false, ImmWidth,
249                      (AMDGPU::OperandSemantics)OperandSemantics, Decoder);
250 }
251 
252 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
253 // and decode using 'enum10' from decodeSrcOp.
254 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
255           unsigned OperandSemantics>
256 static DecodeStatus decodeSrcRegOrImmA9(MCInst &Inst, unsigned Imm,
257                                         uint64_t /* Addr */,
258                                         const MCDisassembler *Decoder) {
259   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm | 512, false, ImmWidth,
260                      (AMDGPU::OperandSemantics)OperandSemantics, Decoder);
261 }
262 
263 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
264           unsigned OperandSemantics>
265 static DecodeStatus decodeSrcRegOrImmDeferred9(MCInst &Inst, unsigned Imm,
266                                                uint64_t /* Addr */,
267                                                const MCDisassembler *Decoder) {
268   return decodeSrcOp(Inst, 9, OpWidth, Imm, Imm, true, ImmWidth,
269                      (AMDGPU::OperandSemantics)OperandSemantics, Decoder);
270 }
271 
272 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
273 // when RegisterClass is used as an operand. Most often used for destination
274 // operands.
275 
276 DECODE_OPERAND_REG_8(VGPR_32)
277 DECODE_OPERAND_REG_8(VGPR_32_Lo128)
278 DECODE_OPERAND_REG_8(VReg_64)
279 DECODE_OPERAND_REG_8(VReg_96)
280 DECODE_OPERAND_REG_8(VReg_128)
281 DECODE_OPERAND_REG_8(VReg_256)
282 DECODE_OPERAND_REG_8(VReg_288)
283 DECODE_OPERAND_REG_8(VReg_352)
284 DECODE_OPERAND_REG_8(VReg_384)
285 DECODE_OPERAND_REG_8(VReg_512)
286 DECODE_OPERAND_REG_8(VReg_1024)
287 
288 DECODE_OPERAND_REG_7(SReg_32, OPW32)
289 DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
290 DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
291 DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
292 DECODE_OPERAND_REG_7(SReg_64, OPW64)
293 DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
294 DECODE_OPERAND_REG_7(SReg_64_XEXEC_XNULL, OPW64)
295 DECODE_OPERAND_REG_7(SReg_96, OPW96)
296 DECODE_OPERAND_REG_7(SReg_128, OPW128)
297 DECODE_OPERAND_REG_7(SReg_256, OPW256)
298 DECODE_OPERAND_REG_7(SReg_512, OPW512)
299 
300 DECODE_OPERAND_REG_8(AGPR_32)
301 DECODE_OPERAND_REG_8(AReg_64)
302 DECODE_OPERAND_REG_8(AReg_128)
303 DECODE_OPERAND_REG_8(AReg_256)
304 DECODE_OPERAND_REG_8(AReg_512)
305 DECODE_OPERAND_REG_8(AReg_1024)
306 
307 static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
308                                                uint64_t /*Addr*/,
309                                                const MCDisassembler *Decoder) {
310   assert(isUInt<10>(Imm) && "10-bit encoding expected");
311   assert((Imm & (1 << 8)) == 0 && "Imm{8} should not be used");
312 
313   bool IsHi = Imm & (1 << 9);
314   unsigned RegIdx = Imm & 0xff;
315   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
316   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
317 }
318 
319 static DecodeStatus
320 DecodeVGPR_16_Lo128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,
321                                  const MCDisassembler *Decoder) {
322   assert(isUInt<8>(Imm) && "8-bit encoding expected");
323 
324   bool IsHi = Imm & (1 << 7);
325   unsigned RegIdx = Imm & 0x7f;
326   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
327   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
328 }
329 
330 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
331           unsigned OperandSemantics>
332 static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
333                                                 uint64_t /*Addr*/,
334                                                 const MCDisassembler *Decoder) {
335   assert(isUInt<9>(Imm) && "9-bit encoding expected");
336 
337   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
338   if (Imm & AMDGPU::EncValues::IS_VGPR) {
339     bool IsHi = Imm & (1 << 7);
340     unsigned RegIdx = Imm & 0x7f;
341     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
342   }
343   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
344                               OpWidth, Imm & 0xFF, false, ImmWidth,
345                               (AMDGPU::OperandSemantics)OperandSemantics));
346 }
347 
348 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
349           unsigned OperandSemantics>
350 static DecodeStatus
351 decodeOperand_VSrcT16_Lo128_Deferred(MCInst &Inst, unsigned Imm,
352                                      uint64_t /*Addr*/,
353                                      const MCDisassembler *Decoder) {
354   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
355   assert(isUInt<9>(Imm) && "9-bit encoding expected");
356 
357   if (Imm & AMDGPU::EncValues::IS_VGPR) {
358     bool IsHi = Imm & (1 << 7);
359     unsigned RegIdx = Imm & 0x7f;
360     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
361   }
362   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
363                               OpWidth, Imm & 0xFF, true, ImmWidth,
364                               (AMDGPU::OperandSemantics)OperandSemantics));
365 }
366 
367 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
368           unsigned OperandSemantics>
369 static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
370                                           uint64_t /*Addr*/,
371                                           const MCDisassembler *Decoder) {
372   assert(isUInt<10>(Imm) && "10-bit encoding expected");
373 
374   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
375   if (Imm & AMDGPU::EncValues::IS_VGPR) {
376     bool IsHi = Imm & (1 << 9);
377     unsigned RegIdx = Imm & 0xff;
378     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
379   }
380   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
381                               OpWidth, Imm & 0xFF, false, ImmWidth,
382                               (AMDGPU::OperandSemantics)OperandSemantics));
383 }
384 
385 static DecodeStatus decodeOperand_VGPR_16(MCInst &Inst, unsigned Imm,
386                                           uint64_t /*Addr*/,
387                                           const MCDisassembler *Decoder) {
388   assert(isUInt<10>(Imm) && "10-bit encoding expected");
389   assert(Imm & AMDGPU::EncValues::IS_VGPR && "VGPR expected");
390 
391   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
392 
393   bool IsHi = Imm & (1 << 9);
394   unsigned RegIdx = Imm & 0xff;
395   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
396 }
397 
398 static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
399                                          uint64_t Addr,
400                                          const MCDisassembler *Decoder) {
401   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
402   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
403 }
404 
405 static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
406                                           uint64_t Addr, const void *Decoder) {
407   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
408   return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
409 }
410 
411 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
412                           const MCRegisterInfo *MRI) {
413   if (OpIdx < 0)
414     return false;
415 
416   const MCOperand &Op = Inst.getOperand(OpIdx);
417   if (!Op.isReg())
418     return false;
419 
420   MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
421   auto Reg = Sub ? Sub : Op.getReg();
422   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
423 }
424 
425 static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm,
426                                  AMDGPUDisassembler::OpWidthTy Opw,
427                                  const MCDisassembler *Decoder) {
428   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
429   if (!DAsm->isGFX90A()) {
430     Imm &= 511;
431   } else {
432     // If atomic has both vdata and vdst their register classes are tied.
433     // The bit is decoded along with the vdst, first operand. We need to
434     // change register class to AGPR if vdst was AGPR.
435     // If a DS instruction has both data0 and data1 their register classes
436     // are also tied.
437     unsigned Opc = Inst.getOpcode();
438     uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
439     uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
440                                                         : AMDGPU::OpName::vdata;
441     const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
442     int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
443     if ((int)Inst.getNumOperands() == DataIdx) {
444       int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
445       if (IsAGPROperand(Inst, DstIdx, MRI))
446         Imm |= 512;
447     }
448 
449     if (TSFlags & SIInstrFlags::DS) {
450       int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
451       if ((int)Inst.getNumOperands() == Data2Idx &&
452           IsAGPROperand(Inst, DataIdx, MRI))
453         Imm |= 512;
454     }
455   }
456   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
457 }
458 
459 template <AMDGPUDisassembler::OpWidthTy Opw>
460 static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm,
461                                  uint64_t /* Addr */,
462                                  const MCDisassembler *Decoder) {
463   return decodeAVLdSt(Inst, Imm, Opw, Decoder);
464 }
465 
466 static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
467                                            uint64_t Addr,
468                                            const MCDisassembler *Decoder) {
469   assert(Imm < (1 << 9) && "9-bit encoding");
470   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
471   return addOperand(Inst,
472                     DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64,
473                                       AMDGPU::OperandSemantics::FP64));
474 }
475 
476 #define DECODE_SDWA(DecName) \
477 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
478 
479 DECODE_SDWA(Src32)
480 DECODE_SDWA(Src16)
481 DECODE_SDWA(VopcDst)
482 
483 static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
484                                      uint64_t /* Addr */,
485                                      const MCDisassembler *Decoder) {
486   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
487   return addOperand(Inst, DAsm->decodeVersionImm(Imm));
488 }
489 
490 #include "AMDGPUGenDisassemblerTables.inc"
491 
492 //===----------------------------------------------------------------------===//
493 //
494 //===----------------------------------------------------------------------===//
495 
496 template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
497   assert(Bytes.size() >= sizeof(T));
498   const auto Res =
499       support::endian::read<T, llvm::endianness::little>(Bytes.data());
500   Bytes = Bytes.slice(sizeof(T));
501   return Res;
502 }
503 
504 static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
505   assert(Bytes.size() >= 12);
506   uint64_t Lo =
507       support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
508   Bytes = Bytes.slice(8);
509   uint64_t Hi =
510       support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
511   Bytes = Bytes.slice(4);
512   return DecoderUInt128(Lo, Hi);
513 }
514 
515 static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
516   assert(Bytes.size() >= 16);
517   uint64_t Lo =
518       support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
519   Bytes = Bytes.slice(8);
520   uint64_t Hi =
521       support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
522   Bytes = Bytes.slice(8);
523   return DecoderUInt128(Lo, Hi);
524 }
525 
526 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
527                                                 ArrayRef<uint8_t> Bytes_,
528                                                 uint64_t Address,
529                                                 raw_ostream &CS) const {
530   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
531   Bytes = Bytes_.slice(0, MaxInstBytesNum);
532 
533   // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless
534   // there are fewer bytes left). This will be overridden on success.
535   Size = std::min((size_t)4, Bytes_.size());
536 
537   do {
538     // ToDo: better to switch encoding length using some bit predicate
539     // but it is unknown yet, so try all we can
540 
541     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
542     // encodings
543     if (isGFX11Plus() && Bytes.size() >= 12 ) {
544       DecoderUInt128 DecW = eat12Bytes(Bytes);
545 
546       if (isGFX11() &&
547           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
548                         DecW, Address, CS))
549         break;
550 
551       if (isGFX12() &&
552           tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
553                         DecW, Address, CS))
554         break;
555 
556       if (isGFX12() &&
557           tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
558         break;
559 
560       // Reinitialize Bytes
561       Bytes = Bytes_.slice(0, MaxInstBytesNum);
562 
563     } else if (Bytes.size() >= 16 &&
564                STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
565       DecoderUInt128 DecW = eat16Bytes(Bytes);
566       if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
567         break;
568 
569       // Reinitialize Bytes
570       Bytes = Bytes_.slice(0, MaxInstBytesNum);
571     }
572 
573     if (Bytes.size() >= 8) {
574       const uint64_t QW = eatBytes<uint64_t>(Bytes);
575 
576       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
577           tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS))
578         break;
579 
580       if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) &&
581           tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
582         break;
583 
584       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
585       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
586       // table first so we print the correct name.
587       if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) &&
588           tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS))
589         break;
590 
591       if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) &&
592           tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS))
593         break;
594 
595       if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
596           tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
597         break;
598 
599       if ((isVI() || isGFX9()) &&
600           tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
601         break;
602 
603       if (isGFX9() && tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
604         break;
605 
606       if (isGFX10() && tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
607         break;
608 
609       if (isGFX12() &&
610           tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
611                         Address, CS))
612         break;
613 
614       if (isGFX11() &&
615           tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
616                         Address, CS))
617         break;
618 
619       if (isGFX11() &&
620           tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
621         break;
622 
623       if (isGFX12() &&
624           tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
625         break;
626 
627       // Reinitialize Bytes
628       Bytes = Bytes_.slice(0, MaxInstBytesNum);
629     }
630 
631     // Try decode 32-bit instruction
632     if (Bytes.size() >= 4) {
633       const uint32_t DW = eatBytes<uint32_t>(Bytes);
634 
635       if ((isVI() || isGFX9()) &&
636           tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
637         break;
638 
639       if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
640         break;
641 
642       if (isGFX9() && tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
643         break;
644 
645       if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
646           tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
647         break;
648 
649       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
650           tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
651         break;
652 
653       if (isGFX10() && tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
654         break;
655 
656       if (isGFX11() &&
657           tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
658                         Address, CS))
659         break;
660 
661       if (isGFX12() &&
662           tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
663                         Address, CS))
664         break;
665     }
666 
667     return MCDisassembler::Fail;
668   } while (false);
669 
670   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) {
671     if (isMacDPP(MI))
672       convertMacDPPInst(MI);
673 
674     if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
675       convertVOP3PDPPInst(MI);
676     else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) ||
677              AMDGPU::isVOPC64DPP(MI.getOpcode()))
678       convertVOPCDPPInst(MI); // Special VOP3 case
679     else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) !=
680              -1)
681       convertDPP8Inst(MI);
682     else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3)
683       convertVOP3DPPInst(MI); // Regular VOP3 case
684   }
685 
686   convertTrue16OpSel(MI);
687 
688   if (AMDGPU::isMAC(MI.getOpcode())) {
689     // Insert dummy unused src2_modifiers.
690     insertNamedMCOperand(MI, MCOperand::createImm(0),
691                          AMDGPU::OpName::src2_modifiers);
692   }
693 
694   if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
695       MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) {
696     // Insert dummy unused src2_modifiers.
697     insertNamedMCOperand(MI, MCOperand::createImm(0),
698                          AMDGPU::OpName::src2_modifiers);
699   }
700 
701   if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
702       !AMDGPU::hasGDS(STI)) {
703     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
704   }
705 
706   if (MCII->get(MI.getOpcode()).TSFlags &
707       (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) {
708     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
709                                              AMDGPU::OpName::cpol);
710     if (CPolPos != -1) {
711       unsigned CPol =
712           (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
713               AMDGPU::CPol::GLC : 0;
714       if (MI.getNumOperands() <= (unsigned)CPolPos) {
715         insertNamedMCOperand(MI, MCOperand::createImm(CPol),
716                              AMDGPU::OpName::cpol);
717       } else if (CPol) {
718         MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
719       }
720     }
721   }
722 
723   if ((MCII->get(MI.getOpcode()).TSFlags &
724        (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
725       (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
726     // GFX90A lost TFE, its place is occupied by ACC.
727     int TFEOpIdx =
728         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
729     if (TFEOpIdx != -1) {
730       auto *TFEIter = MI.begin();
731       std::advance(TFEIter, TFEOpIdx);
732       MI.insert(TFEIter, MCOperand::createImm(0));
733     }
734   }
735 
736   if (MCII->get(MI.getOpcode()).TSFlags &
737       (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
738     int SWZOpIdx =
739         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
740     if (SWZOpIdx != -1) {
741       auto *SWZIter = MI.begin();
742       std::advance(SWZIter, SWZOpIdx);
743       MI.insert(SWZIter, MCOperand::createImm(0));
744     }
745   }
746 
747   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
748     int VAddr0Idx =
749         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
750     int RsrcIdx =
751         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
752     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
753     if (VAddr0Idx >= 0 && NSAArgs > 0) {
754       unsigned NSAWords = (NSAArgs + 3) / 4;
755       if (Bytes.size() < 4 * NSAWords)
756         return MCDisassembler::Fail;
757       for (unsigned i = 0; i < NSAArgs; ++i) {
758         const unsigned VAddrIdx = VAddr0Idx + 1 + i;
759         auto VAddrRCID =
760             MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
761         MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
762       }
763       Bytes = Bytes.slice(4 * NSAWords);
764     }
765 
766     convertMIMGInst(MI);
767   }
768 
769   if (MCII->get(MI.getOpcode()).TSFlags &
770       (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))
771     convertMIMGInst(MI);
772 
773   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)
774     convertEXPInst(MI);
775 
776   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)
777     convertVINTERPInst(MI);
778 
779   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
780     convertSDWAInst(MI);
781 
782   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
783     convertMAIInst(MI);
784 
785   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
786                                               AMDGPU::OpName::vdst_in);
787   if (VDstIn_Idx != -1) {
788     int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
789                            MCOI::OperandConstraint::TIED_TO);
790     if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
791          !MI.getOperand(VDstIn_Idx).isReg() ||
792          MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
793       if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
794         MI.erase(&MI.getOperand(VDstIn_Idx));
795       insertNamedMCOperand(MI,
796         MCOperand::createReg(MI.getOperand(Tied).getReg()),
797         AMDGPU::OpName::vdst_in);
798     }
799   }
800 
801   int ImmLitIdx =
802       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
803   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
804   if (ImmLitIdx != -1 && !IsSOPK)
805     convertFMAanyK(MI, ImmLitIdx);
806 
807   Size = MaxInstBytesNum - Bytes.size();
808   return MCDisassembler::Success;
809 }
810 
811 void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
812   if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
813     // The MCInst still has these fields even though they are no longer encoded
814     // in the GFX11 instruction.
815     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
816     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
817   }
818 }
819 
820 void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
821   convertTrue16OpSel(MI);
822   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_t16_gfx11 ||
823       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_fake16_gfx11 ||
824       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_t16_gfx12 ||
825       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_fake16_gfx12 ||
826       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_t16_gfx11 ||
827       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_fake16_gfx11 ||
828       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_t16_gfx12 ||
829       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_fake16_gfx12 ||
830       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_t16_gfx11 ||
831       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_fake16_gfx11 ||
832       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_t16_gfx12 ||
833       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_fake16_gfx12 ||
834       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_t16_gfx11 ||
835       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_fake16_gfx11 ||
836       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_t16_gfx12 ||
837       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_fake16_gfx12) {
838     // The MCInst has this field that is not directly encoded in the
839     // instruction.
840     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
841   }
842 }
843 
844 void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
845   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
846       STI.hasFeature(AMDGPU::FeatureGFX10)) {
847     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
848       // VOPC - insert clamp
849       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
850   } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
851     int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
852     if (SDst != -1) {
853       // VOPC - insert VCC register as sdst
854       insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC),
855                            AMDGPU::OpName::sdst);
856     } else {
857       // VOP1/2 - insert omod if present in instruction
858       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
859     }
860   }
861 }
862 
863 /// Adjust the register values used by V_MFMA_F8F6F4_f8_f8 instructions to the
864 /// appropriate subregister for the used format width.
865 static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
866                                         MCOperand &MO, uint8_t NumRegs) {
867   switch (NumRegs) {
868   case 4:
869     return MO.setReg(MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3));
870   case 6:
871     return MO.setReg(
872         MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
873   case 8:
874     // No-op in cases where one operand is still f8/bf8.
875     return;
876   default:
877     llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
878   }
879 }
880 
881 /// f8f6f4 instructions have different pseudos depending on the used formats. In
882 /// the disassembler table, we only have the variants with the largest register
883 /// classes which assume using an fp8/bf8 format for both operands. The actual
884 /// register class depends on the format in blgp and cbsz operands. Adjust the
885 /// register classes depending on the used format.
886 void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
887   int BlgpIdx =
888       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::blgp);
889   if (BlgpIdx == -1)
890     return;
891 
892   int CbszIdx =
893       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::cbsz);
894 
895   unsigned CBSZ = MI.getOperand(CbszIdx).getImm();
896   unsigned BLGP = MI.getOperand(BlgpIdx).getImm();
897 
898   const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
899       AMDGPU::getMFMA_F8F6F4_WithFormatArgs(CBSZ, BLGP, MI.getOpcode());
900   if (!AdjustedRegClassOpcode ||
901       AdjustedRegClassOpcode->Opcode == MI.getOpcode())
902     return;
903 
904   MI.setOpcode(AdjustedRegClassOpcode->Opcode);
905   int Src0Idx =
906       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
907   int Src1Idx =
908       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
909   adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
910                               AdjustedRegClassOpcode->NumRegsSrcA);
911   adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
912                               AdjustedRegClassOpcode->NumRegsSrcB);
913 }
914 
915 struct VOPModifiers {
916   unsigned OpSel = 0;
917   unsigned OpSelHi = 0;
918   unsigned NegLo = 0;
919   unsigned NegHi = 0;
920 };
921 
922 // Reconstruct values of VOP3/VOP3P operands such as op_sel.
923 // Note that these values do not affect disassembler output,
924 // so this is only necessary for consistency with src_modifiers.
925 static VOPModifiers collectVOPModifiers(const MCInst &MI,
926                                         bool IsVOP3P = false) {
927   VOPModifiers Modifiers;
928   unsigned Opc = MI.getOpcode();
929   const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
930                         AMDGPU::OpName::src1_modifiers,
931                         AMDGPU::OpName::src2_modifiers};
932   for (int J = 0; J < 3; ++J) {
933     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
934     if (OpIdx == -1)
935       continue;
936 
937     unsigned Val = MI.getOperand(OpIdx).getImm();
938 
939     Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
940     if (IsVOP3P) {
941       Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
942       Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
943       Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
944     } else if (J == 0) {
945       Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
946     }
947   }
948 
949   return Modifiers;
950 }
951 
952 // Instructions decode the op_sel/suffix bits into the src_modifier
953 // operands. Copy those bits into the src operands for true16 VGPRs.
954 void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
955   const unsigned Opc = MI.getOpcode();
956   const MCRegisterClass &ConversionRC =
957       MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
958   constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
959       {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
960         SISrcMods::OP_SEL_0},
961        {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
962         SISrcMods::OP_SEL_0},
963        {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
964         SISrcMods::OP_SEL_0},
965        {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
966         SISrcMods::DST_OP_SEL}}};
967   for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
968     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
969     int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
970     if (OpIdx == -1 || OpModsIdx == -1)
971       continue;
972     MCOperand &Op = MI.getOperand(OpIdx);
973     if (!Op.isReg())
974       continue;
975     if (!ConversionRC.contains(Op.getReg()))
976       continue;
977     unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
978     const MCOperand &OpMods = MI.getOperand(OpModsIdx);
979     unsigned ModVal = OpMods.getImm();
980     if (ModVal & OpSelMask) { // isHi
981       unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
982       Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
983     }
984   }
985 }
986 
987 // MAC opcodes have special old and src2 operands.
988 // src2 is tied to dst, while old is not tied (but assumed to be).
989 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
990   constexpr int DST_IDX = 0;
991   auto Opcode = MI.getOpcode();
992   const auto &Desc = MCII->get(Opcode);
993   auto OldIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::old);
994 
995   if (OldIdx != -1 && Desc.getOperandConstraint(
996                           OldIdx, MCOI::OperandConstraint::TIED_TO) == -1) {
997     assert(AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2));
998     assert(Desc.getOperandConstraint(
999                AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2),
1000                MCOI::OperandConstraint::TIED_TO) == DST_IDX);
1001     (void)DST_IDX;
1002     return true;
1003   }
1004 
1005   return false;
1006 }
1007 
1008 // Create dummy old operand and insert dummy unused src2_modifiers
1009 void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
1010   assert(MI.getNumOperands() + 1 < MCII->get(MI.getOpcode()).getNumOperands());
1011   insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
1012   insertNamedMCOperand(MI, MCOperand::createImm(0),
1013                        AMDGPU::OpName::src2_modifiers);
1014 }
1015 
1016 void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
1017   unsigned Opc = MI.getOpcode();
1018 
1019   int VDstInIdx =
1020       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
1021   if (VDstInIdx != -1)
1022     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
1023 
1024   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1025   if (MI.getNumOperands() < DescNumOps &&
1026       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
1027     convertTrue16OpSel(MI);
1028     auto Mods = collectVOPModifiers(MI);
1029     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1030                          AMDGPU::OpName::op_sel);
1031   } else {
1032     // Insert dummy unused src modifiers.
1033     if (MI.getNumOperands() < DescNumOps &&
1034         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
1035       insertNamedMCOperand(MI, MCOperand::createImm(0),
1036                            AMDGPU::OpName::src0_modifiers);
1037 
1038     if (MI.getNumOperands() < DescNumOps &&
1039         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
1040       insertNamedMCOperand(MI, MCOperand::createImm(0),
1041                            AMDGPU::OpName::src1_modifiers);
1042   }
1043 }
1044 
1045 void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
1046   convertTrue16OpSel(MI);
1047 
1048   int VDstInIdx =
1049       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
1050   if (VDstInIdx != -1)
1051     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
1052 
1053   unsigned Opc = MI.getOpcode();
1054   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1055   if (MI.getNumOperands() < DescNumOps &&
1056       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
1057     auto Mods = collectVOPModifiers(MI);
1058     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1059                          AMDGPU::OpName::op_sel);
1060   }
1061 }
1062 
1063 // Note that before gfx10, the MIMG encoding provided no information about
1064 // VADDR size. Consequently, decoded instructions always show address as if it
1065 // has 1 dword, which could be not really so.
1066 void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
1067   auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
1068 
1069   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1070                                            AMDGPU::OpName::vdst);
1071 
1072   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1073                                             AMDGPU::OpName::vdata);
1074   int VAddr0Idx =
1075       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
1076   int RsrcOpName = (TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc
1077                                                   : AMDGPU::OpName::rsrc;
1078   int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName);
1079   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1080                                             AMDGPU::OpName::dmask);
1081 
1082   int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1083                                             AMDGPU::OpName::tfe);
1084   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1085                                             AMDGPU::OpName::d16);
1086 
1087   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1088   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1089       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
1090 
1091   assert(VDataIdx != -1);
1092   if (BaseOpcode->BVH) {
1093     // Add A16 operand for intersect_ray instructions
1094     addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
1095     return;
1096   }
1097 
1098   bool IsAtomic = (VDstIdx != -1);
1099   bool IsGather4 = TSFlags & SIInstrFlags::Gather4;
1100   bool IsVSample = TSFlags & SIInstrFlags::VSAMPLE;
1101   bool IsNSA = false;
1102   bool IsPartialNSA = false;
1103   unsigned AddrSize = Info->VAddrDwords;
1104 
1105   if (isGFX10Plus()) {
1106     unsigned DimIdx =
1107         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
1108     int A16Idx =
1109         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
1110     const AMDGPU::MIMGDimInfo *Dim =
1111         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
1112     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
1113 
1114     AddrSize =
1115         AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
1116 
1117     // VSAMPLE insts that do not use vaddr3 behave the same as NSA forms.
1118     // VIMAGE insts other than BVH never use vaddr4.
1119     IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
1120             Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA ||
1121             Info->MIMGEncoding == AMDGPU::MIMGEncGfx12;
1122     if (!IsNSA) {
1123       if (!IsVSample && AddrSize > 12)
1124         AddrSize = 16;
1125     } else {
1126       if (AddrSize > Info->VAddrDwords) {
1127         if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
1128           // The NSA encoding does not contain enough operands for the
1129           // combination of base opcode / dimension. Should this be an error?
1130           return;
1131         }
1132         IsPartialNSA = true;
1133       }
1134     }
1135   }
1136 
1137   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
1138   unsigned DstSize = IsGather4 ? 4 : std::max(llvm::popcount(DMask), 1);
1139 
1140   bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
1141   if (D16 && AMDGPU::hasPackedD16(STI)) {
1142     DstSize = (DstSize + 1) / 2;
1143   }
1144 
1145   if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
1146     DstSize += 1;
1147 
1148   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
1149     return;
1150 
1151   int NewOpcode =
1152       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
1153   if (NewOpcode == -1)
1154     return;
1155 
1156   // Widen the register to the correct number of enabled channels.
1157   MCRegister NewVdata;
1158   if (DstSize != Info->VDataDwords) {
1159     auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
1160 
1161     // Get first subregister of VData
1162     MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg();
1163     MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
1164     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
1165 
1166     NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
1167                                        &MRI.getRegClass(DataRCID));
1168     if (!NewVdata) {
1169       // It's possible to encode this such that the low register + enabled
1170       // components exceeds the register count.
1171       return;
1172     }
1173   }
1174 
1175   // If not using NSA on GFX10+, widen vaddr0 address register to correct size.
1176   // If using partial NSA on GFX11+ widen last address register.
1177   int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx;
1178   MCRegister NewVAddrSA;
1179   if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) &&
1180       AddrSize != Info->VAddrDwords) {
1181     MCRegister VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
1182     MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
1183     VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
1184 
1185     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
1186     NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
1187                                         &MRI.getRegClass(AddrRCID));
1188     if (!NewVAddrSA)
1189       return;
1190   }
1191 
1192   MI.setOpcode(NewOpcode);
1193 
1194   if (NewVdata != AMDGPU::NoRegister) {
1195     MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
1196 
1197     if (IsAtomic) {
1198       // Atomic operations have an additional operand (a copy of data)
1199       MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
1200     }
1201   }
1202 
1203   if (NewVAddrSA) {
1204     MI.getOperand(VAddrSAIdx) = MCOperand::createReg(NewVAddrSA);
1205   } else if (IsNSA) {
1206     assert(AddrSize <= Info->VAddrDwords);
1207     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
1208              MI.begin() + VAddr0Idx + Info->VAddrDwords);
1209   }
1210 }
1211 
1212 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
1213 // decoder only adds to src_modifiers, so manually add the bits to the other
1214 // operands.
1215 void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
1216   unsigned Opc = MI.getOpcode();
1217   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1218   auto Mods = collectVOPModifiers(MI, true);
1219 
1220   if (MI.getNumOperands() < DescNumOps &&
1221       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
1222     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
1223 
1224   if (MI.getNumOperands() < DescNumOps &&
1225       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel))
1226     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1227                          AMDGPU::OpName::op_sel);
1228   if (MI.getNumOperands() < DescNumOps &&
1229       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel_hi))
1230     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
1231                          AMDGPU::OpName::op_sel_hi);
1232   if (MI.getNumOperands() < DescNumOps &&
1233       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_lo))
1234     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
1235                          AMDGPU::OpName::neg_lo);
1236   if (MI.getNumOperands() < DescNumOps &&
1237       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
1238     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
1239                          AMDGPU::OpName::neg_hi);
1240 }
1241 
1242 // Create dummy old operand and insert optional operands
1243 void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
1244   unsigned Opc = MI.getOpcode();
1245   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1246 
1247   if (MI.getNumOperands() < DescNumOps &&
1248       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::old))
1249     insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
1250 
1251   if (MI.getNumOperands() < DescNumOps &&
1252       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
1253     insertNamedMCOperand(MI, MCOperand::createImm(0),
1254                          AMDGPU::OpName::src0_modifiers);
1255 
1256   if (MI.getNumOperands() < DescNumOps &&
1257       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
1258     insertNamedMCOperand(MI, MCOperand::createImm(0),
1259                          AMDGPU::OpName::src1_modifiers);
1260 }
1261 
1262 void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const {
1263   assert(HasLiteral && "Should have decoded a literal");
1264   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
1265   unsigned DescNumOps = Desc.getNumOperands();
1266   insertNamedMCOperand(MI, MCOperand::createImm(Literal),
1267                        AMDGPU::OpName::immDeferred);
1268   assert(DescNumOps == MI.getNumOperands());
1269   for (unsigned I = 0; I < DescNumOps; ++I) {
1270     auto &Op = MI.getOperand(I);
1271     auto OpType = Desc.operands()[I].OperandType;
1272     bool IsDeferredOp = (OpType == AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED ||
1273                          OpType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED);
1274     if (Op.isImm() && Op.getImm() == AMDGPU::EncValues::LITERAL_CONST &&
1275         IsDeferredOp)
1276       Op.setImm(Literal);
1277   }
1278 }
1279 
1280 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
1281   return getContext().getRegisterInfo()->
1282     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
1283 }
1284 
1285 inline
1286 MCOperand AMDGPUDisassembler::errOperand(unsigned V,
1287                                          const Twine& ErrMsg) const {
1288   *CommentStream << "Error: " + ErrMsg;
1289 
1290   // ToDo: add support for error operands to MCInst.h
1291   // return MCOperand::createError(V);
1292   return MCOperand();
1293 }
1294 
1295 inline
1296 MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
1297   return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
1298 }
1299 
1300 inline
1301 MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
1302                                                unsigned Val) const {
1303   const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
1304   if (Val >= RegCl.getNumRegs())
1305     return errOperand(Val, Twine(getRegClassName(RegClassID)) +
1306                            ": unknown register " + Twine(Val));
1307   return createRegOperand(RegCl.getRegister(Val));
1308 }
1309 
1310 inline
1311 MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
1312                                                 unsigned Val) const {
1313   // ToDo: SI/CI have 104 SGPRs, VI - 102
1314   // Valery: here we accepting as much as we can, let assembler sort it out
1315   int shift = 0;
1316   switch (SRegClassID) {
1317   case AMDGPU::SGPR_32RegClassID:
1318   case AMDGPU::TTMP_32RegClassID:
1319     break;
1320   case AMDGPU::SGPR_64RegClassID:
1321   case AMDGPU::TTMP_64RegClassID:
1322     shift = 1;
1323     break;
1324   case AMDGPU::SGPR_96RegClassID:
1325   case AMDGPU::TTMP_96RegClassID:
1326   case AMDGPU::SGPR_128RegClassID:
1327   case AMDGPU::TTMP_128RegClassID:
1328   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
1329   // this bundle?
1330   case AMDGPU::SGPR_256RegClassID:
1331   case AMDGPU::TTMP_256RegClassID:
1332     // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
1333   // this bundle?
1334   case AMDGPU::SGPR_288RegClassID:
1335   case AMDGPU::TTMP_288RegClassID:
1336   case AMDGPU::SGPR_320RegClassID:
1337   case AMDGPU::TTMP_320RegClassID:
1338   case AMDGPU::SGPR_352RegClassID:
1339   case AMDGPU::TTMP_352RegClassID:
1340   case AMDGPU::SGPR_384RegClassID:
1341   case AMDGPU::TTMP_384RegClassID:
1342   case AMDGPU::SGPR_512RegClassID:
1343   case AMDGPU::TTMP_512RegClassID:
1344     shift = 2;
1345     break;
1346   // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
1347   // this bundle?
1348   default:
1349     llvm_unreachable("unhandled register class");
1350   }
1351 
1352   if (Val % (1 << shift)) {
1353     *CommentStream << "Warning: " << getRegClassName(SRegClassID)
1354                    << ": scalar reg isn't aligned " << Val;
1355   }
1356 
1357   return createRegOperand(SRegClassID, Val >> shift);
1358 }
1359 
1360 MCOperand AMDGPUDisassembler::createVGPR16Operand(unsigned RegIdx,
1361                                                   bool IsHi) const {
1362   unsigned RegIdxInVGPR16 = RegIdx * 2 + (IsHi ? 1 : 0);
1363   return createRegOperand(AMDGPU::VGPR_16RegClassID, RegIdxInVGPR16);
1364 }
1365 
1366 // Decode Literals for insts which always have a literal in the encoding
1367 MCOperand
1368 AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
1369   if (HasLiteral) {
1370     assert(
1371         AMDGPU::hasVOPD(STI) &&
1372         "Should only decode multiple kimm with VOPD, check VSrc operand types");
1373     if (Literal != Val)
1374       return errOperand(Val, "More than one unique literal is illegal");
1375   }
1376   HasLiteral = true;
1377   Literal = Val;
1378   return MCOperand::createImm(Literal);
1379 }
1380 
1381 MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
1382   // For now all literal constants are supposed to be unsigned integer
1383   // ToDo: deal with signed/unsigned 64-bit integer constants
1384   // ToDo: deal with float/double constants
1385   if (!HasLiteral) {
1386     if (Bytes.size() < 4) {
1387       return errOperand(0, "cannot read literal, inst bytes left " +
1388                         Twine(Bytes.size()));
1389     }
1390     HasLiteral = true;
1391     Literal = Literal64 = eatBytes<uint32_t>(Bytes);
1392     if (ExtendFP64)
1393       Literal64 <<= 32;
1394   }
1395   return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
1396 }
1397 
1398 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
1399   using namespace AMDGPU::EncValues;
1400 
1401   assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
1402   return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
1403     (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
1404     (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
1405       // Cast prevents negative overflow.
1406 }
1407 
1408 static int64_t getInlineImmVal32(unsigned Imm) {
1409   switch (Imm) {
1410   case 240:
1411     return llvm::bit_cast<uint32_t>(0.5f);
1412   case 241:
1413     return llvm::bit_cast<uint32_t>(-0.5f);
1414   case 242:
1415     return llvm::bit_cast<uint32_t>(1.0f);
1416   case 243:
1417     return llvm::bit_cast<uint32_t>(-1.0f);
1418   case 244:
1419     return llvm::bit_cast<uint32_t>(2.0f);
1420   case 245:
1421     return llvm::bit_cast<uint32_t>(-2.0f);
1422   case 246:
1423     return llvm::bit_cast<uint32_t>(4.0f);
1424   case 247:
1425     return llvm::bit_cast<uint32_t>(-4.0f);
1426   case 248: // 1 / (2 * PI)
1427     return 0x3e22f983;
1428   default:
1429     llvm_unreachable("invalid fp inline imm");
1430   }
1431 }
1432 
1433 static int64_t getInlineImmVal64(unsigned Imm) {
1434   switch (Imm) {
1435   case 240:
1436     return llvm::bit_cast<uint64_t>(0.5);
1437   case 241:
1438     return llvm::bit_cast<uint64_t>(-0.5);
1439   case 242:
1440     return llvm::bit_cast<uint64_t>(1.0);
1441   case 243:
1442     return llvm::bit_cast<uint64_t>(-1.0);
1443   case 244:
1444     return llvm::bit_cast<uint64_t>(2.0);
1445   case 245:
1446     return llvm::bit_cast<uint64_t>(-2.0);
1447   case 246:
1448     return llvm::bit_cast<uint64_t>(4.0);
1449   case 247:
1450     return llvm::bit_cast<uint64_t>(-4.0);
1451   case 248: // 1 / (2 * PI)
1452     return 0x3fc45f306dc9c882;
1453   default:
1454     llvm_unreachable("invalid fp inline imm");
1455   }
1456 }
1457 
1458 static int64_t getInlineImmValF16(unsigned Imm) {
1459   switch (Imm) {
1460   case 240:
1461     return 0x3800;
1462   case 241:
1463     return 0xB800;
1464   case 242:
1465     return 0x3C00;
1466   case 243:
1467     return 0xBC00;
1468   case 244:
1469     return 0x4000;
1470   case 245:
1471     return 0xC000;
1472   case 246:
1473     return 0x4400;
1474   case 247:
1475     return 0xC400;
1476   case 248: // 1 / (2 * PI)
1477     return 0x3118;
1478   default:
1479     llvm_unreachable("invalid fp inline imm");
1480   }
1481 }
1482 
1483 static int64_t getInlineImmValBF16(unsigned Imm) {
1484   switch (Imm) {
1485   case 240:
1486     return 0x3F00;
1487   case 241:
1488     return 0xBF00;
1489   case 242:
1490     return 0x3F80;
1491   case 243:
1492     return 0xBF80;
1493   case 244:
1494     return 0x4000;
1495   case 245:
1496     return 0xC000;
1497   case 246:
1498     return 0x4080;
1499   case 247:
1500     return 0xC080;
1501   case 248: // 1 / (2 * PI)
1502     return 0x3E22;
1503   default:
1504     llvm_unreachable("invalid fp inline imm");
1505   }
1506 }
1507 
1508 static int64_t getInlineImmVal16(unsigned Imm, AMDGPU::OperandSemantics Sema) {
1509   return (Sema == AMDGPU::OperandSemantics::BF16) ? getInlineImmValBF16(Imm)
1510                                                   : getInlineImmValF16(Imm);
1511 }
1512 
1513 MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm,
1514                                             AMDGPU::OperandSemantics Sema) {
1515   assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN &&
1516          Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
1517 
1518   // ToDo: case 248: 1/(2*PI) - is allowed only on VI
1519   // ImmWidth 0 is a default case where operand should not allow immediates.
1520   // Imm value is still decoded into 32 bit immediate operand, inst printer will
1521   // use it to print verbose error message.
1522   switch (ImmWidth) {
1523   case 0:
1524   case 32:
1525     return MCOperand::createImm(getInlineImmVal32(Imm));
1526   case 64:
1527     return MCOperand::createImm(getInlineImmVal64(Imm));
1528   case 16:
1529     return MCOperand::createImm(getInlineImmVal16(Imm, Sema));
1530   default:
1531     llvm_unreachable("implement me");
1532   }
1533 }
1534 
1535 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
1536   using namespace AMDGPU;
1537 
1538   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1539   switch (Width) {
1540   default: // fall
1541   case OPW32:
1542   case OPW16:
1543   case OPWV216:
1544     return VGPR_32RegClassID;
1545   case OPW64:
1546   case OPWV232: return VReg_64RegClassID;
1547   case OPW96: return VReg_96RegClassID;
1548   case OPW128: return VReg_128RegClassID;
1549   case OPW160: return VReg_160RegClassID;
1550   case OPW256: return VReg_256RegClassID;
1551   case OPW288: return VReg_288RegClassID;
1552   case OPW320: return VReg_320RegClassID;
1553   case OPW352: return VReg_352RegClassID;
1554   case OPW384: return VReg_384RegClassID;
1555   case OPW512: return VReg_512RegClassID;
1556   case OPW1024: return VReg_1024RegClassID;
1557   }
1558 }
1559 
1560 unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
1561   using namespace AMDGPU;
1562 
1563   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1564   switch (Width) {
1565   default: // fall
1566   case OPW32:
1567   case OPW16:
1568   case OPWV216:
1569     return AGPR_32RegClassID;
1570   case OPW64:
1571   case OPWV232: return AReg_64RegClassID;
1572   case OPW96: return AReg_96RegClassID;
1573   case OPW128: return AReg_128RegClassID;
1574   case OPW160: return AReg_160RegClassID;
1575   case OPW256: return AReg_256RegClassID;
1576   case OPW288: return AReg_288RegClassID;
1577   case OPW320: return AReg_320RegClassID;
1578   case OPW352: return AReg_352RegClassID;
1579   case OPW384: return AReg_384RegClassID;
1580   case OPW512: return AReg_512RegClassID;
1581   case OPW1024: return AReg_1024RegClassID;
1582   }
1583 }
1584 
1585 
1586 unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
1587   using namespace AMDGPU;
1588 
1589   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1590   switch (Width) {
1591   default: // fall
1592   case OPW32:
1593   case OPW16:
1594   case OPWV216:
1595     return SGPR_32RegClassID;
1596   case OPW64:
1597   case OPWV232: return SGPR_64RegClassID;
1598   case OPW96: return SGPR_96RegClassID;
1599   case OPW128: return SGPR_128RegClassID;
1600   case OPW160: return SGPR_160RegClassID;
1601   case OPW256: return SGPR_256RegClassID;
1602   case OPW288: return SGPR_288RegClassID;
1603   case OPW320: return SGPR_320RegClassID;
1604   case OPW352: return SGPR_352RegClassID;
1605   case OPW384: return SGPR_384RegClassID;
1606   case OPW512: return SGPR_512RegClassID;
1607   }
1608 }
1609 
1610 unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
1611   using namespace AMDGPU;
1612 
1613   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1614   switch (Width) {
1615   default: // fall
1616   case OPW32:
1617   case OPW16:
1618   case OPWV216:
1619     return TTMP_32RegClassID;
1620   case OPW64:
1621   case OPWV232: return TTMP_64RegClassID;
1622   case OPW128: return TTMP_128RegClassID;
1623   case OPW256: return TTMP_256RegClassID;
1624   case OPW288: return TTMP_288RegClassID;
1625   case OPW320: return TTMP_320RegClassID;
1626   case OPW352: return TTMP_352RegClassID;
1627   case OPW384: return TTMP_384RegClassID;
1628   case OPW512: return TTMP_512RegClassID;
1629   }
1630 }
1631 
1632 int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
1633   using namespace AMDGPU::EncValues;
1634 
1635   unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
1636   unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
1637 
1638   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
1639 }
1640 
1641 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
1642                                           bool MandatoryLiteral,
1643                                           unsigned ImmWidth,
1644                                           AMDGPU::OperandSemantics Sema) const {
1645   using namespace AMDGPU::EncValues;
1646 
1647   assert(Val < 1024); // enum10
1648 
1649   bool IsAGPR = Val & 512;
1650   Val &= 511;
1651 
1652   if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
1653     return createRegOperand(IsAGPR ? getAgprClassId(Width)
1654                                    : getVgprClassId(Width), Val - VGPR_MIN);
1655   }
1656   return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
1657                             Sema);
1658 }
1659 
1660 MCOperand
1661 AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
1662                                        bool MandatoryLiteral, unsigned ImmWidth,
1663                                        AMDGPU::OperandSemantics Sema) const {
1664   // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
1665   // decoded earlier.
1666   assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
1667   using namespace AMDGPU::EncValues;
1668 
1669   if (Val <= SGPR_MAX) {
1670     // "SGPR_MIN <= Val" is always true and causes compilation warning.
1671     static_assert(SGPR_MIN == 0);
1672     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
1673   }
1674 
1675   int TTmpIdx = getTTmpIdx(Val);
1676   if (TTmpIdx >= 0) {
1677     return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
1678   }
1679 
1680   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
1681     return decodeIntImmed(Val);
1682 
1683   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
1684     return decodeFPImmed(ImmWidth, Val, Sema);
1685 
1686   if (Val == LITERAL_CONST) {
1687     if (MandatoryLiteral)
1688       // Keep a sentinel value for deferred setting
1689       return MCOperand::createImm(LITERAL_CONST);
1690     return decodeLiteralConstant(Sema == AMDGPU::OperandSemantics::FP64);
1691   }
1692 
1693   switch (Width) {
1694   case OPW32:
1695   case OPW16:
1696   case OPWV216:
1697     return decodeSpecialReg32(Val);
1698   case OPW64:
1699   case OPWV232:
1700     return decodeSpecialReg64(Val);
1701   default:
1702     llvm_unreachable("unexpected immediate type");
1703   }
1704 }
1705 
1706 // Bit 0 of DstY isn't stored in the instruction, because it's always the
1707 // opposite of bit 0 of DstX.
1708 MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
1709                                                unsigned Val) const {
1710   int VDstXInd =
1711       AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
1712   assert(VDstXInd != -1);
1713   assert(Inst.getOperand(VDstXInd).isReg());
1714   unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
1715   Val |= ~XDstReg & 1;
1716   auto Width = llvm::AMDGPUDisassembler::OPW32;
1717   return createRegOperand(getVgprClassId(Width), Val);
1718 }
1719 
1720 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
1721   using namespace AMDGPU;
1722 
1723   switch (Val) {
1724   // clang-format off
1725   case 102: return createRegOperand(FLAT_SCR_LO);
1726   case 103: return createRegOperand(FLAT_SCR_HI);
1727   case 104: return createRegOperand(XNACK_MASK_LO);
1728   case 105: return createRegOperand(XNACK_MASK_HI);
1729   case 106: return createRegOperand(VCC_LO);
1730   case 107: return createRegOperand(VCC_HI);
1731   case 108: return createRegOperand(TBA_LO);
1732   case 109: return createRegOperand(TBA_HI);
1733   case 110: return createRegOperand(TMA_LO);
1734   case 111: return createRegOperand(TMA_HI);
1735   case 124:
1736     return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
1737   case 125:
1738     return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
1739   case 126: return createRegOperand(EXEC_LO);
1740   case 127: return createRegOperand(EXEC_HI);
1741   case 235: return createRegOperand(SRC_SHARED_BASE_LO);
1742   case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
1743   case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
1744   case 238: return createRegOperand(SRC_PRIVATE_LIMIT_LO);
1745   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1746   case 251: return createRegOperand(SRC_VCCZ);
1747   case 252: return createRegOperand(SRC_EXECZ);
1748   case 253: return createRegOperand(SRC_SCC);
1749   case 254: return createRegOperand(LDS_DIRECT);
1750   default: break;
1751     // clang-format on
1752   }
1753   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1754 }
1755 
1756 MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
1757   using namespace AMDGPU;
1758 
1759   switch (Val) {
1760   case 102: return createRegOperand(FLAT_SCR);
1761   case 104: return createRegOperand(XNACK_MASK);
1762   case 106: return createRegOperand(VCC);
1763   case 108: return createRegOperand(TBA);
1764   case 110: return createRegOperand(TMA);
1765   case 124:
1766     if (isGFX11Plus())
1767       return createRegOperand(SGPR_NULL);
1768     break;
1769   case 125:
1770     if (!isGFX11Plus())
1771       return createRegOperand(SGPR_NULL);
1772     break;
1773   case 126: return createRegOperand(EXEC);
1774   case 235: return createRegOperand(SRC_SHARED_BASE);
1775   case 236: return createRegOperand(SRC_SHARED_LIMIT);
1776   case 237: return createRegOperand(SRC_PRIVATE_BASE);
1777   case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
1778   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1779   case 251: return createRegOperand(SRC_VCCZ);
1780   case 252: return createRegOperand(SRC_EXECZ);
1781   case 253: return createRegOperand(SRC_SCC);
1782   default: break;
1783   }
1784   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1785 }
1786 
1787 MCOperand
1788 AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, const unsigned Val,
1789                                   unsigned ImmWidth,
1790                                   AMDGPU::OperandSemantics Sema) const {
1791   using namespace AMDGPU::SDWA;
1792   using namespace AMDGPU::EncValues;
1793 
1794   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
1795       STI.hasFeature(AMDGPU::FeatureGFX10)) {
1796     // XXX: cast to int is needed to avoid stupid warning:
1797     // compare with unsigned is always true
1798     if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
1799         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
1800       return createRegOperand(getVgprClassId(Width),
1801                               Val - SDWA9EncValues::SRC_VGPR_MIN);
1802     }
1803     if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
1804         Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
1805                               : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
1806       return createSRegOperand(getSgprClassId(Width),
1807                                Val - SDWA9EncValues::SRC_SGPR_MIN);
1808     }
1809     if (SDWA9EncValues::SRC_TTMP_MIN <= Val &&
1810         Val <= SDWA9EncValues::SRC_TTMP_MAX) {
1811       return createSRegOperand(getTtmpClassId(Width),
1812                                Val - SDWA9EncValues::SRC_TTMP_MIN);
1813     }
1814 
1815     const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
1816 
1817     if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
1818       return decodeIntImmed(SVal);
1819 
1820     if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
1821       return decodeFPImmed(ImmWidth, SVal, Sema);
1822 
1823     return decodeSpecialReg32(SVal);
1824   }
1825   if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands))
1826     return createRegOperand(getVgprClassId(Width), Val);
1827   llvm_unreachable("unsupported target");
1828 }
1829 
1830 MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
1831   return decodeSDWASrc(OPW16, Val, 16, AMDGPU::OperandSemantics::FP16);
1832 }
1833 
1834 MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
1835   return decodeSDWASrc(OPW32, Val, 32, AMDGPU::OperandSemantics::FP32);
1836 }
1837 
1838 MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
1839   using namespace AMDGPU::SDWA;
1840 
1841   assert((STI.hasFeature(AMDGPU::FeatureGFX9) ||
1842           STI.hasFeature(AMDGPU::FeatureGFX10)) &&
1843          "SDWAVopcDst should be present only on GFX9+");
1844 
1845   bool IsWave64 = STI.hasFeature(AMDGPU::FeatureWavefrontSize64);
1846 
1847   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
1848     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
1849 
1850     int TTmpIdx = getTTmpIdx(Val);
1851     if (TTmpIdx >= 0) {
1852       auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
1853       return createSRegOperand(TTmpClsId, TTmpIdx);
1854     }
1855     if (Val > SGPR_MAX) {
1856       return IsWave64 ? decodeSpecialReg64(Val) : decodeSpecialReg32(Val);
1857     }
1858     return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
1859   }
1860   return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
1861 }
1862 
1863 MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
1864   return STI.hasFeature(AMDGPU::FeatureWavefrontSize64)
1865              ? decodeSrcOp(OPW64, Val)
1866              : decodeSrcOp(OPW32, Val);
1867 }
1868 
1869 MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
1870   return decodeSrcOp(OPW32, Val);
1871 }
1872 
1873 MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
1874   if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1)
1875     return MCOperand();
1876   return MCOperand::createImm(Val);
1877 }
1878 
1879 MCOperand AMDGPUDisassembler::decodeVersionImm(unsigned Imm) const {
1880   using VersionField = AMDGPU::EncodingField<7, 0>;
1881   using W64Bit = AMDGPU::EncodingBit<13>;
1882   using W32Bit = AMDGPU::EncodingBit<14>;
1883   using MDPBit = AMDGPU::EncodingBit<15>;
1884   using Encoding = AMDGPU::EncodingFields<VersionField, W64Bit, W32Bit, MDPBit>;
1885 
1886   auto [Version, W64, W32, MDP] = Encoding::decode(Imm);
1887 
1888   // Decode into a plain immediate if any unused bits are raised.
1889   if (Encoding::encode(Version, W64, W32, MDP) != Imm)
1890     return MCOperand::createImm(Imm);
1891 
1892   const auto &Versions = AMDGPU::UCVersion::getGFXVersions();
1893   const auto *I = find_if(
1894       Versions, [Version = Version](const AMDGPU::UCVersion::GFXVersion &V) {
1895         return V.Code == Version;
1896       });
1897   MCContext &Ctx = getContext();
1898   const MCExpr *E;
1899   if (I == Versions.end())
1900     E = MCConstantExpr::create(Version, Ctx);
1901   else
1902     E = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(I->Symbol), Ctx);
1903 
1904   if (W64)
1905     E = MCBinaryExpr::createOr(E, UCVersionW64Expr, Ctx);
1906   if (W32)
1907     E = MCBinaryExpr::createOr(E, UCVersionW32Expr, Ctx);
1908   if (MDP)
1909     E = MCBinaryExpr::createOr(E, UCVersionMDPExpr, Ctx);
1910 
1911   return MCOperand::createExpr(E);
1912 }
1913 
1914 bool AMDGPUDisassembler::isVI() const {
1915   return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
1916 }
1917 
1918 bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
1919 
1920 bool AMDGPUDisassembler::isGFX90A() const {
1921   return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
1922 }
1923 
1924 bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
1925 
1926 bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
1927 
1928 bool AMDGPUDisassembler::isGFX10Plus() const {
1929   return AMDGPU::isGFX10Plus(STI);
1930 }
1931 
1932 bool AMDGPUDisassembler::isGFX11() const {
1933   return STI.hasFeature(AMDGPU::FeatureGFX11);
1934 }
1935 
1936 bool AMDGPUDisassembler::isGFX11Plus() const {
1937   return AMDGPU::isGFX11Plus(STI);
1938 }
1939 
1940 bool AMDGPUDisassembler::isGFX12() const {
1941   return STI.hasFeature(AMDGPU::FeatureGFX12);
1942 }
1943 
1944 bool AMDGPUDisassembler::isGFX12Plus() const {
1945   return AMDGPU::isGFX12Plus(STI);
1946 }
1947 
1948 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
1949   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
1950 }
1951 
1952 bool AMDGPUDisassembler::hasKernargPreload() const {
1953   return AMDGPU::hasKernargPreload(STI);
1954 }
1955 
1956 //===----------------------------------------------------------------------===//
1957 // AMDGPU specific symbol handling
1958 //===----------------------------------------------------------------------===//
1959 
1960 /// Print a string describing the reserved bit range specified by Mask with
1961 /// offset BaseBytes for use in error comments. Mask is a single continuous
1962 /// range of 1s surrounded by zeros. The format here is meant to align with the
1963 /// tables that describe these bits in llvm.org/docs/AMDGPUUsage.html.
1964 static SmallString<32> getBitRangeFromMask(uint32_t Mask, unsigned BaseBytes) {
1965   SmallString<32> Result;
1966   raw_svector_ostream S(Result);
1967 
1968   int TrailingZeros = llvm::countr_zero(Mask);
1969   int PopCount = llvm::popcount(Mask);
1970 
1971   if (PopCount == 1) {
1972     S << "bit (" << (TrailingZeros + BaseBytes * CHAR_BIT) << ')';
1973   } else {
1974     S << "bits in range ("
1975       << (TrailingZeros + PopCount - 1 + BaseBytes * CHAR_BIT) << ':'
1976       << (TrailingZeros + BaseBytes * CHAR_BIT) << ')';
1977   }
1978 
1979   return Result;
1980 }
1981 
1982 #define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK))
1983 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1984   do {                                                                         \
1985     KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n';            \
1986   } while (0)
1987 #define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK)                        \
1988   do {                                                                         \
1989     KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " "       \
1990              << GET_FIELD(MASK) << '\n';                                       \
1991   } while (0)
1992 
1993 #define CHECK_RESERVED_BITS_IMPL(MASK, DESC, MSG)                              \
1994   do {                                                                         \
1995     if (FourByteBuffer & (MASK)) {                                             \
1996       return createStringError(std::errc::invalid_argument,                    \
1997                                "kernel descriptor " DESC                       \
1998                                " reserved %s set" MSG,                         \
1999                                getBitRangeFromMask((MASK), 0).c_str());        \
2000     }                                                                          \
2001   } while (0)
2002 
2003 #define CHECK_RESERVED_BITS(MASK) CHECK_RESERVED_BITS_IMPL(MASK, #MASK, "")
2004 #define CHECK_RESERVED_BITS_MSG(MASK, MSG)                                     \
2005   CHECK_RESERVED_BITS_IMPL(MASK, #MASK, ", " MSG)
2006 #define CHECK_RESERVED_BITS_DESC(MASK, DESC)                                   \
2007   CHECK_RESERVED_BITS_IMPL(MASK, DESC, "")
2008 #define CHECK_RESERVED_BITS_DESC_MSG(MASK, DESC, MSG)                          \
2009   CHECK_RESERVED_BITS_IMPL(MASK, DESC, ", " MSG)
2010 
2011 // NOLINTNEXTLINE(readability-identifier-naming)
2012 Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
2013     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
2014   using namespace amdhsa;
2015   StringRef Indent = "\t";
2016 
2017   // We cannot accurately backward compute #VGPRs used from
2018   // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
2019   // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
2020   // simply calculate the inverse of what the assembler does.
2021 
2022   uint32_t GranulatedWorkitemVGPRCount =
2023       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT);
2024 
2025   uint32_t NextFreeVGPR =
2026       (GranulatedWorkitemVGPRCount + 1) *
2027       AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32);
2028 
2029   KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
2030 
2031   // We cannot backward compute values used to calculate
2032   // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
2033   // directives can't be computed:
2034   // .amdhsa_reserve_vcc
2035   // .amdhsa_reserve_flat_scratch
2036   // .amdhsa_reserve_xnack_mask
2037   // They take their respective default values if not specified in the assembly.
2038   //
2039   // GRANULATED_WAVEFRONT_SGPR_COUNT
2040   //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
2041   //
2042   // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
2043   // are set to 0. So while disassembling we consider that:
2044   //
2045   // GRANULATED_WAVEFRONT_SGPR_COUNT
2046   //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
2047   //
2048   // The disassembler cannot recover the original values of those 3 directives.
2049 
2050   uint32_t GranulatedWavefrontSGPRCount =
2051       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT);
2052 
2053   if (isGFX10Plus())
2054     CHECK_RESERVED_BITS_MSG(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
2055                             "must be zero on gfx10+");
2056 
2057   uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
2058                           AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
2059 
2060   KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
2061   if (!hasArchitectedFlatScratch())
2062     KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
2063   KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
2064   KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
2065 
2066   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_PRIORITY);
2067 
2068   PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
2069                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
2070   PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
2071                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
2072   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
2073                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
2074   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
2075                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
2076 
2077   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_PRIV);
2078 
2079   if (!isGFX12Plus())
2080     PRINT_DIRECTIVE(".amdhsa_dx10_clamp",
2081                     COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
2082 
2083   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_DEBUG_MODE);
2084 
2085   if (!isGFX12Plus())
2086     PRINT_DIRECTIVE(".amdhsa_ieee_mode",
2087                     COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
2088 
2089   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_BULKY);
2090   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_CDBG_USER);
2091 
2092   if (isGFX9Plus())
2093     PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
2094 
2095   if (!isGFX9Plus())
2096     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0,
2097                                  "COMPUTE_PGM_RSRC1", "must be zero pre-gfx9");
2098 
2099   CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC1_RESERVED1, "COMPUTE_PGM_RSRC1");
2100 
2101   if (!isGFX10Plus())
2102     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2,
2103                                  "COMPUTE_PGM_RSRC1", "must be zero pre-gfx10");
2104 
2105   if (isGFX10Plus()) {
2106     PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
2107                     COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
2108     PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
2109     PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
2110   }
2111 
2112   if (isGFX12Plus())
2113     PRINT_DIRECTIVE(".amdhsa_round_robin_scheduling",
2114                     COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
2115 
2116   return true;
2117 }
2118 
2119 // NOLINTNEXTLINE(readability-identifier-naming)
2120 Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
2121     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
2122   using namespace amdhsa;
2123   StringRef Indent = "\t";
2124   if (hasArchitectedFlatScratch())
2125     PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
2126                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
2127   else
2128     PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
2129                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
2130   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
2131                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
2132   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
2133                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
2134   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
2135                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
2136   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
2137                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
2138   PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
2139                   COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
2140 
2141   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH);
2142   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY);
2143   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE);
2144 
2145   PRINT_DIRECTIVE(
2146       ".amdhsa_exception_fp_ieee_invalid_op",
2147       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
2148   PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
2149                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
2150   PRINT_DIRECTIVE(
2151       ".amdhsa_exception_fp_ieee_div_zero",
2152       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
2153   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
2154                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
2155   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
2156                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
2157   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
2158                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
2159   PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
2160                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
2161 
2162   CHECK_RESERVED_BITS_DESC(COMPUTE_PGM_RSRC2_RESERVED0, "COMPUTE_PGM_RSRC2");
2163 
2164   return true;
2165 }
2166 
2167 // NOLINTNEXTLINE(readability-identifier-naming)
2168 Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
2169     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
2170   using namespace amdhsa;
2171   StringRef Indent = "\t";
2172   if (isGFX90A()) {
2173     KdStream << Indent << ".amdhsa_accum_offset "
2174              << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
2175              << '\n';
2176 
2177     PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
2178 
2179     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX90A_RESERVED0,
2180                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx90a");
2181     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX90A_RESERVED1,
2182                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx90a");
2183   } else if (isGFX10Plus()) {
2184     // Bits [0-3].
2185     if (!isGFX12Plus()) {
2186       if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
2187         PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
2188                         COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
2189       } else {
2190         PRINT_PSEUDO_DIRECTIVE_COMMENT(
2191             "SHARED_VGPR_COUNT",
2192             COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
2193       }
2194     } else {
2195       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0,
2196                                    "COMPUTE_PGM_RSRC3",
2197                                    "must be zero on gfx12+");
2198     }
2199 
2200     // Bits [4-11].
2201     if (isGFX11()) {
2202       PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
2203                                      COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE);
2204       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
2205                                      COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START);
2206       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
2207                                      COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END);
2208     } else if (isGFX12Plus()) {
2209       PRINT_PSEUDO_DIRECTIVE_COMMENT(
2210           "INST_PREF_SIZE", COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE);
2211     } else {
2212       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED1,
2213                                    "COMPUTE_PGM_RSRC3",
2214                                    "must be zero on gfx10");
2215     }
2216 
2217     // Bits [12].
2218     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2,
2219                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
2220 
2221     // Bits [13].
2222     if (isGFX12Plus()) {
2223       PRINT_PSEUDO_DIRECTIVE_COMMENT("GLG_EN",
2224                                      COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN);
2225     } else {
2226       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3,
2227                                    "COMPUTE_PGM_RSRC3",
2228                                    "must be zero on gfx10 or gfx11");
2229     }
2230 
2231     // Bits [14-30].
2232     CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
2233                                  "COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
2234 
2235     // Bits [31].
2236     if (isGFX11Plus()) {
2237       PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
2238                                      COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP);
2239     } else {
2240       CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED5,
2241                                    "COMPUTE_PGM_RSRC3",
2242                                    "must be zero on gfx10");
2243     }
2244   } else if (FourByteBuffer) {
2245     return createStringError(
2246         std::errc::invalid_argument,
2247         "kernel descriptor COMPUTE_PGM_RSRC3 must be all zero before gfx9");
2248   }
2249   return true;
2250 }
2251 #undef PRINT_PSEUDO_DIRECTIVE_COMMENT
2252 #undef PRINT_DIRECTIVE
2253 #undef GET_FIELD
2254 #undef CHECK_RESERVED_BITS_IMPL
2255 #undef CHECK_RESERVED_BITS
2256 #undef CHECK_RESERVED_BITS_MSG
2257 #undef CHECK_RESERVED_BITS_DESC
2258 #undef CHECK_RESERVED_BITS_DESC_MSG
2259 
2260 /// Create an error object to return from onSymbolStart for reserved kernel
2261 /// descriptor bits being set.
2262 static Error createReservedKDBitsError(uint32_t Mask, unsigned BaseBytes,
2263                                        const char *Msg = "") {
2264   return createStringError(
2265       std::errc::invalid_argument, "kernel descriptor reserved %s set%s%s",
2266       getBitRangeFromMask(Mask, BaseBytes).c_str(), *Msg ? ", " : "", Msg);
2267 }
2268 
2269 /// Create an error object to return from onSymbolStart for reserved kernel
2270 /// descriptor bytes being set.
2271 static Error createReservedKDBytesError(unsigned BaseInBytes,
2272                                         unsigned WidthInBytes) {
2273   // Create an error comment in the same format as the "Kernel Descriptor"
2274   // table here: https://llvm.org/docs/AMDGPUUsage.html#kernel-descriptor .
2275   return createStringError(
2276       std::errc::invalid_argument,
2277       "kernel descriptor reserved bits in range (%u:%u) set",
2278       (BaseInBytes + WidthInBytes) * CHAR_BIT - 1, BaseInBytes * CHAR_BIT);
2279 }
2280 
2281 Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
2282     DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
2283     raw_string_ostream &KdStream) const {
2284 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
2285   do {                                                                         \
2286     KdStream << Indent << DIRECTIVE " "                                        \
2287              << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
2288   } while (0)
2289 
2290   uint16_t TwoByteBuffer = 0;
2291   uint32_t FourByteBuffer = 0;
2292 
2293   StringRef ReservedBytes;
2294   StringRef Indent = "\t";
2295 
2296   assert(Bytes.size() == 64);
2297   DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
2298 
2299   switch (Cursor.tell()) {
2300   case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
2301     FourByteBuffer = DE.getU32(Cursor);
2302     KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
2303              << '\n';
2304     return true;
2305 
2306   case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
2307     FourByteBuffer = DE.getU32(Cursor);
2308     KdStream << Indent << ".amdhsa_private_segment_fixed_size "
2309              << FourByteBuffer << '\n';
2310     return true;
2311 
2312   case amdhsa::KERNARG_SIZE_OFFSET:
2313     FourByteBuffer = DE.getU32(Cursor);
2314     KdStream << Indent << ".amdhsa_kernarg_size "
2315              << FourByteBuffer << '\n';
2316     return true;
2317 
2318   case amdhsa::RESERVED0_OFFSET:
2319     // 4 reserved bytes, must be 0.
2320     ReservedBytes = DE.getBytes(Cursor, 4);
2321     for (int I = 0; I < 4; ++I) {
2322       if (ReservedBytes[I] != 0)
2323         return createReservedKDBytesError(amdhsa::RESERVED0_OFFSET, 4);
2324     }
2325     return true;
2326 
2327   case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
2328     // KERNEL_CODE_ENTRY_BYTE_OFFSET
2329     // So far no directive controls this for Code Object V3, so simply skip for
2330     // disassembly.
2331     DE.skip(Cursor, 8);
2332     return true;
2333 
2334   case amdhsa::RESERVED1_OFFSET:
2335     // 20 reserved bytes, must be 0.
2336     ReservedBytes = DE.getBytes(Cursor, 20);
2337     for (int I = 0; I < 20; ++I) {
2338       if (ReservedBytes[I] != 0)
2339         return createReservedKDBytesError(amdhsa::RESERVED1_OFFSET, 20);
2340     }
2341     return true;
2342 
2343   case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
2344     FourByteBuffer = DE.getU32(Cursor);
2345     return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream);
2346 
2347   case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
2348     FourByteBuffer = DE.getU32(Cursor);
2349     return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream);
2350 
2351   case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
2352     FourByteBuffer = DE.getU32(Cursor);
2353     return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream);
2354 
2355   case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
2356     using namespace amdhsa;
2357     TwoByteBuffer = DE.getU16(Cursor);
2358 
2359     if (!hasArchitectedFlatScratch())
2360       PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
2361                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
2362     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
2363                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
2364     PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
2365                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
2366     PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
2367                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
2368     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
2369                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
2370     if (!hasArchitectedFlatScratch())
2371       PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
2372                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
2373     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
2374                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
2375 
2376     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
2377       return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
2378                                        amdhsa::KERNEL_CODE_PROPERTIES_OFFSET);
2379 
2380     // Reserved for GFX9
2381     if (isGFX9() &&
2382         (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
2383       return createReservedKDBitsError(
2384           KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
2385           amdhsa::KERNEL_CODE_PROPERTIES_OFFSET, "must be zero on gfx9");
2386     }
2387     if (isGFX10Plus()) {
2388       PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
2389                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2390     }
2391 
2392     if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
2393       PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
2394                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
2395 
2396     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1) {
2397       return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED1,
2398                                        amdhsa::KERNEL_CODE_PROPERTIES_OFFSET);
2399     }
2400 
2401     return true;
2402 
2403   case amdhsa::KERNARG_PRELOAD_OFFSET:
2404     using namespace amdhsa;
2405     TwoByteBuffer = DE.getU16(Cursor);
2406     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) {
2407       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length",
2408                       KERNARG_PRELOAD_SPEC_LENGTH);
2409     }
2410 
2411     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) {
2412       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset",
2413                       KERNARG_PRELOAD_SPEC_OFFSET);
2414     }
2415     return true;
2416 
2417   case amdhsa::RESERVED3_OFFSET:
2418     // 4 bytes from here are reserved, must be 0.
2419     ReservedBytes = DE.getBytes(Cursor, 4);
2420     for (int I = 0; I < 4; ++I) {
2421       if (ReservedBytes[I] != 0)
2422         return createReservedKDBytesError(amdhsa::RESERVED3_OFFSET, 4);
2423     }
2424     return true;
2425 
2426   default:
2427     llvm_unreachable("Unhandled index. Case statements cover everything.");
2428     return true;
2429   }
2430 #undef PRINT_DIRECTIVE
2431 }
2432 
2433 Expected<bool> AMDGPUDisassembler::decodeKernelDescriptor(
2434     StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
2435 
2436   // CP microcode requires the kernel descriptor to be 64 aligned.
2437   if (Bytes.size() != 64 || KdAddress % 64 != 0)
2438     return createStringError(std::errc::invalid_argument,
2439                              "kernel descriptor must be 64-byte aligned");
2440 
2441   // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10
2442   // requires us to know the setting of .amdhsa_wavefront_size32 in order to
2443   // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong
2444   // order. Workaround this by first looking up .amdhsa_wavefront_size32 here
2445   // when required.
2446   if (isGFX10Plus()) {
2447     uint16_t KernelCodeProperties =
2448         support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET],
2449                                 llvm::endianness::little);
2450     EnableWavefrontSize32 =
2451         AMDHSA_BITS_GET(KernelCodeProperties,
2452                         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2453   }
2454 
2455   std::string Kd;
2456   raw_string_ostream KdStream(Kd);
2457   KdStream << ".amdhsa_kernel " << KdName << '\n';
2458 
2459   DataExtractor::Cursor C(0);
2460   while (C && C.tell() < Bytes.size()) {
2461     Expected<bool> Res = decodeKernelDescriptorDirective(C, Bytes, KdStream);
2462 
2463     cantFail(C.takeError());
2464 
2465     if (!Res)
2466       return Res;
2467   }
2468   KdStream << ".end_amdhsa_kernel\n";
2469   outs() << KdStream.str();
2470   return true;
2471 }
2472 
2473 Expected<bool> AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
2474                                                  uint64_t &Size,
2475                                                  ArrayRef<uint8_t> Bytes,
2476                                                  uint64_t Address) const {
2477   // Right now only kernel descriptor needs to be handled.
2478   // We ignore all other symbols for target specific handling.
2479   // TODO:
2480   // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
2481   // Object V2 and V3 when symbols are marked protected.
2482 
2483   // amd_kernel_code_t for Code Object V2.
2484   if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
2485     Size = 256;
2486     return createStringError(std::errc::invalid_argument,
2487                              "code object v2 is not supported");
2488   }
2489 
2490   // Code Object V3 kernel descriptors.
2491   StringRef Name = Symbol.Name;
2492   if (Symbol.Type == ELF::STT_OBJECT && Name.ends_with(StringRef(".kd"))) {
2493     Size = 64; // Size = 64 regardless of success or failure.
2494     return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
2495   }
2496 
2497   return false;
2498 }
2499 
2500 const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id,
2501                                                            int64_t Val) {
2502   MCContext &Ctx = getContext();
2503   MCSymbol *Sym = Ctx.getOrCreateSymbol(Id);
2504   // Note: only set value to Val on a new symbol in case an dissassembler
2505   // has already been initialized in this context.
2506   if (!Sym->isVariable()) {
2507     Sym->setVariableValue(MCConstantExpr::create(Val, Ctx));
2508   } else {
2509     int64_t Res = ~Val;
2510     bool Valid = Sym->getVariableValue()->evaluateAsAbsolute(Res);
2511     if (!Valid || Res != Val)
2512       Ctx.reportWarning(SMLoc(), "unsupported redefinition of " + Id);
2513   }
2514   return MCSymbolRefExpr::create(Sym, Ctx);
2515 }
2516 
2517 //===----------------------------------------------------------------------===//
2518 // AMDGPUSymbolizer
2519 //===----------------------------------------------------------------------===//
2520 
2521 // Try to find symbol name for specified label
2522 bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
2523     MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
2524     uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
2525     uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
2526 
2527   if (!IsBranch) {
2528     return false;
2529   }
2530 
2531   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
2532   if (!Symbols)
2533     return false;
2534 
2535   auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
2536     return Val.Addr == static_cast<uint64_t>(Value) &&
2537            Val.Type == ELF::STT_NOTYPE;
2538   });
2539   if (Result != Symbols->end()) {
2540     auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
2541     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
2542     Inst.addOperand(MCOperand::createExpr(Add));
2543     return true;
2544   }
2545   // Add to list of referenced addresses, so caller can synthesize a label.
2546   ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
2547   return false;
2548 }
2549 
2550 void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
2551                                                        int64_t Value,
2552                                                        uint64_t Address) {
2553   llvm_unreachable("unimplemented");
2554 }
2555 
2556 //===----------------------------------------------------------------------===//
2557 // Initialization
2558 //===----------------------------------------------------------------------===//
2559 
2560 static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
2561                               LLVMOpInfoCallback /*GetOpInfo*/,
2562                               LLVMSymbolLookupCallback /*SymbolLookUp*/,
2563                               void *DisInfo,
2564                               MCContext *Ctx,
2565                               std::unique_ptr<MCRelocationInfo> &&RelInfo) {
2566   return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
2567 }
2568 
2569 static MCDisassembler *createAMDGPUDisassembler(const Target &T,
2570                                                 const MCSubtargetInfo &STI,
2571                                                 MCContext &Ctx) {
2572   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
2573 }
2574 
2575 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
2576   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
2577                                          createAMDGPUDisassembler);
2578   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
2579                                        createAMDGPUSymbolizer);
2580 }
2581