xref: /llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (revision a256e1d97a92013b72beeb3ce1191d5537612bf4)
1 //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 //
11 /// \file
12 ///
13 /// This file contains definition for AMDGPU ISA disassembler
14 //
15 //===----------------------------------------------------------------------===//
16 
17 // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
18 
19 #include "Disassembler/AMDGPUDisassembler.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIDefines.h"
22 #include "SIRegisterInfo.h"
23 #include "TargetInfo/AMDGPUTargetInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm-c/DisassemblerTypes.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/MC/MCAsmInfo.h"
28 #include "llvm/MC/MCContext.h"
29 #include "llvm/MC/MCDecoderOps.h"
30 #include "llvm/MC/MCExpr.h"
31 #include "llvm/MC/MCInstrDesc.h"
32 #include "llvm/MC/MCRegisterInfo.h"
33 #include "llvm/MC/MCSubtargetInfo.h"
34 #include "llvm/MC/TargetRegistry.h"
35 #include "llvm/Support/AMDHSAKernelDescriptor.h"
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "amdgpu-disassembler"
40 
41 #define SGPR_MAX                                                               \
42   (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10                           \
43                  : AMDGPU::EncValues::SGPR_MAX_SI)
44 
45 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
46 
47 AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
48                                        MCContext &Ctx,
49                                        MCInstrInfo const *MCII) :
50   MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
51   TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) {
52 
53   // ToDo: AMDGPUDisassembler supports only VI ISA.
54   if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10Plus())
55     report_fatal_error("Disassembly not yet supported for subtarget");
56 }
57 
58 inline static MCDisassembler::DecodeStatus
59 addOperand(MCInst &Inst, const MCOperand& Opnd) {
60   Inst.addOperand(Opnd);
61   return Opnd.isValid() ?
62     MCDisassembler::Success :
63     MCDisassembler::Fail;
64 }
65 
66 static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
67                                 uint16_t NameIdx) {
68   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
69   if (OpIdx != -1) {
70     auto I = MI.begin();
71     std::advance(I, OpIdx);
72     MI.insert(I, Op);
73   }
74   return OpIdx;
75 }
76 
77 static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
78                                        uint64_t Addr,
79                                        const MCDisassembler *Decoder) {
80   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
81 
82   // Our branches take a simm16, but we need two extra bits to account for the
83   // factor of 4.
84   APInt SignedOffset(18, Imm * 4, true);
85   int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
86 
87   if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
88     return MCDisassembler::Success;
89   return addOperand(Inst, MCOperand::createImm(Imm));
90 }
91 
92 static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
93                                      const MCDisassembler *Decoder) {
94   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
95   int64_t Offset;
96   if (DAsm->isVI()) {         // VI supports 20-bit unsigned offsets.
97     Offset = Imm & 0xFFFFF;
98   } else {                    // GFX9+ supports 21-bit signed offsets.
99     Offset = SignExtend64<21>(Imm);
100   }
101   return addOperand(Inst, MCOperand::createImm(Offset));
102 }
103 
104 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
105                                   const MCDisassembler *Decoder) {
106   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
107   return addOperand(Inst, DAsm->decodeBoolReg(Val));
108 }
109 
110 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
111   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
112                                         uint64_t /*Addr*/,                     \
113                                         const MCDisassembler *Decoder) {       \
114     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
115     return addOperand(Inst, DAsm->DecoderName(Imm));                           \
116   }
117 
118 // Decoder for registers, decode directly using RegClassID. Imm(8-bit) is
119 // number of register. Used by VGPR only and AGPR only operands.
120 #define DECODE_OPERAND_REG_8(RegClass)                                         \
121   static DecodeStatus Decode##RegClass##RegisterClass(                         \
122       MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,                           \
123       const MCDisassembler *Decoder) {                                         \
124     assert(Imm < (1 << 8) && "8-bit encoding");                                \
125     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
126     return addOperand(                                                         \
127         Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm));      \
128   }
129 
130 #define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral,         \
131                      ImmWidth)                                                 \
132   static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,      \
133                            const MCDisassembler *Decoder) {                    \
134     assert(Imm < (1 << EncSize) && #EncSize "-bit encoding");                  \
135     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
136     return addOperand(Inst,                                                    \
137                       DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm,   \
138                                         MandatoryLiteral, ImmWidth));          \
139   }
140 
141 // Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to
142 // get register class. Used by SGPR only operands.
143 #define DECODE_OPERAND_REG_7(RegClass, OpWidth)                                \
144   DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0)
145 
146 // Decoder for registers. Imm(10-bit): Imm{7-0} is number of register,
147 // Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC).
148 // Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp.
149 // Used by AV_ register classes (AGPR or VGPR only register operands).
150 #define DECODE_OPERAND_REG_AV10(RegClass, OpWidth)                             \
151   DECODE_SrcOp(Decode##RegClass##RegisterClass, 10, OpWidth,                   \
152                Imm | AMDGPU::EncValues::IS_VGPR, false, 0)
153 
154 // Decoder for Src(9-bit encoding) registers only.
155 #define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth)                            \
156   DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0)
157 
158 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
159 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
160 // only.
161 #define DECODE_OPERAND_SRC_REG_A9(RegClass, OpWidth)                           \
162   DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm | 512, false, 0)
163 
164 // Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding
165 // Imm{9} is acc, registers only.
166 #define DECODE_SRC_OPERAND_REG_AV10(RegClass, OpWidth)                         \
167   DECODE_SrcOp(decodeOperand_##RegClass, 10, OpWidth, Imm, false, 0)
168 
169 // Decoder for RegisterOperands using 9-bit Src encoding. Operand can be
170 // register from RegClass or immediate. Registers that don't belong to RegClass
171 // will be decoded and InstPrinter will report warning. Immediate will be
172 // decoded into constant of size ImmWidth, should match width of immediate used
173 // by OperandType (important for floating point types).
174 #define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth)           \
175   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
176                false, ImmWidth)
177 
178 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
179 // and decode using 'enum10' from decodeSrcOp.
180 #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
181   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth,           \
182                Imm | 512, false, ImmWidth)
183 
184 #define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth)  \
185   DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9,         \
186                OpWidth, Imm, true, ImmWidth)
187 
188 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
189 // when RegisterClass is used as an operand. Most often used for destination
190 // operands.
191 
192 DECODE_OPERAND_REG_8(VGPR_32)
193 DECODE_OPERAND_REG_8(VGPR_32_Lo128)
194 DECODE_OPERAND_REG_8(VReg_64)
195 DECODE_OPERAND_REG_8(VReg_96)
196 DECODE_OPERAND_REG_8(VReg_128)
197 DECODE_OPERAND_REG_8(VReg_256)
198 DECODE_OPERAND_REG_8(VReg_288)
199 DECODE_OPERAND_REG_8(VReg_352)
200 DECODE_OPERAND_REG_8(VReg_384)
201 DECODE_OPERAND_REG_8(VReg_512)
202 DECODE_OPERAND_REG_8(VReg_1024)
203 
204 DECODE_OPERAND_REG_7(SReg_32, OPW32)
205 DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
206 DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
207 DECODE_OPERAND_REG_7(SReg_64, OPW64)
208 DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
209 DECODE_OPERAND_REG_7(SReg_128, OPW128)
210 DECODE_OPERAND_REG_7(SReg_256, OPW256)
211 DECODE_OPERAND_REG_7(SReg_512, OPW512)
212 
213 DECODE_OPERAND_REG_8(AGPR_32)
214 DECODE_OPERAND_REG_8(AReg_64)
215 DECODE_OPERAND_REG_8(AReg_128)
216 DECODE_OPERAND_REG_8(AReg_256)
217 DECODE_OPERAND_REG_8(AReg_512)
218 DECODE_OPERAND_REG_8(AReg_1024)
219 
220 DECODE_OPERAND_REG_AV10(AVDst_128, OPW128)
221 DECODE_OPERAND_REG_AV10(AVDst_512, OPW512)
222 
223 // Decoders for register only source RegisterOperands that use use 9-bit Src
224 // encoding: 'decodeOperand_<RegClass>'.
225 
226 DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32)
227 DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64)
228 DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128)
229 DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256)
230 DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32)
231 
232 DECODE_OPERAND_SRC_REG_A9(AGPR_32, OPW32)
233 
234 DECODE_SRC_OPERAND_REG_AV10(AV_32, OPW32)
235 DECODE_SRC_OPERAND_REG_AV10(AV_64, OPW64)
236 DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128)
237 
238 // Decoders for register or immediate RegisterOperands that use 9-bit Src
239 // encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'.
240 
241 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
242 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
243 DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
244 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
245 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
246 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
247 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
248 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
249 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
250 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
251 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
252 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
253 
254 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
255 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
256 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
257 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32)
258 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
259 
260 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
261 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
262 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
263 
264 static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm,
265                                           uint64_t Addr,
266                                           const MCDisassembler *Decoder) {
267   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
268   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
269 }
270 
271 static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm,
272                                           uint64_t Addr,
273                                           const MCDisassembler *Decoder) {
274   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
275   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
276 }
277 
278 static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
279                                           uint64_t Addr, const void *Decoder) {
280   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
281   return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
282 }
283 
284 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
285                           const MCRegisterInfo *MRI) {
286   if (OpIdx < 0)
287     return false;
288 
289   const MCOperand &Op = Inst.getOperand(OpIdx);
290   if (!Op.isReg())
291     return false;
292 
293   unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
294   auto Reg = Sub ? Sub : Op.getReg();
295   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
296 }
297 
298 static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
299                                              AMDGPUDisassembler::OpWidthTy Opw,
300                                              const MCDisassembler *Decoder) {
301   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
302   if (!DAsm->isGFX90A()) {
303     Imm &= 511;
304   } else {
305     // If atomic has both vdata and vdst their register classes are tied.
306     // The bit is decoded along with the vdst, first operand. We need to
307     // change register class to AGPR if vdst was AGPR.
308     // If a DS instruction has both data0 and data1 their register classes
309     // are also tied.
310     unsigned Opc = Inst.getOpcode();
311     uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
312     uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
313                                                         : AMDGPU::OpName::vdata;
314     const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
315     int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
316     if ((int)Inst.getNumOperands() == DataIdx) {
317       int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
318       if (IsAGPROperand(Inst, DstIdx, MRI))
319         Imm |= 512;
320     }
321 
322     if (TSFlags & SIInstrFlags::DS) {
323       int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
324       if ((int)Inst.getNumOperands() == Data2Idx &&
325           IsAGPROperand(Inst, DataIdx, MRI))
326         Imm |= 512;
327     }
328   }
329   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
330 }
331 
332 static DecodeStatus
333 DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
334                              const MCDisassembler *Decoder) {
335   return decodeOperand_AVLdSt_Any(Inst, Imm,
336                                   AMDGPUDisassembler::OPW32, Decoder);
337 }
338 
339 static DecodeStatus
340 DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
341                              const MCDisassembler *Decoder) {
342   return decodeOperand_AVLdSt_Any(Inst, Imm,
343                                   AMDGPUDisassembler::OPW64, Decoder);
344 }
345 
346 static DecodeStatus
347 DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
348                              const MCDisassembler *Decoder) {
349   return decodeOperand_AVLdSt_Any(Inst, Imm,
350                                   AMDGPUDisassembler::OPW96, Decoder);
351 }
352 
353 static DecodeStatus
354 DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
355                               const MCDisassembler *Decoder) {
356   return decodeOperand_AVLdSt_Any(Inst, Imm,
357                                   AMDGPUDisassembler::OPW128, Decoder);
358 }
359 
360 static DecodeStatus
361 DecodeAVLdSt_160RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
362                               const MCDisassembler *Decoder) {
363   return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW160,
364                                   Decoder);
365 }
366 
367 #define DECODE_SDWA(DecName) \
368 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
369 
370 DECODE_SDWA(Src32)
371 DECODE_SDWA(Src16)
372 DECODE_SDWA(VopcDst)
373 
374 #include "AMDGPUGenDisassemblerTables.inc"
375 
376 //===----------------------------------------------------------------------===//
377 //
378 //===----------------------------------------------------------------------===//
379 
380 template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
381   assert(Bytes.size() >= sizeof(T));
382   const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
383   Bytes = Bytes.slice(sizeof(T));
384   return Res;
385 }
386 
387 static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
388   assert(Bytes.size() >= 12);
389   uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>(
390       Bytes.data());
391   Bytes = Bytes.slice(8);
392   uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>(
393       Bytes.data());
394   Bytes = Bytes.slice(4);
395   return DecoderUInt128(Lo, Hi);
396 }
397 
398 // The disassembler is greedy, so we need to check FI operand value to
399 // not parse a dpp if the correct literal is not set. For dpp16 the
400 // autogenerated decoder checks the dpp literal
401 static bool isValidDPP8(const MCInst &MI) {
402   using namespace llvm::AMDGPU::DPP;
403   int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
404   assert(FiIdx != -1);
405   if ((unsigned)FiIdx >= MI.getNumOperands())
406     return false;
407   unsigned Fi = MI.getOperand(FiIdx).getImm();
408   return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
409 }
410 
411 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
412                                                 ArrayRef<uint8_t> Bytes_,
413                                                 uint64_t Address,
414                                                 raw_ostream &CS) const {
415   CommentStream = &CS;
416   bool IsSDWA = false;
417 
418   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
419   Bytes = Bytes_.slice(0, MaxInstBytesNum);
420 
421   DecodeStatus Res = MCDisassembler::Fail;
422   do {
423     // ToDo: better to switch encoding length using some bit predicate
424     // but it is unknown yet, so try all we can
425 
426     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
427     // encodings
428     if (isGFX11Plus() && Bytes.size() >= 12 ) {
429       DecoderUInt128 DecW = eat12Bytes(Bytes);
430       Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW,
431                                           Address);
432       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
433         break;
434       MI = MCInst(); // clear
435       Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW,
436                                           Address);
437       if (Res) {
438         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
439           convertVOP3PDPPInst(MI);
440         else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
441           convertVOPCDPPInst(MI); // Special VOP3 case
442         else {
443           assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
444           convertVOP3DPPInst(MI); // Regular VOP3 case
445         }
446         break;
447       }
448       Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address);
449       if (Res)
450         break;
451     }
452     // Reinitialize Bytes
453     Bytes = Bytes_.slice(0, MaxInstBytesNum);
454 
455     if (Bytes.size() >= 8) {
456       const uint64_t QW = eatBytes<uint64_t>(Bytes);
457 
458       if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
459         Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address);
460         if (Res) {
461           if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
462               == -1)
463             break;
464           if (convertDPP8Inst(MI) == MCDisassembler::Success)
465             break;
466           MI = MCInst(); // clear
467         }
468       }
469 
470       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
471       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
472         break;
473       MI = MCInst(); // clear
474 
475       Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address);
476       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
477         break;
478       MI = MCInst(); // clear
479 
480       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
481       if (Res) break;
482 
483       Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address);
484       if (Res) {
485         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
486           convertVOPCDPPInst(MI);
487         break;
488       }
489 
490       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
491       if (Res) { IsSDWA = true;  break; }
492 
493       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
494       if (Res) { IsSDWA = true;  break; }
495 
496       Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address);
497       if (Res) { IsSDWA = true;  break; }
498 
499       if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
500         Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
501         if (Res)
502           break;
503       }
504 
505       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
506       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
507       // table first so we print the correct name.
508       if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
509         Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
510         if (Res)
511           break;
512       }
513     }
514 
515     // Reinitialize Bytes as DPP64 could have eaten too much
516     Bytes = Bytes_.slice(0, MaxInstBytesNum);
517 
518     // Try decode 32-bit instruction
519     if (Bytes.size() < 4) break;
520     const uint32_t DW = eatBytes<uint32_t>(Bytes);
521     Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address);
522     if (Res) break;
523 
524     Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
525     if (Res) break;
526 
527     Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
528     if (Res) break;
529 
530     if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
531       Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address);
532       if (Res)
533         break;
534     }
535 
536     if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
537       Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
538       if (Res) break;
539     }
540 
541     Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
542     if (Res) break;
543 
544     Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address);
545     if (Res) break;
546 
547     if (Bytes.size() < 4) break;
548     const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
549 
550     if (STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts]) {
551       Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address);
552       if (Res)
553         break;
554     }
555 
556     if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
557       Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address);
558       if (Res)
559         break;
560     }
561 
562     Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
563     if (Res) break;
564 
565     Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
566     if (Res) break;
567 
568     Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);
569     if (Res) break;
570 
571     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
572     if (Res) break;
573 
574     Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
575     if (Res)
576       break;
577 
578     Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address);
579   } while (false);
580 
581   if (Res && AMDGPU::isMAC(MI.getOpcode())) {
582     // Insert dummy unused src2_modifiers.
583     insertNamedMCOperand(MI, MCOperand::createImm(0),
584                          AMDGPU::OpName::src2_modifiers);
585   }
586 
587   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
588           (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
589     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
590                                              AMDGPU::OpName::cpol);
591     if (CPolPos != -1) {
592       unsigned CPol =
593           (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
594               AMDGPU::CPol::GLC : 0;
595       if (MI.getNumOperands() <= (unsigned)CPolPos) {
596         insertNamedMCOperand(MI, MCOperand::createImm(CPol),
597                              AMDGPU::OpName::cpol);
598       } else if (CPol) {
599         MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
600       }
601     }
602   }
603 
604   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
605               (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
606              (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) {
607     // GFX90A lost TFE, its place is occupied by ACC.
608     int TFEOpIdx =
609         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
610     if (TFEOpIdx != -1) {
611       auto TFEIter = MI.begin();
612       std::advance(TFEIter, TFEOpIdx);
613       MI.insert(TFEIter, MCOperand::createImm(0));
614     }
615   }
616 
617   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
618               (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
619     int SWZOpIdx =
620         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
621     if (SWZOpIdx != -1) {
622       auto SWZIter = MI.begin();
623       std::advance(SWZIter, SWZOpIdx);
624       MI.insert(SWZIter, MCOperand::createImm(0));
625     }
626   }
627 
628   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
629     int VAddr0Idx =
630         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
631     int RsrcIdx =
632         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
633     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
634     if (VAddr0Idx >= 0 && NSAArgs > 0) {
635       unsigned NSAWords = (NSAArgs + 3) / 4;
636       if (Bytes.size() < 4 * NSAWords) {
637         Res = MCDisassembler::Fail;
638       } else {
639         for (unsigned i = 0; i < NSAArgs; ++i) {
640           const unsigned VAddrIdx = VAddr0Idx + 1 + i;
641           auto VAddrRCID =
642               MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
643           MI.insert(MI.begin() + VAddrIdx,
644                     createRegOperand(VAddrRCID, Bytes[i]));
645         }
646         Bytes = Bytes.slice(4 * NSAWords);
647       }
648     }
649 
650     if (Res)
651       Res = convertMIMGInst(MI);
652   }
653 
654   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
655     Res = convertEXPInst(MI);
656 
657   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
658     Res = convertVINTERPInst(MI);
659 
660   if (Res && IsSDWA)
661     Res = convertSDWAInst(MI);
662 
663   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
664                                               AMDGPU::OpName::vdst_in);
665   if (VDstIn_Idx != -1) {
666     int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
667                            MCOI::OperandConstraint::TIED_TO);
668     if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
669          !MI.getOperand(VDstIn_Idx).isReg() ||
670          MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
671       if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
672         MI.erase(&MI.getOperand(VDstIn_Idx));
673       insertNamedMCOperand(MI,
674         MCOperand::createReg(MI.getOperand(Tied).getReg()),
675         AMDGPU::OpName::vdst_in);
676     }
677   }
678 
679   int ImmLitIdx =
680       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
681   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
682   if (Res && ImmLitIdx != -1 && !IsSOPK)
683     Res = convertFMAanyK(MI, ImmLitIdx);
684 
685   // if the opcode was not recognized we'll assume a Size of 4 bytes
686   // (unless there are fewer bytes left)
687   Size = Res ? (MaxInstBytesNum - Bytes.size())
688              : std::min((size_t)4, Bytes_.size());
689   return Res;
690 }
691 
692 DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
693   if (STI.getFeatureBits()[AMDGPU::FeatureGFX11]) {
694     // The MCInst still has these fields even though they are no longer encoded
695     // in the GFX11 instruction.
696     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
697     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
698   }
699   return MCDisassembler::Success;
700 }
701 
702 DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
703   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
704       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
705       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
706       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
707     // The MCInst has this field that is not directly encoded in the
708     // instruction.
709     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
710   }
711   return MCDisassembler::Success;
712 }
713 
714 DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
715   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
716       STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
717     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
718       // VOPC - insert clamp
719       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
720   } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
721     int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
722     if (SDst != -1) {
723       // VOPC - insert VCC register as sdst
724       insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC),
725                            AMDGPU::OpName::sdst);
726     } else {
727       // VOP1/2 - insert omod if present in instruction
728       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
729     }
730   }
731   return MCDisassembler::Success;
732 }
733 
734 struct VOPModifiers {
735   unsigned OpSel = 0;
736   unsigned OpSelHi = 0;
737   unsigned NegLo = 0;
738   unsigned NegHi = 0;
739 };
740 
741 // Reconstruct values of VOP3/VOP3P operands such as op_sel.
742 // Note that these values do not affect disassembler output,
743 // so this is only necessary for consistency with src_modifiers.
744 static VOPModifiers collectVOPModifiers(const MCInst &MI,
745                                         bool IsVOP3P = false) {
746   VOPModifiers Modifiers;
747   unsigned Opc = MI.getOpcode();
748   const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
749                         AMDGPU::OpName::src1_modifiers,
750                         AMDGPU::OpName::src2_modifiers};
751   for (int J = 0; J < 3; ++J) {
752     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
753     if (OpIdx == -1)
754       continue;
755 
756     unsigned Val = MI.getOperand(OpIdx).getImm();
757 
758     Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
759     if (IsVOP3P) {
760       Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
761       Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
762       Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
763     } else if (J == 0) {
764       Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
765     }
766   }
767 
768   return Modifiers;
769 }
770 
771 // MAC opcodes have special old and src2 operands.
772 // src2 is tied to dst, while old is not tied (but assumed to be).
773 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
774   constexpr int DST_IDX = 0;
775   auto Opcode = MI.getOpcode();
776   const auto &Desc = MCII->get(Opcode);
777   auto OldIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::old);
778 
779   if (OldIdx != -1 && Desc.getOperandConstraint(
780                           OldIdx, MCOI::OperandConstraint::TIED_TO) == -1) {
781     assert(AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2));
782     assert(Desc.getOperandConstraint(
783                AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2),
784                MCOI::OperandConstraint::TIED_TO) == DST_IDX);
785     (void)DST_IDX;
786     return true;
787   }
788 
789   return false;
790 }
791 
792 // Create dummy old operand and insert dummy unused src2_modifiers
793 void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
794   assert(MI.getNumOperands() + 1 < MCII->get(MI.getOpcode()).getNumOperands());
795   insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
796   insertNamedMCOperand(MI, MCOperand::createImm(0),
797                        AMDGPU::OpName::src2_modifiers);
798 }
799 
800 // We must check FI == literal to reject not genuine dpp8 insts, and we must
801 // first add optional MI operands to check FI
802 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
803   unsigned Opc = MI.getOpcode();
804   if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
805     convertVOP3PDPPInst(MI);
806   } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
807              AMDGPU::isVOPC64DPP(Opc)) {
808     convertVOPCDPPInst(MI);
809   } else {
810     if (isMacDPP(MI))
811       convertMacDPPInst(MI);
812 
813     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
814     if (MI.getNumOperands() < DescNumOps &&
815         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
816       auto Mods = collectVOPModifiers(MI);
817       insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
818                            AMDGPU::OpName::op_sel);
819     } else {
820       // Insert dummy unused src modifiers.
821       if (MI.getNumOperands() < DescNumOps &&
822           AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
823         insertNamedMCOperand(MI, MCOperand::createImm(0),
824                              AMDGPU::OpName::src0_modifiers);
825 
826       if (MI.getNumOperands() < DescNumOps &&
827           AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
828         insertNamedMCOperand(MI, MCOperand::createImm(0),
829                              AMDGPU::OpName::src1_modifiers);
830     }
831   }
832   return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
833 }
834 
835 DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
836   if (isMacDPP(MI))
837     convertMacDPPInst(MI);
838 
839   unsigned Opc = MI.getOpcode();
840   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
841   if (MI.getNumOperands() < DescNumOps &&
842       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
843     auto Mods = collectVOPModifiers(MI);
844     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
845                          AMDGPU::OpName::op_sel);
846   }
847   return MCDisassembler::Success;
848 }
849 
850 // Note that before gfx10, the MIMG encoding provided no information about
851 // VADDR size. Consequently, decoded instructions always show address as if it
852 // has 1 dword, which could be not really so.
853 DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
854 
855   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
856                                            AMDGPU::OpName::vdst);
857 
858   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
859                                             AMDGPU::OpName::vdata);
860   int VAddr0Idx =
861       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
862   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
863                                             AMDGPU::OpName::dmask);
864 
865   int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
866                                             AMDGPU::OpName::tfe);
867   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
868                                             AMDGPU::OpName::d16);
869 
870   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
871   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
872       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
873 
874   assert(VDataIdx != -1);
875   if (BaseOpcode->BVH) {
876     // Add A16 operand for intersect_ray instructions
877     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::a16))
878       addOperand(MI, MCOperand::createImm(1));
879     return MCDisassembler::Success;
880   }
881 
882   bool IsAtomic = (VDstIdx != -1);
883   bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
884   bool IsNSA = false;
885   unsigned AddrSize = Info->VAddrDwords;
886 
887   if (isGFX10Plus()) {
888     unsigned DimIdx =
889         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
890     int A16Idx =
891         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
892     const AMDGPU::MIMGDimInfo *Dim =
893         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
894     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
895 
896     AddrSize =
897         AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
898 
899     IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
900             Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
901     if (!IsNSA) {
902       if (AddrSize > 12)
903         AddrSize = 16;
904     } else {
905       if (AddrSize > Info->VAddrDwords) {
906         // The NSA encoding does not contain enough operands for the combination
907         // of base opcode / dimension. Should this be an error?
908         return MCDisassembler::Success;
909       }
910     }
911   }
912 
913   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
914   unsigned DstSize = IsGather4 ? 4 : std::max(llvm::popcount(DMask), 1);
915 
916   bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
917   if (D16 && AMDGPU::hasPackedD16(STI)) {
918     DstSize = (DstSize + 1) / 2;
919   }
920 
921   if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
922     DstSize += 1;
923 
924   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
925     return MCDisassembler::Success;
926 
927   int NewOpcode =
928       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
929   if (NewOpcode == -1)
930     return MCDisassembler::Success;
931 
932   // Widen the register to the correct number of enabled channels.
933   unsigned NewVdata = AMDGPU::NoRegister;
934   if (DstSize != Info->VDataDwords) {
935     auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
936 
937     // Get first subregister of VData
938     unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
939     unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
940     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
941 
942     NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
943                                        &MRI.getRegClass(DataRCID));
944     if (NewVdata == AMDGPU::NoRegister) {
945       // It's possible to encode this such that the low register + enabled
946       // components exceeds the register count.
947       return MCDisassembler::Success;
948     }
949   }
950 
951   // If not using NSA on GFX10+, widen address register to correct size.
952   unsigned NewVAddr0 = AMDGPU::NoRegister;
953   if (isGFX10Plus() && !IsNSA && AddrSize != Info->VAddrDwords) {
954     unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
955     unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
956     VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;
957 
958     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddr0Idx].RegClass;
959     NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0,
960                                         &MRI.getRegClass(AddrRCID));
961     if (NewVAddr0 == AMDGPU::NoRegister)
962       return MCDisassembler::Success;
963   }
964 
965   MI.setOpcode(NewOpcode);
966 
967   if (NewVdata != AMDGPU::NoRegister) {
968     MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
969 
970     if (IsAtomic) {
971       // Atomic operations have an additional operand (a copy of data)
972       MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
973     }
974   }
975 
976   if (NewVAddr0 != AMDGPU::NoRegister) {
977     MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0);
978   } else if (IsNSA) {
979     assert(AddrSize <= Info->VAddrDwords);
980     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
981              MI.begin() + VAddr0Idx + Info->VAddrDwords);
982   }
983 
984   return MCDisassembler::Success;
985 }
986 
987 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
988 // decoder only adds to src_modifiers, so manually add the bits to the other
989 // operands.
990 DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
991   unsigned Opc = MI.getOpcode();
992   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
993   auto Mods = collectVOPModifiers(MI, true);
994 
995   if (MI.getNumOperands() < DescNumOps &&
996       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
997     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
998 
999   if (MI.getNumOperands() < DescNumOps &&
1000       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel))
1001     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1002                          AMDGPU::OpName::op_sel);
1003   if (MI.getNumOperands() < DescNumOps &&
1004       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel_hi))
1005     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
1006                          AMDGPU::OpName::op_sel_hi);
1007   if (MI.getNumOperands() < DescNumOps &&
1008       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_lo))
1009     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
1010                          AMDGPU::OpName::neg_lo);
1011   if (MI.getNumOperands() < DescNumOps &&
1012       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
1013     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
1014                          AMDGPU::OpName::neg_hi);
1015 
1016   return MCDisassembler::Success;
1017 }
1018 
1019 // Create dummy old operand and insert optional operands
1020 DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
1021   unsigned Opc = MI.getOpcode();
1022   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1023 
1024   if (MI.getNumOperands() < DescNumOps &&
1025       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::old))
1026     insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
1027 
1028   if (MI.getNumOperands() < DescNumOps &&
1029       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
1030     insertNamedMCOperand(MI, MCOperand::createImm(0),
1031                          AMDGPU::OpName::src0_modifiers);
1032 
1033   if (MI.getNumOperands() < DescNumOps &&
1034       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
1035     insertNamedMCOperand(MI, MCOperand::createImm(0),
1036                          AMDGPU::OpName::src1_modifiers);
1037   return MCDisassembler::Success;
1038 }
1039 
1040 DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
1041                                                 int ImmLitIdx) const {
1042   assert(HasLiteral && "Should have decoded a literal");
1043   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
1044   unsigned DescNumOps = Desc.getNumOperands();
1045   insertNamedMCOperand(MI, MCOperand::createImm(Literal),
1046                        AMDGPU::OpName::immDeferred);
1047   assert(DescNumOps == MI.getNumOperands());
1048   for (unsigned I = 0; I < DescNumOps; ++I) {
1049     auto &Op = MI.getOperand(I);
1050     auto OpType = Desc.operands()[I].OperandType;
1051     bool IsDeferredOp = (OpType == AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED ||
1052                          OpType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED);
1053     if (Op.isImm() && Op.getImm() == AMDGPU::EncValues::LITERAL_CONST &&
1054         IsDeferredOp)
1055       Op.setImm(Literal);
1056   }
1057   return MCDisassembler::Success;
1058 }
1059 
1060 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
1061   return getContext().getRegisterInfo()->
1062     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
1063 }
1064 
1065 inline
1066 MCOperand AMDGPUDisassembler::errOperand(unsigned V,
1067                                          const Twine& ErrMsg) const {
1068   *CommentStream << "Error: " + ErrMsg;
1069 
1070   // ToDo: add support for error operands to MCInst.h
1071   // return MCOperand::createError(V);
1072   return MCOperand();
1073 }
1074 
1075 inline
1076 MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
1077   return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
1078 }
1079 
1080 inline
1081 MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
1082                                                unsigned Val) const {
1083   const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
1084   if (Val >= RegCl.getNumRegs())
1085     return errOperand(Val, Twine(getRegClassName(RegClassID)) +
1086                            ": unknown register " + Twine(Val));
1087   return createRegOperand(RegCl.getRegister(Val));
1088 }
1089 
1090 inline
1091 MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
1092                                                 unsigned Val) const {
1093   // ToDo: SI/CI have 104 SGPRs, VI - 102
1094   // Valery: here we accepting as much as we can, let assembler sort it out
1095   int shift = 0;
1096   switch (SRegClassID) {
1097   case AMDGPU::SGPR_32RegClassID:
1098   case AMDGPU::TTMP_32RegClassID:
1099     break;
1100   case AMDGPU::SGPR_64RegClassID:
1101   case AMDGPU::TTMP_64RegClassID:
1102     shift = 1;
1103     break;
1104   case AMDGPU::SGPR_128RegClassID:
1105   case AMDGPU::TTMP_128RegClassID:
1106   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
1107   // this bundle?
1108   case AMDGPU::SGPR_256RegClassID:
1109   case AMDGPU::TTMP_256RegClassID:
1110     // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
1111   // this bundle?
1112   case AMDGPU::SGPR_288RegClassID:
1113   case AMDGPU::TTMP_288RegClassID:
1114   case AMDGPU::SGPR_320RegClassID:
1115   case AMDGPU::TTMP_320RegClassID:
1116   case AMDGPU::SGPR_352RegClassID:
1117   case AMDGPU::TTMP_352RegClassID:
1118   case AMDGPU::SGPR_384RegClassID:
1119   case AMDGPU::TTMP_384RegClassID:
1120   case AMDGPU::SGPR_512RegClassID:
1121   case AMDGPU::TTMP_512RegClassID:
1122     shift = 2;
1123     break;
1124   // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
1125   // this bundle?
1126   default:
1127     llvm_unreachable("unhandled register class");
1128   }
1129 
1130   if (Val % (1 << shift)) {
1131     *CommentStream << "Warning: " << getRegClassName(SRegClassID)
1132                    << ": scalar reg isn't aligned " << Val;
1133   }
1134 
1135   return createRegOperand(SRegClassID, Val >> shift);
1136 }
1137 
1138 // Decode Literals for insts which always have a literal in the encoding
1139 MCOperand
1140 AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
1141   if (HasLiteral) {
1142     assert(
1143         AMDGPU::hasVOPD(STI) &&
1144         "Should only decode multiple kimm with VOPD, check VSrc operand types");
1145     if (Literal != Val)
1146       return errOperand(Val, "More than one unique literal is illegal");
1147   }
1148   HasLiteral = true;
1149   Literal = Val;
1150   return MCOperand::createImm(Literal);
1151 }
1152 
1153 MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
1154   // For now all literal constants are supposed to be unsigned integer
1155   // ToDo: deal with signed/unsigned 64-bit integer constants
1156   // ToDo: deal with float/double constants
1157   if (!HasLiteral) {
1158     if (Bytes.size() < 4) {
1159       return errOperand(0, "cannot read literal, inst bytes left " +
1160                         Twine(Bytes.size()));
1161     }
1162     HasLiteral = true;
1163     Literal = eatBytes<uint32_t>(Bytes);
1164   }
1165   return MCOperand::createImm(Literal);
1166 }
1167 
1168 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
1169   using namespace AMDGPU::EncValues;
1170 
1171   assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
1172   return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
1173     (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
1174     (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
1175       // Cast prevents negative overflow.
1176 }
1177 
1178 static int64_t getInlineImmVal32(unsigned Imm) {
1179   switch (Imm) {
1180   case 240:
1181     return FloatToBits(0.5f);
1182   case 241:
1183     return FloatToBits(-0.5f);
1184   case 242:
1185     return FloatToBits(1.0f);
1186   case 243:
1187     return FloatToBits(-1.0f);
1188   case 244:
1189     return FloatToBits(2.0f);
1190   case 245:
1191     return FloatToBits(-2.0f);
1192   case 246:
1193     return FloatToBits(4.0f);
1194   case 247:
1195     return FloatToBits(-4.0f);
1196   case 248: // 1 / (2 * PI)
1197     return 0x3e22f983;
1198   default:
1199     llvm_unreachable("invalid fp inline imm");
1200   }
1201 }
1202 
1203 static int64_t getInlineImmVal64(unsigned Imm) {
1204   switch (Imm) {
1205   case 240:
1206     return DoubleToBits(0.5);
1207   case 241:
1208     return DoubleToBits(-0.5);
1209   case 242:
1210     return DoubleToBits(1.0);
1211   case 243:
1212     return DoubleToBits(-1.0);
1213   case 244:
1214     return DoubleToBits(2.0);
1215   case 245:
1216     return DoubleToBits(-2.0);
1217   case 246:
1218     return DoubleToBits(4.0);
1219   case 247:
1220     return DoubleToBits(-4.0);
1221   case 248: // 1 / (2 * PI)
1222     return 0x3fc45f306dc9c882;
1223   default:
1224     llvm_unreachable("invalid fp inline imm");
1225   }
1226 }
1227 
1228 static int64_t getInlineImmVal16(unsigned Imm) {
1229   switch (Imm) {
1230   case 240:
1231     return 0x3800;
1232   case 241:
1233     return 0xB800;
1234   case 242:
1235     return 0x3C00;
1236   case 243:
1237     return 0xBC00;
1238   case 244:
1239     return 0x4000;
1240   case 245:
1241     return 0xC000;
1242   case 246:
1243     return 0x4400;
1244   case 247:
1245     return 0xC400;
1246   case 248: // 1 / (2 * PI)
1247     return 0x3118;
1248   default:
1249     llvm_unreachable("invalid fp inline imm");
1250   }
1251 }
1252 
1253 MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm) {
1254   assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
1255       && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
1256 
1257   // ToDo: case 248: 1/(2*PI) - is allowed only on VI
1258   // ImmWidth 0 is a default case where operand should not allow immediates.
1259   // Imm value is still decoded into 32 bit immediate operand, inst printer will
1260   // use it to print verbose error message.
1261   switch (ImmWidth) {
1262   case 0:
1263   case 32:
1264     return MCOperand::createImm(getInlineImmVal32(Imm));
1265   case 64:
1266     return MCOperand::createImm(getInlineImmVal64(Imm));
1267   case 16:
1268     return MCOperand::createImm(getInlineImmVal16(Imm));
1269   default:
1270     llvm_unreachable("implement me");
1271   }
1272 }
1273 
1274 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
1275   using namespace AMDGPU;
1276 
1277   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1278   switch (Width) {
1279   default: // fall
1280   case OPW32:
1281   case OPW16:
1282   case OPWV216:
1283     return VGPR_32RegClassID;
1284   case OPW64:
1285   case OPWV232: return VReg_64RegClassID;
1286   case OPW96: return VReg_96RegClassID;
1287   case OPW128: return VReg_128RegClassID;
1288   case OPW160: return VReg_160RegClassID;
1289   case OPW256: return VReg_256RegClassID;
1290   case OPW288: return VReg_288RegClassID;
1291   case OPW320: return VReg_320RegClassID;
1292   case OPW352: return VReg_352RegClassID;
1293   case OPW384: return VReg_384RegClassID;
1294   case OPW512: return VReg_512RegClassID;
1295   case OPW1024: return VReg_1024RegClassID;
1296   }
1297 }
1298 
1299 unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
1300   using namespace AMDGPU;
1301 
1302   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1303   switch (Width) {
1304   default: // fall
1305   case OPW32:
1306   case OPW16:
1307   case OPWV216:
1308     return AGPR_32RegClassID;
1309   case OPW64:
1310   case OPWV232: return AReg_64RegClassID;
1311   case OPW96: return AReg_96RegClassID;
1312   case OPW128: return AReg_128RegClassID;
1313   case OPW160: return AReg_160RegClassID;
1314   case OPW256: return AReg_256RegClassID;
1315   case OPW288: return AReg_288RegClassID;
1316   case OPW320: return AReg_320RegClassID;
1317   case OPW352: return AReg_352RegClassID;
1318   case OPW384: return AReg_384RegClassID;
1319   case OPW512: return AReg_512RegClassID;
1320   case OPW1024: return AReg_1024RegClassID;
1321   }
1322 }
1323 
1324 
1325 unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
1326   using namespace AMDGPU;
1327 
1328   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1329   switch (Width) {
1330   default: // fall
1331   case OPW32:
1332   case OPW16:
1333   case OPWV216:
1334     return SGPR_32RegClassID;
1335   case OPW64:
1336   case OPWV232: return SGPR_64RegClassID;
1337   case OPW96: return SGPR_96RegClassID;
1338   case OPW128: return SGPR_128RegClassID;
1339   case OPW160: return SGPR_160RegClassID;
1340   case OPW256: return SGPR_256RegClassID;
1341   case OPW288: return SGPR_288RegClassID;
1342   case OPW320: return SGPR_320RegClassID;
1343   case OPW352: return SGPR_352RegClassID;
1344   case OPW384: return SGPR_384RegClassID;
1345   case OPW512: return SGPR_512RegClassID;
1346   }
1347 }
1348 
1349 unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
1350   using namespace AMDGPU;
1351 
1352   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1353   switch (Width) {
1354   default: // fall
1355   case OPW32:
1356   case OPW16:
1357   case OPWV216:
1358     return TTMP_32RegClassID;
1359   case OPW64:
1360   case OPWV232: return TTMP_64RegClassID;
1361   case OPW128: return TTMP_128RegClassID;
1362   case OPW256: return TTMP_256RegClassID;
1363   case OPW288: return TTMP_288RegClassID;
1364   case OPW320: return TTMP_320RegClassID;
1365   case OPW352: return TTMP_352RegClassID;
1366   case OPW384: return TTMP_384RegClassID;
1367   case OPW512: return TTMP_512RegClassID;
1368   }
1369 }
1370 
1371 int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
1372   using namespace AMDGPU::EncValues;
1373 
1374   unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
1375   unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
1376 
1377   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
1378 }
1379 
1380 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
1381                                           bool MandatoryLiteral,
1382                                           unsigned ImmWidth) const {
1383   using namespace AMDGPU::EncValues;
1384 
1385   assert(Val < 1024); // enum10
1386 
1387   bool IsAGPR = Val & 512;
1388   Val &= 511;
1389 
1390   if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
1391     return createRegOperand(IsAGPR ? getAgprClassId(Width)
1392                                    : getVgprClassId(Width), Val - VGPR_MIN);
1393   }
1394   if (Val <= SGPR_MAX) {
1395     // "SGPR_MIN <= Val" is always true and causes compilation warning.
1396     static_assert(SGPR_MIN == 0);
1397     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
1398   }
1399 
1400   int TTmpIdx = getTTmpIdx(Val);
1401   if (TTmpIdx >= 0) {
1402     return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
1403   }
1404 
1405   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
1406     return decodeIntImmed(Val);
1407 
1408   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
1409     return decodeFPImmed(ImmWidth, Val);
1410 
1411   if (Val == LITERAL_CONST) {
1412     if (MandatoryLiteral)
1413       // Keep a sentinel value for deferred setting
1414       return MCOperand::createImm(LITERAL_CONST);
1415     else
1416       return decodeLiteralConstant();
1417   }
1418 
1419   switch (Width) {
1420   case OPW32:
1421   case OPW16:
1422   case OPWV216:
1423     return decodeSpecialReg32(Val);
1424   case OPW64:
1425   case OPWV232:
1426     return decodeSpecialReg64(Val);
1427   default:
1428     llvm_unreachable("unexpected immediate type");
1429   }
1430 }
1431 
1432 // Bit 0 of DstY isn't stored in the instruction, because it's always the
1433 // opposite of bit 0 of DstX.
1434 MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
1435                                                unsigned Val) const {
1436   int VDstXInd =
1437       AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
1438   assert(VDstXInd != -1);
1439   assert(Inst.getOperand(VDstXInd).isReg());
1440   unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
1441   Val |= ~XDstReg & 1;
1442   auto Width = llvm::AMDGPUDisassembler::OPW32;
1443   return createRegOperand(getVgprClassId(Width), Val);
1444 }
1445 
1446 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
1447   using namespace AMDGPU;
1448 
1449   switch (Val) {
1450   // clang-format off
1451   case 102: return createRegOperand(FLAT_SCR_LO);
1452   case 103: return createRegOperand(FLAT_SCR_HI);
1453   case 104: return createRegOperand(XNACK_MASK_LO);
1454   case 105: return createRegOperand(XNACK_MASK_HI);
1455   case 106: return createRegOperand(VCC_LO);
1456   case 107: return createRegOperand(VCC_HI);
1457   case 108: return createRegOperand(TBA_LO);
1458   case 109: return createRegOperand(TBA_HI);
1459   case 110: return createRegOperand(TMA_LO);
1460   case 111: return createRegOperand(TMA_HI);
1461   case 124:
1462     return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
1463   case 125:
1464     return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
1465   case 126: return createRegOperand(EXEC_LO);
1466   case 127: return createRegOperand(EXEC_HI);
1467   case 235: return createRegOperand(SRC_SHARED_BASE_LO);
1468   case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
1469   case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
1470   case 238: return createRegOperand(SRC_PRIVATE_LIMIT_LO);
1471   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1472   case 251: return createRegOperand(SRC_VCCZ);
1473   case 252: return createRegOperand(SRC_EXECZ);
1474   case 253: return createRegOperand(SRC_SCC);
1475   case 254: return createRegOperand(LDS_DIRECT);
1476   default: break;
1477     // clang-format on
1478   }
1479   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1480 }
1481 
1482 MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
1483   using namespace AMDGPU;
1484 
1485   switch (Val) {
1486   case 102: return createRegOperand(FLAT_SCR);
1487   case 104: return createRegOperand(XNACK_MASK);
1488   case 106: return createRegOperand(VCC);
1489   case 108: return createRegOperand(TBA);
1490   case 110: return createRegOperand(TMA);
1491   case 124:
1492     if (isGFX11Plus())
1493       return createRegOperand(SGPR_NULL);
1494     break;
1495   case 125:
1496     if (!isGFX11Plus())
1497       return createRegOperand(SGPR_NULL);
1498     break;
1499   case 126: return createRegOperand(EXEC);
1500   case 235: return createRegOperand(SRC_SHARED_BASE);
1501   case 236: return createRegOperand(SRC_SHARED_LIMIT);
1502   case 237: return createRegOperand(SRC_PRIVATE_BASE);
1503   case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
1504   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1505   case 251: return createRegOperand(SRC_VCCZ);
1506   case 252: return createRegOperand(SRC_EXECZ);
1507   case 253: return createRegOperand(SRC_SCC);
1508   default: break;
1509   }
1510   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1511 }
1512 
1513 MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
1514                                             const unsigned Val,
1515                                             unsigned ImmWidth) const {
1516   using namespace AMDGPU::SDWA;
1517   using namespace AMDGPU::EncValues;
1518 
1519   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
1520       STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
1521     // XXX: cast to int is needed to avoid stupid warning:
1522     // compare with unsigned is always true
1523     if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
1524         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
1525       return createRegOperand(getVgprClassId(Width),
1526                               Val - SDWA9EncValues::SRC_VGPR_MIN);
1527     }
1528     if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
1529         Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
1530                               : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
1531       return createSRegOperand(getSgprClassId(Width),
1532                                Val - SDWA9EncValues::SRC_SGPR_MIN);
1533     }
1534     if (SDWA9EncValues::SRC_TTMP_MIN <= Val &&
1535         Val <= SDWA9EncValues::SRC_TTMP_MAX) {
1536       return createSRegOperand(getTtmpClassId(Width),
1537                                Val - SDWA9EncValues::SRC_TTMP_MIN);
1538     }
1539 
1540     const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
1541 
1542     if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
1543       return decodeIntImmed(SVal);
1544 
1545     if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
1546       return decodeFPImmed(ImmWidth, SVal);
1547 
1548     return decodeSpecialReg32(SVal);
1549   } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
1550     return createRegOperand(getVgprClassId(Width), Val);
1551   }
1552   llvm_unreachable("unsupported target");
1553 }
1554 
1555 MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
1556   return decodeSDWASrc(OPW16, Val, 16);
1557 }
1558 
1559 MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
1560   return decodeSDWASrc(OPW32, Val, 32);
1561 }
1562 
1563 MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
1564   using namespace AMDGPU::SDWA;
1565 
1566   assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
1567           STI.getFeatureBits()[AMDGPU::FeatureGFX10]) &&
1568          "SDWAVopcDst should be present only on GFX9+");
1569 
1570   bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64];
1571 
1572   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
1573     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
1574 
1575     int TTmpIdx = getTTmpIdx(Val);
1576     if (TTmpIdx >= 0) {
1577       auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
1578       return createSRegOperand(TTmpClsId, TTmpIdx);
1579     } else if (Val > SGPR_MAX) {
1580       return IsWave64 ? decodeSpecialReg64(Val)
1581                       : decodeSpecialReg32(Val);
1582     } else {
1583       return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
1584     }
1585   } else {
1586     return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
1587   }
1588 }
1589 
1590 MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
1591   return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]
1592              ? decodeSrcOp(OPW64, Val)
1593              : decodeSrcOp(OPW32, Val);
1594 }
1595 
1596 bool AMDGPUDisassembler::isVI() const {
1597   return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
1598 }
1599 
1600 bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
1601 
1602 bool AMDGPUDisassembler::isGFX90A() const {
1603   return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
1604 }
1605 
1606 bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
1607 
1608 bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
1609 
1610 bool AMDGPUDisassembler::isGFX10Plus() const {
1611   return AMDGPU::isGFX10Plus(STI);
1612 }
1613 
1614 bool AMDGPUDisassembler::isGFX11() const {
1615   return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
1616 }
1617 
1618 bool AMDGPUDisassembler::isGFX11Plus() const {
1619   return AMDGPU::isGFX11Plus(STI);
1620 }
1621 
1622 
1623 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
1624   return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
1625 }
1626 
1627 //===----------------------------------------------------------------------===//
1628 // AMDGPU specific symbol handling
1629 //===----------------------------------------------------------------------===//
1630 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1631   do {                                                                         \
1632     KdStream << Indent << DIRECTIVE " "                                        \
1633              << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';           \
1634   } while (0)
1635 
1636 // NOLINTNEXTLINE(readability-identifier-naming)
1637 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
1638     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1639   using namespace amdhsa;
1640   StringRef Indent = "\t";
1641 
1642   // We cannot accurately backward compute #VGPRs used from
1643   // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
1644   // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
1645   // simply calculate the inverse of what the assembler does.
1646 
1647   uint32_t GranulatedWorkitemVGPRCount =
1648       (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >>
1649       COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT;
1650 
1651   uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) *
1652                           AMDGPU::IsaInfo::getVGPREncodingGranule(&STI);
1653 
1654   KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
1655 
1656   // We cannot backward compute values used to calculate
1657   // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
1658   // directives can't be computed:
1659   // .amdhsa_reserve_vcc
1660   // .amdhsa_reserve_flat_scratch
1661   // .amdhsa_reserve_xnack_mask
1662   // They take their respective default values if not specified in the assembly.
1663   //
1664   // GRANULATED_WAVEFRONT_SGPR_COUNT
1665   //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
1666   //
1667   // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
1668   // are set to 0. So while disassembling we consider that:
1669   //
1670   // GRANULATED_WAVEFRONT_SGPR_COUNT
1671   //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
1672   //
1673   // The disassembler cannot recover the original values of those 3 directives.
1674 
1675   uint32_t GranulatedWavefrontSGPRCount =
1676       (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >>
1677       COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT;
1678 
1679   if (isGFX10Plus() && GranulatedWavefrontSGPRCount)
1680     return MCDisassembler::Fail;
1681 
1682   uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
1683                           AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
1684 
1685   KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
1686   if (!hasArchitectedFlatScratch())
1687     KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
1688   KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
1689   KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
1690 
1691   if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
1692     return MCDisassembler::Fail;
1693 
1694   PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
1695                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
1696   PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
1697                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
1698   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
1699                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
1700   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
1701                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
1702 
1703   if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
1704     return MCDisassembler::Fail;
1705 
1706   PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
1707 
1708   if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
1709     return MCDisassembler::Fail;
1710 
1711   PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
1712 
1713   if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
1714     return MCDisassembler::Fail;
1715 
1716   if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
1717     return MCDisassembler::Fail;
1718 
1719   PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
1720 
1721   if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
1722     return MCDisassembler::Fail;
1723 
1724   if (isGFX10Plus()) {
1725     PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
1726                     COMPUTE_PGM_RSRC1_WGP_MODE);
1727     PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
1728     PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
1729   }
1730   return MCDisassembler::Success;
1731 }
1732 
1733 // NOLINTNEXTLINE(readability-identifier-naming)
1734 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
1735     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1736   using namespace amdhsa;
1737   StringRef Indent = "\t";
1738   if (hasArchitectedFlatScratch())
1739     PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
1740                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
1741   else
1742     PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
1743                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
1744   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
1745                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
1746   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
1747                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
1748   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
1749                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
1750   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
1751                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
1752   PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
1753                   COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
1754 
1755   if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
1756     return MCDisassembler::Fail;
1757 
1758   if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
1759     return MCDisassembler::Fail;
1760 
1761   if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
1762     return MCDisassembler::Fail;
1763 
1764   PRINT_DIRECTIVE(
1765       ".amdhsa_exception_fp_ieee_invalid_op",
1766       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
1767   PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
1768                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
1769   PRINT_DIRECTIVE(
1770       ".amdhsa_exception_fp_ieee_div_zero",
1771       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
1772   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
1773                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
1774   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
1775                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
1776   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
1777                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
1778   PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
1779                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
1780 
1781   if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
1782     return MCDisassembler::Fail;
1783 
1784   return MCDisassembler::Success;
1785 }
1786 
1787 #undef PRINT_DIRECTIVE
1788 
1789 MCDisassembler::DecodeStatus
1790 AMDGPUDisassembler::decodeKernelDescriptorDirective(
1791     DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
1792     raw_string_ostream &KdStream) const {
1793 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1794   do {                                                                         \
1795     KdStream << Indent << DIRECTIVE " "                                        \
1796              << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
1797   } while (0)
1798 
1799   uint16_t TwoByteBuffer = 0;
1800   uint32_t FourByteBuffer = 0;
1801 
1802   StringRef ReservedBytes;
1803   StringRef Indent = "\t";
1804 
1805   assert(Bytes.size() == 64);
1806   DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
1807 
1808   switch (Cursor.tell()) {
1809   case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
1810     FourByteBuffer = DE.getU32(Cursor);
1811     KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
1812              << '\n';
1813     return MCDisassembler::Success;
1814 
1815   case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
1816     FourByteBuffer = DE.getU32(Cursor);
1817     KdStream << Indent << ".amdhsa_private_segment_fixed_size "
1818              << FourByteBuffer << '\n';
1819     return MCDisassembler::Success;
1820 
1821   case amdhsa::KERNARG_SIZE_OFFSET:
1822     FourByteBuffer = DE.getU32(Cursor);
1823     KdStream << Indent << ".amdhsa_kernarg_size "
1824              << FourByteBuffer << '\n';
1825     return MCDisassembler::Success;
1826 
1827   case amdhsa::RESERVED0_OFFSET:
1828     // 4 reserved bytes, must be 0.
1829     ReservedBytes = DE.getBytes(Cursor, 4);
1830     for (int I = 0; I < 4; ++I) {
1831       if (ReservedBytes[I] != 0) {
1832         return MCDisassembler::Fail;
1833       }
1834     }
1835     return MCDisassembler::Success;
1836 
1837   case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
1838     // KERNEL_CODE_ENTRY_BYTE_OFFSET
1839     // So far no directive controls this for Code Object V3, so simply skip for
1840     // disassembly.
1841     DE.skip(Cursor, 8);
1842     return MCDisassembler::Success;
1843 
1844   case amdhsa::RESERVED1_OFFSET:
1845     // 20 reserved bytes, must be 0.
1846     ReservedBytes = DE.getBytes(Cursor, 20);
1847     for (int I = 0; I < 20; ++I) {
1848       if (ReservedBytes[I] != 0) {
1849         return MCDisassembler::Fail;
1850       }
1851     }
1852     return MCDisassembler::Success;
1853 
1854   case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
1855     // COMPUTE_PGM_RSRC3
1856     //  - Only set for GFX10, GFX6-9 have this to be 0.
1857     //  - Currently no directives directly control this.
1858     FourByteBuffer = DE.getU32(Cursor);
1859     if (!isGFX10Plus() && FourByteBuffer) {
1860       return MCDisassembler::Fail;
1861     }
1862     return MCDisassembler::Success;
1863 
1864   case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
1865     FourByteBuffer = DE.getU32(Cursor);
1866     if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) ==
1867         MCDisassembler::Fail) {
1868       return MCDisassembler::Fail;
1869     }
1870     return MCDisassembler::Success;
1871 
1872   case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
1873     FourByteBuffer = DE.getU32(Cursor);
1874     if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) ==
1875         MCDisassembler::Fail) {
1876       return MCDisassembler::Fail;
1877     }
1878     return MCDisassembler::Success;
1879 
1880   case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
1881     using namespace amdhsa;
1882     TwoByteBuffer = DE.getU16(Cursor);
1883 
1884     if (!hasArchitectedFlatScratch())
1885       PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
1886                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
1887     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
1888                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
1889     PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
1890                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
1891     PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
1892                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
1893     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
1894                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
1895     if (!hasArchitectedFlatScratch())
1896       PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
1897                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
1898     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
1899                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
1900 
1901     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
1902       return MCDisassembler::Fail;
1903 
1904     // Reserved for GFX9
1905     if (isGFX9() &&
1906         (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
1907       return MCDisassembler::Fail;
1908     } else if (isGFX10Plus()) {
1909       PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
1910                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
1911     }
1912 
1913     if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5)
1914       PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
1915                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
1916 
1917     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
1918       return MCDisassembler::Fail;
1919 
1920     return MCDisassembler::Success;
1921 
1922   case amdhsa::RESERVED2_OFFSET:
1923     // 6 bytes from here are reserved, must be 0.
1924     ReservedBytes = DE.getBytes(Cursor, 6);
1925     for (int I = 0; I < 6; ++I) {
1926       if (ReservedBytes[I] != 0)
1927         return MCDisassembler::Fail;
1928     }
1929     return MCDisassembler::Success;
1930 
1931   default:
1932     llvm_unreachable("Unhandled index. Case statements cover everything.");
1933     return MCDisassembler::Fail;
1934   }
1935 #undef PRINT_DIRECTIVE
1936 }
1937 
1938 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
1939     StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
1940   // CP microcode requires the kernel descriptor to be 64 aligned.
1941   if (Bytes.size() != 64 || KdAddress % 64 != 0)
1942     return MCDisassembler::Fail;
1943 
1944   std::string Kd;
1945   raw_string_ostream KdStream(Kd);
1946   KdStream << ".amdhsa_kernel " << KdName << '\n';
1947 
1948   DataExtractor::Cursor C(0);
1949   while (C && C.tell() < Bytes.size()) {
1950     MCDisassembler::DecodeStatus Status =
1951         decodeKernelDescriptorDirective(C, Bytes, KdStream);
1952 
1953     cantFail(C.takeError());
1954 
1955     if (Status == MCDisassembler::Fail)
1956       return MCDisassembler::Fail;
1957   }
1958   KdStream << ".end_amdhsa_kernel\n";
1959   outs() << KdStream.str();
1960   return MCDisassembler::Success;
1961 }
1962 
1963 std::optional<MCDisassembler::DecodeStatus>
1964 AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
1965                                   ArrayRef<uint8_t> Bytes, uint64_t Address,
1966                                   raw_ostream &CStream) const {
1967   // Right now only kernel descriptor needs to be handled.
1968   // We ignore all other symbols for target specific handling.
1969   // TODO:
1970   // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
1971   // Object V2 and V3 when symbols are marked protected.
1972 
1973   // amd_kernel_code_t for Code Object V2.
1974   if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
1975     Size = 256;
1976     return MCDisassembler::Fail;
1977   }
1978 
1979   // Code Object V3 kernel descriptors.
1980   StringRef Name = Symbol.Name;
1981   if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
1982     Size = 64; // Size = 64 regardless of success or failure.
1983     return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
1984   }
1985   return std::nullopt;
1986 }
1987 
1988 //===----------------------------------------------------------------------===//
1989 // AMDGPUSymbolizer
1990 //===----------------------------------------------------------------------===//
1991 
1992 // Try to find symbol name for specified label
1993 bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
1994     MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
1995     uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
1996     uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
1997 
1998   if (!IsBranch) {
1999     return false;
2000   }
2001 
2002   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
2003   if (!Symbols)
2004     return false;
2005 
2006   auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
2007     return Val.Addr == static_cast<uint64_t>(Value) &&
2008            Val.Type == ELF::STT_NOTYPE;
2009   });
2010   if (Result != Symbols->end()) {
2011     auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
2012     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
2013     Inst.addOperand(MCOperand::createExpr(Add));
2014     return true;
2015   }
2016   // Add to list of referenced addresses, so caller can synthesize a label.
2017   ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
2018   return false;
2019 }
2020 
2021 void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
2022                                                        int64_t Value,
2023                                                        uint64_t Address) {
2024   llvm_unreachable("unimplemented");
2025 }
2026 
2027 //===----------------------------------------------------------------------===//
2028 // Initialization
2029 //===----------------------------------------------------------------------===//
2030 
2031 static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
2032                               LLVMOpInfoCallback /*GetOpInfo*/,
2033                               LLVMSymbolLookupCallback /*SymbolLookUp*/,
2034                               void *DisInfo,
2035                               MCContext *Ctx,
2036                               std::unique_ptr<MCRelocationInfo> &&RelInfo) {
2037   return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
2038 }
2039 
2040 static MCDisassembler *createAMDGPUDisassembler(const Target &T,
2041                                                 const MCSubtargetInfo &STI,
2042                                                 MCContext &Ctx) {
2043   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
2044 }
2045 
2046 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
2047   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
2048                                          createAMDGPUDisassembler);
2049   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
2050                                        createAMDGPUSymbolizer);
2051 }
2052