xref: /llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (revision 6cb3866b1ce9d835402e414049478cea82427cf1)
1 //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 //
11 /// \file
12 ///
13 /// This file contains definition for AMDGPU ISA disassembler
14 //
15 //===----------------------------------------------------------------------===//
16 
17 // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
18 
19 #include "Disassembler/AMDGPUDisassembler.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIDefines.h"
22 #include "SIRegisterInfo.h"
23 #include "TargetInfo/AMDGPUTargetInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm-c/DisassemblerTypes.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/MC/MCAsmInfo.h"
28 #include "llvm/MC/MCContext.h"
29 #include "llvm/MC/MCDecoderOps.h"
30 #include "llvm/MC/MCExpr.h"
31 #include "llvm/MC/MCInstrDesc.h"
32 #include "llvm/MC/MCRegisterInfo.h"
33 #include "llvm/MC/MCSubtargetInfo.h"
34 #include "llvm/MC/TargetRegistry.h"
35 #include "llvm/Support/AMDHSAKernelDescriptor.h"
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "amdgpu-disassembler"
40 
41 #define SGPR_MAX                                                               \
42   (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10                           \
43                  : AMDGPU::EncValues::SGPR_MAX_SI)
44 
45 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
46 
47 AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
48                                        MCContext &Ctx, MCInstrInfo const *MCII)
49     : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
50       MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)) {
51   // ToDo: AMDGPUDisassembler supports only VI ISA.
52   if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
53     report_fatal_error("Disassembly not yet supported for subtarget");
54 }
55 
56 inline static MCDisassembler::DecodeStatus
57 addOperand(MCInst &Inst, const MCOperand& Opnd) {
58   Inst.addOperand(Opnd);
59   return Opnd.isValid() ?
60     MCDisassembler::Success :
61     MCDisassembler::Fail;
62 }
63 
64 static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
65                                 uint16_t NameIdx) {
66   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
67   if (OpIdx != -1) {
68     auto I = MI.begin();
69     std::advance(I, OpIdx);
70     MI.insert(I, Op);
71   }
72   return OpIdx;
73 }
74 
75 static DecodeStatus decodeSOPPBrTarget(MCInst &Inst, unsigned Imm,
76                                        uint64_t Addr,
77                                        const MCDisassembler *Decoder) {
78   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
79 
80   // Our branches take a simm16, but we need two extra bits to account for the
81   // factor of 4.
82   APInt SignedOffset(18, Imm * 4, true);
83   int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
84 
85   if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
86     return MCDisassembler::Success;
87   return addOperand(Inst, MCOperand::createImm(Imm));
88 }
89 
90 static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
91                                      const MCDisassembler *Decoder) {
92   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
93   int64_t Offset;
94   if (DAsm->isVI()) {         // VI supports 20-bit unsigned offsets.
95     Offset = Imm & 0xFFFFF;
96   } else {                    // GFX9+ supports 21-bit signed offsets.
97     Offset = SignExtend64<21>(Imm);
98   }
99   return addOperand(Inst, MCOperand::createImm(Offset));
100 }
101 
102 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
103                                   const MCDisassembler *Decoder) {
104   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
105   return addOperand(Inst, DAsm->decodeBoolReg(Val));
106 }
107 
108 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
109   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
110                                         uint64_t /*Addr*/,                     \
111                                         const MCDisassembler *Decoder) {       \
112     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
113     return addOperand(Inst, DAsm->DecoderName(Imm));                           \
114   }
115 
116 // Decoder for registers, decode directly using RegClassID. Imm(8-bit) is
117 // number of register. Used by VGPR only and AGPR only operands.
118 #define DECODE_OPERAND_REG_8(RegClass)                                         \
119   static DecodeStatus Decode##RegClass##RegisterClass(                         \
120       MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,                           \
121       const MCDisassembler *Decoder) {                                         \
122     assert(Imm < (1 << 8) && "8-bit encoding");                                \
123     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
124     return addOperand(                                                         \
125         Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm));      \
126   }
127 
128 #define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral,         \
129                      ImmWidth)                                                 \
130   static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,      \
131                            const MCDisassembler *Decoder) {                    \
132     assert(Imm < (1 << EncSize) && #EncSize "-bit encoding");                  \
133     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
134     return addOperand(Inst,                                                    \
135                       DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm,   \
136                                         MandatoryLiteral, ImmWidth));          \
137   }
138 
139 // Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to
140 // get register class. Used by SGPR only operands.
141 #define DECODE_OPERAND_REG_7(RegClass, OpWidth)                                \
142   DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0)
143 
144 // Decoder for registers. Imm(10-bit): Imm{7-0} is number of register,
145 // Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC).
146 // Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp.
147 // Used by AV_ register classes (AGPR or VGPR only register operands).
148 #define DECODE_OPERAND_REG_AV10(RegClass, OpWidth)                             \
149   DECODE_SrcOp(Decode##RegClass##RegisterClass, 10, OpWidth,                   \
150                Imm | AMDGPU::EncValues::IS_VGPR, false, 0)
151 
152 // Decoder for Src(9-bit encoding) registers only.
153 #define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth)                            \
154   DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0)
155 
156 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
157 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
158 // only.
159 #define DECODE_OPERAND_SRC_REG_A9(RegClass, OpWidth)                           \
160   DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm | 512, false, 0)
161 
162 // Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding
163 // Imm{9} is acc, registers only.
164 #define DECODE_SRC_OPERAND_REG_AV10(RegClass, OpWidth)                         \
165   DECODE_SrcOp(decodeOperand_##RegClass, 10, OpWidth, Imm, false, 0)
166 
167 // Decoder for RegisterOperands using 9-bit Src encoding. Operand can be
168 // register from RegClass or immediate. Registers that don't belong to RegClass
169 // will be decoded and InstPrinter will report warning. Immediate will be
170 // decoded into constant of size ImmWidth, should match width of immediate used
171 // by OperandType (important for floating point types).
172 #define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth)           \
173   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
174                false, ImmWidth)
175 
176 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
177 // and decode using 'enum10' from decodeSrcOp.
178 #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
179   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth,           \
180                Imm | 512, false, ImmWidth)
181 
182 #define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth)  \
183   DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9,         \
184                OpWidth, Imm, true, ImmWidth)
185 
186 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
187 // when RegisterClass is used as an operand. Most often used for destination
188 // operands.
189 
190 DECODE_OPERAND_REG_8(VGPR_32)
191 DECODE_OPERAND_REG_8(VGPR_32_Lo128)
192 DECODE_OPERAND_REG_8(VReg_64)
193 DECODE_OPERAND_REG_8(VReg_96)
194 DECODE_OPERAND_REG_8(VReg_128)
195 DECODE_OPERAND_REG_8(VReg_256)
196 DECODE_OPERAND_REG_8(VReg_288)
197 DECODE_OPERAND_REG_8(VReg_352)
198 DECODE_OPERAND_REG_8(VReg_384)
199 DECODE_OPERAND_REG_8(VReg_512)
200 DECODE_OPERAND_REG_8(VReg_1024)
201 
202 DECODE_OPERAND_REG_7(SReg_32, OPW32)
203 DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
204 DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
205 DECODE_OPERAND_REG_7(SReg_64, OPW64)
206 DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
207 DECODE_OPERAND_REG_7(SReg_128, OPW128)
208 DECODE_OPERAND_REG_7(SReg_256, OPW256)
209 DECODE_OPERAND_REG_7(SReg_512, OPW512)
210 
211 DECODE_OPERAND_REG_8(AGPR_32)
212 DECODE_OPERAND_REG_8(AReg_64)
213 DECODE_OPERAND_REG_8(AReg_128)
214 DECODE_OPERAND_REG_8(AReg_256)
215 DECODE_OPERAND_REG_8(AReg_512)
216 DECODE_OPERAND_REG_8(AReg_1024)
217 
218 DECODE_OPERAND_REG_AV10(AVDst_128, OPW128)
219 DECODE_OPERAND_REG_AV10(AVDst_512, OPW512)
220 
221 // Decoders for register only source RegisterOperands that use use 9-bit Src
222 // encoding: 'decodeOperand_<RegClass>'.
223 
224 DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32)
225 DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64)
226 DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128)
227 DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256)
228 DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32)
229 
230 DECODE_OPERAND_SRC_REG_A9(AGPR_32, OPW32)
231 
232 DECODE_SRC_OPERAND_REG_AV10(AV_32, OPW32)
233 DECODE_SRC_OPERAND_REG_AV10(AV_64, OPW64)
234 DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128)
235 
236 // Decoders for register or immediate RegisterOperands that use 9-bit Src
237 // encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'.
238 
239 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
240 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
241 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 16)
242 DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
243 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
244 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
245 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
246 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
247 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
248 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
249 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
250 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
251 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
252 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
253 
254 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
255 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
256 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
257 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32)
258 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
259 
260 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
261 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
262 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
263 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(SReg_32, OPW32, 32)
264 
265 static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
266                                          uint64_t Addr,
267                                          const MCDisassembler *Decoder) {
268   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
269   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
270 }
271 
272 static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
273                                           uint64_t Addr, const void *Decoder) {
274   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
275   return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
276 }
277 
278 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
279                           const MCRegisterInfo *MRI) {
280   if (OpIdx < 0)
281     return false;
282 
283   const MCOperand &Op = Inst.getOperand(OpIdx);
284   if (!Op.isReg())
285     return false;
286 
287   unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
288   auto Reg = Sub ? Sub : Op.getReg();
289   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
290 }
291 
292 static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
293                                              AMDGPUDisassembler::OpWidthTy Opw,
294                                              const MCDisassembler *Decoder) {
295   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
296   if (!DAsm->isGFX90A()) {
297     Imm &= 511;
298   } else {
299     // If atomic has both vdata and vdst their register classes are tied.
300     // The bit is decoded along with the vdst, first operand. We need to
301     // change register class to AGPR if vdst was AGPR.
302     // If a DS instruction has both data0 and data1 their register classes
303     // are also tied.
304     unsigned Opc = Inst.getOpcode();
305     uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
306     uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
307                                                         : AMDGPU::OpName::vdata;
308     const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
309     int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
310     if ((int)Inst.getNumOperands() == DataIdx) {
311       int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
312       if (IsAGPROperand(Inst, DstIdx, MRI))
313         Imm |= 512;
314     }
315 
316     if (TSFlags & SIInstrFlags::DS) {
317       int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
318       if ((int)Inst.getNumOperands() == Data2Idx &&
319           IsAGPROperand(Inst, DataIdx, MRI))
320         Imm |= 512;
321     }
322   }
323   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
324 }
325 
326 static DecodeStatus
327 DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
328                              const MCDisassembler *Decoder) {
329   return decodeOperand_AVLdSt_Any(Inst, Imm,
330                                   AMDGPUDisassembler::OPW32, Decoder);
331 }
332 
333 static DecodeStatus
334 DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
335                              const MCDisassembler *Decoder) {
336   return decodeOperand_AVLdSt_Any(Inst, Imm,
337                                   AMDGPUDisassembler::OPW64, Decoder);
338 }
339 
340 static DecodeStatus
341 DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
342                              const MCDisassembler *Decoder) {
343   return decodeOperand_AVLdSt_Any(Inst, Imm,
344                                   AMDGPUDisassembler::OPW96, Decoder);
345 }
346 
347 static DecodeStatus
348 DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
349                               const MCDisassembler *Decoder) {
350   return decodeOperand_AVLdSt_Any(Inst, Imm,
351                                   AMDGPUDisassembler::OPW128, Decoder);
352 }
353 
354 static DecodeStatus
355 DecodeAVLdSt_160RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
356                               const MCDisassembler *Decoder) {
357   return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW160,
358                                   Decoder);
359 }
360 
361 #define DECODE_SDWA(DecName) \
362 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
363 
364 DECODE_SDWA(Src32)
365 DECODE_SDWA(Src16)
366 DECODE_SDWA(VopcDst)
367 
368 #include "AMDGPUGenDisassemblerTables.inc"
369 
370 //===----------------------------------------------------------------------===//
371 //
372 //===----------------------------------------------------------------------===//
373 
374 template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
375   assert(Bytes.size() >= sizeof(T));
376   const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
377   Bytes = Bytes.slice(sizeof(T));
378   return Res;
379 }
380 
381 static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
382   assert(Bytes.size() >= 12);
383   uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>(
384       Bytes.data());
385   Bytes = Bytes.slice(8);
386   uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>(
387       Bytes.data());
388   Bytes = Bytes.slice(4);
389   return DecoderUInt128(Lo, Hi);
390 }
391 
392 // The disassembler is greedy, so we need to check FI operand value to
393 // not parse a dpp if the correct literal is not set. For dpp16 the
394 // autogenerated decoder checks the dpp literal
395 static bool isValidDPP8(const MCInst &MI) {
396   using namespace llvm::AMDGPU::DPP;
397   int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
398   assert(FiIdx != -1);
399   if ((unsigned)FiIdx >= MI.getNumOperands())
400     return false;
401   unsigned Fi = MI.getOperand(FiIdx).getImm();
402   return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
403 }
404 
405 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
406                                                 ArrayRef<uint8_t> Bytes_,
407                                                 uint64_t Address,
408                                                 raw_ostream &CS) const {
409   bool IsSDWA = false;
410 
411   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
412   Bytes = Bytes_.slice(0, MaxInstBytesNum);
413 
414   DecodeStatus Res = MCDisassembler::Fail;
415   do {
416     // ToDo: better to switch encoding length using some bit predicate
417     // but it is unknown yet, so try all we can
418 
419     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
420     // encodings
421     if (isGFX11Plus() && Bytes.size() >= 12 ) {
422       DecoderUInt128 DecW = eat12Bytes(Bytes);
423       Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, Address, CS);
424       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
425         break;
426       MI = MCInst(); // clear
427       Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address, CS);
428       if (Res) {
429         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
430           convertVOP3PDPPInst(MI);
431         else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
432           convertVOPCDPPInst(MI); // Special VOP3 case
433         else {
434           assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
435           convertVOP3DPPInst(MI); // Regular VOP3 case
436         }
437         break;
438       }
439       Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
440       if (Res)
441         break;
442     }
443     // Reinitialize Bytes
444     Bytes = Bytes_.slice(0, MaxInstBytesNum);
445 
446     if (Bytes.size() >= 8) {
447       const uint64_t QW = eatBytes<uint64_t>(Bytes);
448 
449       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
450         Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
451         if (Res) {
452           if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
453               == -1)
454             break;
455           if (convertDPP8Inst(MI) == MCDisassembler::Success)
456             break;
457           MI = MCInst(); // clear
458         }
459       }
460 
461       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
462       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
463         break;
464       MI = MCInst(); // clear
465 
466       Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address, CS);
467       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
468         break;
469       MI = MCInst(); // clear
470 
471       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
472       if (Res) break;
473 
474       Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address, CS);
475       if (Res) {
476         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
477           convertVOPCDPPInst(MI);
478         break;
479       }
480 
481       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address, CS);
482       if (Res) { IsSDWA = true;  break; }
483 
484       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address, CS);
485       if (Res) { IsSDWA = true;  break; }
486 
487       Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address, CS);
488       if (Res) { IsSDWA = true;  break; }
489 
490       if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
491         Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
492         if (Res)
493           break;
494       }
495 
496       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
497       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
498       // table first so we print the correct name.
499       if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
500         Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
501         if (Res)
502           break;
503       }
504     }
505 
506     // Reinitialize Bytes as DPP64 could have eaten too much
507     Bytes = Bytes_.slice(0, MaxInstBytesNum);
508 
509     // Try decode 32-bit instruction
510     if (Bytes.size() < 4) break;
511     const uint32_t DW = eatBytes<uint32_t>(Bytes);
512     Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
513     if (Res) break;
514 
515     Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
516     if (Res) break;
517 
518     Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
519     if (Res) break;
520 
521     if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
522       Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
523       if (Res)
524         break;
525     }
526 
527     if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
528       Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
529       if (Res) break;
530     }
531 
532     Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
533     if (Res) break;
534 
535     Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address, CS);
536     if (Res) break;
537 
538     if (Bytes.size() < 4) break;
539     const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
540 
541     if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
542       Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
543       if (Res)
544         break;
545     }
546 
547     if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
548       Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
549       if (Res)
550         break;
551     }
552 
553     Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
554     if (Res) break;
555 
556     Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address, CS);
557     if (Res) break;
558 
559     Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
560     if (Res) break;
561 
562     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
563     if (Res) break;
564 
565     Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address, CS);
566     if (Res)
567       break;
568 
569     Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
570   } while (false);
571 
572   if (Res && AMDGPU::isMAC(MI.getOpcode())) {
573     // Insert dummy unused src2_modifiers.
574     insertNamedMCOperand(MI, MCOperand::createImm(0),
575                          AMDGPU::OpName::src2_modifiers);
576   }
577 
578   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
579           (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
580     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
581                                              AMDGPU::OpName::cpol);
582     if (CPolPos != -1) {
583       unsigned CPol =
584           (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
585               AMDGPU::CPol::GLC : 0;
586       if (MI.getNumOperands() <= (unsigned)CPolPos) {
587         insertNamedMCOperand(MI, MCOperand::createImm(CPol),
588                              AMDGPU::OpName::cpol);
589       } else if (CPol) {
590         MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
591       }
592     }
593   }
594 
595   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
596               (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
597              (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
598     // GFX90A lost TFE, its place is occupied by ACC.
599     int TFEOpIdx =
600         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
601     if (TFEOpIdx != -1) {
602       auto TFEIter = MI.begin();
603       std::advance(TFEIter, TFEOpIdx);
604       MI.insert(TFEIter, MCOperand::createImm(0));
605     }
606   }
607 
608   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
609               (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
610     int SWZOpIdx =
611         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
612     if (SWZOpIdx != -1) {
613       auto SWZIter = MI.begin();
614       std::advance(SWZIter, SWZOpIdx);
615       MI.insert(SWZIter, MCOperand::createImm(0));
616     }
617   }
618 
619   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
620     int VAddr0Idx =
621         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
622     int RsrcIdx =
623         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
624     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
625     if (VAddr0Idx >= 0 && NSAArgs > 0) {
626       unsigned NSAWords = (NSAArgs + 3) / 4;
627       if (Bytes.size() < 4 * NSAWords) {
628         Res = MCDisassembler::Fail;
629       } else {
630         for (unsigned i = 0; i < NSAArgs; ++i) {
631           const unsigned VAddrIdx = VAddr0Idx + 1 + i;
632           auto VAddrRCID =
633               MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
634           MI.insert(MI.begin() + VAddrIdx,
635                     createRegOperand(VAddrRCID, Bytes[i]));
636         }
637         Bytes = Bytes.slice(4 * NSAWords);
638       }
639     }
640 
641     if (Res)
642       Res = convertMIMGInst(MI);
643   }
644 
645   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
646     Res = convertEXPInst(MI);
647 
648   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
649     Res = convertVINTERPInst(MI);
650 
651   if (Res && IsSDWA)
652     Res = convertSDWAInst(MI);
653 
654   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
655                                               AMDGPU::OpName::vdst_in);
656   if (VDstIn_Idx != -1) {
657     int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
658                            MCOI::OperandConstraint::TIED_TO);
659     if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
660          !MI.getOperand(VDstIn_Idx).isReg() ||
661          MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
662       if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
663         MI.erase(&MI.getOperand(VDstIn_Idx));
664       insertNamedMCOperand(MI,
665         MCOperand::createReg(MI.getOperand(Tied).getReg()),
666         AMDGPU::OpName::vdst_in);
667     }
668   }
669 
670   int ImmLitIdx =
671       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
672   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
673   if (Res && ImmLitIdx != -1 && !IsSOPK)
674     Res = convertFMAanyK(MI, ImmLitIdx);
675 
676   // if the opcode was not recognized we'll assume a Size of 4 bytes
677   // (unless there are fewer bytes left)
678   Size = Res ? (MaxInstBytesNum - Bytes.size())
679              : std::min((size_t)4, Bytes_.size());
680   return Res;
681 }
682 
683 DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
684   if (STI.hasFeature(AMDGPU::FeatureGFX11)) {
685     // The MCInst still has these fields even though they are no longer encoded
686     // in the GFX11 instruction.
687     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
688     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
689   }
690   return MCDisassembler::Success;
691 }
692 
693 DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
694   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
695       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
696       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
697       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
698     // The MCInst has this field that is not directly encoded in the
699     // instruction.
700     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
701   }
702   return MCDisassembler::Success;
703 }
704 
705 DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
706   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
707       STI.hasFeature(AMDGPU::FeatureGFX10)) {
708     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
709       // VOPC - insert clamp
710       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
711   } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
712     int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
713     if (SDst != -1) {
714       // VOPC - insert VCC register as sdst
715       insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC),
716                            AMDGPU::OpName::sdst);
717     } else {
718       // VOP1/2 - insert omod if present in instruction
719       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
720     }
721   }
722   return MCDisassembler::Success;
723 }
724 
725 struct VOPModifiers {
726   unsigned OpSel = 0;
727   unsigned OpSelHi = 0;
728   unsigned NegLo = 0;
729   unsigned NegHi = 0;
730 };
731 
732 // Reconstruct values of VOP3/VOP3P operands such as op_sel.
733 // Note that these values do not affect disassembler output,
734 // so this is only necessary for consistency with src_modifiers.
735 static VOPModifiers collectVOPModifiers(const MCInst &MI,
736                                         bool IsVOP3P = false) {
737   VOPModifiers Modifiers;
738   unsigned Opc = MI.getOpcode();
739   const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
740                         AMDGPU::OpName::src1_modifiers,
741                         AMDGPU::OpName::src2_modifiers};
742   for (int J = 0; J < 3; ++J) {
743     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
744     if (OpIdx == -1)
745       continue;
746 
747     unsigned Val = MI.getOperand(OpIdx).getImm();
748 
749     Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
750     if (IsVOP3P) {
751       Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
752       Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
753       Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
754     } else if (J == 0) {
755       Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
756     }
757   }
758 
759   return Modifiers;
760 }
761 
762 // MAC opcodes have special old and src2 operands.
763 // src2 is tied to dst, while old is not tied (but assumed to be).
764 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
765   constexpr int DST_IDX = 0;
766   auto Opcode = MI.getOpcode();
767   const auto &Desc = MCII->get(Opcode);
768   auto OldIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::old);
769 
770   if (OldIdx != -1 && Desc.getOperandConstraint(
771                           OldIdx, MCOI::OperandConstraint::TIED_TO) == -1) {
772     assert(AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2));
773     assert(Desc.getOperandConstraint(
774                AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2),
775                MCOI::OperandConstraint::TIED_TO) == DST_IDX);
776     (void)DST_IDX;
777     return true;
778   }
779 
780   return false;
781 }
782 
783 // Create dummy old operand and insert dummy unused src2_modifiers
784 void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
785   assert(MI.getNumOperands() + 1 < MCII->get(MI.getOpcode()).getNumOperands());
786   insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
787   insertNamedMCOperand(MI, MCOperand::createImm(0),
788                        AMDGPU::OpName::src2_modifiers);
789 }
790 
791 // We must check FI == literal to reject not genuine dpp8 insts, and we must
792 // first add optional MI operands to check FI
793 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
794   unsigned Opc = MI.getOpcode();
795   if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
796     convertVOP3PDPPInst(MI);
797   } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
798              AMDGPU::isVOPC64DPP(Opc)) {
799     convertVOPCDPPInst(MI);
800   } else {
801     if (isMacDPP(MI))
802       convertMacDPPInst(MI);
803 
804     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
805     if (MI.getNumOperands() < DescNumOps &&
806         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
807       auto Mods = collectVOPModifiers(MI);
808       insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
809                            AMDGPU::OpName::op_sel);
810     } else {
811       // Insert dummy unused src modifiers.
812       if (MI.getNumOperands() < DescNumOps &&
813           AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
814         insertNamedMCOperand(MI, MCOperand::createImm(0),
815                              AMDGPU::OpName::src0_modifiers);
816 
817       if (MI.getNumOperands() < DescNumOps &&
818           AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
819         insertNamedMCOperand(MI, MCOperand::createImm(0),
820                              AMDGPU::OpName::src1_modifiers);
821     }
822   }
823   return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
824 }
825 
826 DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
827   if (isMacDPP(MI))
828     convertMacDPPInst(MI);
829 
830   unsigned Opc = MI.getOpcode();
831   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
832   if (MI.getNumOperands() < DescNumOps &&
833       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
834     auto Mods = collectVOPModifiers(MI);
835     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
836                          AMDGPU::OpName::op_sel);
837   }
838   return MCDisassembler::Success;
839 }
840 
841 // Note that before gfx10, the MIMG encoding provided no information about
842 // VADDR size. Consequently, decoded instructions always show address as if it
843 // has 1 dword, which could be not really so.
844 DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
845 
846   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
847                                            AMDGPU::OpName::vdst);
848 
849   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
850                                             AMDGPU::OpName::vdata);
851   int VAddr0Idx =
852       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
853   int RsrcIdx =
854       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
855   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
856                                             AMDGPU::OpName::dmask);
857 
858   int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
859                                             AMDGPU::OpName::tfe);
860   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
861                                             AMDGPU::OpName::d16);
862 
863   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
864   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
865       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
866 
867   assert(VDataIdx != -1);
868   if (BaseOpcode->BVH) {
869     // Add A16 operand for intersect_ray instructions
870     addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
871     return MCDisassembler::Success;
872   }
873 
874   bool IsAtomic = (VDstIdx != -1);
875   bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
876   bool IsNSA = false;
877   bool IsPartialNSA = false;
878   unsigned AddrSize = Info->VAddrDwords;
879 
880   if (isGFX10Plus()) {
881     unsigned DimIdx =
882         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
883     int A16Idx =
884         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
885     const AMDGPU::MIMGDimInfo *Dim =
886         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
887     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
888 
889     AddrSize =
890         AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
891 
892     IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
893             Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
894     if (!IsNSA) {
895       if (AddrSize > 12)
896         AddrSize = 16;
897     } else {
898       if (AddrSize > Info->VAddrDwords) {
899         if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
900           // The NSA encoding does not contain enough operands for the
901           // combination of base opcode / dimension. Should this be an error?
902           return MCDisassembler::Success;
903         }
904         IsPartialNSA = true;
905       }
906     }
907   }
908 
909   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
910   unsigned DstSize = IsGather4 ? 4 : std::max(llvm::popcount(DMask), 1);
911 
912   bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
913   if (D16 && AMDGPU::hasPackedD16(STI)) {
914     DstSize = (DstSize + 1) / 2;
915   }
916 
917   if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
918     DstSize += 1;
919 
920   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
921     return MCDisassembler::Success;
922 
923   int NewOpcode =
924       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
925   if (NewOpcode == -1)
926     return MCDisassembler::Success;
927 
928   // Widen the register to the correct number of enabled channels.
929   unsigned NewVdata = AMDGPU::NoRegister;
930   if (DstSize != Info->VDataDwords) {
931     auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
932 
933     // Get first subregister of VData
934     unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
935     unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
936     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
937 
938     NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
939                                        &MRI.getRegClass(DataRCID));
940     if (NewVdata == AMDGPU::NoRegister) {
941       // It's possible to encode this such that the low register + enabled
942       // components exceeds the register count.
943       return MCDisassembler::Success;
944     }
945   }
946 
947   // If not using NSA on GFX10+, widen vaddr0 address register to correct size.
948   // If using partial NSA on GFX11+ widen last address register.
949   int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx;
950   unsigned NewVAddrSA = AMDGPU::NoRegister;
951   if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) &&
952       AddrSize != Info->VAddrDwords) {
953     unsigned VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
954     unsigned VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
955     VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
956 
957     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
958     NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
959                                         &MRI.getRegClass(AddrRCID));
960     if (!NewVAddrSA)
961       return MCDisassembler::Success;
962   }
963 
964   MI.setOpcode(NewOpcode);
965 
966   if (NewVdata != AMDGPU::NoRegister) {
967     MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
968 
969     if (IsAtomic) {
970       // Atomic operations have an additional operand (a copy of data)
971       MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
972     }
973   }
974 
975   if (NewVAddrSA) {
976     MI.getOperand(VAddrSAIdx) = MCOperand::createReg(NewVAddrSA);
977   } else if (IsNSA) {
978     assert(AddrSize <= Info->VAddrDwords);
979     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
980              MI.begin() + VAddr0Idx + Info->VAddrDwords);
981   }
982 
983   return MCDisassembler::Success;
984 }
985 
986 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
987 // decoder only adds to src_modifiers, so manually add the bits to the other
988 // operands.
989 DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
990   unsigned Opc = MI.getOpcode();
991   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
992   auto Mods = collectVOPModifiers(MI, true);
993 
994   if (MI.getNumOperands() < DescNumOps &&
995       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
996     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
997 
998   if (MI.getNumOperands() < DescNumOps &&
999       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel))
1000     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1001                          AMDGPU::OpName::op_sel);
1002   if (MI.getNumOperands() < DescNumOps &&
1003       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel_hi))
1004     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
1005                          AMDGPU::OpName::op_sel_hi);
1006   if (MI.getNumOperands() < DescNumOps &&
1007       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_lo))
1008     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
1009                          AMDGPU::OpName::neg_lo);
1010   if (MI.getNumOperands() < DescNumOps &&
1011       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
1012     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
1013                          AMDGPU::OpName::neg_hi);
1014 
1015   return MCDisassembler::Success;
1016 }
1017 
1018 // Create dummy old operand and insert optional operands
1019 DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
1020   unsigned Opc = MI.getOpcode();
1021   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1022 
1023   if (MI.getNumOperands() < DescNumOps &&
1024       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::old))
1025     insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
1026 
1027   if (MI.getNumOperands() < DescNumOps &&
1028       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
1029     insertNamedMCOperand(MI, MCOperand::createImm(0),
1030                          AMDGPU::OpName::src0_modifiers);
1031 
1032   if (MI.getNumOperands() < DescNumOps &&
1033       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
1034     insertNamedMCOperand(MI, MCOperand::createImm(0),
1035                          AMDGPU::OpName::src1_modifiers);
1036   return MCDisassembler::Success;
1037 }
1038 
1039 DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
1040                                                 int ImmLitIdx) const {
1041   assert(HasLiteral && "Should have decoded a literal");
1042   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
1043   unsigned DescNumOps = Desc.getNumOperands();
1044   insertNamedMCOperand(MI, MCOperand::createImm(Literal),
1045                        AMDGPU::OpName::immDeferred);
1046   assert(DescNumOps == MI.getNumOperands());
1047   for (unsigned I = 0; I < DescNumOps; ++I) {
1048     auto &Op = MI.getOperand(I);
1049     auto OpType = Desc.operands()[I].OperandType;
1050     bool IsDeferredOp = (OpType == AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED ||
1051                          OpType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED);
1052     if (Op.isImm() && Op.getImm() == AMDGPU::EncValues::LITERAL_CONST &&
1053         IsDeferredOp)
1054       Op.setImm(Literal);
1055   }
1056   return MCDisassembler::Success;
1057 }
1058 
1059 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
1060   return getContext().getRegisterInfo()->
1061     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
1062 }
1063 
1064 inline
1065 MCOperand AMDGPUDisassembler::errOperand(unsigned V,
1066                                          const Twine& ErrMsg) const {
1067   *CommentStream << "Error: " + ErrMsg;
1068 
1069   // ToDo: add support for error operands to MCInst.h
1070   // return MCOperand::createError(V);
1071   return MCOperand();
1072 }
1073 
1074 inline
1075 MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
1076   return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
1077 }
1078 
1079 inline
1080 MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
1081                                                unsigned Val) const {
1082   const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
1083   if (Val >= RegCl.getNumRegs())
1084     return errOperand(Val, Twine(getRegClassName(RegClassID)) +
1085                            ": unknown register " + Twine(Val));
1086   return createRegOperand(RegCl.getRegister(Val));
1087 }
1088 
1089 inline
1090 MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
1091                                                 unsigned Val) const {
1092   // ToDo: SI/CI have 104 SGPRs, VI - 102
1093   // Valery: here we accepting as much as we can, let assembler sort it out
1094   int shift = 0;
1095   switch (SRegClassID) {
1096   case AMDGPU::SGPR_32RegClassID:
1097   case AMDGPU::TTMP_32RegClassID:
1098     break;
1099   case AMDGPU::SGPR_64RegClassID:
1100   case AMDGPU::TTMP_64RegClassID:
1101     shift = 1;
1102     break;
1103   case AMDGPU::SGPR_128RegClassID:
1104   case AMDGPU::TTMP_128RegClassID:
1105   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
1106   // this bundle?
1107   case AMDGPU::SGPR_256RegClassID:
1108   case AMDGPU::TTMP_256RegClassID:
1109     // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
1110   // this bundle?
1111   case AMDGPU::SGPR_288RegClassID:
1112   case AMDGPU::TTMP_288RegClassID:
1113   case AMDGPU::SGPR_320RegClassID:
1114   case AMDGPU::TTMP_320RegClassID:
1115   case AMDGPU::SGPR_352RegClassID:
1116   case AMDGPU::TTMP_352RegClassID:
1117   case AMDGPU::SGPR_384RegClassID:
1118   case AMDGPU::TTMP_384RegClassID:
1119   case AMDGPU::SGPR_512RegClassID:
1120   case AMDGPU::TTMP_512RegClassID:
1121     shift = 2;
1122     break;
1123   // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
1124   // this bundle?
1125   default:
1126     llvm_unreachable("unhandled register class");
1127   }
1128 
1129   if (Val % (1 << shift)) {
1130     *CommentStream << "Warning: " << getRegClassName(SRegClassID)
1131                    << ": scalar reg isn't aligned " << Val;
1132   }
1133 
1134   return createRegOperand(SRegClassID, Val >> shift);
1135 }
1136 
1137 // Decode Literals for insts which always have a literal in the encoding
1138 MCOperand
1139 AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
1140   if (HasLiteral) {
1141     assert(
1142         AMDGPU::hasVOPD(STI) &&
1143         "Should only decode multiple kimm with VOPD, check VSrc operand types");
1144     if (Literal != Val)
1145       return errOperand(Val, "More than one unique literal is illegal");
1146   }
1147   HasLiteral = true;
1148   Literal = Val;
1149   return MCOperand::createImm(Literal);
1150 }
1151 
1152 MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
1153   // For now all literal constants are supposed to be unsigned integer
1154   // ToDo: deal with signed/unsigned 64-bit integer constants
1155   // ToDo: deal with float/double constants
1156   if (!HasLiteral) {
1157     if (Bytes.size() < 4) {
1158       return errOperand(0, "cannot read literal, inst bytes left " +
1159                         Twine(Bytes.size()));
1160     }
1161     HasLiteral = true;
1162     Literal = eatBytes<uint32_t>(Bytes);
1163   }
1164   return MCOperand::createImm(Literal);
1165 }
1166 
1167 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
1168   using namespace AMDGPU::EncValues;
1169 
1170   assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
1171   return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
1172     (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
1173     (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
1174       // Cast prevents negative overflow.
1175 }
1176 
1177 static int64_t getInlineImmVal32(unsigned Imm) {
1178   switch (Imm) {
1179   case 240:
1180     return llvm::bit_cast<uint32_t>(0.5f);
1181   case 241:
1182     return llvm::bit_cast<uint32_t>(-0.5f);
1183   case 242:
1184     return llvm::bit_cast<uint32_t>(1.0f);
1185   case 243:
1186     return llvm::bit_cast<uint32_t>(-1.0f);
1187   case 244:
1188     return llvm::bit_cast<uint32_t>(2.0f);
1189   case 245:
1190     return llvm::bit_cast<uint32_t>(-2.0f);
1191   case 246:
1192     return llvm::bit_cast<uint32_t>(4.0f);
1193   case 247:
1194     return llvm::bit_cast<uint32_t>(-4.0f);
1195   case 248: // 1 / (2 * PI)
1196     return 0x3e22f983;
1197   default:
1198     llvm_unreachable("invalid fp inline imm");
1199   }
1200 }
1201 
1202 static int64_t getInlineImmVal64(unsigned Imm) {
1203   switch (Imm) {
1204   case 240:
1205     return llvm::bit_cast<uint64_t>(0.5);
1206   case 241:
1207     return llvm::bit_cast<uint64_t>(-0.5);
1208   case 242:
1209     return llvm::bit_cast<uint64_t>(1.0);
1210   case 243:
1211     return llvm::bit_cast<uint64_t>(-1.0);
1212   case 244:
1213     return llvm::bit_cast<uint64_t>(2.0);
1214   case 245:
1215     return llvm::bit_cast<uint64_t>(-2.0);
1216   case 246:
1217     return llvm::bit_cast<uint64_t>(4.0);
1218   case 247:
1219     return llvm::bit_cast<uint64_t>(-4.0);
1220   case 248: // 1 / (2 * PI)
1221     return 0x3fc45f306dc9c882;
1222   default:
1223     llvm_unreachable("invalid fp inline imm");
1224   }
1225 }
1226 
1227 static int64_t getInlineImmVal16(unsigned Imm) {
1228   switch (Imm) {
1229   case 240:
1230     return 0x3800;
1231   case 241:
1232     return 0xB800;
1233   case 242:
1234     return 0x3C00;
1235   case 243:
1236     return 0xBC00;
1237   case 244:
1238     return 0x4000;
1239   case 245:
1240     return 0xC000;
1241   case 246:
1242     return 0x4400;
1243   case 247:
1244     return 0xC400;
1245   case 248: // 1 / (2 * PI)
1246     return 0x3118;
1247   default:
1248     llvm_unreachable("invalid fp inline imm");
1249   }
1250 }
1251 
1252 MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm) {
1253   assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
1254       && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
1255 
1256   // ToDo: case 248: 1/(2*PI) - is allowed only on VI
1257   // ImmWidth 0 is a default case where operand should not allow immediates.
1258   // Imm value is still decoded into 32 bit immediate operand, inst printer will
1259   // use it to print verbose error message.
1260   switch (ImmWidth) {
1261   case 0:
1262   case 32:
1263     return MCOperand::createImm(getInlineImmVal32(Imm));
1264   case 64:
1265     return MCOperand::createImm(getInlineImmVal64(Imm));
1266   case 16:
1267     return MCOperand::createImm(getInlineImmVal16(Imm));
1268   default:
1269     llvm_unreachable("implement me");
1270   }
1271 }
1272 
1273 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
1274   using namespace AMDGPU;
1275 
1276   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1277   switch (Width) {
1278   default: // fall
1279   case OPW32:
1280   case OPW16:
1281   case OPWV216:
1282     return VGPR_32RegClassID;
1283   case OPW64:
1284   case OPWV232: return VReg_64RegClassID;
1285   case OPW96: return VReg_96RegClassID;
1286   case OPW128: return VReg_128RegClassID;
1287   case OPW160: return VReg_160RegClassID;
1288   case OPW256: return VReg_256RegClassID;
1289   case OPW288: return VReg_288RegClassID;
1290   case OPW320: return VReg_320RegClassID;
1291   case OPW352: return VReg_352RegClassID;
1292   case OPW384: return VReg_384RegClassID;
1293   case OPW512: return VReg_512RegClassID;
1294   case OPW1024: return VReg_1024RegClassID;
1295   }
1296 }
1297 
1298 unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
1299   using namespace AMDGPU;
1300 
1301   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1302   switch (Width) {
1303   default: // fall
1304   case OPW32:
1305   case OPW16:
1306   case OPWV216:
1307     return AGPR_32RegClassID;
1308   case OPW64:
1309   case OPWV232: return AReg_64RegClassID;
1310   case OPW96: return AReg_96RegClassID;
1311   case OPW128: return AReg_128RegClassID;
1312   case OPW160: return AReg_160RegClassID;
1313   case OPW256: return AReg_256RegClassID;
1314   case OPW288: return AReg_288RegClassID;
1315   case OPW320: return AReg_320RegClassID;
1316   case OPW352: return AReg_352RegClassID;
1317   case OPW384: return AReg_384RegClassID;
1318   case OPW512: return AReg_512RegClassID;
1319   case OPW1024: return AReg_1024RegClassID;
1320   }
1321 }
1322 
1323 
1324 unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
1325   using namespace AMDGPU;
1326 
1327   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1328   switch (Width) {
1329   default: // fall
1330   case OPW32:
1331   case OPW16:
1332   case OPWV216:
1333     return SGPR_32RegClassID;
1334   case OPW64:
1335   case OPWV232: return SGPR_64RegClassID;
1336   case OPW96: return SGPR_96RegClassID;
1337   case OPW128: return SGPR_128RegClassID;
1338   case OPW160: return SGPR_160RegClassID;
1339   case OPW256: return SGPR_256RegClassID;
1340   case OPW288: return SGPR_288RegClassID;
1341   case OPW320: return SGPR_320RegClassID;
1342   case OPW352: return SGPR_352RegClassID;
1343   case OPW384: return SGPR_384RegClassID;
1344   case OPW512: return SGPR_512RegClassID;
1345   }
1346 }
1347 
1348 unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
1349   using namespace AMDGPU;
1350 
1351   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1352   switch (Width) {
1353   default: // fall
1354   case OPW32:
1355   case OPW16:
1356   case OPWV216:
1357     return TTMP_32RegClassID;
1358   case OPW64:
1359   case OPWV232: return TTMP_64RegClassID;
1360   case OPW128: return TTMP_128RegClassID;
1361   case OPW256: return TTMP_256RegClassID;
1362   case OPW288: return TTMP_288RegClassID;
1363   case OPW320: return TTMP_320RegClassID;
1364   case OPW352: return TTMP_352RegClassID;
1365   case OPW384: return TTMP_384RegClassID;
1366   case OPW512: return TTMP_512RegClassID;
1367   }
1368 }
1369 
1370 int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
1371   using namespace AMDGPU::EncValues;
1372 
1373   unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
1374   unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
1375 
1376   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
1377 }
1378 
1379 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
1380                                           bool MandatoryLiteral,
1381                                           unsigned ImmWidth) const {
1382   using namespace AMDGPU::EncValues;
1383 
1384   assert(Val < 1024); // enum10
1385 
1386   bool IsAGPR = Val & 512;
1387   Val &= 511;
1388 
1389   if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
1390     return createRegOperand(IsAGPR ? getAgprClassId(Width)
1391                                    : getVgprClassId(Width), Val - VGPR_MIN);
1392   }
1393   if (Val <= SGPR_MAX) {
1394     // "SGPR_MIN <= Val" is always true and causes compilation warning.
1395     static_assert(SGPR_MIN == 0);
1396     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
1397   }
1398 
1399   int TTmpIdx = getTTmpIdx(Val);
1400   if (TTmpIdx >= 0) {
1401     return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
1402   }
1403 
1404   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
1405     return decodeIntImmed(Val);
1406 
1407   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
1408     return decodeFPImmed(ImmWidth, Val);
1409 
1410   if (Val == LITERAL_CONST) {
1411     if (MandatoryLiteral)
1412       // Keep a sentinel value for deferred setting
1413       return MCOperand::createImm(LITERAL_CONST);
1414     else
1415       return decodeLiteralConstant();
1416   }
1417 
1418   switch (Width) {
1419   case OPW32:
1420   case OPW16:
1421   case OPWV216:
1422     return decodeSpecialReg32(Val);
1423   case OPW64:
1424   case OPWV232:
1425     return decodeSpecialReg64(Val);
1426   default:
1427     llvm_unreachable("unexpected immediate type");
1428   }
1429 }
1430 
1431 // Bit 0 of DstY isn't stored in the instruction, because it's always the
1432 // opposite of bit 0 of DstX.
1433 MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
1434                                                unsigned Val) const {
1435   int VDstXInd =
1436       AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
1437   assert(VDstXInd != -1);
1438   assert(Inst.getOperand(VDstXInd).isReg());
1439   unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
1440   Val |= ~XDstReg & 1;
1441   auto Width = llvm::AMDGPUDisassembler::OPW32;
1442   return createRegOperand(getVgprClassId(Width), Val);
1443 }
1444 
1445 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
1446   using namespace AMDGPU;
1447 
1448   switch (Val) {
1449   // clang-format off
1450   case 102: return createRegOperand(FLAT_SCR_LO);
1451   case 103: return createRegOperand(FLAT_SCR_HI);
1452   case 104: return createRegOperand(XNACK_MASK_LO);
1453   case 105: return createRegOperand(XNACK_MASK_HI);
1454   case 106: return createRegOperand(VCC_LO);
1455   case 107: return createRegOperand(VCC_HI);
1456   case 108: return createRegOperand(TBA_LO);
1457   case 109: return createRegOperand(TBA_HI);
1458   case 110: return createRegOperand(TMA_LO);
1459   case 111: return createRegOperand(TMA_HI);
1460   case 124:
1461     return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
1462   case 125:
1463     return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
1464   case 126: return createRegOperand(EXEC_LO);
1465   case 127: return createRegOperand(EXEC_HI);
1466   case 235: return createRegOperand(SRC_SHARED_BASE_LO);
1467   case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
1468   case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
1469   case 238: return createRegOperand(SRC_PRIVATE_LIMIT_LO);
1470   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1471   case 251: return createRegOperand(SRC_VCCZ);
1472   case 252: return createRegOperand(SRC_EXECZ);
1473   case 253: return createRegOperand(SRC_SCC);
1474   case 254: return createRegOperand(LDS_DIRECT);
1475   default: break;
1476     // clang-format on
1477   }
1478   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1479 }
1480 
1481 MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
1482   using namespace AMDGPU;
1483 
1484   switch (Val) {
1485   case 102: return createRegOperand(FLAT_SCR);
1486   case 104: return createRegOperand(XNACK_MASK);
1487   case 106: return createRegOperand(VCC);
1488   case 108: return createRegOperand(TBA);
1489   case 110: return createRegOperand(TMA);
1490   case 124:
1491     if (isGFX11Plus())
1492       return createRegOperand(SGPR_NULL);
1493     break;
1494   case 125:
1495     if (!isGFX11Plus())
1496       return createRegOperand(SGPR_NULL);
1497     break;
1498   case 126: return createRegOperand(EXEC);
1499   case 235: return createRegOperand(SRC_SHARED_BASE);
1500   case 236: return createRegOperand(SRC_SHARED_LIMIT);
1501   case 237: return createRegOperand(SRC_PRIVATE_BASE);
1502   case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
1503   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1504   case 251: return createRegOperand(SRC_VCCZ);
1505   case 252: return createRegOperand(SRC_EXECZ);
1506   case 253: return createRegOperand(SRC_SCC);
1507   default: break;
1508   }
1509   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1510 }
1511 
1512 MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
1513                                             const unsigned Val,
1514                                             unsigned ImmWidth) const {
1515   using namespace AMDGPU::SDWA;
1516   using namespace AMDGPU::EncValues;
1517 
1518   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
1519       STI.hasFeature(AMDGPU::FeatureGFX10)) {
1520     // XXX: cast to int is needed to avoid stupid warning:
1521     // compare with unsigned is always true
1522     if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
1523         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
1524       return createRegOperand(getVgprClassId(Width),
1525                               Val - SDWA9EncValues::SRC_VGPR_MIN);
1526     }
1527     if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
1528         Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
1529                               : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
1530       return createSRegOperand(getSgprClassId(Width),
1531                                Val - SDWA9EncValues::SRC_SGPR_MIN);
1532     }
1533     if (SDWA9EncValues::SRC_TTMP_MIN <= Val &&
1534         Val <= SDWA9EncValues::SRC_TTMP_MAX) {
1535       return createSRegOperand(getTtmpClassId(Width),
1536                                Val - SDWA9EncValues::SRC_TTMP_MIN);
1537     }
1538 
1539     const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
1540 
1541     if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
1542       return decodeIntImmed(SVal);
1543 
1544     if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
1545       return decodeFPImmed(ImmWidth, SVal);
1546 
1547     return decodeSpecialReg32(SVal);
1548   } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
1549     return createRegOperand(getVgprClassId(Width), Val);
1550   }
1551   llvm_unreachable("unsupported target");
1552 }
1553 
1554 MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
1555   return decodeSDWASrc(OPW16, Val, 16);
1556 }
1557 
1558 MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
1559   return decodeSDWASrc(OPW32, Val, 32);
1560 }
1561 
1562 MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
1563   using namespace AMDGPU::SDWA;
1564 
1565   assert((STI.hasFeature(AMDGPU::FeatureGFX9) ||
1566           STI.hasFeature(AMDGPU::FeatureGFX10)) &&
1567          "SDWAVopcDst should be present only on GFX9+");
1568 
1569   bool IsWave64 = STI.hasFeature(AMDGPU::FeatureWavefrontSize64);
1570 
1571   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
1572     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
1573 
1574     int TTmpIdx = getTTmpIdx(Val);
1575     if (TTmpIdx >= 0) {
1576       auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
1577       return createSRegOperand(TTmpClsId, TTmpIdx);
1578     } else if (Val > SGPR_MAX) {
1579       return IsWave64 ? decodeSpecialReg64(Val)
1580                       : decodeSpecialReg32(Val);
1581     } else {
1582       return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
1583     }
1584   } else {
1585     return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
1586   }
1587 }
1588 
1589 MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
1590   return STI.hasFeature(AMDGPU::FeatureWavefrontSize64)
1591              ? decodeSrcOp(OPW64, Val)
1592              : decodeSrcOp(OPW32, Val);
1593 }
1594 
1595 bool AMDGPUDisassembler::isVI() const {
1596   return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
1597 }
1598 
1599 bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
1600 
1601 bool AMDGPUDisassembler::isGFX90A() const {
1602   return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
1603 }
1604 
1605 bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
1606 
1607 bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
1608 
1609 bool AMDGPUDisassembler::isGFX10Plus() const {
1610   return AMDGPU::isGFX10Plus(STI);
1611 }
1612 
1613 bool AMDGPUDisassembler::isGFX11() const {
1614   return STI.hasFeature(AMDGPU::FeatureGFX11);
1615 }
1616 
1617 bool AMDGPUDisassembler::isGFX11Plus() const {
1618   return AMDGPU::isGFX11Plus(STI);
1619 }
1620 
1621 
1622 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
1623   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
1624 }
1625 
1626 bool AMDGPUDisassembler::hasKernargPreload() const {
1627   return AMDGPU::hasKernargPreload(STI);
1628 }
1629 
1630 //===----------------------------------------------------------------------===//
1631 // AMDGPU specific symbol handling
1632 //===----------------------------------------------------------------------===//
1633 #define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK))
1634 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1635   do {                                                                         \
1636     KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n';            \
1637   } while (0)
1638 #define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK)                        \
1639   do {                                                                         \
1640     KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " "       \
1641              << GET_FIELD(MASK) << '\n';                                       \
1642   } while (0)
1643 
1644 // NOLINTNEXTLINE(readability-identifier-naming)
1645 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
1646     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1647   using namespace amdhsa;
1648   StringRef Indent = "\t";
1649 
1650   // We cannot accurately backward compute #VGPRs used from
1651   // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
1652   // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
1653   // simply calculate the inverse of what the assembler does.
1654 
1655   uint32_t GranulatedWorkitemVGPRCount =
1656       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT);
1657 
1658   uint32_t NextFreeVGPR =
1659       (GranulatedWorkitemVGPRCount + 1) *
1660       AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32);
1661 
1662   KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
1663 
1664   // We cannot backward compute values used to calculate
1665   // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
1666   // directives can't be computed:
1667   // .amdhsa_reserve_vcc
1668   // .amdhsa_reserve_flat_scratch
1669   // .amdhsa_reserve_xnack_mask
1670   // They take their respective default values if not specified in the assembly.
1671   //
1672   // GRANULATED_WAVEFRONT_SGPR_COUNT
1673   //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
1674   //
1675   // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
1676   // are set to 0. So while disassembling we consider that:
1677   //
1678   // GRANULATED_WAVEFRONT_SGPR_COUNT
1679   //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
1680   //
1681   // The disassembler cannot recover the original values of those 3 directives.
1682 
1683   uint32_t GranulatedWavefrontSGPRCount =
1684       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT);
1685 
1686   if (isGFX10Plus() && GranulatedWavefrontSGPRCount)
1687     return MCDisassembler::Fail;
1688 
1689   uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
1690                           AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
1691 
1692   KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
1693   if (!hasArchitectedFlatScratch())
1694     KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
1695   KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
1696   KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
1697 
1698   if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
1699     return MCDisassembler::Fail;
1700 
1701   PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
1702                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
1703   PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
1704                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
1705   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
1706                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
1707   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
1708                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
1709 
1710   if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
1711     return MCDisassembler::Fail;
1712 
1713   PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
1714 
1715   if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
1716     return MCDisassembler::Fail;
1717 
1718   PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
1719 
1720   if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
1721     return MCDisassembler::Fail;
1722 
1723   if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
1724     return MCDisassembler::Fail;
1725 
1726   PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
1727 
1728   if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
1729     return MCDisassembler::Fail;
1730 
1731   if (isGFX10Plus()) {
1732     PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
1733                     COMPUTE_PGM_RSRC1_WGP_MODE);
1734     PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
1735     PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
1736   }
1737   return MCDisassembler::Success;
1738 }
1739 
1740 // NOLINTNEXTLINE(readability-identifier-naming)
1741 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
1742     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1743   using namespace amdhsa;
1744   StringRef Indent = "\t";
1745   if (hasArchitectedFlatScratch())
1746     PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
1747                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
1748   else
1749     PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
1750                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
1751   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
1752                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
1753   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
1754                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
1755   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
1756                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
1757   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
1758                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
1759   PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
1760                   COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
1761 
1762   if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
1763     return MCDisassembler::Fail;
1764 
1765   if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
1766     return MCDisassembler::Fail;
1767 
1768   if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
1769     return MCDisassembler::Fail;
1770 
1771   PRINT_DIRECTIVE(
1772       ".amdhsa_exception_fp_ieee_invalid_op",
1773       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
1774   PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
1775                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
1776   PRINT_DIRECTIVE(
1777       ".amdhsa_exception_fp_ieee_div_zero",
1778       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
1779   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
1780                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
1781   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
1782                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
1783   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
1784                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
1785   PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
1786                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
1787 
1788   if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
1789     return MCDisassembler::Fail;
1790 
1791   return MCDisassembler::Success;
1792 }
1793 
1794 // NOLINTNEXTLINE(readability-identifier-naming)
1795 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
1796     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1797   using namespace amdhsa;
1798   StringRef Indent = "\t";
1799   if (isGFX90A()) {
1800     KdStream << Indent << ".amdhsa_accum_offset "
1801              << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
1802              << '\n';
1803     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED0)
1804       return MCDisassembler::Fail;
1805     PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
1806     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1)
1807       return MCDisassembler::Fail;
1808   } else if (isGFX10Plus()) {
1809     if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
1810       PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
1811                       COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
1812     } else {
1813       PRINT_PSEUDO_DIRECTIVE_COMMENT(
1814           "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
1815     }
1816     PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
1817                                    COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE);
1818     PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
1819                                    COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START);
1820     PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
1821                                    COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_END);
1822     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED0)
1823       return MCDisassembler::Fail;
1824     PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
1825                                    COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START);
1826   } else if (FourByteBuffer) {
1827     return MCDisassembler::Fail;
1828   }
1829   return MCDisassembler::Success;
1830 }
1831 #undef PRINT_PSEUDO_DIRECTIVE_COMMENT
1832 #undef PRINT_DIRECTIVE
1833 #undef GET_FIELD
1834 
1835 MCDisassembler::DecodeStatus
1836 AMDGPUDisassembler::decodeKernelDescriptorDirective(
1837     DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
1838     raw_string_ostream &KdStream) const {
1839 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1840   do {                                                                         \
1841     KdStream << Indent << DIRECTIVE " "                                        \
1842              << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
1843   } while (0)
1844 
1845   uint16_t TwoByteBuffer = 0;
1846   uint32_t FourByteBuffer = 0;
1847 
1848   StringRef ReservedBytes;
1849   StringRef Indent = "\t";
1850 
1851   assert(Bytes.size() == 64);
1852   DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
1853 
1854   switch (Cursor.tell()) {
1855   case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
1856     FourByteBuffer = DE.getU32(Cursor);
1857     KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
1858              << '\n';
1859     return MCDisassembler::Success;
1860 
1861   case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
1862     FourByteBuffer = DE.getU32(Cursor);
1863     KdStream << Indent << ".amdhsa_private_segment_fixed_size "
1864              << FourByteBuffer << '\n';
1865     return MCDisassembler::Success;
1866 
1867   case amdhsa::KERNARG_SIZE_OFFSET:
1868     FourByteBuffer = DE.getU32(Cursor);
1869     KdStream << Indent << ".amdhsa_kernarg_size "
1870              << FourByteBuffer << '\n';
1871     return MCDisassembler::Success;
1872 
1873   case amdhsa::RESERVED0_OFFSET:
1874     // 4 reserved bytes, must be 0.
1875     ReservedBytes = DE.getBytes(Cursor, 4);
1876     for (int I = 0; I < 4; ++I) {
1877       if (ReservedBytes[I] != 0) {
1878         return MCDisassembler::Fail;
1879       }
1880     }
1881     return MCDisassembler::Success;
1882 
1883   case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
1884     // KERNEL_CODE_ENTRY_BYTE_OFFSET
1885     // So far no directive controls this for Code Object V3, so simply skip for
1886     // disassembly.
1887     DE.skip(Cursor, 8);
1888     return MCDisassembler::Success;
1889 
1890   case amdhsa::RESERVED1_OFFSET:
1891     // 20 reserved bytes, must be 0.
1892     ReservedBytes = DE.getBytes(Cursor, 20);
1893     for (int I = 0; I < 20; ++I) {
1894       if (ReservedBytes[I] != 0) {
1895         return MCDisassembler::Fail;
1896       }
1897     }
1898     return MCDisassembler::Success;
1899 
1900   case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
1901     FourByteBuffer = DE.getU32(Cursor);
1902     return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream);
1903 
1904   case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
1905     FourByteBuffer = DE.getU32(Cursor);
1906     return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream);
1907 
1908   case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
1909     FourByteBuffer = DE.getU32(Cursor);
1910     return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream);
1911 
1912   case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
1913     using namespace amdhsa;
1914     TwoByteBuffer = DE.getU16(Cursor);
1915 
1916     if (!hasArchitectedFlatScratch())
1917       PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
1918                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
1919     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
1920                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
1921     PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
1922                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
1923     PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
1924                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
1925     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
1926                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
1927     if (!hasArchitectedFlatScratch())
1928       PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
1929                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
1930     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
1931                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
1932 
1933     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
1934       return MCDisassembler::Fail;
1935 
1936     // Reserved for GFX9
1937     if (isGFX9() &&
1938         (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
1939       return MCDisassembler::Fail;
1940     } else if (isGFX10Plus()) {
1941       PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
1942                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
1943     }
1944 
1945     if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
1946       PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
1947                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
1948 
1949     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
1950       return MCDisassembler::Fail;
1951 
1952     return MCDisassembler::Success;
1953 
1954   case amdhsa::KERNARG_PRELOAD_OFFSET:
1955     using namespace amdhsa;
1956     TwoByteBuffer = DE.getU16(Cursor);
1957     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) {
1958       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length",
1959                       KERNARG_PRELOAD_SPEC_LENGTH);
1960     }
1961 
1962     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) {
1963       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset",
1964                       KERNARG_PRELOAD_SPEC_OFFSET);
1965     }
1966     return MCDisassembler::Success;
1967 
1968   case amdhsa::RESERVED3_OFFSET:
1969     // 4 bytes from here are reserved, must be 0.
1970     ReservedBytes = DE.getBytes(Cursor, 4);
1971     for (int I = 0; I < 4; ++I) {
1972       if (ReservedBytes[I] != 0)
1973         return MCDisassembler::Fail;
1974     }
1975     return MCDisassembler::Success;
1976 
1977   default:
1978     llvm_unreachable("Unhandled index. Case statements cover everything.");
1979     return MCDisassembler::Fail;
1980   }
1981 #undef PRINT_DIRECTIVE
1982 }
1983 
1984 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
1985     StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
1986   // CP microcode requires the kernel descriptor to be 64 aligned.
1987   if (Bytes.size() != 64 || KdAddress % 64 != 0)
1988     return MCDisassembler::Fail;
1989 
1990   // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10
1991   // requires us to know the setting of .amdhsa_wavefront_size32 in order to
1992   // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong
1993   // order. Workaround this by first looking up .amdhsa_wavefront_size32 here
1994   // when required.
1995   if (isGFX10Plus()) {
1996     uint16_t KernelCodeProperties =
1997         support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET],
1998                                 support::endianness::little);
1999     EnableWavefrontSize32 =
2000         AMDHSA_BITS_GET(KernelCodeProperties,
2001                         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2002   }
2003 
2004   std::string Kd;
2005   raw_string_ostream KdStream(Kd);
2006   KdStream << ".amdhsa_kernel " << KdName << '\n';
2007 
2008   DataExtractor::Cursor C(0);
2009   while (C && C.tell() < Bytes.size()) {
2010     MCDisassembler::DecodeStatus Status =
2011         decodeKernelDescriptorDirective(C, Bytes, KdStream);
2012 
2013     cantFail(C.takeError());
2014 
2015     if (Status == MCDisassembler::Fail)
2016       return MCDisassembler::Fail;
2017   }
2018   KdStream << ".end_amdhsa_kernel\n";
2019   outs() << KdStream.str();
2020   return MCDisassembler::Success;
2021 }
2022 
2023 std::optional<MCDisassembler::DecodeStatus>
2024 AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
2025                                   ArrayRef<uint8_t> Bytes, uint64_t Address,
2026                                   raw_ostream &CStream) const {
2027   // Right now only kernel descriptor needs to be handled.
2028   // We ignore all other symbols for target specific handling.
2029   // TODO:
2030   // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
2031   // Object V2 and V3 when symbols are marked protected.
2032 
2033   // amd_kernel_code_t for Code Object V2.
2034   if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
2035     Size = 256;
2036     return MCDisassembler::Fail;
2037   }
2038 
2039   // Code Object V3 kernel descriptors.
2040   StringRef Name = Symbol.Name;
2041   if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
2042     Size = 64; // Size = 64 regardless of success or failure.
2043     return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
2044   }
2045   return std::nullopt;
2046 }
2047 
2048 //===----------------------------------------------------------------------===//
2049 // AMDGPUSymbolizer
2050 //===----------------------------------------------------------------------===//
2051 
2052 // Try to find symbol name for specified label
2053 bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
2054     MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
2055     uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
2056     uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
2057 
2058   if (!IsBranch) {
2059     return false;
2060   }
2061 
2062   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
2063   if (!Symbols)
2064     return false;
2065 
2066   auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
2067     return Val.Addr == static_cast<uint64_t>(Value) &&
2068            Val.Type == ELF::STT_NOTYPE;
2069   });
2070   if (Result != Symbols->end()) {
2071     auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
2072     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
2073     Inst.addOperand(MCOperand::createExpr(Add));
2074     return true;
2075   }
2076   // Add to list of referenced addresses, so caller can synthesize a label.
2077   ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
2078   return false;
2079 }
2080 
2081 void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
2082                                                        int64_t Value,
2083                                                        uint64_t Address) {
2084   llvm_unreachable("unimplemented");
2085 }
2086 
2087 //===----------------------------------------------------------------------===//
2088 // Initialization
2089 //===----------------------------------------------------------------------===//
2090 
2091 static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
2092                               LLVMOpInfoCallback /*GetOpInfo*/,
2093                               LLVMSymbolLookupCallback /*SymbolLookUp*/,
2094                               void *DisInfo,
2095                               MCContext *Ctx,
2096                               std::unique_ptr<MCRelocationInfo> &&RelInfo) {
2097   return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
2098 }
2099 
2100 static MCDisassembler *createAMDGPUDisassembler(const Target &T,
2101                                                 const MCSubtargetInfo &STI,
2102                                                 MCContext &Ctx) {
2103   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
2104 }
2105 
2106 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
2107   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
2108                                          createAMDGPUDisassembler);
2109   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
2110                                        createAMDGPUSymbolizer);
2111 }
2112