xref: /llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (revision 569ef8ddd949d5eab8b4aea2ee181c318dc95f09)
1 //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 //
11 /// \file
12 ///
13 /// This file contains definition for AMDGPU ISA disassembler
14 //
15 //===----------------------------------------------------------------------===//
16 
17 // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
18 
19 #include "Disassembler/AMDGPUDisassembler.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIDefines.h"
22 #include "SIRegisterInfo.h"
23 #include "TargetInfo/AMDGPUTargetInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm-c/DisassemblerTypes.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/MC/MCAsmInfo.h"
28 #include "llvm/MC/MCContext.h"
29 #include "llvm/MC/MCDecoderOps.h"
30 #include "llvm/MC/MCExpr.h"
31 #include "llvm/MC/MCInstrDesc.h"
32 #include "llvm/MC/MCRegisterInfo.h"
33 #include "llvm/MC/MCSubtargetInfo.h"
34 #include "llvm/MC/TargetRegistry.h"
35 #include "llvm/Support/AMDHSAKernelDescriptor.h"
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "amdgpu-disassembler"
40 
41 #define SGPR_MAX                                                               \
42   (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10                           \
43                  : AMDGPU::EncValues::SGPR_MAX_SI)
44 
45 using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
46 
47 AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
48                                        MCContext &Ctx, MCInstrInfo const *MCII)
49     : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
50       MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)) {
51   // ToDo: AMDGPUDisassembler supports only VI ISA.
52   if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
53     report_fatal_error("Disassembly not yet supported for subtarget");
54 }
55 
56 inline static MCDisassembler::DecodeStatus
57 addOperand(MCInst &Inst, const MCOperand& Opnd) {
58   Inst.addOperand(Opnd);
59   return Opnd.isValid() ?
60     MCDisassembler::Success :
61     MCDisassembler::Fail;
62 }
63 
64 static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
65                                 uint16_t NameIdx) {
66   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
67   if (OpIdx != -1) {
68     auto I = MI.begin();
69     std::advance(I, OpIdx);
70     MI.insert(I, Op);
71   }
72   return OpIdx;
73 }
74 
75 static DecodeStatus decodeSOPPBrTarget(MCInst &Inst, unsigned Imm,
76                                        uint64_t Addr,
77                                        const MCDisassembler *Decoder) {
78   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
79 
80   // Our branches take a simm16, but we need two extra bits to account for the
81   // factor of 4.
82   APInt SignedOffset(18, Imm * 4, true);
83   int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
84 
85   if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0))
86     return MCDisassembler::Success;
87   return addOperand(Inst, MCOperand::createImm(Imm));
88 }
89 
90 static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
91                                      const MCDisassembler *Decoder) {
92   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
93   int64_t Offset;
94   if (DAsm->isGFX12Plus()) { // GFX12 supports 24-bit signed offsets.
95     Offset = SignExtend64<24>(Imm);
96   } else if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
97     Offset = Imm & 0xFFFFF;
98   } else { // GFX9+ supports 21-bit signed offsets.
99     Offset = SignExtend64<21>(Imm);
100   }
101   return addOperand(Inst, MCOperand::createImm(Offset));
102 }
103 
104 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
105                                   const MCDisassembler *Decoder) {
106   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
107   return addOperand(Inst, DAsm->decodeBoolReg(Val));
108 }
109 
110 static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
111                                        uint64_t Addr,
112                                        const MCDisassembler *Decoder) {
113   auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
114   return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
115 }
116 
117 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
118   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
119                                         uint64_t /*Addr*/,                     \
120                                         const MCDisassembler *Decoder) {       \
121     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
122     return addOperand(Inst, DAsm->DecoderName(Imm));                           \
123   }
124 
125 // Decoder for registers, decode directly using RegClassID. Imm(8-bit) is
126 // number of register. Used by VGPR only and AGPR only operands.
127 #define DECODE_OPERAND_REG_8(RegClass)                                         \
128   static DecodeStatus Decode##RegClass##RegisterClass(                         \
129       MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,                           \
130       const MCDisassembler *Decoder) {                                         \
131     assert(Imm < (1 << 8) && "8-bit encoding");                                \
132     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
133     return addOperand(                                                         \
134         Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm));      \
135   }
136 
137 #define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral,         \
138                      ImmWidth)                                                 \
139   static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,      \
140                            const MCDisassembler *Decoder) {                    \
141     assert(Imm < (1 << EncSize) && #EncSize "-bit encoding");                  \
142     auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);              \
143     return addOperand(Inst,                                                    \
144                       DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm,   \
145                                         MandatoryLiteral, ImmWidth));          \
146   }
147 
148 // Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to
149 // get register class. Used by SGPR only operands.
150 #define DECODE_OPERAND_REG_7(RegClass, OpWidth)                                \
151   DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0)
152 
153 // Decoder for registers. Imm(10-bit): Imm{7-0} is number of register,
154 // Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC).
155 // Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp.
156 // Used by AV_ register classes (AGPR or VGPR only register operands).
157 #define DECODE_OPERAND_REG_AV10(RegClass, OpWidth)                             \
158   DECODE_SrcOp(Decode##RegClass##RegisterClass, 10, OpWidth,                   \
159                Imm | AMDGPU::EncValues::IS_VGPR, false, 0)
160 
161 // Decoder for Src(9-bit encoding) registers only.
162 #define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth)                            \
163   DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0)
164 
165 // Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
166 // Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
167 // only.
168 #define DECODE_OPERAND_SRC_REG_A9(RegClass, OpWidth)                           \
169   DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm | 512, false, 0)
170 
171 // Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding
172 // Imm{9} is acc, registers only.
173 #define DECODE_SRC_OPERAND_REG_AV10(RegClass, OpWidth)                         \
174   DECODE_SrcOp(decodeOperand_##RegClass, 10, OpWidth, Imm, false, 0)
175 
176 // Decoder for RegisterOperands using 9-bit Src encoding. Operand can be
177 // register from RegClass or immediate. Registers that don't belong to RegClass
178 // will be decoded and InstPrinter will report warning. Immediate will be
179 // decoded into constant of size ImmWidth, should match width of immediate used
180 // by OperandType (important for floating point types).
181 #define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth)           \
182   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
183                false, ImmWidth)
184 
185 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
186 // and decode using 'enum10' from decodeSrcOp.
187 #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
188   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth,           \
189                Imm | 512, false, ImmWidth)
190 
191 #define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth)  \
192   DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9,         \
193                OpWidth, Imm, true, ImmWidth)
194 
195 // Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
196 // when RegisterClass is used as an operand. Most often used for destination
197 // operands.
198 
199 DECODE_OPERAND_REG_8(VGPR_32)
200 DECODE_OPERAND_REG_8(VGPR_32_Lo128)
201 DECODE_OPERAND_REG_8(VReg_64)
202 DECODE_OPERAND_REG_8(VReg_96)
203 DECODE_OPERAND_REG_8(VReg_128)
204 DECODE_OPERAND_REG_8(VReg_256)
205 DECODE_OPERAND_REG_8(VReg_288)
206 DECODE_OPERAND_REG_8(VReg_352)
207 DECODE_OPERAND_REG_8(VReg_384)
208 DECODE_OPERAND_REG_8(VReg_512)
209 DECODE_OPERAND_REG_8(VReg_1024)
210 
211 DECODE_OPERAND_REG_7(SReg_32, OPW32)
212 DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
213 DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
214 DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
215 DECODE_OPERAND_REG_7(SReg_64, OPW64)
216 DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
217 DECODE_OPERAND_REG_7(SReg_96, OPW96)
218 DECODE_OPERAND_REG_7(SReg_128, OPW128)
219 DECODE_OPERAND_REG_7(SReg_256, OPW256)
220 DECODE_OPERAND_REG_7(SReg_512, OPW512)
221 
222 DECODE_OPERAND_REG_8(AGPR_32)
223 DECODE_OPERAND_REG_8(AReg_64)
224 DECODE_OPERAND_REG_8(AReg_128)
225 DECODE_OPERAND_REG_8(AReg_256)
226 DECODE_OPERAND_REG_8(AReg_512)
227 DECODE_OPERAND_REG_8(AReg_1024)
228 
229 DECODE_OPERAND_REG_AV10(AVDst_128, OPW128)
230 DECODE_OPERAND_REG_AV10(AVDst_512, OPW512)
231 
232 // Decoders for register only source RegisterOperands that use use 9-bit Src
233 // encoding: 'decodeOperand_<RegClass>'.
234 
235 DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32)
236 DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64)
237 DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128)
238 DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256)
239 DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32)
240 
241 DECODE_OPERAND_SRC_REG_A9(AGPR_32, OPW32)
242 
243 DECODE_SRC_OPERAND_REG_AV10(AV_32, OPW32)
244 DECODE_SRC_OPERAND_REG_AV10(AV_64, OPW64)
245 DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128)
246 
247 // Decoders for register or immediate RegisterOperands that use 9-bit Src
248 // encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'.
249 
250 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
251 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
252 DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 16)
253 DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
254 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
255 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
256 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
257 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
258 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
259 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
260 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
261 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
262 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
263 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
264 
265 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
266 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
267 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
268 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32)
269 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
270 
271 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
272 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
273 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
274 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(SReg_32, OPW32, 32)
275 
276 static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
277                                                uint64_t /*Addr*/,
278                                                const MCDisassembler *Decoder) {
279   assert(isUInt<10>(Imm) && "10-bit encoding expected");
280   assert((Imm & (1 << 8)) == 0 && "Imm{8} should not be used");
281 
282   bool IsHi = Imm & (1 << 9);
283   unsigned RegIdx = Imm & 0xff;
284   auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
285   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
286 }
287 
288 static DecodeStatus
289 DecodeVGPR_16_Lo128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,
290                                  const MCDisassembler *Decoder) {
291   assert(isUInt<8>(Imm) && "8-bit encoding expected");
292 
293   bool IsHi = Imm & (1 << 7);
294   unsigned RegIdx = Imm & 0x7f;
295   auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
296   return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
297 }
298 
299 static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
300                                                 uint64_t /*Addr*/,
301                                                 const MCDisassembler *Decoder) {
302   assert(isUInt<9>(Imm) && "9-bit encoding expected");
303 
304   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
305   bool IsVGPR = Imm & (1 << 8);
306   if (IsVGPR) {
307     bool IsHi = Imm & (1 << 7);
308     unsigned RegIdx = Imm & 0x7f;
309     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
310   }
311   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16,
312                                                    Imm & 0xFF, false, 16));
313 }
314 
315 static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
316                                           uint64_t /*Addr*/,
317                                           const MCDisassembler *Decoder) {
318   assert(isUInt<10>(Imm) && "10-bit encoding expected");
319 
320   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
321   bool IsVGPR = Imm & (1 << 8);
322   if (IsVGPR) {
323     bool IsHi = Imm & (1 << 9);
324     unsigned RegIdx = Imm & 0xff;
325     return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
326   }
327   return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16,
328                                                    Imm & 0xFF, false, 16));
329 }
330 
331 static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
332                                          uint64_t Addr,
333                                          const MCDisassembler *Decoder) {
334   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
335   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
336 }
337 
338 static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
339                                           uint64_t Addr, const void *Decoder) {
340   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
341   return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val));
342 }
343 
344 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
345                           const MCRegisterInfo *MRI) {
346   if (OpIdx < 0)
347     return false;
348 
349   const MCOperand &Op = Inst.getOperand(OpIdx);
350   if (!Op.isReg())
351     return false;
352 
353   unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
354   auto Reg = Sub ? Sub : Op.getReg();
355   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
356 }
357 
358 static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
359                                              AMDGPUDisassembler::OpWidthTy Opw,
360                                              const MCDisassembler *Decoder) {
361   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
362   if (!DAsm->isGFX90A()) {
363     Imm &= 511;
364   } else {
365     // If atomic has both vdata and vdst their register classes are tied.
366     // The bit is decoded along with the vdst, first operand. We need to
367     // change register class to AGPR if vdst was AGPR.
368     // If a DS instruction has both data0 and data1 their register classes
369     // are also tied.
370     unsigned Opc = Inst.getOpcode();
371     uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
372     uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
373                                                         : AMDGPU::OpName::vdata;
374     const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
375     int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
376     if ((int)Inst.getNumOperands() == DataIdx) {
377       int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
378       if (IsAGPROperand(Inst, DstIdx, MRI))
379         Imm |= 512;
380     }
381 
382     if (TSFlags & SIInstrFlags::DS) {
383       int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
384       if ((int)Inst.getNumOperands() == Data2Idx &&
385           IsAGPROperand(Inst, DataIdx, MRI))
386         Imm |= 512;
387     }
388   }
389   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
390 }
391 
392 static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
393                                            uint64_t Addr,
394                                            const MCDisassembler *Decoder) {
395   assert(Imm < (1 << 9) && "9-bit encoding");
396   auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
397   return addOperand(
398       Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64, true));
399 }
400 
401 static DecodeStatus
402 DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
403                              const MCDisassembler *Decoder) {
404   return decodeOperand_AVLdSt_Any(Inst, Imm,
405                                   AMDGPUDisassembler::OPW32, Decoder);
406 }
407 
408 static DecodeStatus
409 DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
410                              const MCDisassembler *Decoder) {
411   return decodeOperand_AVLdSt_Any(Inst, Imm,
412                                   AMDGPUDisassembler::OPW64, Decoder);
413 }
414 
415 static DecodeStatus
416 DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
417                              const MCDisassembler *Decoder) {
418   return decodeOperand_AVLdSt_Any(Inst, Imm,
419                                   AMDGPUDisassembler::OPW96, Decoder);
420 }
421 
422 static DecodeStatus
423 DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
424                               const MCDisassembler *Decoder) {
425   return decodeOperand_AVLdSt_Any(Inst, Imm,
426                                   AMDGPUDisassembler::OPW128, Decoder);
427 }
428 
429 static DecodeStatus
430 DecodeAVLdSt_160RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
431                               const MCDisassembler *Decoder) {
432   return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW160,
433                                   Decoder);
434 }
435 
436 #define DECODE_SDWA(DecName) \
437 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
438 
439 DECODE_SDWA(Src32)
440 DECODE_SDWA(Src16)
441 DECODE_SDWA(VopcDst)
442 
443 #include "AMDGPUGenDisassemblerTables.inc"
444 
445 //===----------------------------------------------------------------------===//
446 //
447 //===----------------------------------------------------------------------===//
448 
449 template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
450   assert(Bytes.size() >= sizeof(T));
451   const auto Res =
452       support::endian::read<T, llvm::endianness::little>(Bytes.data());
453   Bytes = Bytes.slice(sizeof(T));
454   return Res;
455 }
456 
457 static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
458   assert(Bytes.size() >= 12);
459   uint64_t Lo =
460       support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
461   Bytes = Bytes.slice(8);
462   uint64_t Hi =
463       support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
464   Bytes = Bytes.slice(4);
465   return DecoderUInt128(Lo, Hi);
466 }
467 
468 // The disassembler is greedy, so we need to check FI operand value to
469 // not parse a dpp if the correct literal is not set. For dpp16 the
470 // autogenerated decoder checks the dpp literal
471 static bool isValidDPP8(const MCInst &MI) {
472   using namespace llvm::AMDGPU::DPP;
473   int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
474   assert(FiIdx != -1);
475   if ((unsigned)FiIdx >= MI.getNumOperands())
476     return false;
477   unsigned Fi = MI.getOperand(FiIdx).getImm();
478   return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
479 }
480 
481 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
482                                                 ArrayRef<uint8_t> Bytes_,
483                                                 uint64_t Address,
484                                                 raw_ostream &CS) const {
485   bool IsSDWA = false;
486 
487   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
488   Bytes = Bytes_.slice(0, MaxInstBytesNum);
489 
490   DecodeStatus Res = MCDisassembler::Fail;
491   do {
492     // ToDo: better to switch encoding length using some bit predicate
493     // but it is unknown yet, so try all we can
494 
495     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
496     // encodings
497     if (isGFX11Plus() && Bytes.size() >= 12 ) {
498       DecoderUInt128 DecW = eat12Bytes(Bytes);
499       Res =
500           tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
501                         MI, DecW, Address, CS);
502       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
503         break;
504       MI = MCInst(); // clear
505       Res =
506           tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
507                         MI, DecW, Address, CS);
508       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
509         break;
510       MI = MCInst(); // clear
511 
512       const auto convertVOPDPP = [&]() {
513         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
514           convertVOP3PDPPInst(MI);
515         } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) {
516           convertVOPCDPPInst(MI); // Special VOP3 case
517         } else {
518           assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
519           convertVOP3DPPInst(MI); // Regular VOP3 case
520         }
521       };
522       Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
523                           MI, DecW, Address, CS);
524       if (Res) {
525         convertVOPDPP();
526         break;
527       }
528       Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
529                           MI, DecW, Address, CS);
530       if (Res) {
531         convertVOPDPP();
532         break;
533       }
534       Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
535       if (Res)
536         break;
537 
538       Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
539       if (Res)
540         break;
541     }
542     // Reinitialize Bytes
543     Bytes = Bytes_.slice(0, MaxInstBytesNum);
544 
545     if (Bytes.size() >= 8) {
546       const uint64_t QW = eatBytes<uint64_t>(Bytes);
547 
548       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
549         Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
550         if (Res) {
551           if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
552               == -1)
553             break;
554           if (convertDPP8Inst(MI) == MCDisassembler::Success)
555             break;
556           MI = MCInst(); // clear
557         }
558       }
559 
560       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
561       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
562         break;
563       MI = MCInst(); // clear
564 
565       Res = tryDecodeInst(DecoderTableDPP8GFX1164,
566                           DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
567       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
568         break;
569       MI = MCInst(); // clear
570 
571       Res = tryDecodeInst(DecoderTableDPP8GFX1264,
572                           DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
573       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
574         break;
575       MI = MCInst(); // clear
576 
577       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
578       if (Res) break;
579 
580       Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
581                           MI, QW, Address, CS);
582       if (Res) {
583         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
584           convertVOPCDPPInst(MI);
585         break;
586       }
587 
588       Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
589                           MI, QW, Address, CS);
590       if (Res) {
591         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
592           convertVOPCDPPInst(MI);
593         break;
594       }
595 
596       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address, CS);
597       if (Res) { IsSDWA = true;  break; }
598 
599       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address, CS);
600       if (Res) { IsSDWA = true;  break; }
601 
602       Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address, CS);
603       if (Res) { IsSDWA = true;  break; }
604 
605       if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
606         Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
607         if (Res)
608           break;
609       }
610 
611       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
612       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
613       // table first so we print the correct name.
614       if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
615         Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
616         if (Res)
617           break;
618       }
619     }
620 
621     // Reinitialize Bytes as DPP64 could have eaten too much
622     Bytes = Bytes_.slice(0, MaxInstBytesNum);
623 
624     // Try decode 32-bit instruction
625     if (Bytes.size() < 4) break;
626     const uint32_t DW = eatBytes<uint32_t>(Bytes);
627     Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
628     if (Res) break;
629 
630     Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
631     if (Res) break;
632 
633     Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
634     if (Res) break;
635 
636     if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
637       Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
638       if (Res)
639         break;
640     }
641 
642     if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
643       Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
644       if (Res) break;
645     }
646 
647     Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
648     if (Res) break;
649 
650     Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
651                         Address, CS);
652     if (Res) break;
653 
654     Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
655                         Address, CS);
656     if (Res)
657       break;
658 
659     if (Bytes.size() < 4) break;
660     const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
661 
662     if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
663       Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
664       if (Res)
665         break;
666     }
667 
668     if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
669       Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
670       if (Res)
671         break;
672     }
673 
674     Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
675     if (Res) break;
676 
677     Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address, CS);
678     if (Res) break;
679 
680     Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
681     if (Res) break;
682 
683     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
684     if (Res) break;
685 
686     Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
687                         Address, CS);
688     if (Res)
689       break;
690 
691     Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
692                         Address, CS);
693     if (Res)
694       break;
695 
696     Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
697   } while (false);
698 
699   if (Res && AMDGPU::isMAC(MI.getOpcode())) {
700     // Insert dummy unused src2_modifiers.
701     insertNamedMCOperand(MI, MCOperand::createImm(0),
702                          AMDGPU::OpName::src2_modifiers);
703   }
704 
705   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
706           (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
707     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
708                                              AMDGPU::OpName::cpol);
709     if (CPolPos != -1) {
710       unsigned CPol =
711           (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
712               AMDGPU::CPol::GLC : 0;
713       if (MI.getNumOperands() <= (unsigned)CPolPos) {
714         insertNamedMCOperand(MI, MCOperand::createImm(CPol),
715                              AMDGPU::OpName::cpol);
716       } else if (CPol) {
717         MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
718       }
719     }
720   }
721 
722   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
723               (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
724              (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
725     // GFX90A lost TFE, its place is occupied by ACC.
726     int TFEOpIdx =
727         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
728     if (TFEOpIdx != -1) {
729       auto TFEIter = MI.begin();
730       std::advance(TFEIter, TFEOpIdx);
731       MI.insert(TFEIter, MCOperand::createImm(0));
732     }
733   }
734 
735   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
736               (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
737     int SWZOpIdx =
738         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
739     if (SWZOpIdx != -1) {
740       auto SWZIter = MI.begin();
741       std::advance(SWZIter, SWZOpIdx);
742       MI.insert(SWZIter, MCOperand::createImm(0));
743     }
744   }
745 
746   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
747     int VAddr0Idx =
748         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
749     int RsrcIdx =
750         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
751     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
752     if (VAddr0Idx >= 0 && NSAArgs > 0) {
753       unsigned NSAWords = (NSAArgs + 3) / 4;
754       if (Bytes.size() < 4 * NSAWords) {
755         Res = MCDisassembler::Fail;
756       } else {
757         for (unsigned i = 0; i < NSAArgs; ++i) {
758           const unsigned VAddrIdx = VAddr0Idx + 1 + i;
759           auto VAddrRCID =
760               MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
761           MI.insert(MI.begin() + VAddrIdx,
762                     createRegOperand(VAddrRCID, Bytes[i]));
763         }
764         Bytes = Bytes.slice(4 * NSAWords);
765       }
766     }
767 
768     if (Res)
769       Res = convertMIMGInst(MI);
770   }
771 
772   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
773               (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)))
774     Res = convertMIMGInst(MI);
775 
776   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
777     Res = convertEXPInst(MI);
778 
779   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
780     Res = convertVINTERPInst(MI);
781 
782   if (Res && IsSDWA)
783     Res = convertSDWAInst(MI);
784 
785   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
786                                               AMDGPU::OpName::vdst_in);
787   if (VDstIn_Idx != -1) {
788     int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
789                            MCOI::OperandConstraint::TIED_TO);
790     if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
791          !MI.getOperand(VDstIn_Idx).isReg() ||
792          MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
793       if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
794         MI.erase(&MI.getOperand(VDstIn_Idx));
795       insertNamedMCOperand(MI,
796         MCOperand::createReg(MI.getOperand(Tied).getReg()),
797         AMDGPU::OpName::vdst_in);
798     }
799   }
800 
801   int ImmLitIdx =
802       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
803   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
804   if (Res && ImmLitIdx != -1 && !IsSOPK)
805     Res = convertFMAanyK(MI, ImmLitIdx);
806 
807   // if the opcode was not recognized we'll assume a Size of 4 bytes
808   // (unless there are fewer bytes left)
809   Size = Res ? (MaxInstBytesNum - Bytes.size())
810              : std::min((size_t)4, Bytes_.size());
811   return Res;
812 }
813 
814 DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
815   if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
816     // The MCInst still has these fields even though they are no longer encoded
817     // in the GFX11 instruction.
818     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
819     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
820   }
821   return MCDisassembler::Success;
822 }
823 
824 DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
825   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
826       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
827       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
828       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 ||
829       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
830       MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 ||
831       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 ||
832       MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) {
833     // The MCInst has this field that is not directly encoded in the
834     // instruction.
835     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
836   }
837   return MCDisassembler::Success;
838 }
839 
840 DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
841   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
842       STI.hasFeature(AMDGPU::FeatureGFX10)) {
843     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
844       // VOPC - insert clamp
845       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
846   } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
847     int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
848     if (SDst != -1) {
849       // VOPC - insert VCC register as sdst
850       insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC),
851                            AMDGPU::OpName::sdst);
852     } else {
853       // VOP1/2 - insert omod if present in instruction
854       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
855     }
856   }
857   return MCDisassembler::Success;
858 }
859 
860 struct VOPModifiers {
861   unsigned OpSel = 0;
862   unsigned OpSelHi = 0;
863   unsigned NegLo = 0;
864   unsigned NegHi = 0;
865 };
866 
867 // Reconstruct values of VOP3/VOP3P operands such as op_sel.
868 // Note that these values do not affect disassembler output,
869 // so this is only necessary for consistency with src_modifiers.
870 static VOPModifiers collectVOPModifiers(const MCInst &MI,
871                                         bool IsVOP3P = false) {
872   VOPModifiers Modifiers;
873   unsigned Opc = MI.getOpcode();
874   const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
875                         AMDGPU::OpName::src1_modifiers,
876                         AMDGPU::OpName::src2_modifiers};
877   for (int J = 0; J < 3; ++J) {
878     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
879     if (OpIdx == -1)
880       continue;
881 
882     unsigned Val = MI.getOperand(OpIdx).getImm();
883 
884     Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
885     if (IsVOP3P) {
886       Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
887       Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
888       Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
889     } else if (J == 0) {
890       Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
891     }
892   }
893 
894   return Modifiers;
895 }
896 
897 // MAC opcodes have special old and src2 operands.
898 // src2 is tied to dst, while old is not tied (but assumed to be).
899 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
900   constexpr int DST_IDX = 0;
901   auto Opcode = MI.getOpcode();
902   const auto &Desc = MCII->get(Opcode);
903   auto OldIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::old);
904 
905   if (OldIdx != -1 && Desc.getOperandConstraint(
906                           OldIdx, MCOI::OperandConstraint::TIED_TO) == -1) {
907     assert(AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2));
908     assert(Desc.getOperandConstraint(
909                AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2),
910                MCOI::OperandConstraint::TIED_TO) == DST_IDX);
911     (void)DST_IDX;
912     return true;
913   }
914 
915   return false;
916 }
917 
918 // Create dummy old operand and insert dummy unused src2_modifiers
919 void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
920   assert(MI.getNumOperands() + 1 < MCII->get(MI.getOpcode()).getNumOperands());
921   insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
922   insertNamedMCOperand(MI, MCOperand::createImm(0),
923                        AMDGPU::OpName::src2_modifiers);
924 }
925 
926 // We must check FI == literal to reject not genuine dpp8 insts, and we must
927 // first add optional MI operands to check FI
928 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
929   unsigned Opc = MI.getOpcode();
930   if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
931     convertVOP3PDPPInst(MI);
932   } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
933              AMDGPU::isVOPC64DPP(Opc)) {
934     convertVOPCDPPInst(MI);
935   } else {
936     if (isMacDPP(MI))
937       convertMacDPPInst(MI);
938 
939     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
940     if (MI.getNumOperands() < DescNumOps &&
941         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
942       auto Mods = collectVOPModifiers(MI);
943       insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
944                            AMDGPU::OpName::op_sel);
945     } else {
946       // Insert dummy unused src modifiers.
947       if (MI.getNumOperands() < DescNumOps &&
948           AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
949         insertNamedMCOperand(MI, MCOperand::createImm(0),
950                              AMDGPU::OpName::src0_modifiers);
951 
952       if (MI.getNumOperands() < DescNumOps &&
953           AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
954         insertNamedMCOperand(MI, MCOperand::createImm(0),
955                              AMDGPU::OpName::src1_modifiers);
956     }
957   }
958   return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
959 }
960 
961 DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
962   if (isMacDPP(MI))
963     convertMacDPPInst(MI);
964 
965   unsigned Opc = MI.getOpcode();
966   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
967   if (MI.getNumOperands() < DescNumOps &&
968       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
969     auto Mods = collectVOPModifiers(MI);
970     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
971                          AMDGPU::OpName::op_sel);
972   }
973   return MCDisassembler::Success;
974 }
975 
976 // Note that before gfx10, the MIMG encoding provided no information about
977 // VADDR size. Consequently, decoded instructions always show address as if it
978 // has 1 dword, which could be not really so.
979 DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
980   auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
981 
982   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
983                                            AMDGPU::OpName::vdst);
984 
985   int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
986                                             AMDGPU::OpName::vdata);
987   int VAddr0Idx =
988       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
989   int RsrcOpName = TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
990                                                 : AMDGPU::OpName::rsrc;
991   int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName);
992   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
993                                             AMDGPU::OpName::dmask);
994 
995   int TFEIdx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
996                                             AMDGPU::OpName::tfe);
997   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
998                                             AMDGPU::OpName::d16);
999 
1000   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1001   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1002       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
1003 
1004   assert(VDataIdx != -1);
1005   if (BaseOpcode->BVH) {
1006     // Add A16 operand for intersect_ray instructions
1007     addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
1008     return MCDisassembler::Success;
1009   }
1010 
1011   bool IsAtomic = (VDstIdx != -1);
1012   bool IsGather4 = TSFlags & SIInstrFlags::Gather4;
1013   bool IsVSample = TSFlags & SIInstrFlags::VSAMPLE;
1014   bool IsNSA = false;
1015   bool IsPartialNSA = false;
1016   unsigned AddrSize = Info->VAddrDwords;
1017 
1018   if (isGFX10Plus()) {
1019     unsigned DimIdx =
1020         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
1021     int A16Idx =
1022         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
1023     const AMDGPU::MIMGDimInfo *Dim =
1024         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
1025     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
1026 
1027     AddrSize =
1028         AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
1029 
1030     // VSAMPLE insts that do not use vaddr3 behave the same as NSA forms.
1031     // VIMAGE insts other than BVH never use vaddr4.
1032     IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
1033             Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA ||
1034             Info->MIMGEncoding == AMDGPU::MIMGEncGfx12;
1035     if (!IsNSA) {
1036       if (!IsVSample && AddrSize > 12)
1037         AddrSize = 16;
1038     } else {
1039       if (AddrSize > Info->VAddrDwords) {
1040         if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
1041           // The NSA encoding does not contain enough operands for the
1042           // combination of base opcode / dimension. Should this be an error?
1043           return MCDisassembler::Success;
1044         }
1045         IsPartialNSA = true;
1046       }
1047     }
1048   }
1049 
1050   unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
1051   unsigned DstSize = IsGather4 ? 4 : std::max(llvm::popcount(DMask), 1);
1052 
1053   bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
1054   if (D16 && AMDGPU::hasPackedD16(STI)) {
1055     DstSize = (DstSize + 1) / 2;
1056   }
1057 
1058   if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
1059     DstSize += 1;
1060 
1061   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
1062     return MCDisassembler::Success;
1063 
1064   int NewOpcode =
1065       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
1066   if (NewOpcode == -1)
1067     return MCDisassembler::Success;
1068 
1069   // Widen the register to the correct number of enabled channels.
1070   unsigned NewVdata = AMDGPU::NoRegister;
1071   if (DstSize != Info->VDataDwords) {
1072     auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass;
1073 
1074     // Get first subregister of VData
1075     unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
1076     unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
1077     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
1078 
1079     NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
1080                                        &MRI.getRegClass(DataRCID));
1081     if (NewVdata == AMDGPU::NoRegister) {
1082       // It's possible to encode this such that the low register + enabled
1083       // components exceeds the register count.
1084       return MCDisassembler::Success;
1085     }
1086   }
1087 
1088   // If not using NSA on GFX10+, widen vaddr0 address register to correct size.
1089   // If using partial NSA on GFX11+ widen last address register.
1090   int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx;
1091   unsigned NewVAddrSA = AMDGPU::NoRegister;
1092   if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) &&
1093       AddrSize != Info->VAddrDwords) {
1094     unsigned VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
1095     unsigned VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
1096     VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
1097 
1098     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
1099     NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
1100                                         &MRI.getRegClass(AddrRCID));
1101     if (!NewVAddrSA)
1102       return MCDisassembler::Success;
1103   }
1104 
1105   MI.setOpcode(NewOpcode);
1106 
1107   if (NewVdata != AMDGPU::NoRegister) {
1108     MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
1109 
1110     if (IsAtomic) {
1111       // Atomic operations have an additional operand (a copy of data)
1112       MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
1113     }
1114   }
1115 
1116   if (NewVAddrSA) {
1117     MI.getOperand(VAddrSAIdx) = MCOperand::createReg(NewVAddrSA);
1118   } else if (IsNSA) {
1119     assert(AddrSize <= Info->VAddrDwords);
1120     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
1121              MI.begin() + VAddr0Idx + Info->VAddrDwords);
1122   }
1123 
1124   return MCDisassembler::Success;
1125 }
1126 
1127 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
1128 // decoder only adds to src_modifiers, so manually add the bits to the other
1129 // operands.
1130 DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
1131   unsigned Opc = MI.getOpcode();
1132   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1133   auto Mods = collectVOPModifiers(MI, true);
1134 
1135   if (MI.getNumOperands() < DescNumOps &&
1136       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
1137     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
1138 
1139   if (MI.getNumOperands() < DescNumOps &&
1140       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel))
1141     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
1142                          AMDGPU::OpName::op_sel);
1143   if (MI.getNumOperands() < DescNumOps &&
1144       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel_hi))
1145     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
1146                          AMDGPU::OpName::op_sel_hi);
1147   if (MI.getNumOperands() < DescNumOps &&
1148       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_lo))
1149     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
1150                          AMDGPU::OpName::neg_lo);
1151   if (MI.getNumOperands() < DescNumOps &&
1152       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
1153     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
1154                          AMDGPU::OpName::neg_hi);
1155 
1156   return MCDisassembler::Success;
1157 }
1158 
1159 // Create dummy old operand and insert optional operands
1160 DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
1161   unsigned Opc = MI.getOpcode();
1162   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
1163 
1164   if (MI.getNumOperands() < DescNumOps &&
1165       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::old))
1166     insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old);
1167 
1168   if (MI.getNumOperands() < DescNumOps &&
1169       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
1170     insertNamedMCOperand(MI, MCOperand::createImm(0),
1171                          AMDGPU::OpName::src0_modifiers);
1172 
1173   if (MI.getNumOperands() < DescNumOps &&
1174       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
1175     insertNamedMCOperand(MI, MCOperand::createImm(0),
1176                          AMDGPU::OpName::src1_modifiers);
1177   return MCDisassembler::Success;
1178 }
1179 
1180 DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
1181                                                 int ImmLitIdx) const {
1182   assert(HasLiteral && "Should have decoded a literal");
1183   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
1184   unsigned DescNumOps = Desc.getNumOperands();
1185   insertNamedMCOperand(MI, MCOperand::createImm(Literal),
1186                        AMDGPU::OpName::immDeferred);
1187   assert(DescNumOps == MI.getNumOperands());
1188   for (unsigned I = 0; I < DescNumOps; ++I) {
1189     auto &Op = MI.getOperand(I);
1190     auto OpType = Desc.operands()[I].OperandType;
1191     bool IsDeferredOp = (OpType == AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED ||
1192                          OpType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED);
1193     if (Op.isImm() && Op.getImm() == AMDGPU::EncValues::LITERAL_CONST &&
1194         IsDeferredOp)
1195       Op.setImm(Literal);
1196   }
1197   return MCDisassembler::Success;
1198 }
1199 
1200 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
1201   return getContext().getRegisterInfo()->
1202     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
1203 }
1204 
1205 inline
1206 MCOperand AMDGPUDisassembler::errOperand(unsigned V,
1207                                          const Twine& ErrMsg) const {
1208   *CommentStream << "Error: " + ErrMsg;
1209 
1210   // ToDo: add support for error operands to MCInst.h
1211   // return MCOperand::createError(V);
1212   return MCOperand();
1213 }
1214 
1215 inline
1216 MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
1217   return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
1218 }
1219 
1220 inline
1221 MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
1222                                                unsigned Val) const {
1223   const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
1224   if (Val >= RegCl.getNumRegs())
1225     return errOperand(Val, Twine(getRegClassName(RegClassID)) +
1226                            ": unknown register " + Twine(Val));
1227   return createRegOperand(RegCl.getRegister(Val));
1228 }
1229 
1230 inline
1231 MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
1232                                                 unsigned Val) const {
1233   // ToDo: SI/CI have 104 SGPRs, VI - 102
1234   // Valery: here we accepting as much as we can, let assembler sort it out
1235   int shift = 0;
1236   switch (SRegClassID) {
1237   case AMDGPU::SGPR_32RegClassID:
1238   case AMDGPU::TTMP_32RegClassID:
1239     break;
1240   case AMDGPU::SGPR_64RegClassID:
1241   case AMDGPU::TTMP_64RegClassID:
1242     shift = 1;
1243     break;
1244   case AMDGPU::SGPR_96RegClassID:
1245   case AMDGPU::TTMP_96RegClassID:
1246   case AMDGPU::SGPR_128RegClassID:
1247   case AMDGPU::TTMP_128RegClassID:
1248   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
1249   // this bundle?
1250   case AMDGPU::SGPR_256RegClassID:
1251   case AMDGPU::TTMP_256RegClassID:
1252     // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
1253   // this bundle?
1254   case AMDGPU::SGPR_288RegClassID:
1255   case AMDGPU::TTMP_288RegClassID:
1256   case AMDGPU::SGPR_320RegClassID:
1257   case AMDGPU::TTMP_320RegClassID:
1258   case AMDGPU::SGPR_352RegClassID:
1259   case AMDGPU::TTMP_352RegClassID:
1260   case AMDGPU::SGPR_384RegClassID:
1261   case AMDGPU::TTMP_384RegClassID:
1262   case AMDGPU::SGPR_512RegClassID:
1263   case AMDGPU::TTMP_512RegClassID:
1264     shift = 2;
1265     break;
1266   // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
1267   // this bundle?
1268   default:
1269     llvm_unreachable("unhandled register class");
1270   }
1271 
1272   if (Val % (1 << shift)) {
1273     *CommentStream << "Warning: " << getRegClassName(SRegClassID)
1274                    << ": scalar reg isn't aligned " << Val;
1275   }
1276 
1277   return createRegOperand(SRegClassID, Val >> shift);
1278 }
1279 
1280 MCOperand AMDGPUDisassembler::createVGPR16Operand(unsigned RegIdx,
1281                                                   bool IsHi) const {
1282   unsigned RCID =
1283       IsHi ? AMDGPU::VGPR_HI16RegClassID : AMDGPU::VGPR_LO16RegClassID;
1284   return createRegOperand(RCID, RegIdx);
1285 }
1286 
1287 // Decode Literals for insts which always have a literal in the encoding
1288 MCOperand
1289 AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
1290   if (HasLiteral) {
1291     assert(
1292         AMDGPU::hasVOPD(STI) &&
1293         "Should only decode multiple kimm with VOPD, check VSrc operand types");
1294     if (Literal != Val)
1295       return errOperand(Val, "More than one unique literal is illegal");
1296   }
1297   HasLiteral = true;
1298   Literal = Val;
1299   return MCOperand::createImm(Literal);
1300 }
1301 
1302 MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
1303   // For now all literal constants are supposed to be unsigned integer
1304   // ToDo: deal with signed/unsigned 64-bit integer constants
1305   // ToDo: deal with float/double constants
1306   if (!HasLiteral) {
1307     if (Bytes.size() < 4) {
1308       return errOperand(0, "cannot read literal, inst bytes left " +
1309                         Twine(Bytes.size()));
1310     }
1311     HasLiteral = true;
1312     Literal = Literal64 = eatBytes<uint32_t>(Bytes);
1313     if (ExtendFP64)
1314       Literal64 <<= 32;
1315   }
1316   return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
1317 }
1318 
1319 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
1320   using namespace AMDGPU::EncValues;
1321 
1322   assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
1323   return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
1324     (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
1325     (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
1326       // Cast prevents negative overflow.
1327 }
1328 
1329 static int64_t getInlineImmVal32(unsigned Imm) {
1330   switch (Imm) {
1331   case 240:
1332     return llvm::bit_cast<uint32_t>(0.5f);
1333   case 241:
1334     return llvm::bit_cast<uint32_t>(-0.5f);
1335   case 242:
1336     return llvm::bit_cast<uint32_t>(1.0f);
1337   case 243:
1338     return llvm::bit_cast<uint32_t>(-1.0f);
1339   case 244:
1340     return llvm::bit_cast<uint32_t>(2.0f);
1341   case 245:
1342     return llvm::bit_cast<uint32_t>(-2.0f);
1343   case 246:
1344     return llvm::bit_cast<uint32_t>(4.0f);
1345   case 247:
1346     return llvm::bit_cast<uint32_t>(-4.0f);
1347   case 248: // 1 / (2 * PI)
1348     return 0x3e22f983;
1349   default:
1350     llvm_unreachable("invalid fp inline imm");
1351   }
1352 }
1353 
1354 static int64_t getInlineImmVal64(unsigned Imm) {
1355   switch (Imm) {
1356   case 240:
1357     return llvm::bit_cast<uint64_t>(0.5);
1358   case 241:
1359     return llvm::bit_cast<uint64_t>(-0.5);
1360   case 242:
1361     return llvm::bit_cast<uint64_t>(1.0);
1362   case 243:
1363     return llvm::bit_cast<uint64_t>(-1.0);
1364   case 244:
1365     return llvm::bit_cast<uint64_t>(2.0);
1366   case 245:
1367     return llvm::bit_cast<uint64_t>(-2.0);
1368   case 246:
1369     return llvm::bit_cast<uint64_t>(4.0);
1370   case 247:
1371     return llvm::bit_cast<uint64_t>(-4.0);
1372   case 248: // 1 / (2 * PI)
1373     return 0x3fc45f306dc9c882;
1374   default:
1375     llvm_unreachable("invalid fp inline imm");
1376   }
1377 }
1378 
1379 static int64_t getInlineImmVal16(unsigned Imm) {
1380   switch (Imm) {
1381   case 240:
1382     return 0x3800;
1383   case 241:
1384     return 0xB800;
1385   case 242:
1386     return 0x3C00;
1387   case 243:
1388     return 0xBC00;
1389   case 244:
1390     return 0x4000;
1391   case 245:
1392     return 0xC000;
1393   case 246:
1394     return 0x4400;
1395   case 247:
1396     return 0xC400;
1397   case 248: // 1 / (2 * PI)
1398     return 0x3118;
1399   default:
1400     llvm_unreachable("invalid fp inline imm");
1401   }
1402 }
1403 
1404 MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm) {
1405   assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
1406       && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
1407 
1408   // ToDo: case 248: 1/(2*PI) - is allowed only on VI
1409   // ImmWidth 0 is a default case where operand should not allow immediates.
1410   // Imm value is still decoded into 32 bit immediate operand, inst printer will
1411   // use it to print verbose error message.
1412   switch (ImmWidth) {
1413   case 0:
1414   case 32:
1415     return MCOperand::createImm(getInlineImmVal32(Imm));
1416   case 64:
1417     return MCOperand::createImm(getInlineImmVal64(Imm));
1418   case 16:
1419     return MCOperand::createImm(getInlineImmVal16(Imm));
1420   default:
1421     llvm_unreachable("implement me");
1422   }
1423 }
1424 
1425 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
1426   using namespace AMDGPU;
1427 
1428   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1429   switch (Width) {
1430   default: // fall
1431   case OPW32:
1432   case OPW16:
1433   case OPWV216:
1434     return VGPR_32RegClassID;
1435   case OPW64:
1436   case OPWV232: return VReg_64RegClassID;
1437   case OPW96: return VReg_96RegClassID;
1438   case OPW128: return VReg_128RegClassID;
1439   case OPW160: return VReg_160RegClassID;
1440   case OPW256: return VReg_256RegClassID;
1441   case OPW288: return VReg_288RegClassID;
1442   case OPW320: return VReg_320RegClassID;
1443   case OPW352: return VReg_352RegClassID;
1444   case OPW384: return VReg_384RegClassID;
1445   case OPW512: return VReg_512RegClassID;
1446   case OPW1024: return VReg_1024RegClassID;
1447   }
1448 }
1449 
1450 unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
1451   using namespace AMDGPU;
1452 
1453   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1454   switch (Width) {
1455   default: // fall
1456   case OPW32:
1457   case OPW16:
1458   case OPWV216:
1459     return AGPR_32RegClassID;
1460   case OPW64:
1461   case OPWV232: return AReg_64RegClassID;
1462   case OPW96: return AReg_96RegClassID;
1463   case OPW128: return AReg_128RegClassID;
1464   case OPW160: return AReg_160RegClassID;
1465   case OPW256: return AReg_256RegClassID;
1466   case OPW288: return AReg_288RegClassID;
1467   case OPW320: return AReg_320RegClassID;
1468   case OPW352: return AReg_352RegClassID;
1469   case OPW384: return AReg_384RegClassID;
1470   case OPW512: return AReg_512RegClassID;
1471   case OPW1024: return AReg_1024RegClassID;
1472   }
1473 }
1474 
1475 
1476 unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
1477   using namespace AMDGPU;
1478 
1479   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1480   switch (Width) {
1481   default: // fall
1482   case OPW32:
1483   case OPW16:
1484   case OPWV216:
1485     return SGPR_32RegClassID;
1486   case OPW64:
1487   case OPWV232: return SGPR_64RegClassID;
1488   case OPW96: return SGPR_96RegClassID;
1489   case OPW128: return SGPR_128RegClassID;
1490   case OPW160: return SGPR_160RegClassID;
1491   case OPW256: return SGPR_256RegClassID;
1492   case OPW288: return SGPR_288RegClassID;
1493   case OPW320: return SGPR_320RegClassID;
1494   case OPW352: return SGPR_352RegClassID;
1495   case OPW384: return SGPR_384RegClassID;
1496   case OPW512: return SGPR_512RegClassID;
1497   }
1498 }
1499 
1500 unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
1501   using namespace AMDGPU;
1502 
1503   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
1504   switch (Width) {
1505   default: // fall
1506   case OPW32:
1507   case OPW16:
1508   case OPWV216:
1509     return TTMP_32RegClassID;
1510   case OPW64:
1511   case OPWV232: return TTMP_64RegClassID;
1512   case OPW128: return TTMP_128RegClassID;
1513   case OPW256: return TTMP_256RegClassID;
1514   case OPW288: return TTMP_288RegClassID;
1515   case OPW320: return TTMP_320RegClassID;
1516   case OPW352: return TTMP_352RegClassID;
1517   case OPW384: return TTMP_384RegClassID;
1518   case OPW512: return TTMP_512RegClassID;
1519   }
1520 }
1521 
1522 int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
1523   using namespace AMDGPU::EncValues;
1524 
1525   unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
1526   unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
1527 
1528   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
1529 }
1530 
1531 MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
1532                                           bool MandatoryLiteral,
1533                                           unsigned ImmWidth, bool IsFP) const {
1534   using namespace AMDGPU::EncValues;
1535 
1536   assert(Val < 1024); // enum10
1537 
1538   bool IsAGPR = Val & 512;
1539   Val &= 511;
1540 
1541   if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
1542     return createRegOperand(IsAGPR ? getAgprClassId(Width)
1543                                    : getVgprClassId(Width), Val - VGPR_MIN);
1544   }
1545   return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
1546                             IsFP);
1547 }
1548 
1549 MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
1550                                                  unsigned Val,
1551                                                  bool MandatoryLiteral,
1552                                                  unsigned ImmWidth,
1553                                                  bool IsFP) const {
1554   // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
1555   // decoded earlier.
1556   assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
1557   using namespace AMDGPU::EncValues;
1558 
1559   if (Val <= SGPR_MAX) {
1560     // "SGPR_MIN <= Val" is always true and causes compilation warning.
1561     static_assert(SGPR_MIN == 0);
1562     return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
1563   }
1564 
1565   int TTmpIdx = getTTmpIdx(Val);
1566   if (TTmpIdx >= 0) {
1567     return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
1568   }
1569 
1570   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
1571     return decodeIntImmed(Val);
1572 
1573   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
1574     return decodeFPImmed(ImmWidth, Val);
1575 
1576   if (Val == LITERAL_CONST) {
1577     if (MandatoryLiteral)
1578       // Keep a sentinel value for deferred setting
1579       return MCOperand::createImm(LITERAL_CONST);
1580     else
1581       return decodeLiteralConstant(IsFP && ImmWidth == 64);
1582   }
1583 
1584   switch (Width) {
1585   case OPW32:
1586   case OPW16:
1587   case OPWV216:
1588     return decodeSpecialReg32(Val);
1589   case OPW64:
1590   case OPWV232:
1591     return decodeSpecialReg64(Val);
1592   default:
1593     llvm_unreachable("unexpected immediate type");
1594   }
1595 }
1596 
1597 // Bit 0 of DstY isn't stored in the instruction, because it's always the
1598 // opposite of bit 0 of DstX.
1599 MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
1600                                                unsigned Val) const {
1601   int VDstXInd =
1602       AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX);
1603   assert(VDstXInd != -1);
1604   assert(Inst.getOperand(VDstXInd).isReg());
1605   unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg());
1606   Val |= ~XDstReg & 1;
1607   auto Width = llvm::AMDGPUDisassembler::OPW32;
1608   return createRegOperand(getVgprClassId(Width), Val);
1609 }
1610 
1611 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
1612   using namespace AMDGPU;
1613 
1614   switch (Val) {
1615   // clang-format off
1616   case 102: return createRegOperand(FLAT_SCR_LO);
1617   case 103: return createRegOperand(FLAT_SCR_HI);
1618   case 104: return createRegOperand(XNACK_MASK_LO);
1619   case 105: return createRegOperand(XNACK_MASK_HI);
1620   case 106: return createRegOperand(VCC_LO);
1621   case 107: return createRegOperand(VCC_HI);
1622   case 108: return createRegOperand(TBA_LO);
1623   case 109: return createRegOperand(TBA_HI);
1624   case 110: return createRegOperand(TMA_LO);
1625   case 111: return createRegOperand(TMA_HI);
1626   case 124:
1627     return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0);
1628   case 125:
1629     return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
1630   case 126: return createRegOperand(EXEC_LO);
1631   case 127: return createRegOperand(EXEC_HI);
1632   case 235: return createRegOperand(SRC_SHARED_BASE_LO);
1633   case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
1634   case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
1635   case 238: return createRegOperand(SRC_PRIVATE_LIMIT_LO);
1636   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1637   case 251: return createRegOperand(SRC_VCCZ);
1638   case 252: return createRegOperand(SRC_EXECZ);
1639   case 253: return createRegOperand(SRC_SCC);
1640   case 254: return createRegOperand(LDS_DIRECT);
1641   default: break;
1642     // clang-format on
1643   }
1644   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1645 }
1646 
1647 MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
1648   using namespace AMDGPU;
1649 
1650   switch (Val) {
1651   case 102: return createRegOperand(FLAT_SCR);
1652   case 104: return createRegOperand(XNACK_MASK);
1653   case 106: return createRegOperand(VCC);
1654   case 108: return createRegOperand(TBA);
1655   case 110: return createRegOperand(TMA);
1656   case 124:
1657     if (isGFX11Plus())
1658       return createRegOperand(SGPR_NULL);
1659     break;
1660   case 125:
1661     if (!isGFX11Plus())
1662       return createRegOperand(SGPR_NULL);
1663     break;
1664   case 126: return createRegOperand(EXEC);
1665   case 235: return createRegOperand(SRC_SHARED_BASE);
1666   case 236: return createRegOperand(SRC_SHARED_LIMIT);
1667   case 237: return createRegOperand(SRC_PRIVATE_BASE);
1668   case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
1669   case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
1670   case 251: return createRegOperand(SRC_VCCZ);
1671   case 252: return createRegOperand(SRC_EXECZ);
1672   case 253: return createRegOperand(SRC_SCC);
1673   default: break;
1674   }
1675   return errOperand(Val, "unknown operand encoding " + Twine(Val));
1676 }
1677 
1678 MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
1679                                             const unsigned Val,
1680                                             unsigned ImmWidth) const {
1681   using namespace AMDGPU::SDWA;
1682   using namespace AMDGPU::EncValues;
1683 
1684   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
1685       STI.hasFeature(AMDGPU::FeatureGFX10)) {
1686     // XXX: cast to int is needed to avoid stupid warning:
1687     // compare with unsigned is always true
1688     if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
1689         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
1690       return createRegOperand(getVgprClassId(Width),
1691                               Val - SDWA9EncValues::SRC_VGPR_MIN);
1692     }
1693     if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
1694         Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
1695                               : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
1696       return createSRegOperand(getSgprClassId(Width),
1697                                Val - SDWA9EncValues::SRC_SGPR_MIN);
1698     }
1699     if (SDWA9EncValues::SRC_TTMP_MIN <= Val &&
1700         Val <= SDWA9EncValues::SRC_TTMP_MAX) {
1701       return createSRegOperand(getTtmpClassId(Width),
1702                                Val - SDWA9EncValues::SRC_TTMP_MIN);
1703     }
1704 
1705     const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN;
1706 
1707     if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX)
1708       return decodeIntImmed(SVal);
1709 
1710     if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
1711       return decodeFPImmed(ImmWidth, SVal);
1712 
1713     return decodeSpecialReg32(SVal);
1714   } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
1715     return createRegOperand(getVgprClassId(Width), Val);
1716   }
1717   llvm_unreachable("unsupported target");
1718 }
1719 
1720 MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
1721   return decodeSDWASrc(OPW16, Val, 16);
1722 }
1723 
1724 MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
1725   return decodeSDWASrc(OPW32, Val, 32);
1726 }
1727 
1728 MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
1729   using namespace AMDGPU::SDWA;
1730 
1731   assert((STI.hasFeature(AMDGPU::FeatureGFX9) ||
1732           STI.hasFeature(AMDGPU::FeatureGFX10)) &&
1733          "SDWAVopcDst should be present only on GFX9+");
1734 
1735   bool IsWave64 = STI.hasFeature(AMDGPU::FeatureWavefrontSize64);
1736 
1737   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
1738     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
1739 
1740     int TTmpIdx = getTTmpIdx(Val);
1741     if (TTmpIdx >= 0) {
1742       auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
1743       return createSRegOperand(TTmpClsId, TTmpIdx);
1744     } else if (Val > SGPR_MAX) {
1745       return IsWave64 ? decodeSpecialReg64(Val)
1746                       : decodeSpecialReg32(Val);
1747     } else {
1748       return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
1749     }
1750   } else {
1751     return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
1752   }
1753 }
1754 
1755 MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
1756   return STI.hasFeature(AMDGPU::FeatureWavefrontSize64)
1757              ? decodeSrcOp(OPW64, Val)
1758              : decodeSrcOp(OPW32, Val);
1759 }
1760 
1761 MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
1762   return decodeSrcOp(OPW32, Val);
1763 }
1764 
1765 bool AMDGPUDisassembler::isVI() const {
1766   return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
1767 }
1768 
1769 bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
1770 
1771 bool AMDGPUDisassembler::isGFX90A() const {
1772   return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
1773 }
1774 
1775 bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
1776 
1777 bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
1778 
1779 bool AMDGPUDisassembler::isGFX10Plus() const {
1780   return AMDGPU::isGFX10Plus(STI);
1781 }
1782 
1783 bool AMDGPUDisassembler::isGFX11() const {
1784   return STI.hasFeature(AMDGPU::FeatureGFX11);
1785 }
1786 
1787 bool AMDGPUDisassembler::isGFX11Plus() const {
1788   return AMDGPU::isGFX11Plus(STI);
1789 }
1790 
1791 bool AMDGPUDisassembler::isGFX12Plus() const {
1792   return AMDGPU::isGFX12Plus(STI);
1793 }
1794 
1795 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
1796   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
1797 }
1798 
1799 bool AMDGPUDisassembler::hasKernargPreload() const {
1800   return AMDGPU::hasKernargPreload(STI);
1801 }
1802 
1803 //===----------------------------------------------------------------------===//
1804 // AMDGPU specific symbol handling
1805 //===----------------------------------------------------------------------===//
1806 #define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK))
1807 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
1808   do {                                                                         \
1809     KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n';            \
1810   } while (0)
1811 #define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK)                        \
1812   do {                                                                         \
1813     KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " "       \
1814              << GET_FIELD(MASK) << '\n';                                       \
1815   } while (0)
1816 
1817 // NOLINTNEXTLINE(readability-identifier-naming)
1818 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
1819     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1820   using namespace amdhsa;
1821   StringRef Indent = "\t";
1822 
1823   // We cannot accurately backward compute #VGPRs used from
1824   // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
1825   // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
1826   // simply calculate the inverse of what the assembler does.
1827 
1828   uint32_t GranulatedWorkitemVGPRCount =
1829       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT);
1830 
1831   uint32_t NextFreeVGPR =
1832       (GranulatedWorkitemVGPRCount + 1) *
1833       AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32);
1834 
1835   KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
1836 
1837   // We cannot backward compute values used to calculate
1838   // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
1839   // directives can't be computed:
1840   // .amdhsa_reserve_vcc
1841   // .amdhsa_reserve_flat_scratch
1842   // .amdhsa_reserve_xnack_mask
1843   // They take their respective default values if not specified in the assembly.
1844   //
1845   // GRANULATED_WAVEFRONT_SGPR_COUNT
1846   //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
1847   //
1848   // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
1849   // are set to 0. So while disassembling we consider that:
1850   //
1851   // GRANULATED_WAVEFRONT_SGPR_COUNT
1852   //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
1853   //
1854   // The disassembler cannot recover the original values of those 3 directives.
1855 
1856   uint32_t GranulatedWavefrontSGPRCount =
1857       GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT);
1858 
1859   if (isGFX10Plus() && GranulatedWavefrontSGPRCount)
1860     return MCDisassembler::Fail;
1861 
1862   uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
1863                           AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
1864 
1865   KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
1866   if (!hasArchitectedFlatScratch())
1867     KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
1868   KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
1869   KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
1870 
1871   if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
1872     return MCDisassembler::Fail;
1873 
1874   PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
1875                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
1876   PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
1877                   COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
1878   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
1879                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
1880   PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
1881                   COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
1882 
1883   if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
1884     return MCDisassembler::Fail;
1885 
1886   if (!isGFX12Plus())
1887     PRINT_DIRECTIVE(".amdhsa_dx10_clamp",
1888                     COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
1889 
1890   if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
1891     return MCDisassembler::Fail;
1892 
1893   if (!isGFX12Plus())
1894     PRINT_DIRECTIVE(".amdhsa_ieee_mode",
1895                     COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
1896 
1897   if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
1898     return MCDisassembler::Fail;
1899 
1900   if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
1901     return MCDisassembler::Fail;
1902 
1903   if (isGFX9Plus())
1904     PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
1905 
1906   if (!isGFX9Plus())
1907     if (FourByteBuffer & COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0)
1908       return MCDisassembler::Fail;
1909   if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED1)
1910     return MCDisassembler::Fail;
1911   if (!isGFX10Plus())
1912     if (FourByteBuffer & COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2)
1913       return MCDisassembler::Fail;
1914 
1915   if (isGFX10Plus()) {
1916     PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
1917                     COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
1918     PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
1919     PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
1920   }
1921 
1922   if (isGFX12Plus())
1923     PRINT_DIRECTIVE(".amdhsa_round_robin_scheduling",
1924                     COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
1925 
1926   return MCDisassembler::Success;
1927 }
1928 
1929 // NOLINTNEXTLINE(readability-identifier-naming)
1930 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
1931     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1932   using namespace amdhsa;
1933   StringRef Indent = "\t";
1934   if (hasArchitectedFlatScratch())
1935     PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
1936                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
1937   else
1938     PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
1939                     COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
1940   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
1941                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
1942   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
1943                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
1944   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
1945                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
1946   PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
1947                   COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
1948   PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
1949                   COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
1950 
1951   if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
1952     return MCDisassembler::Fail;
1953 
1954   if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
1955     return MCDisassembler::Fail;
1956 
1957   if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
1958     return MCDisassembler::Fail;
1959 
1960   PRINT_DIRECTIVE(
1961       ".amdhsa_exception_fp_ieee_invalid_op",
1962       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
1963   PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
1964                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
1965   PRINT_DIRECTIVE(
1966       ".amdhsa_exception_fp_ieee_div_zero",
1967       COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
1968   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
1969                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
1970   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
1971                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
1972   PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
1973                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
1974   PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
1975                   COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
1976 
1977   if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
1978     return MCDisassembler::Fail;
1979 
1980   return MCDisassembler::Success;
1981 }
1982 
1983 // NOLINTNEXTLINE(readability-identifier-naming)
1984 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
1985     uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
1986   using namespace amdhsa;
1987   StringRef Indent = "\t";
1988   if (isGFX90A()) {
1989     KdStream << Indent << ".amdhsa_accum_offset "
1990              << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
1991              << '\n';
1992     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED0)
1993       return MCDisassembler::Fail;
1994     PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
1995     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1)
1996       return MCDisassembler::Fail;
1997   } else if (isGFX10Plus()) {
1998     if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
1999       PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
2000                       COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
2001     } else {
2002       PRINT_PSEUDO_DIRECTIVE_COMMENT(
2003           "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
2004     }
2005 
2006     if (isGFX11Plus()) {
2007       PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
2008                                      COMPUTE_PGM_RSRC3_GFX11_PLUS_INST_PREF_SIZE);
2009       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
2010                                      COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
2011       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
2012                                      COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_END);
2013     } else {
2014       if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED0)
2015         return MCDisassembler::Fail;
2016     }
2017 
2018     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED1)
2019       return MCDisassembler::Fail;
2020 
2021     if (isGFX11Plus()) {
2022       PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
2023                                      COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
2024     } else {
2025       if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED2)
2026         return MCDisassembler::Fail;
2027     }
2028   } else if (FourByteBuffer) {
2029     return MCDisassembler::Fail;
2030   }
2031   return MCDisassembler::Success;
2032 }
2033 #undef PRINT_PSEUDO_DIRECTIVE_COMMENT
2034 #undef PRINT_DIRECTIVE
2035 #undef GET_FIELD
2036 
2037 MCDisassembler::DecodeStatus
2038 AMDGPUDisassembler::decodeKernelDescriptorDirective(
2039     DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
2040     raw_string_ostream &KdStream) const {
2041 #define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
2042   do {                                                                         \
2043     KdStream << Indent << DIRECTIVE " "                                        \
2044              << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
2045   } while (0)
2046 
2047   uint16_t TwoByteBuffer = 0;
2048   uint32_t FourByteBuffer = 0;
2049 
2050   StringRef ReservedBytes;
2051   StringRef Indent = "\t";
2052 
2053   assert(Bytes.size() == 64);
2054   DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
2055 
2056   switch (Cursor.tell()) {
2057   case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
2058     FourByteBuffer = DE.getU32(Cursor);
2059     KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
2060              << '\n';
2061     return MCDisassembler::Success;
2062 
2063   case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
2064     FourByteBuffer = DE.getU32(Cursor);
2065     KdStream << Indent << ".amdhsa_private_segment_fixed_size "
2066              << FourByteBuffer << '\n';
2067     return MCDisassembler::Success;
2068 
2069   case amdhsa::KERNARG_SIZE_OFFSET:
2070     FourByteBuffer = DE.getU32(Cursor);
2071     KdStream << Indent << ".amdhsa_kernarg_size "
2072              << FourByteBuffer << '\n';
2073     return MCDisassembler::Success;
2074 
2075   case amdhsa::RESERVED0_OFFSET:
2076     // 4 reserved bytes, must be 0.
2077     ReservedBytes = DE.getBytes(Cursor, 4);
2078     for (int I = 0; I < 4; ++I) {
2079       if (ReservedBytes[I] != 0) {
2080         return MCDisassembler::Fail;
2081       }
2082     }
2083     return MCDisassembler::Success;
2084 
2085   case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
2086     // KERNEL_CODE_ENTRY_BYTE_OFFSET
2087     // So far no directive controls this for Code Object V3, so simply skip for
2088     // disassembly.
2089     DE.skip(Cursor, 8);
2090     return MCDisassembler::Success;
2091 
2092   case amdhsa::RESERVED1_OFFSET:
2093     // 20 reserved bytes, must be 0.
2094     ReservedBytes = DE.getBytes(Cursor, 20);
2095     for (int I = 0; I < 20; ++I) {
2096       if (ReservedBytes[I] != 0) {
2097         return MCDisassembler::Fail;
2098       }
2099     }
2100     return MCDisassembler::Success;
2101 
2102   case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
2103     FourByteBuffer = DE.getU32(Cursor);
2104     return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream);
2105 
2106   case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
2107     FourByteBuffer = DE.getU32(Cursor);
2108     return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream);
2109 
2110   case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
2111     FourByteBuffer = DE.getU32(Cursor);
2112     return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream);
2113 
2114   case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
2115     using namespace amdhsa;
2116     TwoByteBuffer = DE.getU16(Cursor);
2117 
2118     if (!hasArchitectedFlatScratch())
2119       PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
2120                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
2121     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
2122                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
2123     PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
2124                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
2125     PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
2126                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
2127     PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
2128                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
2129     if (!hasArchitectedFlatScratch())
2130       PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
2131                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
2132     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
2133                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
2134 
2135     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
2136       return MCDisassembler::Fail;
2137 
2138     // Reserved for GFX9
2139     if (isGFX9() &&
2140         (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
2141       return MCDisassembler::Fail;
2142     } else if (isGFX10Plus()) {
2143       PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
2144                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2145     }
2146 
2147     if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
2148       PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
2149                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
2150 
2151     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
2152       return MCDisassembler::Fail;
2153 
2154     return MCDisassembler::Success;
2155 
2156   case amdhsa::KERNARG_PRELOAD_OFFSET:
2157     using namespace amdhsa;
2158     TwoByteBuffer = DE.getU16(Cursor);
2159     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) {
2160       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length",
2161                       KERNARG_PRELOAD_SPEC_LENGTH);
2162     }
2163 
2164     if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) {
2165       PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset",
2166                       KERNARG_PRELOAD_SPEC_OFFSET);
2167     }
2168     return MCDisassembler::Success;
2169 
2170   case amdhsa::RESERVED3_OFFSET:
2171     // 4 bytes from here are reserved, must be 0.
2172     ReservedBytes = DE.getBytes(Cursor, 4);
2173     for (int I = 0; I < 4; ++I) {
2174       if (ReservedBytes[I] != 0)
2175         return MCDisassembler::Fail;
2176     }
2177     return MCDisassembler::Success;
2178 
2179   default:
2180     llvm_unreachable("Unhandled index. Case statements cover everything.");
2181     return MCDisassembler::Fail;
2182   }
2183 #undef PRINT_DIRECTIVE
2184 }
2185 
2186 MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
2187     StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
2188   // CP microcode requires the kernel descriptor to be 64 aligned.
2189   if (Bytes.size() != 64 || KdAddress % 64 != 0)
2190     return MCDisassembler::Fail;
2191 
2192   // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10
2193   // requires us to know the setting of .amdhsa_wavefront_size32 in order to
2194   // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong
2195   // order. Workaround this by first looking up .amdhsa_wavefront_size32 here
2196   // when required.
2197   if (isGFX10Plus()) {
2198     uint16_t KernelCodeProperties =
2199         support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET],
2200                                 llvm::endianness::little);
2201     EnableWavefrontSize32 =
2202         AMDHSA_BITS_GET(KernelCodeProperties,
2203                         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
2204   }
2205 
2206   std::string Kd;
2207   raw_string_ostream KdStream(Kd);
2208   KdStream << ".amdhsa_kernel " << KdName << '\n';
2209 
2210   DataExtractor::Cursor C(0);
2211   while (C && C.tell() < Bytes.size()) {
2212     MCDisassembler::DecodeStatus Status =
2213         decodeKernelDescriptorDirective(C, Bytes, KdStream);
2214 
2215     cantFail(C.takeError());
2216 
2217     if (Status == MCDisassembler::Fail)
2218       return MCDisassembler::Fail;
2219   }
2220   KdStream << ".end_amdhsa_kernel\n";
2221   outs() << KdStream.str();
2222   return MCDisassembler::Success;
2223 }
2224 
2225 std::optional<MCDisassembler::DecodeStatus>
2226 AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
2227                                   ArrayRef<uint8_t> Bytes, uint64_t Address,
2228                                   raw_ostream &CStream) const {
2229   // Right now only kernel descriptor needs to be handled.
2230   // We ignore all other symbols for target specific handling.
2231   // TODO:
2232   // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
2233   // Object V2 and V3 when symbols are marked protected.
2234 
2235   // amd_kernel_code_t for Code Object V2.
2236   if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
2237     Size = 256;
2238     return MCDisassembler::Fail;
2239   }
2240 
2241   // Code Object V3 kernel descriptors.
2242   StringRef Name = Symbol.Name;
2243   if (Symbol.Type == ELF::STT_OBJECT && Name.ends_with(StringRef(".kd"))) {
2244     Size = 64; // Size = 64 regardless of success or failure.
2245     return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
2246   }
2247   return std::nullopt;
2248 }
2249 
2250 //===----------------------------------------------------------------------===//
2251 // AMDGPUSymbolizer
2252 //===----------------------------------------------------------------------===//
2253 
2254 // Try to find symbol name for specified label
2255 bool AMDGPUSymbolizer::tryAddingSymbolicOperand(
2256     MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value,
2257     uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/,
2258     uint64_t /*OpSize*/, uint64_t /*InstSize*/) {
2259 
2260   if (!IsBranch) {
2261     return false;
2262   }
2263 
2264   auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
2265   if (!Symbols)
2266     return false;
2267 
2268   auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
2269     return Val.Addr == static_cast<uint64_t>(Value) &&
2270            Val.Type == ELF::STT_NOTYPE;
2271   });
2272   if (Result != Symbols->end()) {
2273     auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
2274     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
2275     Inst.addOperand(MCOperand::createExpr(Add));
2276     return true;
2277   }
2278   // Add to list of referenced addresses, so caller can synthesize a label.
2279   ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
2280   return false;
2281 }
2282 
2283 void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
2284                                                        int64_t Value,
2285                                                        uint64_t Address) {
2286   llvm_unreachable("unimplemented");
2287 }
2288 
2289 //===----------------------------------------------------------------------===//
2290 // Initialization
2291 //===----------------------------------------------------------------------===//
2292 
2293 static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
2294                               LLVMOpInfoCallback /*GetOpInfo*/,
2295                               LLVMSymbolLookupCallback /*SymbolLookUp*/,
2296                               void *DisInfo,
2297                               MCContext *Ctx,
2298                               std::unique_ptr<MCRelocationInfo> &&RelInfo) {
2299   return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
2300 }
2301 
2302 static MCDisassembler *createAMDGPUDisassembler(const Target &T,
2303                                                 const MCSubtargetInfo &STI,
2304                                                 MCContext &Ctx) {
2305   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
2306 }
2307 
2308 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
2309   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
2310                                          createAMDGPUDisassembler);
2311   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
2312                                        createAMDGPUSymbolizer);
2313 }
2314