1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements methods from the AMDGPUCustomBehaviour class.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUCustomBehaviour.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIInstrInfo.h"
17 #include "TargetInfo/AMDGPUTargetInfo.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/WithColor.h"
20
21 namespace llvm {
22 namespace mca {
23
postProcessInstruction(std::unique_ptr<Instruction> & Inst,const MCInst & MCI)24 void AMDGPUInstrPostProcess::postProcessInstruction(
25 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26 switch (MCI.getOpcode()) {
27 case AMDGPU::S_WAITCNT:
28 case AMDGPU::S_WAITCNT_EXPCNT:
29 case AMDGPU::S_WAITCNT_LGKMCNT:
30 case AMDGPU::S_WAITCNT_VMCNT:
31 case AMDGPU::S_WAITCNT_VSCNT:
32 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
33 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
34 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
35 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
36 case AMDGPU::S_WAITCNT_gfx10:
37 case AMDGPU::S_WAITCNT_gfx6_gfx7:
38 case AMDGPU::S_WAITCNT_vi:
39 return processWaitCnt(Inst, MCI);
40 }
41 }
42
43 // s_waitcnt instructions encode important information as immediate operands
44 // which are lost during the MCInst -> mca::Instruction lowering.
processWaitCnt(std::unique_ptr<Instruction> & Inst,const MCInst & MCI)45 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
46 const MCInst &MCI) {
47 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
48 MCAOperand Op;
49 const MCOperand &MCOp = MCI.getOperand(Idx);
50 if (MCOp.isReg()) {
51 Op = MCAOperand::createReg(MCOp.getReg());
52 } else if (MCOp.isImm()) {
53 Op = MCAOperand::createImm(MCOp.getImm());
54 }
55 Op.setIndex(Idx);
56 Inst->addOperand(Op);
57 }
58 }
59
AMDGPUCustomBehaviour(const MCSubtargetInfo & STI,const mca::SourceMgr & SrcMgr,const MCInstrInfo & MCII)60 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
61 const mca::SourceMgr &SrcMgr,
62 const MCInstrInfo &MCII)
63 : CustomBehaviour(STI, SrcMgr, MCII) {
64 generateWaitCntInfo();
65 }
66
checkCustomHazard(ArrayRef<InstRef> IssuedInst,const InstRef & IR)67 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
68 const InstRef &IR) {
69 const Instruction &Inst = *IR.getInstruction();
70 unsigned Opcode = Inst.getOpcode();
71
72 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
73 // pseudo instructions here. However, there are plans for the future to make
74 // it possible to use mca within backend passes. As such, I have left the
75 // pseudo version of s_waitcnt within this switch statement.
76 switch (Opcode) {
77 default:
78 return 0;
79 case AMDGPU::S_WAITCNT: // This instruction
80 case AMDGPU::S_WAITCNT_EXPCNT:
81 case AMDGPU::S_WAITCNT_LGKMCNT:
82 case AMDGPU::S_WAITCNT_VMCNT:
83 case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
84 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
85 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
86 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
87 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
88 case AMDGPU::S_WAITCNT_gfx10:
89 case AMDGPU::S_WAITCNT_gfx6_gfx7:
90 case AMDGPU::S_WAITCNT_vi:
91 // s_endpgm also behaves as if there is an implicit
92 // s_waitcnt 0, but I'm not sure if it would be appropriate
93 // to model this in llvm-mca based on how the iterations work
94 // while simulating the pipeline over and over.
95 return handleWaitCnt(IssuedInst, IR);
96 }
97
98 return 0;
99 }
100
handleWaitCnt(ArrayRef<InstRef> IssuedInst,const InstRef & IR)101 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
102 const InstRef &IR) {
103 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
104 // I do not know how that instruction works so I did not attempt to model it.
105 // set the max values to begin
106 unsigned Vmcnt = 63;
107 unsigned Expcnt = 7;
108 unsigned Lgkmcnt = 31;
109 unsigned Vscnt = 63;
110 unsigned CurrVmcnt = 0;
111 unsigned CurrExpcnt = 0;
112 unsigned CurrLgkmcnt = 0;
113 unsigned CurrVscnt = 0;
114 unsigned CyclesToWaitVm = ~0U;
115 unsigned CyclesToWaitExp = ~0U;
116 unsigned CyclesToWaitLgkm = ~0U;
117 unsigned CyclesToWaitVs = ~0U;
118
119 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
120
121 // We will now look at each of the currently executing instructions
122 // to find out if this wait instruction still needs to wait.
123 for (const InstRef &PrevIR : IssuedInst) {
124 const Instruction &PrevInst = *PrevIR.getInstruction();
125 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
126 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
127 const int CyclesLeft = PrevInst.getCyclesLeft();
128 assert(CyclesLeft != UNKNOWN_CYCLES &&
129 "We should know how many cycles are left for this instruction");
130 if (PrevInstWaitInfo.VmCnt) {
131 CurrVmcnt++;
132 if ((unsigned)CyclesLeft < CyclesToWaitVm)
133 CyclesToWaitVm = CyclesLeft;
134 }
135 if (PrevInstWaitInfo.ExpCnt) {
136 CurrExpcnt++;
137 if ((unsigned)CyclesLeft < CyclesToWaitExp)
138 CyclesToWaitExp = CyclesLeft;
139 }
140 if (PrevInstWaitInfo.LgkmCnt) {
141 CurrLgkmcnt++;
142 if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
143 CyclesToWaitLgkm = CyclesLeft;
144 }
145 if (PrevInstWaitInfo.VsCnt) {
146 CurrVscnt++;
147 if ((unsigned)CyclesLeft < CyclesToWaitVs)
148 CyclesToWaitVs = CyclesLeft;
149 }
150 }
151
152 unsigned CyclesToWait = ~0U;
153 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
154 CyclesToWait = CyclesToWaitVm;
155 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
156 CyclesToWait = CyclesToWaitExp;
157 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
158 CyclesToWait = CyclesToWaitLgkm;
159 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
160 CyclesToWait = CyclesToWaitVs;
161
162 // We may underestimate how many cycles we need to wait, but this
163 // isn't a big deal. Our return value is just how many cycles until
164 // this function gets run again. So as long as we don't overestimate
165 // the wait time, we'll still end up stalling at this instruction
166 // for the correct number of cycles.
167
168 if (CyclesToWait == ~0U)
169 return 0;
170 return CyclesToWait;
171 }
172
computeWaitCnt(const InstRef & IR,unsigned & Vmcnt,unsigned & Expcnt,unsigned & Lgkmcnt,unsigned & Vscnt)173 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
174 unsigned &Expcnt, unsigned &Lgkmcnt,
175 unsigned &Vscnt) {
176 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
177 const Instruction &Inst = *IR.getInstruction();
178 unsigned Opcode = Inst.getOpcode();
179
180 switch (Opcode) {
181 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
182 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
183 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
184 case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
185 // Should probably be checking for nullptr
186 // here, but I'm not sure how I should handle the case
187 // where we see a nullptr.
188 const MCAOperand *OpReg = Inst.getOperand(0);
189 const MCAOperand *OpImm = Inst.getOperand(1);
190 assert(OpReg && OpReg->isReg() && "First operand should be a register.");
191 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
192 if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
193 // Instruction is using a real register.
194 // Since we can't know what value this register will have,
195 // we can't compute what the value of this wait should be.
196 WithColor::warning() << "The register component of "
197 << MCII.getName(Opcode) << " will be completely "
198 << "ignored. So the wait may not be accurate.\n";
199 }
200 switch (Opcode) {
201 // Redundant switch so I don't have to repeat the code above
202 // for each case. There are more clever ways to avoid this
203 // extra switch and anyone can feel free to implement one of them.
204 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
205 Expcnt = OpImm->getImm();
206 break;
207 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
208 Lgkmcnt = OpImm->getImm();
209 break;
210 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
211 Vmcnt = OpImm->getImm();
212 break;
213 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
214 Vscnt = OpImm->getImm();
215 break;
216 }
217 return;
218 }
219 case AMDGPU::S_WAITCNT_gfx10:
220 case AMDGPU::S_WAITCNT_gfx6_gfx7:
221 case AMDGPU::S_WAITCNT_vi:
222 unsigned WaitCnt = Inst.getOperand(0)->getImm();
223 AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
224 return;
225 }
226 }
227
generateWaitCntInfo()228 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
229 // The core logic from this function is taken from
230 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
231 // that are being looked at are in the MachineInstr format, whereas we have
232 // access to the MCInst format. The side effects of this are that we can't use
233 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
234 // functions. Therefore, we conservatively assume that these functions will
235 // return true. This may cause a few instructions to be incorrectly tagged
236 // with an extra CNT. However, these are instructions that do interact with at
237 // least one CNT so giving them an extra CNT shouldn't cause issues in most
238 // scenarios.
239 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
240 InstrWaitCntInfo.resize(SrcMgr.size());
241
242 for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
243 const std::unique_ptr<Instruction> &Inst = EN.value();
244 unsigned Index = EN.index();
245 unsigned Opcode = Inst->getOpcode();
246 const MCInstrDesc &MCID = MCII.get(Opcode);
247 if ((MCID.TSFlags & SIInstrFlags::DS) &&
248 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
249 InstrWaitCntInfo[Index].LgkmCnt = true;
250 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
251 InstrWaitCntInfo[Index].ExpCnt = true;
252 } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
253 // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
254 // and mayAccessLDSThroughFlat(Inst) would both return true for this
255 // instruction. We have to do this because those functions use
256 // information about the memory operands that we don't have access to.
257 InstrWaitCntInfo[Index].LgkmCnt = true;
258 if (!STI.hasFeature(AMDGPU::FeatureVscnt))
259 InstrWaitCntInfo[Index].VmCnt = true;
260 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
261 InstrWaitCntInfo[Index].VmCnt = true;
262 else
263 InstrWaitCntInfo[Index].VsCnt = true;
264 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
265 if (!STI.hasFeature(AMDGPU::FeatureVscnt))
266 InstrWaitCntInfo[Index].VmCnt = true;
267 else if ((MCID.mayLoad() &&
268 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
269 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
270 !MCID.mayStore()))
271 InstrWaitCntInfo[Index].VmCnt = true;
272 else if (MCID.mayStore())
273 InstrWaitCntInfo[Index].VsCnt = true;
274
275 // (IV.Major < 7) is meant to represent
276 // GCNTarget.vmemWriteNeedsExpWaitcnt()
277 // which is defined as
278 // { return getGeneration() < SEA_ISLANDS; }
279 if (IV.Major < 7 &&
280 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
281 InstrWaitCntInfo[Index].ExpCnt = true;
282 } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
283 InstrWaitCntInfo[Index].LgkmCnt = true;
284 } else if (MCID.TSFlags & SIInstrFlags::EXP) {
285 InstrWaitCntInfo[Index].ExpCnt = true;
286 } else {
287 switch (Opcode) {
288 case AMDGPU::S_SENDMSG:
289 case AMDGPU::S_SENDMSGHALT:
290 case AMDGPU::S_MEMTIME:
291 case AMDGPU::S_MEMREALTIME:
292 InstrWaitCntInfo[Index].LgkmCnt = true;
293 break;
294 }
295 }
296 }
297 }
298
299 // taken from SIInstrInfo::isVMEM()
isVMEM(const MCInstrDesc & MCID)300 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
301 return MCID.TSFlags & SIInstrFlags::MUBUF ||
302 MCID.TSFlags & SIInstrFlags::MTBUF ||
303 MCID.TSFlags & SIInstrFlags::MIMG;
304 }
305
306 // taken from SIInstrInfo::hasModifiersSet()
hasModifiersSet(const std::unique_ptr<Instruction> & Inst,unsigned OpName) const307 bool AMDGPUCustomBehaviour::hasModifiersSet(
308 const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
309 int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
310 if (Idx == -1)
311 return false;
312
313 const MCAOperand *Op = Inst->getOperand(Idx);
314 if (Op == nullptr || !Op->isImm() || !Op->getImm())
315 return false;
316
317 return true;
318 }
319
320 // taken from SIInstrInfo::isAlwaysGDS()
isAlwaysGDS(uint16_t Opcode) const321 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
322 return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
323 Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
324 Opcode == AMDGPU::DS_GWS_SEMA_P ||
325 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
326 Opcode == AMDGPU::DS_GWS_BARRIER;
327 }
328
329 } // namespace mca
330 } // namespace llvm
331
332 using namespace llvm;
333 using namespace mca;
334
335 static CustomBehaviour *
createAMDGPUCustomBehaviour(const MCSubtargetInfo & STI,const mca::SourceMgr & SrcMgr,const MCInstrInfo & MCII)336 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
337 const mca::SourceMgr &SrcMgr,
338 const MCInstrInfo &MCII) {
339 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
340 }
341
342 static InstrPostProcess *
createAMDGPUInstrPostProcess(const MCSubtargetInfo & STI,const MCInstrInfo & MCII)343 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
344 const MCInstrInfo &MCII) {
345 return new AMDGPUInstrPostProcess(STI, MCII);
346 }
347
348 /// Extern function to initialize the targets for the AMDGPU backend
349
LLVMInitializeAMDGPUTargetMCA()350 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
351 TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
352 createAMDGPUCustomBehaviour);
353 TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
354 createAMDGPUInstrPostProcess);
355
356 TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
357 createAMDGPUCustomBehaviour);
358 TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
359 createAMDGPUInstrPostProcess);
360 }
361