xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 8a3d3a9af6f05352355cdfa1dc883768c72f90f1)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = 5;
50 }
51 
52 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
53   EmitInstruction(SU->getInstr());
54 }
55 
56 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
57   CurrCycleInstr = MI;
58 }
59 
60 static bool isDivFMas(unsigned Opcode) {
61   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
62 }
63 
64 static bool isSGetReg(unsigned Opcode) {
65   return Opcode == AMDGPU::S_GETREG_B32;
66 }
67 
68 static bool isSSetReg(unsigned Opcode) {
69   return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
70 }
71 
72 static bool isRWLane(unsigned Opcode) {
73   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
74 }
75 
76 static bool isRFE(unsigned Opcode) {
77   return Opcode == AMDGPU::S_RFE_B64;
78 }
79 
80 static bool isSMovRel(unsigned Opcode) {
81   switch (Opcode) {
82   case AMDGPU::S_MOVRELS_B32:
83   case AMDGPU::S_MOVRELS_B64:
84   case AMDGPU::S_MOVRELD_B32:
85   case AMDGPU::S_MOVRELD_B64:
86     return true;
87   default:
88     return false;
89   }
90 }
91 
92 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
93                                     const MachineInstr &MI) {
94   if (TII.isAlwaysGDS(MI.getOpcode()))
95     return true;
96 
97   switch (MI.getOpcode()) {
98   case AMDGPU::S_SENDMSG:
99   case AMDGPU::S_SENDMSGHALT:
100   case AMDGPU::S_TTRACEDATA:
101     return true;
102   // These DS opcodes don't support GDS.
103   case AMDGPU::DS_NOP:
104   case AMDGPU::DS_PERMUTE_B32:
105   case AMDGPU::DS_BPERMUTE_B32:
106     return false;
107   default:
108     if (TII.isDS(MI.getOpcode())) {
109       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
110                                            AMDGPU::OpName::gds);
111       if (MI.getOperand(GDS).getImm())
112         return true;
113     }
114     return false;
115   }
116 }
117 
118 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
119   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
120                                                      AMDGPU::OpName::simm16);
121   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
122 }
123 
124 ScheduleHazardRecognizer::HazardType
125 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
126   MachineInstr *MI = SU->getInstr();
127   if (MI->isBundle())
128    return NoHazard;
129 
130   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
131     return NoopHazard;
132 
133   // FIXME: Should flat be considered vmem?
134   if ((SIInstrInfo::isVMEM(*MI) ||
135        SIInstrInfo::isFLAT(*MI))
136       && checkVMEMHazards(MI) > 0)
137     return NoopHazard;
138 
139   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
140     return NoopHazard;
141 
142   if (ST.hasNoDataDepHazard())
143     return NoHazard;
144 
145   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
146     return NoopHazard;
147 
148   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
149     return NoopHazard;
150 
151   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
152     return NoopHazard;
153 
154   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
155     return NoopHazard;
156 
157   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
158     return NoopHazard;
159 
160   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
161     return NoopHazard;
162 
163   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
164     return NoopHazard;
165 
166   if (ST.hasReadM0MovRelInterpHazard() &&
167       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
168       checkReadM0Hazards(MI) > 0)
169     return NoopHazard;
170 
171   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
172       checkReadM0Hazards(MI) > 0)
173     return NoopHazard;
174 
175   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
176     return NoopHazard;
177 
178   if (checkAnyInstHazards(MI) > 0)
179     return NoopHazard;
180 
181   return NoHazard;
182 }
183 
184 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
185   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
186       .addImm(0);
187 }
188 
189 void GCNHazardRecognizer::processBundle() {
190   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
191   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
192   // Check bundled MachineInstr's for hazards.
193   for (; MI != E && MI->isInsideBundle(); ++MI) {
194     CurrCycleInstr = &*MI;
195     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
196 
197     if (IsHazardRecognizerMode)
198       fixHazards(CurrCycleInstr);
199 
200     for (unsigned i = 0; i < WaitStates; ++i)
201       insertNoopInBundle(CurrCycleInstr, TII);
202 
203     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
204     // include the bundled MI directly after, only add a maximum of
205     // (MaxLookAhead - 1) noops to EmittedInstrs.
206     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
207       EmittedInstrs.push_front(nullptr);
208 
209     EmittedInstrs.push_front(CurrCycleInstr);
210     EmittedInstrs.resize(MaxLookAhead);
211   }
212   CurrCycleInstr = nullptr;
213 }
214 
215 unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
216   IsHazardRecognizerMode = false;
217   return PreEmitNoopsCommon(SU->getInstr());
218 }
219 
220 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
221   IsHazardRecognizerMode = true;
222   CurrCycleInstr = MI;
223   unsigned W = PreEmitNoopsCommon(MI);
224   fixHazards(MI);
225   CurrCycleInstr = nullptr;
226   return W;
227 }
228 
229 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
230   if (MI->isBundle())
231     return 0;
232 
233   int WaitStates = std::max(0, checkAnyInstHazards(MI));
234 
235   if (SIInstrInfo::isSMRD(*MI))
236     return std::max(WaitStates, checkSMRDHazards(MI));
237 
238   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
239     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
240 
241   if (ST.hasNSAtoVMEMBug())
242     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
243 
244   if (ST.hasNoDataDepHazard())
245     return WaitStates;
246 
247   if (SIInstrInfo::isVALU(*MI))
248     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
249 
250   if (SIInstrInfo::isDPP(*MI))
251     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
252 
253   if (isDivFMas(MI->getOpcode()))
254     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
255 
256   if (isRWLane(MI->getOpcode()))
257     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
258 
259   if (MI->isInlineAsm())
260     return std::max(WaitStates, checkInlineAsmHazards(MI));
261 
262   if (isSGetReg(MI->getOpcode()))
263     return std::max(WaitStates, checkGetRegHazards(MI));
264 
265   if (isSSetReg(MI->getOpcode()))
266     return std::max(WaitStates, checkSetRegHazards(MI));
267 
268   if (isRFE(MI->getOpcode()))
269     return std::max(WaitStates, checkRFEHazards(MI));
270 
271   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
272                                            isSMovRel(MI->getOpcode())))
273     return std::max(WaitStates, checkReadM0Hazards(MI));
274 
275   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
276     return std::max(WaitStates, checkReadM0Hazards(MI));
277 
278   return WaitStates;
279 }
280 
281 void GCNHazardRecognizer::EmitNoop() {
282   EmittedInstrs.push_front(nullptr);
283 }
284 
285 void GCNHazardRecognizer::AdvanceCycle() {
286   // When the scheduler detects a stall, it will call AdvanceCycle() without
287   // emitting any instructions.
288   if (!CurrCycleInstr)
289     return;
290 
291   // Do not track non-instructions which do not affect the wait states.
292   // If included, these instructions can lead to buffer overflow such that
293   // detectable hazards are missed.
294   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
295       CurrCycleInstr->isKill())
296     return;
297 
298   if (CurrCycleInstr->isBundle()) {
299     processBundle();
300     return;
301   }
302 
303   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
304 
305   // Keep track of emitted instructions
306   EmittedInstrs.push_front(CurrCycleInstr);
307 
308   // Add a nullptr for each additional wait state after the first.  Make sure
309   // not to add more than getMaxLookAhead() items to the list, since we
310   // truncate the list to that size right after this loop.
311   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
312        i < e; ++i) {
313     EmittedInstrs.push_front(nullptr);
314   }
315 
316   // getMaxLookahead() is the largest number of wait states we will ever need
317   // to insert, so there is no point in keeping track of more than that many
318   // wait states.
319   EmittedInstrs.resize(getMaxLookAhead());
320 
321   CurrCycleInstr = nullptr;
322 }
323 
324 void GCNHazardRecognizer::RecedeCycle() {
325   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
326 }
327 
328 //===----------------------------------------------------------------------===//
329 // Helper Functions
330 //===----------------------------------------------------------------------===//
331 
332 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
333 
334 // Returns a minimum wait states since \p I walking all predecessors.
335 // Only scans until \p IsExpired does not return true.
336 // Can only be run in a hazard recognizer mode.
337 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
338                               MachineBasicBlock *MBB,
339                               MachineBasicBlock::reverse_instr_iterator I,
340                               int WaitStates,
341                               IsExpiredFn IsExpired,
342                               DenseSet<const MachineBasicBlock *> &Visited) {
343   for (auto E = MBB->instr_rend(); I != E; ++I) {
344     // Don't add WaitStates for parent BUNDLE instructions.
345     if (I->isBundle())
346       continue;
347 
348     if (IsHazard(&*I))
349       return WaitStates;
350 
351     if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
352       continue;
353 
354     WaitStates += SIInstrInfo::getNumWaitStates(*I);
355 
356     if (IsExpired(&*I, WaitStates))
357       return std::numeric_limits<int>::max();
358   }
359 
360   int MinWaitStates = WaitStates;
361   bool Found = false;
362   for (MachineBasicBlock *Pred : MBB->predecessors()) {
363     if (!Visited.insert(Pred).second)
364       continue;
365 
366     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
367                                WaitStates, IsExpired, Visited);
368 
369     if (W == std::numeric_limits<int>::max())
370       continue;
371 
372     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
373     if (IsExpired(nullptr, MinWaitStates))
374       return MinWaitStates;
375 
376     Found = true;
377   }
378 
379   if (Found)
380     return MinWaitStates;
381 
382   return std::numeric_limits<int>::max();
383 }
384 
385 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
386                               MachineInstr *MI,
387                               IsExpiredFn IsExpired) {
388   DenseSet<const MachineBasicBlock *> Visited;
389   return getWaitStatesSince(IsHazard, MI->getParent(),
390                             std::next(MI->getReverseIterator()),
391                             0, IsExpired, Visited);
392 }
393 
394 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
395   if (IsHazardRecognizerMode) {
396     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
397       return WaitStates >= Limit;
398     };
399     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
400   }
401 
402   int WaitStates = 0;
403   for (MachineInstr *MI : EmittedInstrs) {
404     if (MI) {
405       if (IsHazard(MI))
406         return WaitStates;
407 
408       if (MI->isInlineAsm())
409         continue;
410     }
411     ++WaitStates;
412 
413     if (WaitStates >= Limit)
414       break;
415   }
416   return std::numeric_limits<int>::max();
417 }
418 
419 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
420                                                IsHazardFn IsHazardDef,
421                                                int Limit) {
422   const SIRegisterInfo *TRI = ST.getRegisterInfo();
423 
424   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
425     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
426   };
427 
428   return getWaitStatesSince(IsHazardFn, Limit);
429 }
430 
431 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
432                                                   int Limit) {
433   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
434     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
435   };
436 
437   return getWaitStatesSince(IsHazardFn, Limit);
438 }
439 
440 //===----------------------------------------------------------------------===//
441 // No-op Hazard Detection
442 //===----------------------------------------------------------------------===//
443 
444 static void addRegUnits(const SIRegisterInfo &TRI,
445                         BitVector &BV, unsigned Reg) {
446   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
447     BV.set(*RUI);
448 }
449 
450 static void addRegsToSet(const SIRegisterInfo &TRI,
451                          iterator_range<MachineInstr::const_mop_iterator> Ops,
452                          BitVector &Set) {
453   for (const MachineOperand &Op : Ops) {
454     if (Op.isReg())
455       addRegUnits(TRI, Set, Op.getReg());
456   }
457 }
458 
459 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
460   // XXX: Do we need to worry about implicit operands
461   addRegsToSet(TRI, MI.defs(), ClauseDefs);
462   addRegsToSet(TRI, MI.uses(), ClauseUses);
463 }
464 
465 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
466   // SMEM soft clause are only present on VI+, and only matter if xnack is
467   // enabled.
468   if (!ST.isXNACKEnabled())
469     return 0;
470 
471   bool IsSMRD = TII.isSMRD(*MEM);
472 
473   resetClause();
474 
475   // A soft-clause is any group of consecutive SMEM instructions.  The
476   // instructions in this group may return out of order and/or may be
477   // replayed (i.e. the same instruction issued more than once).
478   //
479   // In order to handle these situations correctly we need to make sure that
480   // when a clause has more than one instruction, no instruction in the clause
481   // writes to a register that is read by another instruction in the clause
482   // (including itself). If we encounter this situaion, we need to break the
483   // clause by inserting a non SMEM instruction.
484 
485   for (MachineInstr *MI : EmittedInstrs) {
486     // When we hit a non-SMEM instruction then we have passed the start of the
487     // clause and we can stop.
488     if (!MI)
489       break;
490 
491     if (IsSMRD != SIInstrInfo::isSMRD(*MI))
492       break;
493 
494     addClauseInst(*MI);
495   }
496 
497   if (ClauseDefs.none())
498     return 0;
499 
500   // We need to make sure not to put loads and stores in the same clause if they
501   // use the same address. For now, just start a new clause whenever we see a
502   // store.
503   if (MEM->mayStore())
504     return 1;
505 
506   addClauseInst(*MEM);
507 
508   // If the set of defs and uses intersect then we cannot add this instruction
509   // to the clause, so we have a hazard.
510   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
511 }
512 
513 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
514   int WaitStatesNeeded = 0;
515 
516   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
517 
518   // This SMRD hazard only affects SI.
519   if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
520     return WaitStatesNeeded;
521 
522   // A read of an SGPR by SMRD instruction requires 4 wait states when the
523   // SGPR was written by a VALU instruction.
524   int SmrdSgprWaitStates = 4;
525   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
526   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
527 
528   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
529 
530   for (const MachineOperand &Use : SMRD->uses()) {
531     if (!Use.isReg())
532       continue;
533     int WaitStatesNeededForUse =
534         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
535                                                    SmrdSgprWaitStates);
536     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
537 
538     // This fixes what appears to be undocumented hardware behavior in SI where
539     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
540     // needs some number of nops in between. We don't know how many we need, but
541     // let's use 4. This wasn't discovered before probably because the only
542     // case when this happens is when we expand a 64-bit pointer into a full
543     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
544     // probably never encountered in the closed-source land.
545     if (IsBufferSMRD) {
546       int WaitStatesNeededForUse =
547         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
548                                                    IsBufferHazardDefFn,
549                                                    SmrdSgprWaitStates);
550       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
551     }
552   }
553 
554   return WaitStatesNeeded;
555 }
556 
557 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
558   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
559     return 0;
560 
561   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
562 
563   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
564   // SGPR was written by a VALU Instruction.
565   const int VmemSgprWaitStates = 5;
566   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
567   for (const MachineOperand &Use : VMEM->uses()) {
568     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
569       continue;
570 
571     int WaitStatesNeededForUse =
572         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
573                                                    VmemSgprWaitStates);
574     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
575   }
576   return WaitStatesNeeded;
577 }
578 
579 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
580   const SIRegisterInfo *TRI = ST.getRegisterInfo();
581   const SIInstrInfo *TII = ST.getInstrInfo();
582 
583   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
584   int DppVgprWaitStates = 2;
585   int DppExecWaitStates = 5;
586   int WaitStatesNeeded = 0;
587   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
588 
589   for (const MachineOperand &Use : DPP->uses()) {
590     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
591       continue;
592     int WaitStatesNeededForUse =
593         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
594                               [](MachineInstr *) { return true; },
595                               DppVgprWaitStates);
596     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
597   }
598 
599   WaitStatesNeeded = std::max(
600       WaitStatesNeeded,
601       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
602                                                 DppExecWaitStates));
603 
604   return WaitStatesNeeded;
605 }
606 
607 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
608   const SIInstrInfo *TII = ST.getInstrInfo();
609 
610   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
611   // instruction.
612   const int DivFMasWaitStates = 4;
613   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
614   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
615                                                DivFMasWaitStates);
616 
617   return DivFMasWaitStates - WaitStatesNeeded;
618 }
619 
620 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
621   const SIInstrInfo *TII = ST.getInstrInfo();
622   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
623 
624   const int GetRegWaitStates = 2;
625   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
626     return GetRegHWReg == getHWReg(TII, *MI);
627   };
628   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
629 
630   return GetRegWaitStates - WaitStatesNeeded;
631 }
632 
633 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
634   const SIInstrInfo *TII = ST.getInstrInfo();
635   unsigned HWReg = getHWReg(TII, *SetRegInstr);
636 
637   const int SetRegWaitStates =
638       ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;
639   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
640     return HWReg == getHWReg(TII, *MI);
641   };
642   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
643   return SetRegWaitStates - WaitStatesNeeded;
644 }
645 
646 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
647   if (!MI.mayStore())
648     return -1;
649 
650   const SIInstrInfo *TII = ST.getInstrInfo();
651   unsigned Opcode = MI.getOpcode();
652   const MCInstrDesc &Desc = MI.getDesc();
653 
654   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
655   int VDataRCID = -1;
656   if (VDataIdx != -1)
657     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
658 
659   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
660     // There is no hazard if the instruction does not use vector regs
661     // (like wbinvl1)
662     if (VDataIdx == -1)
663       return -1;
664     // For MUBUF/MTBUF instructions this hazard only exists if the
665     // instruction is not using a register in the soffset field.
666     const MachineOperand *SOffset =
667         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
668     // If we have no soffset operand, then assume this field has been
669     // hardcoded to zero.
670     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
671         (!SOffset || !SOffset->isReg()))
672       return VDataIdx;
673   }
674 
675   // MIMG instructions create a hazard if they don't use a 256-bit T# and
676   // the store size is greater than 8 bytes and they have more than two bits
677   // of their dmask set.
678   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
679   if (TII->isMIMG(MI)) {
680     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
681     assert(SRsrcIdx != -1 &&
682            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
683     (void)SRsrcIdx;
684   }
685 
686   if (TII->isFLAT(MI)) {
687     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
688     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
689       return DataIdx;
690   }
691 
692   return -1;
693 }
694 
695 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
696 						const MachineRegisterInfo &MRI) {
697   // Helper to check for the hazard where VMEM instructions that store more than
698   // 8 bytes can have there store data over written by the next instruction.
699   const SIRegisterInfo *TRI = ST.getRegisterInfo();
700 
701   const int VALUWaitStates = 1;
702   int WaitStatesNeeded = 0;
703 
704   if (!TRI->isVGPR(MRI, Def.getReg()))
705     return WaitStatesNeeded;
706   unsigned Reg = Def.getReg();
707   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
708     int DataIdx = createsVALUHazard(*MI);
709     return DataIdx >= 0 &&
710     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
711   };
712   int WaitStatesNeededForDef =
713     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
714   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
715 
716   return WaitStatesNeeded;
717 }
718 
719 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
720   // This checks for the hazard where VMEM instructions that store more than
721   // 8 bytes can have there store data over written by the next instruction.
722   if (!ST.has12DWordStoreHazard())
723     return 0;
724 
725   const MachineRegisterInfo &MRI = MF.getRegInfo();
726   int WaitStatesNeeded = 0;
727 
728   for (const MachineOperand &Def : VALU->defs()) {
729     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
730   }
731 
732   return WaitStatesNeeded;
733 }
734 
735 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
736   // This checks for hazards associated with inline asm statements.
737   // Since inline asms can contain just about anything, we use this
738   // to call/leverage other check*Hazard routines. Note that
739   // this function doesn't attempt to address all possible inline asm
740   // hazards (good luck), but is a collection of what has been
741   // problematic thus far.
742 
743   // see checkVALUHazards()
744   if (!ST.has12DWordStoreHazard())
745     return 0;
746 
747   const MachineRegisterInfo &MRI = MF.getRegInfo();
748   int WaitStatesNeeded = 0;
749 
750   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
751        I != E; ++I) {
752     const MachineOperand &Op = IA->getOperand(I);
753     if (Op.isReg() && Op.isDef()) {
754       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
755     }
756   }
757 
758   return WaitStatesNeeded;
759 }
760 
761 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
762   const SIInstrInfo *TII = ST.getInstrInfo();
763   const SIRegisterInfo *TRI = ST.getRegisterInfo();
764   const MachineRegisterInfo &MRI = MF.getRegInfo();
765 
766   const MachineOperand *LaneSelectOp =
767       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
768 
769   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
770     return 0;
771 
772   unsigned LaneSelectReg = LaneSelectOp->getReg();
773   auto IsHazardFn = [TII] (MachineInstr *MI) {
774     return TII->isVALU(*MI);
775   };
776 
777   const int RWLaneWaitStates = 4;
778   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
779                                               RWLaneWaitStates);
780   return RWLaneWaitStates - WaitStatesSince;
781 }
782 
783 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
784   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
785     return 0;
786 
787   const SIInstrInfo *TII = ST.getInstrInfo();
788 
789   const int RFEWaitStates = 1;
790 
791   auto IsHazardFn = [TII] (MachineInstr *MI) {
792     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
793   };
794   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
795   return RFEWaitStates - WaitStatesNeeded;
796 }
797 
798 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
799   if (MI->isDebugInstr())
800     return 0;
801 
802   const SIRegisterInfo *TRI = ST.getRegisterInfo();
803   if (!ST.hasSMovFedHazard())
804     return 0;
805 
806   // Check for any instruction reading an SGPR after a write from
807   // s_mov_fed_b32.
808   int MovFedWaitStates = 1;
809   int WaitStatesNeeded = 0;
810 
811   for (const MachineOperand &Use : MI->uses()) {
812     if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
813       continue;
814     auto IsHazardFn = [] (MachineInstr *MI) {
815       return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
816     };
817     int WaitStatesNeededForUse =
818         MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
819                                                  MovFedWaitStates);
820     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
821   }
822 
823   return WaitStatesNeeded;
824 }
825 
826 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
827   const SIInstrInfo *TII = ST.getInstrInfo();
828   const int SMovRelWaitStates = 1;
829   auto IsHazardFn = [TII] (MachineInstr *MI) {
830     return TII->isSALU(*MI);
831   };
832   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
833                                                    SMovRelWaitStates);
834 }
835 
836 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
837   fixVMEMtoScalarWriteHazards(MI);
838   fixSMEMtoVectorWriteHazards(MI);
839   fixVcmpxExecWARHazard(MI);
840   fixLdsBranchVmemWARHazard(MI);
841 }
842 
843 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
844   if (!ST.hasVMEMtoScalarWriteHazard())
845     return false;
846 
847   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
848     return false;
849 
850   if (MI->getNumDefs() == 0)
851     return false;
852 
853   const SIRegisterInfo *TRI = ST.getRegisterInfo();
854 
855   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
856     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
857         !SIInstrInfo::isFLAT(*I))
858       return false;
859 
860     for (const MachineOperand &Def : MI->defs()) {
861       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
862       if (!Op || (Op->isImplicit() && Op->getReg() == AMDGPU::EXEC))
863         continue;
864       return true;
865     }
866     return false;
867   };
868 
869   auto IsExpiredFn = [] (MachineInstr *MI, int) {
870     return MI && (SIInstrInfo::isVALU(*MI) ||
871                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
872                    !MI->getOperand(0).getImm()));
873   };
874 
875   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
876       std::numeric_limits<int>::max())
877     return false;
878 
879   const SIInstrInfo *TII = ST.getInstrInfo();
880   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
881   return true;
882 }
883 
884 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
885   if (!ST.hasSMEMtoVectorWriteHazard())
886     return false;
887 
888   if (!SIInstrInfo::isVALU(*MI))
889     return false;
890 
891   unsigned SDSTName;
892   switch (MI->getOpcode()) {
893   case AMDGPU::V_READLANE_B32:
894   case AMDGPU::V_READFIRSTLANE_B32:
895     SDSTName = AMDGPU::OpName::vdst;
896     break;
897   default:
898     SDSTName = AMDGPU::OpName::sdst;
899     break;
900   }
901 
902   const SIInstrInfo *TII = ST.getInstrInfo();
903   const SIRegisterInfo *TRI = ST.getRegisterInfo();
904   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
905   if (!SDST) {
906     for (const auto &MO : MI->implicit_operands()) {
907       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
908         SDST = &MO;
909         break;
910       }
911     }
912   }
913 
914   if (!SDST)
915     return false;
916 
917   const unsigned SDSTReg = SDST->getReg();
918   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
919     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
920   };
921 
922   // This assumes that there will be s_waitcnt lgkmcnt(0) or equivalent
923   // between any at risk SMEM and any SALU dependent on the SMEM results.
924   auto IsExpiredFn = [TII] (MachineInstr *MI, int) {
925     if (MI) {
926       if (TII->isSALU(*MI)) {
927         if (TII->isSOPP(*MI))
928           return false;
929         switch (MI->getOpcode()) {
930         case AMDGPU::S_SETVSKIP:
931         case AMDGPU::S_VERSION:
932         case AMDGPU::S_WAITCNT_VSCNT:
933         case AMDGPU::S_WAITCNT_VMCNT:
934         case AMDGPU::S_WAITCNT_EXPCNT:
935         case AMDGPU::S_WAITCNT_LGKMCNT:
936           return false;
937         default:
938           return true;
939         }
940       }
941     }
942     return false;
943   };
944 
945   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
946       std::numeric_limits<int>::max())
947     return false;
948 
949   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
950           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
951       .addImm(0);
952   return true;
953 }
954 
955 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
956   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
957     return false;
958 
959   const SIRegisterInfo *TRI = ST.getRegisterInfo();
960   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
961     return false;
962 
963   auto IsHazardFn = [TRI] (MachineInstr *I) {
964     if (SIInstrInfo::isVALU(*I))
965       return false;
966     return I->readsRegister(AMDGPU::EXEC, TRI);
967   };
968 
969   const SIInstrInfo *TII = ST.getInstrInfo();
970   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
971     if (!MI)
972       return false;
973     if (SIInstrInfo::isVALU(*MI)) {
974       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
975         return true;
976       for (auto MO : MI->implicit_operands())
977         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
978           return true;
979     }
980     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
981         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
982       return true;
983     return false;
984   };
985 
986   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
987       std::numeric_limits<int>::max())
988     return false;
989 
990   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
991           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
992     .addImm(0xfffe);
993   return true;
994 }
995 
996 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
997   if (!ST.hasLdsBranchVmemWARHazard())
998     return false;
999 
1000   auto IsHazardInst = [] (const MachineInstr *MI) {
1001     if (SIInstrInfo::isDS(*MI))
1002       return 1;
1003     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1004       return 2;
1005     return 0;
1006   };
1007 
1008   auto InstType = IsHazardInst(MI);
1009   if (!InstType)
1010     return false;
1011 
1012   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1013     return I && (IsHazardInst(I) ||
1014                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1015                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1016                   !I->getOperand(1).getImm()));
1017   };
1018 
1019   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1020     if (!I->isBranch())
1021       return false;
1022 
1023     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1024       auto InstType2 = IsHazardInst(I);
1025       return InstType2 && InstType != InstType2;
1026     };
1027 
1028     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1029       if (!I)
1030         return false;
1031 
1032       auto InstType2 = IsHazardInst(I);
1033       if (InstType == InstType2)
1034         return true;
1035 
1036       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1037              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1038              !I->getOperand(1).getImm();
1039     };
1040 
1041     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1042            std::numeric_limits<int>::max();
1043   };
1044 
1045   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1046       std::numeric_limits<int>::max())
1047     return false;
1048 
1049   const SIInstrInfo *TII = ST.getInstrInfo();
1050   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1051           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1052     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1053     .addImm(0);
1054 
1055   return true;
1056 }
1057 
1058 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1059   int NSAtoVMEMWaitStates = 1;
1060 
1061   if (!ST.hasNSAtoVMEMBug())
1062     return 0;
1063 
1064   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1065     return 0;
1066 
1067   const SIInstrInfo *TII = ST.getInstrInfo();
1068   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1069   if (!Offset || (Offset->getImm() & 6) == 0)
1070     return 0;
1071 
1072   auto IsHazardFn = [TII] (MachineInstr *I) {
1073     if (!SIInstrInfo::isMIMG(*I))
1074       return false;
1075     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1076     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1077            TII->getInstSizeInBytes(*I) >= 16;
1078   };
1079 
1080   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1081 }
1082