xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 1e15adba62a9fbc00a9999d75818ef8b1fbb8cd7)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/Support/TargetParser.h"
20 
21 using namespace llvm;
22 
23 namespace {
24 
25 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27 
28   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29     if (Arg.getAsInteger(0, Value))
30       return O.error("'" + Arg + "' value invalid for uint argument!");
31 
32     if (Value > 100)
33       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34 
35     return false;
36   }
37 };
38 
39 } // end anonymous namespace
40 
41 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43                      cl::desc("Fill a percentage of the latency between "
44                               "neighboring MFMA with s_nops."));
45 
46 //===----------------------------------------------------------------------===//
47 // Hazard Recognizer Implementation
48 //===----------------------------------------------------------------------===//
49 
50 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51                                                  const GCNSubtarget &ST);
52 
53 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54   IsHazardRecognizerMode(false),
55   CurrCycleInstr(nullptr),
56   MF(MF),
57   ST(MF.getSubtarget<GCNSubtarget>()),
58   TII(*ST.getInstrInfo()),
59   TRI(TII.getRegisterInfo()),
60   ClauseUses(TRI.getNumRegUnits()),
61   ClauseDefs(TRI.getNumRegUnits()) {
62   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63   TSchedModel.init(&ST);
64   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65 }
66 
67 void GCNHazardRecognizer::Reset() {
68   EmittedInstrs.clear();
69 }
70 
71 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72   EmitInstruction(SU->getInstr());
73 }
74 
75 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76   CurrCycleInstr = MI;
77 }
78 
79 static bool isDivFMas(unsigned Opcode) {
80   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81 }
82 
83 static bool isSGetReg(unsigned Opcode) {
84   return Opcode == AMDGPU::S_GETREG_B32;
85 }
86 
87 static bool isSSetReg(unsigned Opcode) {
88   switch (Opcode) {
89   case AMDGPU::S_SETREG_B32:
90   case AMDGPU::S_SETREG_B32_mode:
91   case AMDGPU::S_SETREG_IMM32_B32:
92   case AMDGPU::S_SETREG_IMM32_B32_mode:
93     return true;
94   }
95   return false;
96 }
97 
98 static bool isRWLane(unsigned Opcode) {
99   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100 }
101 
102 static bool isRFE(unsigned Opcode) {
103   return Opcode == AMDGPU::S_RFE_B64;
104 }
105 
106 static bool isSMovRel(unsigned Opcode) {
107   switch (Opcode) {
108   case AMDGPU::S_MOVRELS_B32:
109   case AMDGPU::S_MOVRELS_B64:
110   case AMDGPU::S_MOVRELD_B32:
111   case AMDGPU::S_MOVRELD_B64:
112     return true;
113   default:
114     return false;
115   }
116 }
117 
118 static bool isDGEMM(unsigned Opcode) {
119   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
120          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
121          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
122          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 ||
123          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 ||
124          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64;
125 }
126 
127 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
128   unsigned Opcode = MI.getOpcode();
129 
130   if (!SIInstrInfo::isMAI(MI) ||
131       isDGEMM(Opcode) ||
132       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
133       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
134     return false;
135 
136   return true;
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
169 }
170 
171 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
172   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
173                                                      AMDGPU::OpName::simm16);
174   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
175 }
176 
177 ScheduleHazardRecognizer::HazardType
178 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
179   MachineInstr *MI = SU->getInstr();
180   // If we are not in "HazardRecognizerMode" and therefore not being run from
181   // the scheduler, track possible stalls from hazards but don't insert noops.
182   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
183 
184   if (MI->isBundle())
185    return NoHazard;
186 
187   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
188     return HazardType;
189 
190   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
191     return HazardType;
192 
193   if (checkFPAtomicToDenormModeHazard(MI) > 0)
194     return HazardType;
195 
196   if (ST.hasNoDataDepHazard())
197     return NoHazard;
198 
199   // FIXME: Should flat be considered vmem?
200   if ((SIInstrInfo::isVMEM(*MI) ||
201        SIInstrInfo::isFLAT(*MI))
202       && checkVMEMHazards(MI) > 0)
203     return HazardType;
204 
205   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
206     return HazardType;
207 
208   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
209     return HazardType;
210 
211   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
212     return HazardType;
213 
214   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
215     return HazardType;
216 
217   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
218        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
219        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
220     return HazardType;
221 
222   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
223     return HazardType;
224 
225   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
226     return HazardType;
227 
228   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
229     return HazardType;
230 
231   if (ST.hasReadM0MovRelInterpHazard() &&
232       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
233       checkReadM0Hazards(MI) > 0)
234     return HazardType;
235 
236   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
237       checkReadM0Hazards(MI) > 0)
238     return HazardType;
239 
240   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
241     return HazardType;
242 
243   if ((SIInstrInfo::isVMEM(*MI) ||
244        SIInstrInfo::isFLAT(*MI) ||
245        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
246     return HazardType;
247 
248   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
249     return HazardType;
250 
251   return NoHazard;
252 }
253 
254 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
255                                 unsigned Quantity) {
256   while (Quantity > 0) {
257     unsigned Arg = std::min(Quantity, 8u);
258     Quantity -= Arg;
259     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
260         .addImm(Arg - 1);
261   }
262 }
263 
264 unsigned
265 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
266   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
267   assert(TSchedModel.getWriteProcResBegin(SC) !=
268          TSchedModel.getWriteProcResEnd(SC));
269   return TSchedModel.getWriteProcResBegin(SC)->Cycles;
270 }
271 
272 void GCNHazardRecognizer::processBundle() {
273   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
274   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
275   // Check bundled MachineInstr's for hazards.
276   for (; MI != E && MI->isInsideBundle(); ++MI) {
277     CurrCycleInstr = &*MI;
278     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
279 
280     if (IsHazardRecognizerMode) {
281       fixHazards(CurrCycleInstr);
282 
283       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
284     }
285 
286     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
287     // include the bundled MI directly after, only add a maximum of
288     // (MaxLookAhead - 1) noops to EmittedInstrs.
289     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
290       EmittedInstrs.push_front(nullptr);
291 
292     EmittedInstrs.push_front(CurrCycleInstr);
293     EmittedInstrs.resize(MaxLookAhead);
294   }
295   CurrCycleInstr = nullptr;
296 }
297 
298 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
299   IsHazardRecognizerMode = true;
300   CurrCycleInstr = MI;
301   unsigned W = PreEmitNoopsCommon(MI);
302   fixHazards(MI);
303   CurrCycleInstr = nullptr;
304   return W;
305 }
306 
307 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
308   if (MI->isBundle())
309     return 0;
310 
311   int WaitStates = 0;
312 
313   if (SIInstrInfo::isSMRD(*MI))
314     return std::max(WaitStates, checkSMRDHazards(MI));
315 
316   if (ST.hasNSAtoVMEMBug())
317     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
318 
319   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
320 
321   if (ST.hasNoDataDepHazard())
322     return WaitStates;
323 
324   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
325     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
326 
327   if (SIInstrInfo::isVALU(*MI))
328     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
329 
330   if (SIInstrInfo::isDPP(*MI))
331     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
332 
333   if (isDivFMas(MI->getOpcode()))
334     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
335 
336   if (isRWLane(MI->getOpcode()))
337     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
338 
339   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
340        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
341        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
342     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
343 
344   if (MI->isInlineAsm())
345     return std::max(WaitStates, checkInlineAsmHazards(MI));
346 
347   if (isSGetReg(MI->getOpcode()))
348     return std::max(WaitStates, checkGetRegHazards(MI));
349 
350   if (isSSetReg(MI->getOpcode()))
351     return std::max(WaitStates, checkSetRegHazards(MI));
352 
353   if (isRFE(MI->getOpcode()))
354     return std::max(WaitStates, checkRFEHazards(MI));
355 
356   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
357                                            isSMovRel(MI->getOpcode())))
358     return std::max(WaitStates, checkReadM0Hazards(MI));
359 
360   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
361     return std::max(WaitStates, checkReadM0Hazards(MI));
362 
363   if (SIInstrInfo::isMAI(*MI))
364     return std::max(WaitStates, checkMAIHazards(MI));
365 
366   if (SIInstrInfo::isVMEM(*MI) ||
367       SIInstrInfo::isFLAT(*MI) ||
368       SIInstrInfo::isDS(*MI))
369     return std::max(WaitStates, checkMAILdStHazards(MI));
370 
371   return WaitStates;
372 }
373 
374 void GCNHazardRecognizer::EmitNoop() {
375   EmittedInstrs.push_front(nullptr);
376 }
377 
378 void GCNHazardRecognizer::AdvanceCycle() {
379   // When the scheduler detects a stall, it will call AdvanceCycle() without
380   // emitting any instructions.
381   if (!CurrCycleInstr) {
382     EmittedInstrs.push_front(nullptr);
383     return;
384   }
385 
386   if (CurrCycleInstr->isBundle()) {
387     processBundle();
388     return;
389   }
390 
391   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
392   if (!NumWaitStates) {
393     CurrCycleInstr = nullptr;
394     return;
395   }
396 
397   // Keep track of emitted instructions
398   EmittedInstrs.push_front(CurrCycleInstr);
399 
400   // Add a nullptr for each additional wait state after the first.  Make sure
401   // not to add more than getMaxLookAhead() items to the list, since we
402   // truncate the list to that size right after this loop.
403   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
404        i < e; ++i) {
405     EmittedInstrs.push_front(nullptr);
406   }
407 
408   // getMaxLookahead() is the largest number of wait states we will ever need
409   // to insert, so there is no point in keeping track of more than that many
410   // wait states.
411   EmittedInstrs.resize(getMaxLookAhead());
412 
413   CurrCycleInstr = nullptr;
414 }
415 
416 void GCNHazardRecognizer::RecedeCycle() {
417   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
418 }
419 
420 //===----------------------------------------------------------------------===//
421 // Helper Functions
422 //===----------------------------------------------------------------------===//
423 
424 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
425 
426 // Returns a minimum wait states since \p I walking all predecessors.
427 // Only scans until \p IsExpired does not return true.
428 // Can only be run in a hazard recognizer mode.
429 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
430                               const MachineBasicBlock *MBB,
431                               MachineBasicBlock::const_reverse_instr_iterator I,
432                               int WaitStates, IsExpiredFn IsExpired,
433                               DenseSet<const MachineBasicBlock *> &Visited) {
434   for (auto E = MBB->instr_rend(); I != E; ++I) {
435     // Don't add WaitStates for parent BUNDLE instructions.
436     if (I->isBundle())
437       continue;
438 
439     if (IsHazard(*I))
440       return WaitStates;
441 
442     if (I->isInlineAsm())
443       continue;
444 
445     WaitStates += SIInstrInfo::getNumWaitStates(*I);
446 
447     if (IsExpired(*I, WaitStates))
448       return std::numeric_limits<int>::max();
449   }
450 
451   int MinWaitStates = std::numeric_limits<int>::max();
452   for (MachineBasicBlock *Pred : MBB->predecessors()) {
453     if (!Visited.insert(Pred).second)
454       continue;
455 
456     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
457                                WaitStates, IsExpired, Visited);
458 
459     MinWaitStates = std::min(MinWaitStates, W);
460   }
461 
462   return MinWaitStates;
463 }
464 
465 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
466                               const MachineInstr *MI, IsExpiredFn IsExpired) {
467   DenseSet<const MachineBasicBlock *> Visited;
468   return getWaitStatesSince(IsHazard, MI->getParent(),
469                             std::next(MI->getReverseIterator()),
470                             0, IsExpired, Visited);
471 }
472 
473 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
474   if (IsHazardRecognizerMode) {
475     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
476       return WaitStates >= Limit;
477     };
478     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
479   }
480 
481   int WaitStates = 0;
482   for (MachineInstr *MI : EmittedInstrs) {
483     if (MI) {
484       if (IsHazard(*MI))
485         return WaitStates;
486 
487       if (MI->isInlineAsm())
488         continue;
489     }
490     ++WaitStates;
491 
492     if (WaitStates >= Limit)
493       break;
494   }
495   return std::numeric_limits<int>::max();
496 }
497 
498 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
499                                                IsHazardFn IsHazardDef,
500                                                int Limit) {
501   const SIRegisterInfo *TRI = ST.getRegisterInfo();
502 
503   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
504     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
505   };
506 
507   return getWaitStatesSince(IsHazardFn, Limit);
508 }
509 
510 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
511                                                   int Limit) {
512   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
513     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
514   };
515 
516   return getWaitStatesSince(IsHazardFn, Limit);
517 }
518 
519 //===----------------------------------------------------------------------===//
520 // No-op Hazard Detection
521 //===----------------------------------------------------------------------===//
522 
523 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
524                         MCRegister Reg) {
525   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
526     BV.set(*RUI);
527 }
528 
529 static void addRegsToSet(const SIRegisterInfo &TRI,
530                          iterator_range<MachineInstr::const_mop_iterator> Ops,
531                          BitVector &Set) {
532   for (const MachineOperand &Op : Ops) {
533     if (Op.isReg())
534       addRegUnits(TRI, Set, Op.getReg().asMCReg());
535   }
536 }
537 
538 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
539   // XXX: Do we need to worry about implicit operands
540   addRegsToSet(TRI, MI.defs(), ClauseDefs);
541   addRegsToSet(TRI, MI.uses(), ClauseUses);
542 }
543 
544 static bool breaksSMEMSoftClause(MachineInstr *MI) {
545   return !SIInstrInfo::isSMRD(*MI);
546 }
547 
548 static bool breaksVMEMSoftClause(MachineInstr *MI) {
549   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
550 }
551 
552 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
553   // SMEM soft clause are only present on VI+, and only matter if xnack is
554   // enabled.
555   if (!ST.isXNACKEnabled())
556     return 0;
557 
558   bool IsSMRD = TII.isSMRD(*MEM);
559 
560   resetClause();
561 
562   // A soft-clause is any group of consecutive SMEM instructions.  The
563   // instructions in this group may return out of order and/or may be
564   // replayed (i.e. the same instruction issued more than once).
565   //
566   // In order to handle these situations correctly we need to make sure that
567   // when a clause has more than one instruction, no instruction in the clause
568   // writes to a register that is read by another instruction in the clause
569   // (including itself). If we encounter this situation, we need to break the
570   // clause by inserting a non SMEM instruction.
571 
572   for (MachineInstr *MI : EmittedInstrs) {
573     // When we hit a non-SMEM instruction then we have passed the start of the
574     // clause and we can stop.
575     if (!MI)
576       break;
577 
578     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
579       break;
580 
581     addClauseInst(*MI);
582   }
583 
584   if (ClauseDefs.none())
585     return 0;
586 
587   // We need to make sure not to put loads and stores in the same clause if they
588   // use the same address. For now, just start a new clause whenever we see a
589   // store.
590   if (MEM->mayStore())
591     return 1;
592 
593   addClauseInst(*MEM);
594 
595   // If the set of defs and uses intersect then we cannot add this instruction
596   // to the clause, so we have a hazard.
597   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
598 }
599 
600 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
601   int WaitStatesNeeded = 0;
602 
603   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
604 
605   // This SMRD hazard only affects SI.
606   if (!ST.hasSMRDReadVALUDefHazard())
607     return WaitStatesNeeded;
608 
609   // A read of an SGPR by SMRD instruction requires 4 wait states when the
610   // SGPR was written by a VALU instruction.
611   int SmrdSgprWaitStates = 4;
612   auto IsHazardDefFn = [this](const MachineInstr &MI) {
613     return TII.isVALU(MI);
614   };
615   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
616     return TII.isSALU(MI);
617   };
618 
619   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
620 
621   for (const MachineOperand &Use : SMRD->uses()) {
622     if (!Use.isReg())
623       continue;
624     int WaitStatesNeededForUse =
625         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
626                                                    SmrdSgprWaitStates);
627     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
628 
629     // This fixes what appears to be undocumented hardware behavior in SI where
630     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
631     // needs some number of nops in between. We don't know how many we need, but
632     // let's use 4. This wasn't discovered before probably because the only
633     // case when this happens is when we expand a 64-bit pointer into a full
634     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
635     // probably never encountered in the closed-source land.
636     if (IsBufferSMRD) {
637       int WaitStatesNeededForUse =
638         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
639                                                    IsBufferHazardDefFn,
640                                                    SmrdSgprWaitStates);
641       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
642     }
643   }
644 
645   return WaitStatesNeeded;
646 }
647 
648 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
649   if (!ST.hasVMEMReadSGPRVALUDefHazard())
650     return 0;
651 
652   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
653 
654   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
655   // SGPR was written by a VALU Instruction.
656   const int VmemSgprWaitStates = 5;
657   auto IsHazardDefFn = [this](const MachineInstr &MI) {
658     return TII.isVALU(MI);
659   };
660   for (const MachineOperand &Use : VMEM->uses()) {
661     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
662       continue;
663 
664     int WaitStatesNeededForUse =
665         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
666                                                    VmemSgprWaitStates);
667     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
668   }
669   return WaitStatesNeeded;
670 }
671 
672 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
673   const SIRegisterInfo *TRI = ST.getRegisterInfo();
674   const SIInstrInfo *TII = ST.getInstrInfo();
675 
676   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
677   int DppVgprWaitStates = 2;
678   int DppExecWaitStates = 5;
679   int WaitStatesNeeded = 0;
680   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
681     return TII->isVALU(MI);
682   };
683 
684   for (const MachineOperand &Use : DPP->uses()) {
685     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
686       continue;
687     int WaitStatesNeededForUse =
688         DppVgprWaitStates - getWaitStatesSinceDef(
689                                 Use.getReg(),
690                                 [](const MachineInstr &) { return true; },
691                                 DppVgprWaitStates);
692     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
693   }
694 
695   WaitStatesNeeded = std::max(
696       WaitStatesNeeded,
697       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
698                                                 DppExecWaitStates));
699 
700   return WaitStatesNeeded;
701 }
702 
703 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
704   const SIInstrInfo *TII = ST.getInstrInfo();
705 
706   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
707   // instruction.
708   const int DivFMasWaitStates = 4;
709   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
710     return TII->isVALU(MI);
711   };
712   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
713                                                DivFMasWaitStates);
714 
715   return DivFMasWaitStates - WaitStatesNeeded;
716 }
717 
718 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
719   const SIInstrInfo *TII = ST.getInstrInfo();
720   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
721 
722   const int GetRegWaitStates = 2;
723   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
724     return GetRegHWReg == getHWReg(TII, MI);
725   };
726   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
727 
728   return GetRegWaitStates - WaitStatesNeeded;
729 }
730 
731 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
732   const SIInstrInfo *TII = ST.getInstrInfo();
733   unsigned HWReg = getHWReg(TII, *SetRegInstr);
734 
735   const int SetRegWaitStates = ST.getSetRegWaitStates();
736   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
737     return HWReg == getHWReg(TII, MI);
738   };
739   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
740   return SetRegWaitStates - WaitStatesNeeded;
741 }
742 
743 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
744   if (!MI.mayStore())
745     return -1;
746 
747   const SIInstrInfo *TII = ST.getInstrInfo();
748   unsigned Opcode = MI.getOpcode();
749   const MCInstrDesc &Desc = MI.getDesc();
750 
751   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
752   int VDataRCID = -1;
753   if (VDataIdx != -1)
754     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
755 
756   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
757     // There is no hazard if the instruction does not use vector regs
758     // (like wbinvl1)
759     if (VDataIdx == -1)
760       return -1;
761     // For MUBUF/MTBUF instructions this hazard only exists if the
762     // instruction is not using a register in the soffset field.
763     const MachineOperand *SOffset =
764         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
765     // If we have no soffset operand, then assume this field has been
766     // hardcoded to zero.
767     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
768         (!SOffset || !SOffset->isReg()))
769       return VDataIdx;
770   }
771 
772   // MIMG instructions create a hazard if they don't use a 256-bit T# and
773   // the store size is greater than 8 bytes and they have more than two bits
774   // of their dmask set.
775   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
776   if (TII->isMIMG(MI)) {
777     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
778     assert(SRsrcIdx != -1 &&
779            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
780     (void)SRsrcIdx;
781   }
782 
783   if (TII->isFLAT(MI)) {
784     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
785     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
786       return DataIdx;
787   }
788 
789   return -1;
790 }
791 
792 int
793 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
794                                             const MachineRegisterInfo &MRI) {
795   // Helper to check for the hazard where VMEM instructions that store more than
796   // 8 bytes can have there store data over written by the next instruction.
797   const SIRegisterInfo *TRI = ST.getRegisterInfo();
798 
799   const int VALUWaitStates = 1;
800   int WaitStatesNeeded = 0;
801 
802   if (!TRI->isVectorRegister(MRI, Def.getReg()))
803     return WaitStatesNeeded;
804   Register Reg = Def.getReg();
805   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
806     int DataIdx = createsVALUHazard(MI);
807     return DataIdx >= 0 &&
808            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
809   };
810   int WaitStatesNeededForDef =
811     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
812   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
813 
814   return WaitStatesNeeded;
815 }
816 
817 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
818   // This checks for the hazard where VMEM instructions that store more than
819   // 8 bytes can have there store data over written by the next instruction.
820   if (!ST.has12DWordStoreHazard())
821     return 0;
822 
823   const MachineRegisterInfo &MRI = MF.getRegInfo();
824   int WaitStatesNeeded = 0;
825 
826   for (const MachineOperand &Def : VALU->defs()) {
827     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
828   }
829 
830   return WaitStatesNeeded;
831 }
832 
833 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
834   // This checks for hazards associated with inline asm statements.
835   // Since inline asms can contain just about anything, we use this
836   // to call/leverage other check*Hazard routines. Note that
837   // this function doesn't attempt to address all possible inline asm
838   // hazards (good luck), but is a collection of what has been
839   // problematic thus far.
840 
841   // see checkVALUHazards()
842   if (!ST.has12DWordStoreHazard())
843     return 0;
844 
845   const MachineRegisterInfo &MRI = MF.getRegInfo();
846   int WaitStatesNeeded = 0;
847 
848   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
849        I != E; ++I) {
850     const MachineOperand &Op = IA->getOperand(I);
851     if (Op.isReg() && Op.isDef()) {
852       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
853     }
854   }
855 
856   return WaitStatesNeeded;
857 }
858 
859 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
860   const SIInstrInfo *TII = ST.getInstrInfo();
861   const SIRegisterInfo *TRI = ST.getRegisterInfo();
862   const MachineRegisterInfo &MRI = MF.getRegInfo();
863 
864   const MachineOperand *LaneSelectOp =
865       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
866 
867   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
868     return 0;
869 
870   Register LaneSelectReg = LaneSelectOp->getReg();
871   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
872 
873   const int RWLaneWaitStates = 4;
874   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
875                                               RWLaneWaitStates);
876   return RWLaneWaitStates - WaitStatesSince;
877 }
878 
879 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
880   if (!ST.hasRFEHazards())
881     return 0;
882 
883   const SIInstrInfo *TII = ST.getInstrInfo();
884 
885   const int RFEWaitStates = 1;
886 
887   auto IsHazardFn = [TII](const MachineInstr &MI) {
888     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
889   };
890   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
891   return RFEWaitStates - WaitStatesNeeded;
892 }
893 
894 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
895   const SIInstrInfo *TII = ST.getInstrInfo();
896   const int SMovRelWaitStates = 1;
897   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
898   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
899                                                    SMovRelWaitStates);
900 }
901 
902 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
903   fixVMEMtoScalarWriteHazards(MI);
904   fixVcmpxPermlaneHazards(MI);
905   fixSMEMtoVectorWriteHazards(MI);
906   fixVcmpxExecWARHazard(MI);
907   fixLdsBranchVmemWARHazard(MI);
908 }
909 
910 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
911   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
912     return false;
913 
914   const SIInstrInfo *TII = ST.getInstrInfo();
915   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
916 
917   auto IsExpiredFn = [](const MachineInstr &MI, int) {
918     unsigned Opc = MI.getOpcode();
919     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
920            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
921   };
922 
923   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
924       std::numeric_limits<int>::max())
925     return false;
926 
927   // V_NOP will be discarded by SQ.
928   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
929   // which is always a VGPR and available.
930   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
931   Register Reg = Src0->getReg();
932   bool IsUndef = Src0->isUndef();
933   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
934           TII->get(AMDGPU::V_MOV_B32_e32))
935     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
936     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
937 
938   return true;
939 }
940 
941 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
942   if (!ST.hasVMEMtoScalarWriteHazard())
943     return false;
944 
945   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
946     return false;
947 
948   if (MI->getNumDefs() == 0)
949     return false;
950 
951   const SIRegisterInfo *TRI = ST.getRegisterInfo();
952 
953   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
954     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
955         !SIInstrInfo::isFLAT(I))
956       return false;
957 
958     for (const MachineOperand &Def : MI->defs()) {
959       const MachineOperand *Op =
960           I.findRegisterUseOperand(Def.getReg(), false, TRI);
961       if (!Op)
962         continue;
963       return true;
964     }
965     return false;
966   };
967 
968   auto IsExpiredFn = [](const MachineInstr &MI, int) {
969     return SIInstrInfo::isVALU(MI) ||
970            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
971             !MI.getOperand(0).getImm()) ||
972            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
973             MI.getOperand(0).getImm() == 0xffe3);
974   };
975 
976   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
977       std::numeric_limits<int>::max())
978     return false;
979 
980   const SIInstrInfo *TII = ST.getInstrInfo();
981   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
982           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
983       .addImm(0xffe3);
984   return true;
985 }
986 
987 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
988   if (!ST.hasSMEMtoVectorWriteHazard())
989     return false;
990 
991   if (!SIInstrInfo::isVALU(*MI))
992     return false;
993 
994   unsigned SDSTName;
995   switch (MI->getOpcode()) {
996   case AMDGPU::V_READLANE_B32:
997   case AMDGPU::V_READFIRSTLANE_B32:
998     SDSTName = AMDGPU::OpName::vdst;
999     break;
1000   default:
1001     SDSTName = AMDGPU::OpName::sdst;
1002     break;
1003   }
1004 
1005   const SIInstrInfo *TII = ST.getInstrInfo();
1006   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1007   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1008   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1009   if (!SDST) {
1010     for (const auto &MO : MI->implicit_operands()) {
1011       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
1012         SDST = &MO;
1013         break;
1014       }
1015     }
1016   }
1017 
1018   if (!SDST)
1019     return false;
1020 
1021   const Register SDSTReg = SDST->getReg();
1022   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1023     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1024   };
1025 
1026   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1027     if (TII->isSALU(MI)) {
1028       switch (MI.getOpcode()) {
1029       case AMDGPU::S_SETVSKIP:
1030       case AMDGPU::S_VERSION:
1031       case AMDGPU::S_WAITCNT_VSCNT:
1032       case AMDGPU::S_WAITCNT_VMCNT:
1033       case AMDGPU::S_WAITCNT_EXPCNT:
1034         // These instructions cannot not mitigate the hazard.
1035         return false;
1036       case AMDGPU::S_WAITCNT_LGKMCNT:
1037         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1038         return (MI.getOperand(1).getImm() == 0) &&
1039                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1040       case AMDGPU::S_WAITCNT: {
1041         const int64_t Imm = MI.getOperand(0).getImm();
1042         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1043         return (Decoded.LgkmCnt == 0);
1044       }
1045       default:
1046         // SOPP instructions cannot mitigate the hazard.
1047         if (TII->isSOPP(MI))
1048           return false;
1049         // At this point the SALU can be assumed to mitigate the hazard
1050         // because either:
1051         // (a) it is independent of the at risk SMEM (breaking chain),
1052         // or
1053         // (b) it is dependent on the SMEM, in which case an appropriate
1054         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1055         //     SMEM instruction.
1056         return true;
1057       }
1058     }
1059     return false;
1060   };
1061 
1062   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1063       std::numeric_limits<int>::max())
1064     return false;
1065 
1066   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1067           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1068       .addImm(0);
1069   return true;
1070 }
1071 
1072 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1073   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1074     return false;
1075 
1076   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1077   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1078     return false;
1079 
1080   auto IsHazardFn = [TRI](const MachineInstr &I) {
1081     if (SIInstrInfo::isVALU(I))
1082       return false;
1083     return I.readsRegister(AMDGPU::EXEC, TRI);
1084   };
1085 
1086   const SIInstrInfo *TII = ST.getInstrInfo();
1087   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1088     if (SIInstrInfo::isVALU(MI)) {
1089       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1090         return true;
1091       for (auto MO : MI.implicit_operands())
1092         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1093           return true;
1094     }
1095     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1096         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1097       return true;
1098     return false;
1099   };
1100 
1101   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1102       std::numeric_limits<int>::max())
1103     return false;
1104 
1105   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1106           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1107     .addImm(0xfffe);
1108   return true;
1109 }
1110 
1111 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1112                                                  const GCNSubtarget &ST) {
1113   if (!ST.hasLdsBranchVmemWARHazard())
1114     return false;
1115 
1116   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1117   // instructions need to appear in the same function.
1118   bool HasLds = false;
1119   bool HasVmem = false;
1120   for (auto &MBB : MF) {
1121     for (auto &MI : MBB) {
1122       HasLds |= SIInstrInfo::isDS(MI);
1123       HasVmem |=
1124           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1125       if (HasLds && HasVmem)
1126         return true;
1127     }
1128   }
1129   return false;
1130 }
1131 
1132 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1133   if (!RunLdsBranchVmemWARHazardFixup)
1134     return false;
1135 
1136   assert(ST.hasLdsBranchVmemWARHazard());
1137 
1138   auto IsHazardInst = [](const MachineInstr &MI) {
1139     if (SIInstrInfo::isDS(MI))
1140       return 1;
1141     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1142       return 2;
1143     return 0;
1144   };
1145 
1146   auto InstType = IsHazardInst(*MI);
1147   if (!InstType)
1148     return false;
1149 
1150   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1151     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1152                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1153                                !I.getOperand(1).getImm());
1154   };
1155 
1156   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1157     if (!I.isBranch())
1158       return false;
1159 
1160     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1161       auto InstType2 = IsHazardInst(I);
1162       return InstType2 && InstType != InstType2;
1163     };
1164 
1165     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1166       auto InstType2 = IsHazardInst(I);
1167       if (InstType == InstType2)
1168         return true;
1169 
1170       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1171              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1172              !I.getOperand(1).getImm();
1173     };
1174 
1175     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1176            std::numeric_limits<int>::max();
1177   };
1178 
1179   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1180       std::numeric_limits<int>::max())
1181     return false;
1182 
1183   const SIInstrInfo *TII = ST.getInstrInfo();
1184   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1185           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1186     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1187     .addImm(0);
1188 
1189   return true;
1190 }
1191 
1192 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1193   int NSAtoVMEMWaitStates = 1;
1194 
1195   if (!ST.hasNSAtoVMEMBug())
1196     return 0;
1197 
1198   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1199     return 0;
1200 
1201   const SIInstrInfo *TII = ST.getInstrInfo();
1202   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1203   if (!Offset || (Offset->getImm() & 6) == 0)
1204     return 0;
1205 
1206   auto IsHazardFn = [TII](const MachineInstr &I) {
1207     if (!SIInstrInfo::isMIMG(I))
1208       return false;
1209     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1210     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1211            TII->getInstSizeInBytes(I) >= 16;
1212   };
1213 
1214   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1215 }
1216 
1217 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1218   int FPAtomicToDenormModeWaitStates = 3;
1219 
1220   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1221     return 0;
1222 
1223   auto IsHazardFn = [](const MachineInstr &I) {
1224     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1225       return false;
1226     return SIInstrInfo::isFPAtomic(I);
1227   };
1228 
1229   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1230     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1231       return true;
1232 
1233     switch (MI.getOpcode()) {
1234     case AMDGPU::S_WAITCNT:
1235     case AMDGPU::S_WAITCNT_VSCNT:
1236     case AMDGPU::S_WAITCNT_VMCNT:
1237     case AMDGPU::S_WAITCNT_EXPCNT:
1238     case AMDGPU::S_WAITCNT_LGKMCNT:
1239     case AMDGPU::S_WAIT_IDLE:
1240       return true;
1241     default:
1242       break;
1243     }
1244 
1245     return false;
1246   };
1247 
1248   return FPAtomicToDenormModeWaitStates -
1249          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1250 }
1251 
1252 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1253   assert(SIInstrInfo::isMAI(*MI));
1254 
1255   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1256 }
1257 
1258 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1259   // Early exit if no padding is requested.
1260   if (MFMAPaddingRatio == 0)
1261     return 0;
1262 
1263   auto IsMFMAFn = [](const MachineInstr &MI) {
1264     return SIInstrInfo::isMAI(MI) &&
1265            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1266            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1267   };
1268 
1269   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1270   if (!IsMFMAFn(*MI) || MFI->getOccupancy() < 2)
1271     return 0;
1272 
1273   int NeighborMFMALatency = 0;
1274   auto IsNeighboringMFMA = [&IsMFMAFn, &NeighborMFMALatency,
1275                             this](const MachineInstr &MI) {
1276     if (!IsMFMAFn(MI))
1277       return false;
1278 
1279     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1280     return true;
1281   };
1282 
1283   const int MaxMFMAPipelineWaitStates = 16;
1284   int WaitStatesSinceNeighborMFMA =
1285       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1286 
1287   int NeighborMFMAPaddingNeeded =
1288       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1289       WaitStatesSinceNeighborMFMA;
1290 
1291   return std::max(0, NeighborMFMAPaddingNeeded);
1292 }
1293 
1294 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1295   int WaitStatesNeeded = 0;
1296   unsigned Opc = MI->getOpcode();
1297 
1298   auto IsVALUFn = [](const MachineInstr &MI) {
1299     return SIInstrInfo::isVALU(MI);
1300   };
1301 
1302   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1303     const int LegacyVALUWritesVGPRWaitStates = 2;
1304     const int VALUWritesExecWaitStates = 4;
1305     const int MaxWaitStates = 4;
1306 
1307     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1308       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1309     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1310 
1311     if (WaitStatesNeeded < MaxWaitStates) {
1312       for (const MachineOperand &Use : MI->explicit_uses()) {
1313         const int MaxWaitStates = 2;
1314 
1315         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1316           continue;
1317 
1318         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1319           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1320         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1321 
1322         if (WaitStatesNeeded == MaxWaitStates)
1323           break;
1324       }
1325     }
1326   }
1327 
1328   auto IsMFMAFn = [](const MachineInstr &MI) {
1329     return SIInstrInfo::isMAI(MI) &&
1330            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1331            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1332   };
1333 
1334   for (const MachineOperand &Op : MI->explicit_operands()) {
1335     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1336       continue;
1337 
1338     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1339       continue;
1340 
1341     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1342     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1343     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1344     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1345     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1346     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1347     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1348     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1349     const int MaxWaitStates = 18;
1350     Register Reg = Op.getReg();
1351     unsigned HazardDefLatency = 0;
1352 
1353     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1354                                this](const MachineInstr &MI) {
1355       if (!IsMFMAFn(MI))
1356         return false;
1357       Register DstReg = MI.getOperand(0).getReg();
1358       if (DstReg == Reg)
1359         return false;
1360       HazardDefLatency =
1361           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1362       return TRI.regsOverlap(DstReg, Reg);
1363     };
1364 
1365     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1366                                                    MaxWaitStates);
1367     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1368     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1369     int OpNo = MI->getOperandNo(&Op);
1370     if (OpNo == SrcCIdx) {
1371       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1372     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1373       switch (HazardDefLatency) {
1374       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1375                break;
1376       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1377                break;
1378       case 16: LLVM_FALLTHROUGH;
1379       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1380                break;
1381       }
1382     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1383       switch (HazardDefLatency) {
1384       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1385                break;
1386       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1387                break;
1388       case 16: LLVM_FALLTHROUGH;
1389       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1390                break;
1391       }
1392     }
1393 
1394     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1395     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1396 
1397     if (WaitStatesNeeded == MaxWaitStates)
1398       return WaitStatesNeeded; // Early exit.
1399 
1400     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1401       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1402         return false;
1403       Register DstReg = MI.getOperand(0).getReg();
1404       return TRI.regsOverlap(Reg, DstReg);
1405     };
1406 
1407     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1408     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1409     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1410     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1411     if (OpNo == SrcCIdx)
1412       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1413     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1414       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1415 
1416     WaitStatesNeededForUse = NeedWaitStates -
1417       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1418     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1419 
1420     if (WaitStatesNeeded == MaxWaitStates)
1421       return WaitStatesNeeded; // Early exit.
1422   }
1423 
1424   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1425     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1426     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1427     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1428     const int MaxWaitStates = 13;
1429     Register DstReg = MI->getOperand(0).getReg();
1430     unsigned HazardDefLatency = 0;
1431 
1432     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1433                          this](const MachineInstr &MI) {
1434       if (!IsMFMAFn(MI))
1435         return false;
1436       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1437       HazardDefLatency =
1438           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1439       return TRI.regsOverlap(Reg, DstReg);
1440     };
1441 
1442     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1443     int NeedWaitStates;
1444     switch (HazardDefLatency) {
1445     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1446              break;
1447     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1448              break;
1449     case 16: LLVM_FALLTHROUGH;
1450     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1451              break;
1452     }
1453 
1454     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1455     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1456   }
1457 
1458   // Pad neighboring MFMA with noops for better inter-wave performance.
1459   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
1460 
1461   return WaitStatesNeeded;
1462 }
1463 
1464 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1465   int WaitStatesNeeded = 0;
1466   unsigned Opc = MI->getOpcode();
1467 
1468   auto IsMFMAFn = [](const MachineInstr &MI) {
1469     return SIInstrInfo::isMAI(MI) &&
1470            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1471            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1472   };
1473 
1474   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1475     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1476   };
1477 
1478   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1479     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1480   };
1481 
1482   if (!IsMFMAFn(*MI))
1483     return WaitStatesNeeded;
1484 
1485   const int VALUWritesExecWaitStates = 4;
1486   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1487     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1488                           VALUWritesExecWaitStates);
1489   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1490 
1491   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1492 
1493   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1494   for (const MachineOperand &Use : MI->explicit_uses()) {
1495     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1496     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1497     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1498     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1499     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1500     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1501     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1502     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1503     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1504     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1505     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1506     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1507     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1508     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1509     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1510     const int MaxWaitStates = 19;
1511 
1512     if (!Use.isReg())
1513       continue;
1514     Register Reg = Use.getReg();
1515     bool FullReg;
1516     const MachineInstr *MI1;
1517 
1518     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1519                                this](const MachineInstr &MI) {
1520       if (!IsMFMAFn(MI))
1521         return false;
1522       Register DstReg = MI.getOperand(0).getReg();
1523       FullReg = (DstReg == Reg);
1524       MI1 = &MI;
1525       return TRI.regsOverlap(DstReg, Reg);
1526     };
1527 
1528     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1529       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1530     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1531 
1532     int NumWaitStates =
1533         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
1534     if (NumWaitStates == std::numeric_limits<int>::max())
1535       continue;
1536 
1537     int OpNo = MI->getOperandNo(&Use);
1538     unsigned Opc1 = MI1->getOpcode();
1539     int NeedWaitStates = 0;
1540     if (OpNo == SrcCIdx) {
1541       if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1542         NeedWaitStates = 0;
1543       } else if (FullReg) {
1544         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1545              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1546             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1547              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1548           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1549       } else {
1550         switch (Opc1) {
1551         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1552         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1553         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
1554         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
1555           if (!isXDL(ST, *MI))
1556             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1557           break;
1558         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1559         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1560           if (!isXDL(ST, *MI))
1561             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1562           break;
1563         default:
1564           switch (TSchedModel.computeInstrLatency(MI1)) {
1565           case 2:
1566             NeedWaitStates = isDGEMM(Opc)
1567               ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1568               : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1569             break;
1570           case 8:
1571             NeedWaitStates = isDGEMM(Opc)
1572               ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1573               : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1574             break;
1575           case 16: LLVM_FALLTHROUGH;
1576           default:
1577             NeedWaitStates = isDGEMM(Opc)
1578               ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1579               : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1580           }
1581         }
1582       }
1583     } else {
1584       switch (Opc1) {
1585       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1586       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1587       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
1588       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
1589         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1590         break;
1591       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1592       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1593         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1594         break;
1595       default:
1596         switch (TSchedModel.computeInstrLatency(MI1)) {
1597         case 2:
1598           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1599           break;
1600         case 8:
1601           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1602           break;
1603         case 16: LLVM_FALLTHROUGH;
1604         default:
1605           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1606         }
1607       }
1608     }
1609     if (WaitStatesNeeded >= NeedWaitStates)
1610       continue;
1611 
1612     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1613     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1614 
1615     if (WaitStatesNeeded == MaxWaitStates)
1616       break;
1617   }
1618 
1619   return WaitStatesNeeded;
1620 }
1621 
1622 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1623   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
1624   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1625     return 0;
1626 
1627   int WaitStatesNeeded = 0;
1628 
1629   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1630     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1631   };
1632 
1633   for (const MachineOperand &Op : MI->explicit_uses()) {
1634     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1635       continue;
1636 
1637     Register Reg = Op.getReg();
1638 
1639     const int AccVgprReadLdStWaitStates = 2;
1640     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1641     const int MaxWaitStates = 2;
1642 
1643     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1644       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1645     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1646 
1647     if (WaitStatesNeeded == MaxWaitStates)
1648       return WaitStatesNeeded; // Early exit.
1649 
1650     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1651       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1652           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1653         return false;
1654       auto IsVALUFn = [](const MachineInstr &MI) {
1655         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1656       };
1657       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1658              std::numeric_limits<int>::max();
1659     };
1660 
1661     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1662       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1663     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1664   }
1665 
1666   return WaitStatesNeeded;
1667 }
1668 
1669 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1670   if (!ST.hasGFX90AInsts())
1671     return 0;
1672 
1673   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1674     return SIInstrInfo::isMAI(MI) &&
1675            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1676            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1677   };
1678 
1679   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1680     return isDGEMM(MI.getOpcode());
1681   };
1682 
1683   // This is checked in checkMAIHazards90A()
1684   if (IsMFMAFn(*MI))
1685     return 0;
1686 
1687   int WaitStatesNeeded = 0;
1688 
1689   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1690                        SIInstrInfo::isFLAT(*MI) ||
1691                        SIInstrInfo::isDS(*MI) ||
1692                        SIInstrInfo::isEXP(*MI);
1693   bool IsVALU = SIInstrInfo::isVALU(*MI);
1694 
1695   const MachineInstr *MFMA = nullptr;
1696   unsigned Reg;
1697   auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) {
1698     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1699       return false;
1700     MFMA = &MI;
1701     return true;
1702   };
1703 
1704   const MachineInstr *DOT = nullptr;
1705   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1706     if (!SIInstrInfo::isDOT(MI) ||
1707         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1708       return false;
1709     DOT = &MI;
1710     return true;
1711   };
1712 
1713   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1714                                            AMDGPU::OpName::src2);
1715 
1716   if (IsMemOrExport || IsVALU) {
1717     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1718     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1719     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1720     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1721     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1722     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1723     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1724     const int DotWriteSameDotReadSrcAB = 3;
1725     const int DotWriteDifferentVALURead = 3;
1726     const int MaxWaitStates = 19;
1727 
1728     for (const MachineOperand &Use : MI->explicit_uses()) {
1729       if (!Use.isReg())
1730         continue;
1731       Reg = Use.getReg();
1732 
1733       DOT = nullptr;
1734       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1735                                                      MaxWaitStates);
1736       if (DOT) {
1737         int NeedWaitStates = 0;
1738         if (DOT->getOpcode() == MI->getOpcode()) {
1739           if (&Use - &MI->getOperand(0) != SrcCIdx)
1740             NeedWaitStates = DotWriteSameDotReadSrcAB;
1741         } else {
1742           NeedWaitStates = DotWriteDifferentVALURead;
1743         }
1744 
1745         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1746         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1747       }
1748 
1749       MFMA = nullptr;
1750       WaitStatesSinceDef =
1751           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
1752       if (!MFMA)
1753         continue;
1754 
1755       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1756       int NeedWaitStates = MaxWaitStates;
1757       switch (HazardDefLatency) {
1758       case 2:
1759         NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1760         break;
1761       case 4:
1762         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
1763         NeedWaitStates =
1764             IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1765                           : DMFMA4x4WriteVgprVALUReadWaitStates;
1766         break;
1767       case 8:
1768         NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1769         break;
1770       case 16: LLVM_FALLTHROUGH;
1771       default:
1772         NeedWaitStates =
1773           isDGEMM(MFMA->getOpcode())
1774             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1775                             : DMFMA16x16WriteVgprVALUReadWaitStates
1776             : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1777         break;
1778       }
1779 
1780       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1781       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1782 
1783       if (WaitStatesNeeded == MaxWaitStates)
1784         break;
1785     }
1786   }
1787 
1788   unsigned Opc = MI->getOpcode();
1789   const int DMFMAToFMA64WaitStates = 2;
1790   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1791        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1792        Opc == AMDGPU::V_FMAC_F64_dpp) &&
1793       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1794     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1795       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1796     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1797   }
1798 
1799   if (!IsVALU && !IsMemOrExport)
1800     return WaitStatesNeeded;
1801 
1802   for (const MachineOperand &Def : MI->defs()) {
1803     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1804     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1805     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1806     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1807     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1808     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1809     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1810     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1811     const int DotWriteDifferentVALUWrite = 3;
1812     const int MaxWaitStates = 19;
1813     const int MaxWarWaitStates = 15;
1814 
1815     Reg = Def.getReg();
1816 
1817     DOT = nullptr;
1818     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1819                                                    MaxWaitStates);
1820     if (DOT && DOT->getOpcode() != MI->getOpcode())
1821       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1822                                                     WaitStatesSinceDef);
1823 
1824     MFMA = nullptr;
1825     WaitStatesSinceDef =
1826         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
1827     if (MFMA) {
1828       int NeedWaitStates = MaxWaitStates;
1829       switch (TSchedModel.computeInstrLatency(MFMA)) {
1830       case 2:
1831         NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1832         break;
1833       case 4:
1834         assert(isDGEMM(MFMA->getOpcode()));
1835         NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1836         break;
1837       case 8:
1838         NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1839         break;
1840       case 16: LLVM_FALLTHROUGH;
1841       default:
1842         NeedWaitStates = isDGEMM(MFMA->getOpcode())
1843                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
1844                    : SMFMA32x32WriteVgprVALUWawWaitStates;
1845         break;
1846       }
1847 
1848       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1849       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1850 
1851       if (WaitStatesNeeded == MaxWaitStates)
1852         break;
1853     }
1854 
1855     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1856                              this](const MachineInstr &MI) {
1857       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1858           !MI.readsRegister(Reg, &TRI))
1859         return false;
1860 
1861       const MachineOperand *SrcC =
1862           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1863       assert(SrcC);
1864       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1865         return false;
1866 
1867       MFMA = &MI;
1868       return true;
1869     };
1870 
1871     MFMA = nullptr;
1872     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1873                                                 MaxWarWaitStates);
1874     if (!MFMA)
1875       continue;
1876 
1877     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1878     int NeedWaitStates = MaxWaitStates;
1879     switch (HazardDefLatency) {
1880     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1881              break;
1882     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1883              break;
1884     case 16: LLVM_FALLTHROUGH;
1885     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1886              break;
1887     }
1888 
1889     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1890     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1891   }
1892 
1893   return WaitStatesNeeded;
1894 }
1895 
1896 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1897   if (!SU->isInstr())
1898     return false;
1899 
1900   const MachineInstr *MAI = nullptr;
1901   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1902     MAI = nullptr;
1903     if (SIInstrInfo::isMAI(MI) &&
1904         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1905         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1906       MAI = &MI;
1907     return MAI != nullptr;
1908   };
1909 
1910   MachineInstr *MI = SU->getInstr();
1911   if (IsMFMAFn(*MI)) {
1912     int W = getWaitStatesSince(IsMFMAFn, 16);
1913     if (MAI)
1914       return W < (int)TSchedModel.computeInstrLatency(MAI);
1915   }
1916 
1917   return false;
1918 }
1919