xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 0eae32dcef82f6f06de6419a0d623d7def0cc8f6)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19 
20 using namespace llvm;
21 
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25 
26 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
27                                                  const GCNSubtarget &ST);
28 
29 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
30   IsHazardRecognizerMode(false),
31   CurrCycleInstr(nullptr),
32   MF(MF),
33   ST(MF.getSubtarget<GCNSubtarget>()),
34   TII(*ST.getInstrInfo()),
35   TRI(TII.getRegisterInfo()),
36   ClauseUses(TRI.getNumRegUnits()),
37   ClauseDefs(TRI.getNumRegUnits()) {
38   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
39   TSchedModel.init(&ST);
40   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
41 }
42 
43 void GCNHazardRecognizer::Reset() {
44   EmittedInstrs.clear();
45 }
46 
47 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
48   EmitInstruction(SU->getInstr());
49 }
50 
51 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
52   CurrCycleInstr = MI;
53 }
54 
55 static bool isDivFMas(unsigned Opcode) {
56   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
57 }
58 
59 static bool isSGetReg(unsigned Opcode) {
60   return Opcode == AMDGPU::S_GETREG_B32;
61 }
62 
63 static bool isSSetReg(unsigned Opcode) {
64   switch (Opcode) {
65   case AMDGPU::S_SETREG_B32:
66   case AMDGPU::S_SETREG_B32_mode:
67   case AMDGPU::S_SETREG_IMM32_B32:
68   case AMDGPU::S_SETREG_IMM32_B32_mode:
69     return true;
70   }
71   return false;
72 }
73 
74 static bool isRWLane(unsigned Opcode) {
75   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
76 }
77 
78 static bool isRFE(unsigned Opcode) {
79   return Opcode == AMDGPU::S_RFE_B64;
80 }
81 
82 static bool isSMovRel(unsigned Opcode) {
83   switch (Opcode) {
84   case AMDGPU::S_MOVRELS_B32:
85   case AMDGPU::S_MOVRELS_B64:
86   case AMDGPU::S_MOVRELD_B32:
87   case AMDGPU::S_MOVRELD_B64:
88     return true;
89   default:
90     return false;
91   }
92 }
93 
94 static bool isDGEMM(unsigned Opcode) {
95   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
96          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
97          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
98          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
99 }
100 
101 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
102   unsigned Opcode = MI.getOpcode();
103 
104   if (!SIInstrInfo::isMAI(MI) ||
105       isDGEMM(Opcode) ||
106       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
107       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
108     return false;
109 
110   return true;
111 }
112 
113 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
114                                     const MachineInstr &MI) {
115   if (TII.isAlwaysGDS(MI.getOpcode()))
116     return true;
117 
118   switch (MI.getOpcode()) {
119   case AMDGPU::S_SENDMSG:
120   case AMDGPU::S_SENDMSGHALT:
121   case AMDGPU::S_TTRACEDATA:
122     return true;
123   // These DS opcodes don't support GDS.
124   case AMDGPU::DS_NOP:
125   case AMDGPU::DS_PERMUTE_B32:
126   case AMDGPU::DS_BPERMUTE_B32:
127     return false;
128   default:
129     if (TII.isDS(MI.getOpcode())) {
130       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
131                                            AMDGPU::OpName::gds);
132       if (MI.getOperand(GDS).getImm())
133         return true;
134     }
135     return false;
136   }
137 }
138 
139 static bool isPermlane(const MachineInstr &MI) {
140   unsigned Opcode = MI.getOpcode();
141   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
142          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
143 }
144 
145 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
146   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
147                                                      AMDGPU::OpName::simm16);
148   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
149 }
150 
151 ScheduleHazardRecognizer::HazardType
152 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
153   MachineInstr *MI = SU->getInstr();
154   // If we are not in "HazardRecognizerMode" and therefore not being run from
155   // the scheduler, track possible stalls from hazards but don't insert noops.
156   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
157 
158   if (MI->isBundle())
159    return NoHazard;
160 
161   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
162     return HazardType;
163 
164   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
165     return HazardType;
166 
167   if (checkFPAtomicToDenormModeHazard(MI) > 0)
168     return HazardType;
169 
170   if (ST.hasNoDataDepHazard())
171     return NoHazard;
172 
173   // FIXME: Should flat be considered vmem?
174   if ((SIInstrInfo::isVMEM(*MI) ||
175        SIInstrInfo::isFLAT(*MI))
176       && checkVMEMHazards(MI) > 0)
177     return HazardType;
178 
179   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
180     return HazardType;
181 
182   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
183     return HazardType;
184 
185   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
186     return HazardType;
187 
188   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
189     return HazardType;
190 
191   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
192        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
193        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
194     return HazardType;
195 
196   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
197     return HazardType;
198 
199   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
200     return HazardType;
201 
202   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
203     return HazardType;
204 
205   if (ST.hasReadM0MovRelInterpHazard() &&
206       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
207       checkReadM0Hazards(MI) > 0)
208     return HazardType;
209 
210   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
211       checkReadM0Hazards(MI) > 0)
212     return HazardType;
213 
214   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
215     return HazardType;
216 
217   if ((SIInstrInfo::isVMEM(*MI) ||
218        SIInstrInfo::isFLAT(*MI) ||
219        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
220     return HazardType;
221 
222   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
223     return HazardType;
224 
225   return NoHazard;
226 }
227 
228 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
229                                 unsigned Quantity) {
230   while (Quantity > 0) {
231     unsigned Arg = std::min(Quantity, 8u);
232     Quantity -= Arg;
233     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
234         .addImm(Arg - 1);
235   }
236 }
237 
238 void GCNHazardRecognizer::processBundle() {
239   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
240   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
241   // Check bundled MachineInstr's for hazards.
242   for (; MI != E && MI->isInsideBundle(); ++MI) {
243     CurrCycleInstr = &*MI;
244     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
245 
246     if (IsHazardRecognizerMode) {
247       fixHazards(CurrCycleInstr);
248 
249       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
250     }
251 
252     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
253     // include the bundled MI directly after, only add a maximum of
254     // (MaxLookAhead - 1) noops to EmittedInstrs.
255     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
256       EmittedInstrs.push_front(nullptr);
257 
258     EmittedInstrs.push_front(CurrCycleInstr);
259     EmittedInstrs.resize(MaxLookAhead);
260   }
261   CurrCycleInstr = nullptr;
262 }
263 
264 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
265   IsHazardRecognizerMode = true;
266   CurrCycleInstr = MI;
267   unsigned W = PreEmitNoopsCommon(MI);
268   fixHazards(MI);
269   CurrCycleInstr = nullptr;
270   return W;
271 }
272 
273 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
274   if (MI->isBundle())
275     return 0;
276 
277   int WaitStates = 0;
278 
279   if (SIInstrInfo::isSMRD(*MI))
280     return std::max(WaitStates, checkSMRDHazards(MI));
281 
282   if (ST.hasNSAtoVMEMBug())
283     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
284 
285   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
286 
287   if (ST.hasNoDataDepHazard())
288     return WaitStates;
289 
290   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
291     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
292 
293   if (SIInstrInfo::isVALU(*MI))
294     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
295 
296   if (SIInstrInfo::isDPP(*MI))
297     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
298 
299   if (isDivFMas(MI->getOpcode()))
300     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
301 
302   if (isRWLane(MI->getOpcode()))
303     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
304 
305   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
306        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
307        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
308     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
309 
310   if (MI->isInlineAsm())
311     return std::max(WaitStates, checkInlineAsmHazards(MI));
312 
313   if (isSGetReg(MI->getOpcode()))
314     return std::max(WaitStates, checkGetRegHazards(MI));
315 
316   if (isSSetReg(MI->getOpcode()))
317     return std::max(WaitStates, checkSetRegHazards(MI));
318 
319   if (isRFE(MI->getOpcode()))
320     return std::max(WaitStates, checkRFEHazards(MI));
321 
322   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
323                                            isSMovRel(MI->getOpcode())))
324     return std::max(WaitStates, checkReadM0Hazards(MI));
325 
326   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
327     return std::max(WaitStates, checkReadM0Hazards(MI));
328 
329   if (SIInstrInfo::isMAI(*MI))
330     return std::max(WaitStates, checkMAIHazards(MI));
331 
332   if (SIInstrInfo::isVMEM(*MI) ||
333       SIInstrInfo::isFLAT(*MI) ||
334       SIInstrInfo::isDS(*MI))
335     return std::max(WaitStates, checkMAILdStHazards(MI));
336 
337   return WaitStates;
338 }
339 
340 void GCNHazardRecognizer::EmitNoop() {
341   EmittedInstrs.push_front(nullptr);
342 }
343 
344 void GCNHazardRecognizer::AdvanceCycle() {
345   // When the scheduler detects a stall, it will call AdvanceCycle() without
346   // emitting any instructions.
347   if (!CurrCycleInstr) {
348     EmittedInstrs.push_front(nullptr);
349     return;
350   }
351 
352   if (CurrCycleInstr->isBundle()) {
353     processBundle();
354     return;
355   }
356 
357   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
358   if (!NumWaitStates) {
359     CurrCycleInstr = nullptr;
360     return;
361   }
362 
363   // Keep track of emitted instructions
364   EmittedInstrs.push_front(CurrCycleInstr);
365 
366   // Add a nullptr for each additional wait state after the first.  Make sure
367   // not to add more than getMaxLookAhead() items to the list, since we
368   // truncate the list to that size right after this loop.
369   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
370        i < e; ++i) {
371     EmittedInstrs.push_front(nullptr);
372   }
373 
374   // getMaxLookahead() is the largest number of wait states we will ever need
375   // to insert, so there is no point in keeping track of more than that many
376   // wait states.
377   EmittedInstrs.resize(getMaxLookAhead());
378 
379   CurrCycleInstr = nullptr;
380 }
381 
382 void GCNHazardRecognizer::RecedeCycle() {
383   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
384 }
385 
386 //===----------------------------------------------------------------------===//
387 // Helper Functions
388 //===----------------------------------------------------------------------===//
389 
390 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
391 
392 // Returns a minimum wait states since \p I walking all predecessors.
393 // Only scans until \p IsExpired does not return true.
394 // Can only be run in a hazard recognizer mode.
395 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
396                               const MachineBasicBlock *MBB,
397                               MachineBasicBlock::const_reverse_instr_iterator I,
398                               int WaitStates, IsExpiredFn IsExpired,
399                               DenseSet<const MachineBasicBlock *> &Visited) {
400   for (auto E = MBB->instr_rend(); I != E; ++I) {
401     // Don't add WaitStates for parent BUNDLE instructions.
402     if (I->isBundle())
403       continue;
404 
405     if (IsHazard(*I))
406       return WaitStates;
407 
408     if (I->isInlineAsm())
409       continue;
410 
411     WaitStates += SIInstrInfo::getNumWaitStates(*I);
412 
413     if (IsExpired(*I, WaitStates))
414       return std::numeric_limits<int>::max();
415   }
416 
417   int MinWaitStates = std::numeric_limits<int>::max();
418   for (MachineBasicBlock *Pred : MBB->predecessors()) {
419     if (!Visited.insert(Pred).second)
420       continue;
421 
422     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
423                                WaitStates, IsExpired, Visited);
424 
425     MinWaitStates = std::min(MinWaitStates, W);
426   }
427 
428   return MinWaitStates;
429 }
430 
431 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
432                               const MachineInstr *MI, IsExpiredFn IsExpired) {
433   DenseSet<const MachineBasicBlock *> Visited;
434   return getWaitStatesSince(IsHazard, MI->getParent(),
435                             std::next(MI->getReverseIterator()),
436                             0, IsExpired, Visited);
437 }
438 
439 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
440   if (IsHazardRecognizerMode) {
441     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
442       return WaitStates >= Limit;
443     };
444     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
445   }
446 
447   int WaitStates = 0;
448   for (MachineInstr *MI : EmittedInstrs) {
449     if (MI) {
450       if (IsHazard(*MI))
451         return WaitStates;
452 
453       if (MI->isInlineAsm())
454         continue;
455     }
456     ++WaitStates;
457 
458     if (WaitStates >= Limit)
459       break;
460   }
461   return std::numeric_limits<int>::max();
462 }
463 
464 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
465                                                IsHazardFn IsHazardDef,
466                                                int Limit) {
467   const SIRegisterInfo *TRI = ST.getRegisterInfo();
468 
469   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
470     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
471   };
472 
473   return getWaitStatesSince(IsHazardFn, Limit);
474 }
475 
476 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
477                                                   int Limit) {
478   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
479     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
480   };
481 
482   return getWaitStatesSince(IsHazardFn, Limit);
483 }
484 
485 //===----------------------------------------------------------------------===//
486 // No-op Hazard Detection
487 //===----------------------------------------------------------------------===//
488 
489 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
490                         MCRegister Reg) {
491   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
492     BV.set(*RUI);
493 }
494 
495 static void addRegsToSet(const SIRegisterInfo &TRI,
496                          iterator_range<MachineInstr::const_mop_iterator> Ops,
497                          BitVector &Set) {
498   for (const MachineOperand &Op : Ops) {
499     if (Op.isReg())
500       addRegUnits(TRI, Set, Op.getReg().asMCReg());
501   }
502 }
503 
504 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
505   // XXX: Do we need to worry about implicit operands
506   addRegsToSet(TRI, MI.defs(), ClauseDefs);
507   addRegsToSet(TRI, MI.uses(), ClauseUses);
508 }
509 
510 static bool breaksSMEMSoftClause(MachineInstr *MI) {
511   return !SIInstrInfo::isSMRD(*MI);
512 }
513 
514 static bool breaksVMEMSoftClause(MachineInstr *MI) {
515   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
516 }
517 
518 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
519   // SMEM soft clause are only present on VI+, and only matter if xnack is
520   // enabled.
521   if (!ST.isXNACKEnabled())
522     return 0;
523 
524   bool IsSMRD = TII.isSMRD(*MEM);
525 
526   resetClause();
527 
528   // A soft-clause is any group of consecutive SMEM instructions.  The
529   // instructions in this group may return out of order and/or may be
530   // replayed (i.e. the same instruction issued more than once).
531   //
532   // In order to handle these situations correctly we need to make sure that
533   // when a clause has more than one instruction, no instruction in the clause
534   // writes to a register that is read by another instruction in the clause
535   // (including itself). If we encounter this situaion, we need to break the
536   // clause by inserting a non SMEM instruction.
537 
538   for (MachineInstr *MI : EmittedInstrs) {
539     // When we hit a non-SMEM instruction then we have passed the start of the
540     // clause and we can stop.
541     if (!MI)
542       break;
543 
544     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
545       break;
546 
547     addClauseInst(*MI);
548   }
549 
550   if (ClauseDefs.none())
551     return 0;
552 
553   // We need to make sure not to put loads and stores in the same clause if they
554   // use the same address. For now, just start a new clause whenever we see a
555   // store.
556   if (MEM->mayStore())
557     return 1;
558 
559   addClauseInst(*MEM);
560 
561   // If the set of defs and uses intersect then we cannot add this instruction
562   // to the clause, so we have a hazard.
563   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
564 }
565 
566 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
567   int WaitStatesNeeded = 0;
568 
569   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
570 
571   // This SMRD hazard only affects SI.
572   if (!ST.hasSMRDReadVALUDefHazard())
573     return WaitStatesNeeded;
574 
575   // A read of an SGPR by SMRD instruction requires 4 wait states when the
576   // SGPR was written by a VALU instruction.
577   int SmrdSgprWaitStates = 4;
578   auto IsHazardDefFn = [this](const MachineInstr &MI) {
579     return TII.isVALU(MI);
580   };
581   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
582     return TII.isSALU(MI);
583   };
584 
585   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
586 
587   for (const MachineOperand &Use : SMRD->uses()) {
588     if (!Use.isReg())
589       continue;
590     int WaitStatesNeededForUse =
591         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
592                                                    SmrdSgprWaitStates);
593     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
594 
595     // This fixes what appears to be undocumented hardware behavior in SI where
596     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
597     // needs some number of nops in between. We don't know how many we need, but
598     // let's use 4. This wasn't discovered before probably because the only
599     // case when this happens is when we expand a 64-bit pointer into a full
600     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
601     // probably never encountered in the closed-source land.
602     if (IsBufferSMRD) {
603       int WaitStatesNeededForUse =
604         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
605                                                    IsBufferHazardDefFn,
606                                                    SmrdSgprWaitStates);
607       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
608     }
609   }
610 
611   return WaitStatesNeeded;
612 }
613 
614 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
615   if (!ST.hasVMEMReadSGPRVALUDefHazard())
616     return 0;
617 
618   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
619 
620   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
621   // SGPR was written by a VALU Instruction.
622   const int VmemSgprWaitStates = 5;
623   auto IsHazardDefFn = [this](const MachineInstr &MI) {
624     return TII.isVALU(MI);
625   };
626   for (const MachineOperand &Use : VMEM->uses()) {
627     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
628       continue;
629 
630     int WaitStatesNeededForUse =
631         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
632                                                    VmemSgprWaitStates);
633     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
634   }
635   return WaitStatesNeeded;
636 }
637 
638 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
639   const SIRegisterInfo *TRI = ST.getRegisterInfo();
640   const SIInstrInfo *TII = ST.getInstrInfo();
641 
642   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
643   int DppVgprWaitStates = 2;
644   int DppExecWaitStates = 5;
645   int WaitStatesNeeded = 0;
646   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
647     return TII->isVALU(MI);
648   };
649 
650   for (const MachineOperand &Use : DPP->uses()) {
651     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
652       continue;
653     int WaitStatesNeededForUse =
654         DppVgprWaitStates - getWaitStatesSinceDef(
655                                 Use.getReg(),
656                                 [](const MachineInstr &) { return true; },
657                                 DppVgprWaitStates);
658     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
659   }
660 
661   WaitStatesNeeded = std::max(
662       WaitStatesNeeded,
663       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
664                                                 DppExecWaitStates));
665 
666   return WaitStatesNeeded;
667 }
668 
669 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
670   const SIInstrInfo *TII = ST.getInstrInfo();
671 
672   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
673   // instruction.
674   const int DivFMasWaitStates = 4;
675   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
676     return TII->isVALU(MI);
677   };
678   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
679                                                DivFMasWaitStates);
680 
681   return DivFMasWaitStates - WaitStatesNeeded;
682 }
683 
684 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
685   const SIInstrInfo *TII = ST.getInstrInfo();
686   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
687 
688   const int GetRegWaitStates = 2;
689   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
690     return GetRegHWReg == getHWReg(TII, MI);
691   };
692   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
693 
694   return GetRegWaitStates - WaitStatesNeeded;
695 }
696 
697 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
698   const SIInstrInfo *TII = ST.getInstrInfo();
699   unsigned HWReg = getHWReg(TII, *SetRegInstr);
700 
701   const int SetRegWaitStates = ST.getSetRegWaitStates();
702   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
703     return HWReg == getHWReg(TII, MI);
704   };
705   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
706   return SetRegWaitStates - WaitStatesNeeded;
707 }
708 
709 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
710   if (!MI.mayStore())
711     return -1;
712 
713   const SIInstrInfo *TII = ST.getInstrInfo();
714   unsigned Opcode = MI.getOpcode();
715   const MCInstrDesc &Desc = MI.getDesc();
716 
717   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
718   int VDataRCID = -1;
719   if (VDataIdx != -1)
720     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
721 
722   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
723     // There is no hazard if the instruction does not use vector regs
724     // (like wbinvl1)
725     if (VDataIdx == -1)
726       return -1;
727     // For MUBUF/MTBUF instructions this hazard only exists if the
728     // instruction is not using a register in the soffset field.
729     const MachineOperand *SOffset =
730         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
731     // If we have no soffset operand, then assume this field has been
732     // hardcoded to zero.
733     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
734         (!SOffset || !SOffset->isReg()))
735       return VDataIdx;
736   }
737 
738   // MIMG instructions create a hazard if they don't use a 256-bit T# and
739   // the store size is greater than 8 bytes and they have more than two bits
740   // of their dmask set.
741   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
742   if (TII->isMIMG(MI)) {
743     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
744     assert(SRsrcIdx != -1 &&
745            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
746     (void)SRsrcIdx;
747   }
748 
749   if (TII->isFLAT(MI)) {
750     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
751     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
752       return DataIdx;
753   }
754 
755   return -1;
756 }
757 
758 int
759 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
760                                             const MachineRegisterInfo &MRI) {
761   // Helper to check for the hazard where VMEM instructions that store more than
762   // 8 bytes can have there store data over written by the next instruction.
763   const SIRegisterInfo *TRI = ST.getRegisterInfo();
764 
765   const int VALUWaitStates = 1;
766   int WaitStatesNeeded = 0;
767 
768   if (!TRI->isVectorRegister(MRI, Def.getReg()))
769     return WaitStatesNeeded;
770   Register Reg = Def.getReg();
771   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
772     int DataIdx = createsVALUHazard(MI);
773     return DataIdx >= 0 &&
774            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
775   };
776   int WaitStatesNeededForDef =
777     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
778   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
779 
780   return WaitStatesNeeded;
781 }
782 
783 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
784   // This checks for the hazard where VMEM instructions that store more than
785   // 8 bytes can have there store data over written by the next instruction.
786   if (!ST.has12DWordStoreHazard())
787     return 0;
788 
789   const MachineRegisterInfo &MRI = MF.getRegInfo();
790   int WaitStatesNeeded = 0;
791 
792   for (const MachineOperand &Def : VALU->defs()) {
793     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
794   }
795 
796   return WaitStatesNeeded;
797 }
798 
799 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
800   // This checks for hazards associated with inline asm statements.
801   // Since inline asms can contain just about anything, we use this
802   // to call/leverage other check*Hazard routines. Note that
803   // this function doesn't attempt to address all possible inline asm
804   // hazards (good luck), but is a collection of what has been
805   // problematic thus far.
806 
807   // see checkVALUHazards()
808   if (!ST.has12DWordStoreHazard())
809     return 0;
810 
811   const MachineRegisterInfo &MRI = MF.getRegInfo();
812   int WaitStatesNeeded = 0;
813 
814   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
815        I != E; ++I) {
816     const MachineOperand &Op = IA->getOperand(I);
817     if (Op.isReg() && Op.isDef()) {
818       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
819     }
820   }
821 
822   return WaitStatesNeeded;
823 }
824 
825 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
826   const SIInstrInfo *TII = ST.getInstrInfo();
827   const SIRegisterInfo *TRI = ST.getRegisterInfo();
828   const MachineRegisterInfo &MRI = MF.getRegInfo();
829 
830   const MachineOperand *LaneSelectOp =
831       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
832 
833   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
834     return 0;
835 
836   Register LaneSelectReg = LaneSelectOp->getReg();
837   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
838 
839   const int RWLaneWaitStates = 4;
840   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
841                                               RWLaneWaitStates);
842   return RWLaneWaitStates - WaitStatesSince;
843 }
844 
845 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
846   if (!ST.hasRFEHazards())
847     return 0;
848 
849   const SIInstrInfo *TII = ST.getInstrInfo();
850 
851   const int RFEWaitStates = 1;
852 
853   auto IsHazardFn = [TII](const MachineInstr &MI) {
854     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
855   };
856   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
857   return RFEWaitStates - WaitStatesNeeded;
858 }
859 
860 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
861   const SIInstrInfo *TII = ST.getInstrInfo();
862   const int SMovRelWaitStates = 1;
863   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
864   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
865                                                    SMovRelWaitStates);
866 }
867 
868 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
869   fixVMEMtoScalarWriteHazards(MI);
870   fixVcmpxPermlaneHazards(MI);
871   fixSMEMtoVectorWriteHazards(MI);
872   fixVcmpxExecWARHazard(MI);
873   fixLdsBranchVmemWARHazard(MI);
874 }
875 
876 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
877   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
878     return false;
879 
880   const SIInstrInfo *TII = ST.getInstrInfo();
881   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
882 
883   auto IsExpiredFn = [](const MachineInstr &MI, int) {
884     unsigned Opc = MI.getOpcode();
885     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
886            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
887   };
888 
889   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
890       std::numeric_limits<int>::max())
891     return false;
892 
893   // V_NOP will be discarded by SQ.
894   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
895   // which is always a VGPR and available.
896   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
897   Register Reg = Src0->getReg();
898   bool IsUndef = Src0->isUndef();
899   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
900           TII->get(AMDGPU::V_MOV_B32_e32))
901     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
902     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
903 
904   return true;
905 }
906 
907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
908   if (!ST.hasVMEMtoScalarWriteHazard())
909     return false;
910 
911   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
912     return false;
913 
914   if (MI->getNumDefs() == 0)
915     return false;
916 
917   const SIRegisterInfo *TRI = ST.getRegisterInfo();
918 
919   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
920     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
921         !SIInstrInfo::isFLAT(I))
922       return false;
923 
924     for (const MachineOperand &Def : MI->defs()) {
925       const MachineOperand *Op =
926           I.findRegisterUseOperand(Def.getReg(), false, TRI);
927       if (!Op)
928         continue;
929       return true;
930     }
931     return false;
932   };
933 
934   auto IsExpiredFn = [](const MachineInstr &MI, int) {
935     return SIInstrInfo::isVALU(MI) ||
936            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
937             !MI.getOperand(0).getImm()) ||
938            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
939             MI.getOperand(0).getImm() == 0xffe3);
940   };
941 
942   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
943       std::numeric_limits<int>::max())
944     return false;
945 
946   const SIInstrInfo *TII = ST.getInstrInfo();
947   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
948           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
949       .addImm(0xffe3);
950   return true;
951 }
952 
953 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
954   if (!ST.hasSMEMtoVectorWriteHazard())
955     return false;
956 
957   if (!SIInstrInfo::isVALU(*MI))
958     return false;
959 
960   unsigned SDSTName;
961   switch (MI->getOpcode()) {
962   case AMDGPU::V_READLANE_B32:
963   case AMDGPU::V_READFIRSTLANE_B32:
964     SDSTName = AMDGPU::OpName::vdst;
965     break;
966   default:
967     SDSTName = AMDGPU::OpName::sdst;
968     break;
969   }
970 
971   const SIInstrInfo *TII = ST.getInstrInfo();
972   const SIRegisterInfo *TRI = ST.getRegisterInfo();
973   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
974   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
975   if (!SDST) {
976     for (const auto &MO : MI->implicit_operands()) {
977       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
978         SDST = &MO;
979         break;
980       }
981     }
982   }
983 
984   if (!SDST)
985     return false;
986 
987   const Register SDSTReg = SDST->getReg();
988   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
989     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
990   };
991 
992   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
993     if (TII->isSALU(MI)) {
994       switch (MI.getOpcode()) {
995       case AMDGPU::S_SETVSKIP:
996       case AMDGPU::S_VERSION:
997       case AMDGPU::S_WAITCNT_VSCNT:
998       case AMDGPU::S_WAITCNT_VMCNT:
999       case AMDGPU::S_WAITCNT_EXPCNT:
1000         // These instructions cannot not mitigate the hazard.
1001         return false;
1002       case AMDGPU::S_WAITCNT_LGKMCNT:
1003         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1004         return (MI.getOperand(1).getImm() == 0) &&
1005                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1006       case AMDGPU::S_WAITCNT: {
1007         const int64_t Imm = MI.getOperand(0).getImm();
1008         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1009         return (Decoded.LgkmCnt == 0);
1010       }
1011       default:
1012         // SOPP instructions cannot mitigate the hazard.
1013         if (TII->isSOPP(MI))
1014           return false;
1015         // At this point the SALU can be assumed to mitigate the hazard
1016         // because either:
1017         // (a) it is independent of the at risk SMEM (breaking chain),
1018         // or
1019         // (b) it is dependent on the SMEM, in which case an appropriate
1020         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1021         //     SMEM instruction.
1022         return true;
1023       }
1024     }
1025     return false;
1026   };
1027 
1028   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1029       std::numeric_limits<int>::max())
1030     return false;
1031 
1032   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1033           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1034       .addImm(0);
1035   return true;
1036 }
1037 
1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1039   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1040     return false;
1041 
1042   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1043   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1044     return false;
1045 
1046   auto IsHazardFn = [TRI](const MachineInstr &I) {
1047     if (SIInstrInfo::isVALU(I))
1048       return false;
1049     return I.readsRegister(AMDGPU::EXEC, TRI);
1050   };
1051 
1052   const SIInstrInfo *TII = ST.getInstrInfo();
1053   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1054     if (SIInstrInfo::isVALU(MI)) {
1055       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1056         return true;
1057       for (auto MO : MI.implicit_operands())
1058         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1059           return true;
1060     }
1061     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1062         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1063       return true;
1064     return false;
1065   };
1066 
1067   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1068       std::numeric_limits<int>::max())
1069     return false;
1070 
1071   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1072           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1073     .addImm(0xfffe);
1074   return true;
1075 }
1076 
1077 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1078                                                  const GCNSubtarget &ST) {
1079   if (!ST.hasLdsBranchVmemWARHazard())
1080     return false;
1081 
1082   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1083   // instructions need to appear in the same function.
1084   bool HasLds = false;
1085   bool HasVmem = false;
1086   for (auto &MBB : MF) {
1087     for (auto &MI : MBB) {
1088       HasLds |= SIInstrInfo::isDS(MI);
1089       HasVmem |=
1090           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1091       if (HasLds && HasVmem)
1092         return true;
1093     }
1094   }
1095   return false;
1096 }
1097 
1098 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1099   if (!RunLdsBranchVmemWARHazardFixup)
1100     return false;
1101 
1102   assert(ST.hasLdsBranchVmemWARHazard());
1103 
1104   auto IsHazardInst = [](const MachineInstr &MI) {
1105     if (SIInstrInfo::isDS(MI))
1106       return 1;
1107     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1108       return 2;
1109     return 0;
1110   };
1111 
1112   auto InstType = IsHazardInst(*MI);
1113   if (!InstType)
1114     return false;
1115 
1116   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1117     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1118                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1119                                !I.getOperand(1).getImm());
1120   };
1121 
1122   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1123     if (!I.isBranch())
1124       return false;
1125 
1126     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1127       auto InstType2 = IsHazardInst(I);
1128       return InstType2 && InstType != InstType2;
1129     };
1130 
1131     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1132       auto InstType2 = IsHazardInst(I);
1133       if (InstType == InstType2)
1134         return true;
1135 
1136       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1137              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1138              !I.getOperand(1).getImm();
1139     };
1140 
1141     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1142            std::numeric_limits<int>::max();
1143   };
1144 
1145   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1146       std::numeric_limits<int>::max())
1147     return false;
1148 
1149   const SIInstrInfo *TII = ST.getInstrInfo();
1150   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1151           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1152     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1153     .addImm(0);
1154 
1155   return true;
1156 }
1157 
1158 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1159   int NSAtoVMEMWaitStates = 1;
1160 
1161   if (!ST.hasNSAtoVMEMBug())
1162     return 0;
1163 
1164   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1165     return 0;
1166 
1167   const SIInstrInfo *TII = ST.getInstrInfo();
1168   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1169   if (!Offset || (Offset->getImm() & 6) == 0)
1170     return 0;
1171 
1172   auto IsHazardFn = [TII](const MachineInstr &I) {
1173     if (!SIInstrInfo::isMIMG(I))
1174       return false;
1175     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1176     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1177            TII->getInstSizeInBytes(I) >= 16;
1178   };
1179 
1180   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1181 }
1182 
1183 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1184   int FPAtomicToDenormModeWaitStates = 3;
1185 
1186   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1187     return 0;
1188 
1189   auto IsHazardFn = [](const MachineInstr &I) {
1190     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1191       return false;
1192     return SIInstrInfo::isFPAtomic(I);
1193   };
1194 
1195   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1196     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1197       return true;
1198 
1199     switch (MI.getOpcode()) {
1200     case AMDGPU::S_WAITCNT:
1201     case AMDGPU::S_WAITCNT_VSCNT:
1202     case AMDGPU::S_WAITCNT_VMCNT:
1203     case AMDGPU::S_WAITCNT_EXPCNT:
1204     case AMDGPU::S_WAITCNT_LGKMCNT:
1205     case AMDGPU::S_WAIT_IDLE:
1206       return true;
1207     default:
1208       break;
1209     }
1210 
1211     return false;
1212   };
1213 
1214   return FPAtomicToDenormModeWaitStates -
1215          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1216 }
1217 
1218 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1219   assert(SIInstrInfo::isMAI(*MI));
1220 
1221   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1222 }
1223 
1224 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1225   int WaitStatesNeeded = 0;
1226   unsigned Opc = MI->getOpcode();
1227 
1228   auto IsVALUFn = [](const MachineInstr &MI) {
1229     return SIInstrInfo::isVALU(MI);
1230   };
1231 
1232   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1233     const int LegacyVALUWritesVGPRWaitStates = 2;
1234     const int VALUWritesExecWaitStates = 4;
1235     const int MaxWaitStates = 4;
1236 
1237     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1238       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1239     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1240 
1241     if (WaitStatesNeeded < MaxWaitStates) {
1242       for (const MachineOperand &Use : MI->explicit_uses()) {
1243         const int MaxWaitStates = 2;
1244 
1245         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1246           continue;
1247 
1248         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1249           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1250         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1251 
1252         if (WaitStatesNeeded == MaxWaitStates)
1253           break;
1254       }
1255     }
1256   }
1257 
1258   auto IsMFMAFn = [](const MachineInstr &MI) {
1259     return SIInstrInfo::isMAI(MI) &&
1260            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1261            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1262   };
1263 
1264   for (const MachineOperand &Op : MI->explicit_operands()) {
1265     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1266       continue;
1267 
1268     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1269       continue;
1270 
1271     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1272     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1273     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1274     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1275     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1276     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1277     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1278     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1279     const int MaxWaitStates = 18;
1280     Register Reg = Op.getReg();
1281     unsigned HazardDefLatency = 0;
1282 
1283     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1284                                this](const MachineInstr &MI) {
1285       if (!IsMFMAFn(MI))
1286         return false;
1287       Register DstReg = MI.getOperand(0).getReg();
1288       if (DstReg == Reg)
1289         return false;
1290       HazardDefLatency =
1291           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1292       return TRI.regsOverlap(DstReg, Reg);
1293     };
1294 
1295     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1296                                                    MaxWaitStates);
1297     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1298     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1299     int OpNo = MI->getOperandNo(&Op);
1300     if (OpNo == SrcCIdx) {
1301       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1302     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1303       switch (HazardDefLatency) {
1304       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1305                break;
1306       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1307                break;
1308       case 16: LLVM_FALLTHROUGH;
1309       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1310                break;
1311       }
1312     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1313       switch (HazardDefLatency) {
1314       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1315                break;
1316       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1317                break;
1318       case 16: LLVM_FALLTHROUGH;
1319       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1320                break;
1321       }
1322     }
1323 
1324     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1325     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1326 
1327     if (WaitStatesNeeded == MaxWaitStates)
1328       return WaitStatesNeeded; // Early exit.
1329 
1330     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1331       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1332         return false;
1333       Register DstReg = MI.getOperand(0).getReg();
1334       return TRI.regsOverlap(Reg, DstReg);
1335     };
1336 
1337     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1338     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1339     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1340     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1341     if (OpNo == SrcCIdx)
1342       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1343     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1344       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1345 
1346     WaitStatesNeededForUse = NeedWaitStates -
1347       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1348     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1349 
1350     if (WaitStatesNeeded == MaxWaitStates)
1351       return WaitStatesNeeded; // Early exit.
1352   }
1353 
1354   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1355     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1356     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1357     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1358     const int MaxWaitStates = 13;
1359     Register DstReg = MI->getOperand(0).getReg();
1360     unsigned HazardDefLatency = 0;
1361 
1362     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1363                          this](const MachineInstr &MI) {
1364       if (!IsMFMAFn(MI))
1365         return false;
1366       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1367       HazardDefLatency =
1368           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1369       return TRI.regsOverlap(Reg, DstReg);
1370     };
1371 
1372     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1373     int NeedWaitStates;
1374     switch (HazardDefLatency) {
1375     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1376              break;
1377     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1378              break;
1379     case 16: LLVM_FALLTHROUGH;
1380     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1381              break;
1382     }
1383 
1384     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1385     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1386   }
1387 
1388   return WaitStatesNeeded;
1389 }
1390 
1391 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1392   int WaitStatesNeeded = 0;
1393   unsigned Opc = MI->getOpcode();
1394 
1395   auto IsMFMAFn = [](const MachineInstr &MI) {
1396     return SIInstrInfo::isMAI(MI) &&
1397            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1398            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1399   };
1400 
1401   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1402     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1403   };
1404 
1405   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1406     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1407   };
1408 
1409   if (!IsMFMAFn(*MI))
1410     return WaitStatesNeeded;
1411 
1412   const int VALUWritesExecWaitStates = 4;
1413   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1414     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1415                           VALUWritesExecWaitStates);
1416   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1417 
1418   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1419 
1420   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1421   for (const MachineOperand &Use : MI->explicit_uses()) {
1422     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1423     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1424     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1425     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1426     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1427     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1428     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1429     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1430     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1431     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1432     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1433     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1434     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1435     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1436     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1437     const int MaxWaitStates = 19;
1438 
1439     if (!Use.isReg())
1440       continue;
1441     unsigned Reg = Use.getReg();
1442     bool FullReg;
1443     const MachineInstr *MI1;
1444 
1445     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1446                                this](const MachineInstr &MI) {
1447       if (!IsMFMAFn(MI))
1448         return false;
1449       Register DstReg = MI.getOperand(0).getReg();
1450       FullReg = (DstReg == Reg);
1451       MI1 = &MI;
1452       return TRI.regsOverlap(DstReg, Reg);
1453     };
1454 
1455     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1456       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1457     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1458 
1459     int NumWaitStates =
1460         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
1461     if (NumWaitStates == std::numeric_limits<int>::max())
1462       continue;
1463 
1464     int OpNo = MI->getOperandNo(&Use);
1465     unsigned Opc1 = MI1->getOpcode();
1466     int NeedWaitStates = 0;
1467     if (OpNo == SrcCIdx) {
1468       if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1469         NeedWaitStates = 0;
1470       } else if (FullReg) {
1471         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1472              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1473             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1474              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1475           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1476       } else {
1477         switch (Opc1) {
1478         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1479         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1480           if (!isXDL(ST, *MI))
1481             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1482           break;
1483         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1484         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1485           if (!isXDL(ST, *MI))
1486             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1487           break;
1488         default:
1489           switch (TSchedModel.computeInstrLatency(MI1)) {
1490           case 2:
1491             NeedWaitStates = isDGEMM(Opc)
1492               ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1493               : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1494             break;
1495           case 8:
1496             NeedWaitStates = isDGEMM(Opc)
1497               ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1498               : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1499             break;
1500           case 16: LLVM_FALLTHROUGH;
1501           default:
1502             NeedWaitStates = isDGEMM(Opc)
1503               ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1504               : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1505           }
1506         }
1507       }
1508     } else {
1509       switch (Opc1) {
1510       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1511       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1512         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1513         break;
1514       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1515       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1516         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1517         break;
1518       default:
1519         switch (TSchedModel.computeInstrLatency(MI1)) {
1520         case 2:
1521           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1522           break;
1523         case 8:
1524           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1525           break;
1526         case 16: LLVM_FALLTHROUGH;
1527         default:
1528           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1529         }
1530       }
1531     }
1532     if (WaitStatesNeeded >= NeedWaitStates)
1533       continue;
1534 
1535     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1536     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1537 
1538     if (WaitStatesNeeded == MaxWaitStates)
1539       break;
1540   }
1541 
1542   return WaitStatesNeeded;
1543 }
1544 
1545 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1546   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
1547   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1548     return 0;
1549 
1550   int WaitStatesNeeded = 0;
1551 
1552   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1553     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1554   };
1555 
1556   for (const MachineOperand &Op : MI->explicit_uses()) {
1557     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1558       continue;
1559 
1560     Register Reg = Op.getReg();
1561 
1562     const int AccVgprReadLdStWaitStates = 2;
1563     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1564     const int MaxWaitStates = 2;
1565 
1566     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1567       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1568     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1569 
1570     if (WaitStatesNeeded == MaxWaitStates)
1571       return WaitStatesNeeded; // Early exit.
1572 
1573     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1574       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1575           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1576         return false;
1577       auto IsVALUFn = [](const MachineInstr &MI) {
1578         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1579       };
1580       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1581              std::numeric_limits<int>::max();
1582     };
1583 
1584     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1585       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1586     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1587   }
1588 
1589   return WaitStatesNeeded;
1590 }
1591 
1592 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1593   if (!ST.hasGFX90AInsts())
1594     return 0;
1595 
1596   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1597     return SIInstrInfo::isMAI(MI) &&
1598            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1599            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1600   };
1601 
1602   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1603     return isDGEMM(MI.getOpcode());
1604   };
1605 
1606   // This is checked in checkMAIHazards90A()
1607   if (IsMFMAFn(*MI))
1608     return 0;
1609 
1610   int WaitStatesNeeded = 0;
1611 
1612   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1613                        SIInstrInfo::isFLAT(*MI) ||
1614                        SIInstrInfo::isDS(*MI) ||
1615                        SIInstrInfo::isEXP(*MI);
1616   bool IsVALU = SIInstrInfo::isVALU(*MI);
1617 
1618   const MachineInstr *MFMA = nullptr;
1619   unsigned Reg;
1620   auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) {
1621     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1622       return false;
1623     MFMA = &MI;
1624     return true;
1625   };
1626 
1627   const MachineInstr *DOT = nullptr;
1628   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1629     if (!SIInstrInfo::isDOT(MI) ||
1630         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1631       return false;
1632     DOT = &MI;
1633     return true;
1634   };
1635 
1636   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1637                                            AMDGPU::OpName::src2);
1638 
1639   if (IsMemOrExport || IsVALU) {
1640     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1641     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1642     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1643     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1644     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1645     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1646     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1647     const int DotWriteSameDotReadSrcAB = 3;
1648     const int DotWriteDifferentVALURead = 3;
1649     const int MaxWaitStates = 19;
1650 
1651     for (const MachineOperand &Use : MI->explicit_uses()) {
1652       if (!Use.isReg())
1653         continue;
1654       Reg = Use.getReg();
1655 
1656       DOT = nullptr;
1657       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1658                                                      MaxWaitStates);
1659       if (DOT) {
1660         int NeedWaitStates = 0;
1661         if (DOT->getOpcode() == MI->getOpcode()) {
1662           if (&Use - &MI->getOperand(0) != SrcCIdx)
1663             NeedWaitStates = DotWriteSameDotReadSrcAB;
1664         } else {
1665           NeedWaitStates = DotWriteDifferentVALURead;
1666         }
1667 
1668         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1669         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1670       }
1671 
1672       MFMA = nullptr;
1673       WaitStatesSinceDef =
1674           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
1675       if (!MFMA)
1676         continue;
1677 
1678       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1679       int NeedWaitStates = MaxWaitStates;
1680       switch (HazardDefLatency) {
1681       case 2:
1682         NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1683         break;
1684       case 4:
1685         assert(isDGEMM(MFMA->getOpcode()));
1686         NeedWaitStates =
1687             IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1688                           : DMFMA4x4WriteVgprVALUReadWaitStates;
1689         break;
1690       case 8:
1691         NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1692         break;
1693       case 16: LLVM_FALLTHROUGH;
1694       default:
1695         NeedWaitStates =
1696           isDGEMM(MFMA->getOpcode())
1697             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1698                             : DMFMA16x16WriteVgprVALUReadWaitStates
1699             : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1700         break;
1701       }
1702 
1703       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1704       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1705 
1706       if (WaitStatesNeeded == MaxWaitStates)
1707         break;
1708     }
1709   }
1710 
1711   unsigned Opc = MI->getOpcode();
1712   const int DMFMAToFMA64WaitStates = 2;
1713   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1714        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1715        Opc == AMDGPU::V_FMAC_F64_dpp) &&
1716       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1717     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1718       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1719     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1720   }
1721 
1722   if (!IsVALU && !IsMemOrExport)
1723     return WaitStatesNeeded;
1724 
1725   for (const MachineOperand &Def : MI->defs()) {
1726     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1727     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1728     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1729     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1730     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1731     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1732     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1733     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1734     const int DotWriteDifferentVALUWrite = 3;
1735     const int MaxWaitStates = 19;
1736     const int MaxWarWaitStates = 15;
1737 
1738     Reg = Def.getReg();
1739 
1740     DOT = nullptr;
1741     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1742                                                    MaxWaitStates);
1743     if (DOT && DOT->getOpcode() != MI->getOpcode())
1744       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1745                                                     WaitStatesSinceDef);
1746 
1747     MFMA = nullptr;
1748     WaitStatesSinceDef =
1749         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
1750     if (MFMA) {
1751       int NeedWaitStates = MaxWaitStates;
1752       switch (TSchedModel.computeInstrLatency(MFMA)) {
1753       case 2:
1754         NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1755         break;
1756       case 4:
1757         assert(isDGEMM(MFMA->getOpcode()));
1758         NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1759         break;
1760       case 8:
1761         NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1762         break;
1763       case 16: LLVM_FALLTHROUGH;
1764       default:
1765         NeedWaitStates = isDGEMM(MFMA->getOpcode())
1766                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
1767                    : SMFMA32x32WriteVgprVALUWawWaitStates;
1768         break;
1769       }
1770 
1771       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1772       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1773 
1774       if (WaitStatesNeeded == MaxWaitStates)
1775         break;
1776     }
1777 
1778     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1779                              this](const MachineInstr &MI) {
1780       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1781           !MI.readsRegister(Reg, &TRI))
1782         return false;
1783 
1784       const MachineOperand *SrcC =
1785           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1786       assert(SrcC);
1787       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1788         return false;
1789 
1790       MFMA = &MI;
1791       return true;
1792     };
1793 
1794     MFMA = nullptr;
1795     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1796                                                 MaxWarWaitStates);
1797     if (!MFMA)
1798       continue;
1799 
1800     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1801     int NeedWaitStates = MaxWaitStates;
1802     switch (HazardDefLatency) {
1803     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1804              break;
1805     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1806              break;
1807     case 16: LLVM_FALLTHROUGH;
1808     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1809              break;
1810     }
1811 
1812     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1813     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1814   }
1815 
1816   return WaitStatesNeeded;
1817 }
1818 
1819 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1820   if (!SU->isInstr())
1821     return false;
1822 
1823   const MachineInstr *MAI = nullptr;
1824   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1825     MAI = nullptr;
1826     if (SIInstrInfo::isMAI(MI) &&
1827         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1828         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1829       MAI = &MI;
1830     return MAI != nullptr;
1831   };
1832 
1833   MachineInstr *MI = SU->getInstr();
1834   if (IsMFMAFn(*MI)) {
1835     int W = getWaitStatesSince(IsMFMAFn, 16);
1836     if (MAI)
1837       return W < (int)TSchedModel.computeInstrLatency(MAI);
1838   }
1839 
1840   return false;
1841 }
1842