xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 7bcf4d63cf3b7bcc789808ea4e9c8369e94467dc)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/ScheduleDAG.h"
20 #include "llvm/TargetParser/TargetParser.h"
21 
22 using namespace llvm;
23 
24 namespace {
25 
26 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28 
29   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30     if (Arg.getAsInteger(0, Value))
31       return O.error("'" + Arg + "' value invalid for uint argument!");
32 
33     if (Value > 100)
34       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35 
36     return false;
37   }
38 };
39 
40 } // end anonymous namespace
41 
42 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
43     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44                      cl::desc("Fill a percentage of the latency between "
45                               "neighboring MFMA with s_nops."));
46 
47 //===----------------------------------------------------------------------===//
48 // Hazard Recognizer Implementation
49 //===----------------------------------------------------------------------===//
50 
51 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
52                                                  const GCNSubtarget &ST);
53 
54 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
55   IsHazardRecognizerMode(false),
56   CurrCycleInstr(nullptr),
57   MF(MF),
58   ST(MF.getSubtarget<GCNSubtarget>()),
59   TII(*ST.getInstrInfo()),
60   TRI(TII.getRegisterInfo()),
61   ClauseUses(TRI.getNumRegUnits()),
62   ClauseDefs(TRI.getNumRegUnits()) {
63   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
64   TSchedModel.init(&ST);
65   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66 }
67 
68 void GCNHazardRecognizer::Reset() {
69   EmittedInstrs.clear();
70 }
71 
72 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
73   EmitInstruction(SU->getInstr());
74 }
75 
76 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
77   CurrCycleInstr = MI;
78 }
79 
80 static bool isDivFMas(unsigned Opcode) {
81   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82 }
83 
84 static bool isSGetReg(unsigned Opcode) {
85   return Opcode == AMDGPU::S_GETREG_B32;
86 }
87 
88 static bool isSSetReg(unsigned Opcode) {
89   switch (Opcode) {
90   case AMDGPU::S_SETREG_B32:
91   case AMDGPU::S_SETREG_B32_mode:
92   case AMDGPU::S_SETREG_IMM32_B32:
93   case AMDGPU::S_SETREG_IMM32_B32_mode:
94     return true;
95   }
96   return false;
97 }
98 
99 static bool isRWLane(unsigned Opcode) {
100   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101 }
102 
103 static bool isRFE(unsigned Opcode) {
104   return Opcode == AMDGPU::S_RFE_B64;
105 }
106 
107 static bool isSMovRel(unsigned Opcode) {
108   switch (Opcode) {
109   case AMDGPU::S_MOVRELS_B32:
110   case AMDGPU::S_MOVRELS_B64:
111   case AMDGPU::S_MOVRELD_B32:
112   case AMDGPU::S_MOVRELD_B64:
113     return true;
114   default:
115     return false;
116   }
117 }
118 
119 static bool isDGEMM(unsigned Opcode) {
120   return AMDGPU::getMAIIsDGEMM(Opcode);
121 }
122 
123 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
124   unsigned Opcode = MI.getOpcode();
125 
126   if (!SIInstrInfo::isMAI(MI) ||
127       isDGEMM(Opcode) ||
128       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
129       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
130     return false;
131 
132   if (!ST.hasGFX940Insts())
133     return true;
134 
135   return AMDGPU::getMAIIsGFX940XDL(Opcode);
136 }
137 
138 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
139                                     const MachineInstr &MI) {
140   if (TII.isAlwaysGDS(MI.getOpcode()))
141     return true;
142 
143   switch (MI.getOpcode()) {
144   case AMDGPU::S_SENDMSG:
145   case AMDGPU::S_SENDMSGHALT:
146   case AMDGPU::S_TTRACEDATA:
147     return true;
148   // These DS opcodes don't support GDS.
149   case AMDGPU::DS_NOP:
150   case AMDGPU::DS_PERMUTE_B32:
151   case AMDGPU::DS_BPERMUTE_B32:
152     return false;
153   default:
154     if (TII.isDS(MI.getOpcode())) {
155       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
156                                            AMDGPU::OpName::gds);
157       if (MI.getOperand(GDS).getImm())
158         return true;
159     }
160     return false;
161   }
162 }
163 
164 static bool isPermlane(const MachineInstr &MI) {
165   unsigned Opcode = MI.getOpcode();
166   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
167          Opcode == AMDGPU::V_PERMLANE64_B32 ||
168          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
169          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
171 }
172 
173 static bool isLdsDma(const MachineInstr &MI) {
174   return SIInstrInfo::isVALU(MI) &&
175          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
176 }
177 
178 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
179   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
180                                                      AMDGPU::OpName::simm16);
181   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
182 }
183 
184 ScheduleHazardRecognizer::HazardType
185 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
186   MachineInstr *MI = SU->getInstr();
187   // If we are not in "HazardRecognizerMode" and therefore not being run from
188   // the scheduler, track possible stalls from hazards but don't insert noops.
189   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
190 
191   if (MI->isBundle())
192    return NoHazard;
193 
194   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
195     return HazardType;
196 
197   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
198     return HazardType;
199 
200   if (checkFPAtomicToDenormModeHazard(MI) > 0)
201     return HazardType;
202 
203   if (ST.hasNoDataDepHazard())
204     return NoHazard;
205 
206   // FIXME: Should flat be considered vmem?
207   if ((SIInstrInfo::isVMEM(*MI) ||
208        SIInstrInfo::isFLAT(*MI))
209       && checkVMEMHazards(MI) > 0)
210     return HazardType;
211 
212   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
213     return HazardType;
214 
215   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
216     return HazardType;
217 
218   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
219     return HazardType;
220 
221   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
222     return HazardType;
223 
224   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
225        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
226        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
227     return HazardType;
228 
229   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
230     return HazardType;
231 
232   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
233     return HazardType;
234 
235   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
236     return HazardType;
237 
238   if (((ST.hasReadM0MovRelInterpHazard() &&
239         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
240          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
241          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
242        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
243        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
244        (ST.hasReadM0LdsDirectHazard() &&
245         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
246       checkReadM0Hazards(MI) > 0)
247     return HazardType;
248 
249   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
250     return HazardType;
251 
252   if ((SIInstrInfo::isVMEM(*MI) ||
253        SIInstrInfo::isFLAT(*MI) ||
254        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
255     return HazardType;
256 
257   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
258     return HazardType;
259 
260   return NoHazard;
261 }
262 
263 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
264                                 unsigned Quantity) {
265   while (Quantity > 0) {
266     unsigned Arg = std::min(Quantity, 8u);
267     Quantity -= Arg;
268     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
269         .addImm(Arg - 1);
270   }
271 }
272 
273 unsigned
274 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
275   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
276   assert(TSchedModel.getWriteProcResBegin(SC) !=
277          TSchedModel.getWriteProcResEnd(SC));
278   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
279 }
280 
281 void GCNHazardRecognizer::processBundle() {
282   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
283   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
284   // Check bundled MachineInstr's for hazards.
285   for (; MI != E && MI->isInsideBundle(); ++MI) {
286     CurrCycleInstr = &*MI;
287     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
288 
289     if (IsHazardRecognizerMode) {
290       fixHazards(CurrCycleInstr);
291 
292       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
293     }
294 
295     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
296     // include the bundled MI directly after, only add a maximum of
297     // (MaxLookAhead - 1) noops to EmittedInstrs.
298     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
299       EmittedInstrs.push_front(nullptr);
300 
301     EmittedInstrs.push_front(CurrCycleInstr);
302     EmittedInstrs.resize(MaxLookAhead);
303   }
304   CurrCycleInstr = nullptr;
305 }
306 
307 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
308   assert(IsHazardRecognizerMode);
309 
310   unsigned NumPreNoops = PreEmitNoops(MI);
311   EmitNoops(NumPreNoops);
312   if (MI->isInsideBundle())
313     insertNoopsInBundle(MI, TII, NumPreNoops);
314   else
315     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
316                     NumPreNoops);
317   EmitInstruction(MI);
318   AdvanceCycle();
319 }
320 
321 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
322   IsHazardRecognizerMode = true;
323   CurrCycleInstr = MI;
324   unsigned W = PreEmitNoopsCommon(MI);
325   fixHazards(MI);
326   CurrCycleInstr = nullptr;
327   return W;
328 }
329 
330 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
331   if (MI->isBundle())
332     return 0;
333 
334   int WaitStates = 0;
335 
336   if (SIInstrInfo::isSMRD(*MI))
337     return std::max(WaitStates, checkSMRDHazards(MI));
338 
339   if (ST.hasNSAtoVMEMBug())
340     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
341 
342   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
343 
344   if (ST.hasNoDataDepHazard())
345     return WaitStates;
346 
347   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
348     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
349 
350   if (SIInstrInfo::isVALU(*MI))
351     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
352 
353   if (SIInstrInfo::isDPP(*MI))
354     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
355 
356   if (isDivFMas(MI->getOpcode()))
357     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
358 
359   if (isRWLane(MI->getOpcode()))
360     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
361 
362   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
363        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
364        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
365     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
366 
367   if (MI->isInlineAsm())
368     return std::max(WaitStates, checkInlineAsmHazards(MI));
369 
370   if (isSGetReg(MI->getOpcode()))
371     return std::max(WaitStates, checkGetRegHazards(MI));
372 
373   if (isSSetReg(MI->getOpcode()))
374     return std::max(WaitStates, checkSetRegHazards(MI));
375 
376   if (isRFE(MI->getOpcode()))
377     return std::max(WaitStates, checkRFEHazards(MI));
378 
379   if ((ST.hasReadM0MovRelInterpHazard() &&
380        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
381         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
382         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
383       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
384       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
385       (ST.hasReadM0LdsDirectHazard() &&
386        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
387     return std::max(WaitStates, checkReadM0Hazards(MI));
388 
389   if (SIInstrInfo::isMAI(*MI))
390     return std::max(WaitStates, checkMAIHazards(MI));
391 
392   if (SIInstrInfo::isVMEM(*MI) ||
393       SIInstrInfo::isFLAT(*MI) ||
394       SIInstrInfo::isDS(*MI))
395     return std::max(WaitStates, checkMAILdStHazards(MI));
396 
397   return WaitStates;
398 }
399 
400 void GCNHazardRecognizer::EmitNoop() {
401   EmittedInstrs.push_front(nullptr);
402 }
403 
404 void GCNHazardRecognizer::AdvanceCycle() {
405   // When the scheduler detects a stall, it will call AdvanceCycle() without
406   // emitting any instructions.
407   if (!CurrCycleInstr) {
408     EmittedInstrs.push_front(nullptr);
409     return;
410   }
411 
412   if (CurrCycleInstr->isBundle()) {
413     processBundle();
414     return;
415   }
416 
417   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
418   if (!NumWaitStates) {
419     CurrCycleInstr = nullptr;
420     return;
421   }
422 
423   // Keep track of emitted instructions
424   EmittedInstrs.push_front(CurrCycleInstr);
425 
426   // Add a nullptr for each additional wait state after the first.  Make sure
427   // not to add more than getMaxLookAhead() items to the list, since we
428   // truncate the list to that size right after this loop.
429   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
430        i < e; ++i) {
431     EmittedInstrs.push_front(nullptr);
432   }
433 
434   // getMaxLookahead() is the largest number of wait states we will ever need
435   // to insert, so there is no point in keeping track of more than that many
436   // wait states.
437   EmittedInstrs.resize(getMaxLookAhead());
438 
439   CurrCycleInstr = nullptr;
440 }
441 
442 void GCNHazardRecognizer::RecedeCycle() {
443   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
444 }
445 
446 //===----------------------------------------------------------------------===//
447 // Helper Functions
448 //===----------------------------------------------------------------------===//
449 
450 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
451 
452 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
453 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
454 
455 // Search for a hazard in a block and its predecessors.
456 template <typename StateT>
457 static bool
458 hasHazard(StateT State,
459           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
460           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
461           const MachineBasicBlock *MBB,
462           MachineBasicBlock::const_reverse_instr_iterator I,
463           DenseSet<const MachineBasicBlock *> &Visited) {
464   for (auto E = MBB->instr_rend(); I != E; ++I) {
465     // No need to look at parent BUNDLE instructions.
466     if (I->isBundle())
467       continue;
468 
469     switch (IsHazard(State, *I)) {
470     case HazardFound:
471       return true;
472     case HazardExpired:
473       return false;
474     default:
475       // Continue search
476       break;
477     }
478 
479     if (I->isInlineAsm() || I->isMetaInstruction())
480       continue;
481 
482     UpdateState(State, *I);
483   }
484 
485   for (MachineBasicBlock *Pred : MBB->predecessors()) {
486     if (!Visited.insert(Pred).second)
487       continue;
488 
489     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
490                   Visited))
491       return true;
492   }
493 
494   return false;
495 }
496 
497 // Returns a minimum wait states since \p I walking all predecessors.
498 // Only scans until \p IsExpired does not return true.
499 // Can only be run in a hazard recognizer mode.
500 static int getWaitStatesSince(
501     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
502     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
503     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
504     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
505   for (auto E = MBB->instr_rend(); I != E; ++I) {
506     // Don't add WaitStates for parent BUNDLE instructions.
507     if (I->isBundle())
508       continue;
509 
510     if (IsHazard(*I))
511       return WaitStates;
512 
513     if (I->isInlineAsm())
514       continue;
515 
516     WaitStates += GetNumWaitStates(*I);
517 
518     if (IsExpired(*I, WaitStates))
519       return std::numeric_limits<int>::max();
520   }
521 
522   int MinWaitStates = std::numeric_limits<int>::max();
523   for (MachineBasicBlock *Pred : MBB->predecessors()) {
524     if (!Visited.insert(Pred).second)
525       continue;
526 
527     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
528                                IsExpired, Visited, GetNumWaitStates);
529 
530     MinWaitStates = std::min(MinWaitStates, W);
531   }
532 
533   return MinWaitStates;
534 }
535 
536 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
537                               const MachineInstr *MI, IsExpiredFn IsExpired) {
538   DenseSet<const MachineBasicBlock *> Visited;
539   return getWaitStatesSince(IsHazard, MI->getParent(),
540                             std::next(MI->getReverseIterator()),
541                             0, IsExpired, Visited);
542 }
543 
544 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
545   if (IsHazardRecognizerMode) {
546     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
547       return WaitStates >= Limit;
548     };
549     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
550   }
551 
552   int WaitStates = 0;
553   for (MachineInstr *MI : EmittedInstrs) {
554     if (MI) {
555       if (IsHazard(*MI))
556         return WaitStates;
557 
558       if (MI->isInlineAsm())
559         continue;
560     }
561     ++WaitStates;
562 
563     if (WaitStates >= Limit)
564       break;
565   }
566   return std::numeric_limits<int>::max();
567 }
568 
569 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
570                                                IsHazardFn IsHazardDef,
571                                                int Limit) {
572   const SIRegisterInfo *TRI = ST.getRegisterInfo();
573 
574   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
575     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
576   };
577 
578   return getWaitStatesSince(IsHazardFn, Limit);
579 }
580 
581 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
582                                                   int Limit) {
583   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
584     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
585   };
586 
587   return getWaitStatesSince(IsHazardFn, Limit);
588 }
589 
590 //===----------------------------------------------------------------------===//
591 // No-op Hazard Detection
592 //===----------------------------------------------------------------------===//
593 
594 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
595                         MCRegister Reg) {
596   for (MCRegUnit Unit : TRI.regunits(Reg))
597     BV.set(Unit);
598 }
599 
600 static void addRegsToSet(const SIRegisterInfo &TRI,
601                          iterator_range<MachineInstr::const_mop_iterator> Ops,
602                          BitVector &DefSet, BitVector &UseSet) {
603   for (const MachineOperand &Op : Ops) {
604     if (Op.isReg())
605       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
606   }
607 }
608 
609 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
610   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
611 }
612 
613 static bool breaksSMEMSoftClause(MachineInstr *MI) {
614   return !SIInstrInfo::isSMRD(*MI);
615 }
616 
617 static bool breaksVMEMSoftClause(MachineInstr *MI) {
618   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
619 }
620 
621 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
622   // SMEM soft clause are only present on VI+, and only matter if xnack is
623   // enabled.
624   if (!ST.isXNACKEnabled())
625     return 0;
626 
627   bool IsSMRD = TII.isSMRD(*MEM);
628 
629   resetClause();
630 
631   // A soft-clause is any group of consecutive SMEM instructions.  The
632   // instructions in this group may return out of order and/or may be
633   // replayed (i.e. the same instruction issued more than once).
634   //
635   // In order to handle these situations correctly we need to make sure that
636   // when a clause has more than one instruction, no instruction in the clause
637   // writes to a register that is read by another instruction in the clause
638   // (including itself). If we encounter this situation, we need to break the
639   // clause by inserting a non SMEM instruction.
640 
641   for (MachineInstr *MI : EmittedInstrs) {
642     // When we hit a non-SMEM instruction then we have passed the start of the
643     // clause and we can stop.
644     if (!MI)
645       break;
646 
647     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
648       break;
649 
650     addClauseInst(*MI);
651   }
652 
653   if (ClauseDefs.none())
654     return 0;
655 
656   // We need to make sure not to put loads and stores in the same clause if they
657   // use the same address. For now, just start a new clause whenever we see a
658   // store.
659   if (MEM->mayStore())
660     return 1;
661 
662   addClauseInst(*MEM);
663 
664   // If the set of defs and uses intersect then we cannot add this instruction
665   // to the clause, so we have a hazard.
666   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
667 }
668 
669 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
670   int WaitStatesNeeded = 0;
671 
672   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
673 
674   // This SMRD hazard only affects SI.
675   if (!ST.hasSMRDReadVALUDefHazard())
676     return WaitStatesNeeded;
677 
678   // A read of an SGPR by SMRD instruction requires 4 wait states when the
679   // SGPR was written by a VALU instruction.
680   int SmrdSgprWaitStates = 4;
681   auto IsHazardDefFn = [this](const MachineInstr &MI) {
682     return TII.isVALU(MI);
683   };
684   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
685     return TII.isSALU(MI);
686   };
687 
688   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
689 
690   for (const MachineOperand &Use : SMRD->uses()) {
691     if (!Use.isReg())
692       continue;
693     int WaitStatesNeededForUse =
694         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
695                                                    SmrdSgprWaitStates);
696     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
697 
698     // This fixes what appears to be undocumented hardware behavior in SI where
699     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
700     // needs some number of nops in between. We don't know how many we need, but
701     // let's use 4. This wasn't discovered before probably because the only
702     // case when this happens is when we expand a 64-bit pointer into a full
703     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
704     // probably never encountered in the closed-source land.
705     if (IsBufferSMRD) {
706       int WaitStatesNeededForUse =
707         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
708                                                    IsBufferHazardDefFn,
709                                                    SmrdSgprWaitStates);
710       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
711     }
712   }
713 
714   return WaitStatesNeeded;
715 }
716 
717 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
718   if (!ST.hasVMEMReadSGPRVALUDefHazard())
719     return 0;
720 
721   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
722 
723   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
724   // SGPR was written by a VALU Instruction.
725   const int VmemSgprWaitStates = 5;
726   auto IsHazardDefFn = [this](const MachineInstr &MI) {
727     return TII.isVALU(MI);
728   };
729   for (const MachineOperand &Use : VMEM->uses()) {
730     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
731       continue;
732 
733     int WaitStatesNeededForUse =
734         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
735                                                    VmemSgprWaitStates);
736     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
737   }
738   return WaitStatesNeeded;
739 }
740 
741 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
742   const SIRegisterInfo *TRI = ST.getRegisterInfo();
743   const SIInstrInfo *TII = ST.getInstrInfo();
744 
745   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
746   int DppVgprWaitStates = 2;
747   int DppExecWaitStates = 5;
748   int WaitStatesNeeded = 0;
749   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
750     return TII->isVALU(MI);
751   };
752 
753   for (const MachineOperand &Use : DPP->uses()) {
754     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
755       continue;
756     int WaitStatesNeededForUse =
757         DppVgprWaitStates - getWaitStatesSinceDef(
758                                 Use.getReg(),
759                                 [](const MachineInstr &) { return true; },
760                                 DppVgprWaitStates);
761     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
762   }
763 
764   WaitStatesNeeded = std::max(
765       WaitStatesNeeded,
766       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
767                                                 DppExecWaitStates));
768 
769   return WaitStatesNeeded;
770 }
771 
772 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
773   const SIInstrInfo *TII = ST.getInstrInfo();
774 
775   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
776   // instruction.
777   const int DivFMasWaitStates = 4;
778   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
779     return TII->isVALU(MI);
780   };
781   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
782                                                DivFMasWaitStates);
783 
784   return DivFMasWaitStates - WaitStatesNeeded;
785 }
786 
787 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
788   const SIInstrInfo *TII = ST.getInstrInfo();
789   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
790 
791   const int GetRegWaitStates = 2;
792   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
793     return GetRegHWReg == getHWReg(TII, MI);
794   };
795   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
796 
797   return GetRegWaitStates - WaitStatesNeeded;
798 }
799 
800 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
801   const SIInstrInfo *TII = ST.getInstrInfo();
802   unsigned HWReg = getHWReg(TII, *SetRegInstr);
803 
804   const int SetRegWaitStates = ST.getSetRegWaitStates();
805   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
806     return HWReg == getHWReg(TII, MI);
807   };
808   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
809   return SetRegWaitStates - WaitStatesNeeded;
810 }
811 
812 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
813   if (!MI.mayStore())
814     return -1;
815 
816   const SIInstrInfo *TII = ST.getInstrInfo();
817   unsigned Opcode = MI.getOpcode();
818   const MCInstrDesc &Desc = MI.getDesc();
819 
820   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
821   int VDataRCID = -1;
822   if (VDataIdx != -1)
823     VDataRCID = Desc.operands()[VDataIdx].RegClass;
824 
825   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
826     // There is no hazard if the instruction does not use vector regs
827     // (like wbinvl1)
828     if (VDataIdx == -1)
829       return -1;
830     // For MUBUF/MTBUF instructions this hazard only exists if the
831     // instruction is not using a register in the soffset field.
832     const MachineOperand *SOffset =
833         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
834     // If we have no soffset operand, then assume this field has been
835     // hardcoded to zero.
836     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
837         (!SOffset || !SOffset->isReg()))
838       return VDataIdx;
839   }
840 
841   // MIMG instructions create a hazard if they don't use a 256-bit T# and
842   // the store size is greater than 8 bytes and they have more than two bits
843   // of their dmask set.
844   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
845   if (TII->isMIMG(MI)) {
846     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
847     assert(SRsrcIdx != -1 &&
848            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
849     (void)SRsrcIdx;
850   }
851 
852   if (TII->isFLAT(MI)) {
853     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
854     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
855       return DataIdx;
856   }
857 
858   return -1;
859 }
860 
861 int
862 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
863                                             const MachineRegisterInfo &MRI) {
864   // Helper to check for the hazard where VMEM instructions that store more than
865   // 8 bytes can have there store data over written by the next instruction.
866   const SIRegisterInfo *TRI = ST.getRegisterInfo();
867 
868   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
869   int WaitStatesNeeded = 0;
870 
871   if (!TRI->isVectorRegister(MRI, Def.getReg()))
872     return WaitStatesNeeded;
873   Register Reg = Def.getReg();
874   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
875     int DataIdx = createsVALUHazard(MI);
876     return DataIdx >= 0 &&
877            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
878   };
879 
880   int WaitStatesNeededForDef =
881     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
882   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
883 
884   return WaitStatesNeeded;
885 }
886 
887 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
888 /// pack the computed value into correct bit position of the dest register. This
889 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
890 /// dst_sel that is not aligned to the register. This function analayzes the \p
891 /// MI and \returns an operand with dst forwarding issue, or nullptr if
892 /// none exists.
893 static const MachineOperand *
894 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
895   if (!SIInstrInfo::isVALU(MI))
896     return nullptr;
897 
898   const SIInstrInfo *TII = ST.getInstrInfo();
899 
900   unsigned Opcode = MI.getOpcode();
901 
902   // There are three different types of instructions
903   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
904   // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
905   // CVT_SR_BF8_F32 with op_sel[3:2]
906   // != 0
907   if (SIInstrInfo::isSDWA(MI)) {
908     // Type 1: SDWA with dst_sel != DWORD
909     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
910       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
911         return nullptr;
912   } else {
913     // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
914     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
915     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
916         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
917               SISrcMods::DST_OP_SEL ||
918           (AMDGPU::isFP8DstSelInst(Opcode) &&
919            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
920             SISrcMods::OP_SEL_0))))
921       return nullptr;
922   }
923 
924   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
925 }
926 
927 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
928 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
929 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
930 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
931                                             const MachineOperand *Dst,
932                                             const SIRegisterInfo *TRI) {
933   // We must consider implicit reads of the VALU. SDWA with dst_sel and
934   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
935   // and we must account for that hazard.
936   // We also must account for WAW hazards. In particular, WAW with dest
937   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
938   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
939   // check for ECC. Without accounting for this hazard, the ECC will be
940   // wrong.
941   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
942   // complete zeroesHigh16BitsOfDest)
943   for (auto &Operand : VALU->operands()) {
944     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
945       return true;
946     }
947   }
948   return false;
949 }
950 
951 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
952   int WaitStatesNeeded = 0;
953 
954   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
955     const int TransDefWaitstates = 1;
956 
957     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
958       if (!SIInstrInfo::isTRANS(MI))
959         return false;
960       const SIRegisterInfo *TRI = ST.getRegisterInfo();
961       const SIInstrInfo *TII = ST.getInstrInfo();
962       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
963 
964       for (const MachineOperand &Use : VALU->explicit_uses()) {
965         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
966           return true;
967       }
968 
969       return false;
970     };
971 
972     int WaitStatesNeededForDef =
973         TransDefWaitstates -
974         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
975     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
976   }
977 
978   if (ST.hasDstSelForwardingHazard()) {
979     const int Shift16DefWaitstates = 1;
980 
981     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
982       const SIRegisterInfo *TRI = ST.getRegisterInfo();
983       const MachineOperand *ForwardedDst =
984           getDstSelForwardingOperand(ProducerMI, ST);
985       if (ForwardedDst) {
986         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
987       }
988 
989       if (ProducerMI.isInlineAsm()) {
990         // Assume inline asm has dst forwarding hazard
991         for (auto &Def : ProducerMI.all_defs()) {
992           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
993             return true;
994         }
995       }
996 
997       return false;
998     };
999 
1000     int WaitStatesNeededForDef =
1001         Shift16DefWaitstates -
1002         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1003     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1004   }
1005 
1006   if (ST.hasVDecCoExecHazard()) {
1007     const int VALUWriteSGPRVALUReadWaitstates = 2;
1008     const int VALUWriteEXECRWLane = 4;
1009     const int VALUWriteVGPRReadlaneRead = 1;
1010 
1011     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1012     const MachineRegisterInfo &MRI = MF.getRegInfo();
1013     Register UseReg;
1014     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1015       if (!SIInstrInfo::isVALU(MI))
1016         return false;
1017       return MI.modifiesRegister(UseReg, TRI);
1018     };
1019 
1020     for (const MachineOperand &Use : VALU->explicit_uses()) {
1021       if (!Use.isReg())
1022         continue;
1023 
1024       UseReg = Use.getReg();
1025       if (TRI->isSGPRReg(MRI, UseReg)) {
1026         int WaitStatesNeededForDef =
1027             VALUWriteSGPRVALUReadWaitstates -
1028             getWaitStatesSince(IsVALUDefSGPRFn,
1029                                VALUWriteSGPRVALUReadWaitstates);
1030         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1031       }
1032     }
1033 
1034     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1035       UseReg = AMDGPU::VCC;
1036       int WaitStatesNeededForDef =
1037           VALUWriteSGPRVALUReadWaitstates -
1038           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1039       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1040     }
1041 
1042     switch (VALU->getOpcode()) {
1043     case AMDGPU::V_READLANE_B32:
1044     case AMDGPU::V_READFIRSTLANE_B32: {
1045       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1046       UseReg = Src->getReg();
1047       int WaitStatesNeededForDef =
1048           VALUWriteVGPRReadlaneRead -
1049           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1050       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1051     }
1052       [[fallthrough]];
1053     case AMDGPU::V_WRITELANE_B32: {
1054       UseReg = AMDGPU::EXEC;
1055       int WaitStatesNeededForDef =
1056           VALUWriteEXECRWLane -
1057           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1058       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1059       break;
1060     }
1061     default:
1062       break;
1063     }
1064   }
1065 
1066   // This checks for the hazard where VMEM instructions that store more than
1067   // 8 bytes can have there store data over written by the next instruction.
1068   if (!ST.has12DWordStoreHazard())
1069     return WaitStatesNeeded;
1070 
1071   const MachineRegisterInfo &MRI = MF.getRegInfo();
1072 
1073   for (const MachineOperand &Def : VALU->defs()) {
1074     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1075   }
1076 
1077   return WaitStatesNeeded;
1078 }
1079 
1080 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1081   // This checks for hazards associated with inline asm statements.
1082   // Since inline asms can contain just about anything, we use this
1083   // to call/leverage other check*Hazard routines. Note that
1084   // this function doesn't attempt to address all possible inline asm
1085   // hazards (good luck), but is a collection of what has been
1086   // problematic thus far.
1087 
1088   // see checkVALUHazards()
1089   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1090     return 0;
1091 
1092   const MachineRegisterInfo &MRI = MF.getRegInfo();
1093   int WaitStatesNeeded = 0;
1094 
1095   for (const MachineOperand &Op :
1096        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1097     if (Op.isReg() && Op.isDef()) {
1098       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1099         continue;
1100 
1101       if (ST.has12DWordStoreHazard()) {
1102         WaitStatesNeeded =
1103             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1104       }
1105     }
1106   }
1107 
1108   if (ST.hasDstSelForwardingHazard()) {
1109     const int Shift16DefWaitstates = 1;
1110 
1111     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1112       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1113       // Assume inline asm reads the dst
1114       if (Dst)
1115         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1116                IA->readsRegister(Dst->getReg(), &TRI);
1117 
1118       if (ProducerMI.isInlineAsm()) {
1119         // If MI is inline asm, assume it has dst forwarding hazard
1120         for (auto &Def : ProducerMI.all_defs()) {
1121           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1122               IA->readsRegister(Def.getReg(), &TRI)) {
1123             return true;
1124           }
1125         }
1126       }
1127 
1128       return false;
1129     };
1130 
1131     int WaitStatesNeededForDef =
1132         Shift16DefWaitstates -
1133         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1134     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1135   }
1136 
1137   return WaitStatesNeeded;
1138 }
1139 
1140 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1141   const SIInstrInfo *TII = ST.getInstrInfo();
1142   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1143   const MachineRegisterInfo &MRI = MF.getRegInfo();
1144 
1145   const MachineOperand *LaneSelectOp =
1146       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1147 
1148   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1149     return 0;
1150 
1151   Register LaneSelectReg = LaneSelectOp->getReg();
1152   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1153 
1154   const int RWLaneWaitStates = 4;
1155   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1156                                               RWLaneWaitStates);
1157   return RWLaneWaitStates - WaitStatesSince;
1158 }
1159 
1160 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1161   if (!ST.hasRFEHazards())
1162     return 0;
1163 
1164   const SIInstrInfo *TII = ST.getInstrInfo();
1165 
1166   const int RFEWaitStates = 1;
1167 
1168   auto IsHazardFn = [TII](const MachineInstr &MI) {
1169     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1170   };
1171   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1172   return RFEWaitStates - WaitStatesNeeded;
1173 }
1174 
1175 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1176   const SIInstrInfo *TII = ST.getInstrInfo();
1177   const int ReadM0WaitStates = 1;
1178   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1179   return ReadM0WaitStates -
1180          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1181 }
1182 
1183 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1184   fixVMEMtoScalarWriteHazards(MI);
1185   fixVcmpxPermlaneHazards(MI);
1186   fixSMEMtoVectorWriteHazards(MI);
1187   fixVcmpxExecWARHazard(MI);
1188   fixLdsBranchVmemWARHazard(MI);
1189   if (ST.hasLdsDirect()) {
1190     fixLdsDirectVALUHazard(MI);
1191     fixLdsDirectVMEMHazard(MI);
1192   }
1193   fixVALUPartialForwardingHazard(MI);
1194   fixVALUTransUseHazard(MI);
1195   fixWMMAHazards(MI);
1196   fixShift64HighRegBug(MI);
1197   fixVALUMaskWriteHazard(MI);
1198   fixRequiredExportPriority(MI);
1199 }
1200 
1201 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1202   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1203     return false;
1204 
1205   const SIInstrInfo *TII = ST.getInstrInfo();
1206   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1207   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1208     return (TII->isVOPC(MI) ||
1209             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1210            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1211   };
1212 
1213   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1214     unsigned Opc = MI.getOpcode();
1215     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1216            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1217   };
1218 
1219   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1220       std::numeric_limits<int>::max())
1221     return false;
1222 
1223   // V_NOP will be discarded by SQ.
1224   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1225   // which is always a VGPR and available.
1226   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1227   Register Reg = Src0->getReg();
1228   bool IsUndef = Src0->isUndef();
1229   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1230           TII->get(AMDGPU::V_MOV_B32_e32))
1231     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1232     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1233 
1234   return true;
1235 }
1236 
1237 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1238   if (!ST.hasVMEMtoScalarWriteHazard())
1239     return false;
1240   assert(!ST.hasExtendedWaitCounts());
1241 
1242   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1243     return false;
1244 
1245   if (MI->getNumDefs() == 0)
1246     return false;
1247 
1248   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1249 
1250   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1251     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1252         !SIInstrInfo::isFLAT(I))
1253       return false;
1254 
1255     for (const MachineOperand &Def : MI->defs()) {
1256       const MachineOperand *Op =
1257           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1258       if (!Op)
1259         continue;
1260       return true;
1261     }
1262     return false;
1263   };
1264 
1265   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1266     return SIInstrInfo::isVALU(MI) ||
1267            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1268             !MI.getOperand(0).getImm()) ||
1269            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1270             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1271   };
1272 
1273   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1274       std::numeric_limits<int>::max())
1275     return false;
1276 
1277   const SIInstrInfo *TII = ST.getInstrInfo();
1278   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1279           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1280       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1281   return true;
1282 }
1283 
1284 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1285   if (!ST.hasSMEMtoVectorWriteHazard())
1286     return false;
1287   assert(!ST.hasExtendedWaitCounts());
1288 
1289   if (!SIInstrInfo::isVALU(*MI))
1290     return false;
1291 
1292   unsigned SDSTName;
1293   switch (MI->getOpcode()) {
1294   case AMDGPU::V_READLANE_B32:
1295   case AMDGPU::V_READFIRSTLANE_B32:
1296     SDSTName = AMDGPU::OpName::vdst;
1297     break;
1298   default:
1299     SDSTName = AMDGPU::OpName::sdst;
1300     break;
1301   }
1302 
1303   const SIInstrInfo *TII = ST.getInstrInfo();
1304   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1305   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1306   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1307   if (!SDST) {
1308     for (const auto &MO : MI->implicit_operands()) {
1309       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1310         SDST = &MO;
1311         break;
1312       }
1313     }
1314   }
1315 
1316   if (!SDST)
1317     return false;
1318 
1319   const Register SDSTReg = SDST->getReg();
1320   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1321     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1322   };
1323 
1324   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1325     if (TII->isSALU(MI)) {
1326       switch (MI.getOpcode()) {
1327       case AMDGPU::S_SETVSKIP:
1328       case AMDGPU::S_VERSION:
1329       case AMDGPU::S_WAITCNT_VSCNT:
1330       case AMDGPU::S_WAITCNT_VMCNT:
1331       case AMDGPU::S_WAITCNT_EXPCNT:
1332         // These instructions cannot not mitigate the hazard.
1333         return false;
1334       case AMDGPU::S_WAITCNT_LGKMCNT:
1335         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1336         return (MI.getOperand(1).getImm() == 0) &&
1337                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1338       case AMDGPU::S_WAITCNT: {
1339         const int64_t Imm = MI.getOperand(0).getImm();
1340         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1341         // DsCnt corresponds to LGKMCnt here.
1342         return (Decoded.DsCnt == 0);
1343       }
1344       default:
1345         // SOPP instructions cannot mitigate the hazard.
1346         if (TII->isSOPP(MI))
1347           return false;
1348         // At this point the SALU can be assumed to mitigate the hazard
1349         // because either:
1350         // (a) it is independent of the at risk SMEM (breaking chain),
1351         // or
1352         // (b) it is dependent on the SMEM, in which case an appropriate
1353         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1354         //     SMEM instruction.
1355         return true;
1356       }
1357     }
1358     return false;
1359   };
1360 
1361   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1362       std::numeric_limits<int>::max())
1363     return false;
1364 
1365   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1366           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1367       .addImm(0);
1368   return true;
1369 }
1370 
1371 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1372   if (!ST.hasVcmpxExecWARHazard())
1373     return false;
1374   assert(!ST.hasExtendedWaitCounts());
1375 
1376   if (!SIInstrInfo::isVALU(*MI))
1377     return false;
1378 
1379   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1380   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1381     return false;
1382 
1383   auto IsHazardFn = [TRI](const MachineInstr &I) {
1384     if (SIInstrInfo::isVALU(I))
1385       return false;
1386     return I.readsRegister(AMDGPU::EXEC, TRI);
1387   };
1388 
1389   const SIInstrInfo *TII = ST.getInstrInfo();
1390   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1391     if (SIInstrInfo::isVALU(MI)) {
1392       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1393         return true;
1394       for (auto MO : MI.implicit_operands())
1395         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1396           return true;
1397     }
1398     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1399         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1400       return true;
1401     return false;
1402   };
1403 
1404   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1405       std::numeric_limits<int>::max())
1406     return false;
1407 
1408   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1409           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1410       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1411   return true;
1412 }
1413 
1414 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1415                                                  const GCNSubtarget &ST) {
1416   if (!ST.hasLdsBranchVmemWARHazard())
1417     return false;
1418 
1419   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1420   // instructions need to appear in the same function.
1421   bool HasLds = false;
1422   bool HasVmem = false;
1423   for (auto &MBB : MF) {
1424     for (auto &MI : MBB) {
1425       HasLds |= SIInstrInfo::isDS(MI);
1426       HasVmem |=
1427           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1428       if (HasLds && HasVmem)
1429         return true;
1430     }
1431   }
1432   return false;
1433 }
1434 
1435 static bool isStoreCountWaitZero(const MachineInstr &I) {
1436   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1437          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1438          !I.getOperand(1).getImm();
1439 }
1440 
1441 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1442   if (!RunLdsBranchVmemWARHazardFixup)
1443     return false;
1444 
1445   assert(ST.hasLdsBranchVmemWARHazard());
1446   assert(!ST.hasExtendedWaitCounts());
1447 
1448   auto IsHazardInst = [](const MachineInstr &MI) {
1449     if (SIInstrInfo::isDS(MI))
1450       return 1;
1451     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1452       return 2;
1453     return 0;
1454   };
1455 
1456   auto InstType = IsHazardInst(*MI);
1457   if (!InstType)
1458     return false;
1459 
1460   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1461     return IsHazardInst(I) || isStoreCountWaitZero(I);
1462   };
1463 
1464   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1465     if (!I.isBranch())
1466       return false;
1467 
1468     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1469       auto InstType2 = IsHazardInst(I);
1470       return InstType2 && InstType != InstType2;
1471     };
1472 
1473     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1474       auto InstType2 = IsHazardInst(I);
1475       if (InstType == InstType2)
1476         return true;
1477 
1478       return isStoreCountWaitZero(I);
1479     };
1480 
1481     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1482            std::numeric_limits<int>::max();
1483   };
1484 
1485   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1486       std::numeric_limits<int>::max())
1487     return false;
1488 
1489   const SIInstrInfo *TII = ST.getInstrInfo();
1490   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1491           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1492     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1493     .addImm(0);
1494 
1495   return true;
1496 }
1497 
1498 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1499   if (!SIInstrInfo::isLDSDIR(*MI))
1500     return false;
1501 
1502   const int NoHazardWaitStates = 15;
1503   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1504   const Register VDSTReg = VDST->getReg();
1505 
1506   bool VisitedTrans = false;
1507   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1508     if (!SIInstrInfo::isVALU(I))
1509       return false;
1510     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1511     // Cover both WAR and WAW
1512     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1513   };
1514   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1515     if (WaitStates >= NoHazardWaitStates)
1516       return true;
1517     // Instructions which cause va_vdst==0 expire hazard
1518     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1519            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1520   };
1521   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1522     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1523   };
1524 
1525   DenseSet<const MachineBasicBlock *> Visited;
1526   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1527                                     std::next(MI->getReverseIterator()), 0,
1528                                     IsExpiredFn, Visited, GetWaitStatesFn);
1529 
1530   // Transcendentals can execute in parallel to other VALUs.
1531   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1532   if (VisitedTrans)
1533     Count = 0;
1534 
1535   MachineOperand *WaitVdstOp =
1536       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1537   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1538 
1539   return true;
1540 }
1541 
1542 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1543   if (!SIInstrInfo::isLDSDIR(*MI))
1544     return false;
1545 
1546   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1547   const Register VDSTReg = VDST->getReg();
1548 
1549   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1550     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1551         !SIInstrInfo::isDS(I))
1552       return false;
1553     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1554   };
1555   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1556   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1557   // according to the type of VMEM instruction.
1558   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1559     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1560            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1561            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1562             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1563            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1564             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1565   };
1566 
1567   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1568       std::numeric_limits<int>::max())
1569     return false;
1570 
1571   if (LdsdirCanWait) {
1572     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1573   } else {
1574     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1575             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1576         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1577   }
1578 
1579   return true;
1580 }
1581 
1582 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1583   if (!ST.hasVALUPartialForwardingHazard())
1584     return false;
1585   assert(!ST.hasExtendedWaitCounts());
1586 
1587   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1588     return false;
1589 
1590   SmallSetVector<Register, 4> SrcVGPRs;
1591 
1592   for (const MachineOperand &Use : MI->explicit_uses()) {
1593     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1594       SrcVGPRs.insert(Use.getReg());
1595   }
1596 
1597   // Only applies with >= 2 unique VGPR sources
1598   if (SrcVGPRs.size() <= 1)
1599     return false;
1600 
1601   // Look for the following pattern:
1602   //   Va <- VALU [PreExecPos]
1603   //   intv1
1604   //   Exec <- SALU [ExecPos]
1605   //   intv2
1606   //   Vb <- VALU [PostExecPos]
1607   //   intv3
1608   //   MI Va, Vb (WaitState = 0)
1609   //
1610   // Where:
1611   // intv1 + intv2 <= 2 VALUs
1612   // intv3 <= 4 VALUs
1613   //
1614   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1615 
1616   const int Intv1plus2MaxVALUs = 2;
1617   const int Intv3MaxVALUs = 4;
1618   const int IntvMaxVALUs = 6;
1619   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1620 
1621   struct StateType {
1622     SmallDenseMap<Register, int, 4> DefPos;
1623     int ExecPos = std::numeric_limits<int>::max();
1624     int VALUs = 0;
1625   };
1626 
1627   StateType State;
1628 
1629   // This overloads expiry testing with all the hazard detection
1630   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1631     // Too many VALU states have passed
1632     if (State.VALUs > NoHazardVALUWaitStates)
1633       return HazardExpired;
1634 
1635     // Instructions which cause va_vdst==0 expire hazard
1636     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1637         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1638         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1639          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1640       return HazardExpired;
1641 
1642     // Track registers writes
1643     bool Changed = false;
1644     if (SIInstrInfo::isVALU(I)) {
1645       for (Register Src : SrcVGPRs) {
1646         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1647           State.DefPos[Src] = State.VALUs;
1648           Changed = true;
1649         }
1650       }
1651     } else if (SIInstrInfo::isSALU(I)) {
1652       if (State.ExecPos == std::numeric_limits<int>::max()) {
1653         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1654           State.ExecPos = State.VALUs;
1655           Changed = true;
1656         }
1657       }
1658     }
1659 
1660     // Early expiration: too many VALUs in intv3
1661     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1662       return HazardExpired;
1663 
1664     // Only evaluate state if something changed
1665     if (!Changed)
1666       return NoHazardFound;
1667 
1668     // Determine positions of VALUs pre/post exec change
1669     if (State.ExecPos == std::numeric_limits<int>::max())
1670       return NoHazardFound;
1671 
1672     int PreExecPos = std::numeric_limits<int>::max();
1673     int PostExecPos = std::numeric_limits<int>::max();
1674 
1675     for (auto Entry : State.DefPos) {
1676       int DefVALUs = Entry.second;
1677       if (DefVALUs != std::numeric_limits<int>::max()) {
1678         if (DefVALUs >= State.ExecPos)
1679           PreExecPos = std::min(PreExecPos, DefVALUs);
1680         else
1681           PostExecPos = std::min(PostExecPos, DefVALUs);
1682       }
1683     }
1684 
1685     // Need a VALUs post exec change
1686     if (PostExecPos == std::numeric_limits<int>::max())
1687       return NoHazardFound;
1688 
1689     // Too many VALUs in intv3?
1690     int Intv3VALUs = PostExecPos;
1691     if (Intv3VALUs > Intv3MaxVALUs)
1692       return HazardExpired;
1693 
1694     // Too many VALUs in intv2?
1695     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1696     if (Intv2VALUs > Intv1plus2MaxVALUs)
1697       return HazardExpired;
1698 
1699     // Need a VALUs pre exec change
1700     if (PreExecPos == std::numeric_limits<int>::max())
1701       return NoHazardFound;
1702 
1703     // Too many VALUs in intv1?
1704     int Intv1VALUs = PreExecPos - State.ExecPos;
1705     if (Intv1VALUs > Intv1plus2MaxVALUs)
1706       return HazardExpired;
1707 
1708     // Too many VALUs in intv1 + intv2
1709     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1710       return HazardExpired;
1711 
1712     return HazardFound;
1713   };
1714   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1715     if (SIInstrInfo::isVALU(MI))
1716       State.VALUs += 1;
1717   };
1718 
1719   DenseSet<const MachineBasicBlock *> Visited;
1720   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1721                             std::next(MI->getReverseIterator()), Visited))
1722     return false;
1723 
1724   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1725           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1726       .addImm(0x0fff);
1727 
1728   return true;
1729 }
1730 
1731 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1732   if (!ST.hasVALUTransUseHazard())
1733     return false;
1734   assert(!ST.hasExtendedWaitCounts());
1735 
1736   if (!SIInstrInfo::isVALU(*MI))
1737     return false;
1738 
1739   SmallSet<Register, 4> SrcVGPRs;
1740 
1741   for (const MachineOperand &Use : MI->explicit_uses()) {
1742     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1743       SrcVGPRs.insert(Use.getReg());
1744   }
1745 
1746   // Look for the following pattern:
1747   //   Va <- TRANS VALU
1748   //   intv
1749   //   MI Va (WaitState = 0)
1750   //
1751   // Where:
1752   // intv <= 5 VALUs / 1 TRANS
1753   //
1754   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1755 
1756   const int IntvMaxVALUs = 5;
1757   const int IntvMaxTRANS = 1;
1758 
1759   struct StateType {
1760     int VALUs = 0;
1761     int TRANS = 0;
1762   };
1763 
1764   StateType State;
1765 
1766   // This overloads expiry testing with all the hazard detection
1767   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1768     // Too many VALU states have passed
1769     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1770       return HazardExpired;
1771 
1772     // Instructions which cause va_vdst==0 expire hazard
1773     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1774         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1775         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1776          I.getOperand(0).getImm() == 0x0fff))
1777       return HazardExpired;
1778 
1779     // Track registers writes
1780     if (SIInstrInfo::isTRANS(I)) {
1781       for (Register Src : SrcVGPRs) {
1782         if (I.modifiesRegister(Src, &TRI)) {
1783           return HazardFound;
1784         }
1785       }
1786     }
1787 
1788     return NoHazardFound;
1789   };
1790   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1791     if (SIInstrInfo::isVALU(MI))
1792       State.VALUs += 1;
1793     if (SIInstrInfo::isTRANS(MI))
1794       State.TRANS += 1;
1795   };
1796 
1797   DenseSet<const MachineBasicBlock *> Visited;
1798   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1799                             std::next(MI->getReverseIterator()), Visited))
1800     return false;
1801 
1802   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1803   // avoided.
1804   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1805           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1806       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1807 
1808   return true;
1809 }
1810 
1811 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1812   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1813     return false;
1814 
1815   const SIInstrInfo *TII = ST.getInstrInfo();
1816   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1817 
1818   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1819     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1820       return false;
1821 
1822     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1823     // with the dest(matrix D) of the previous wmma.
1824     const Register CurSrc0Reg =
1825         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1826     const Register CurSrc1Reg =
1827         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1828 
1829     const Register PrevDstReg =
1830         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1831 
1832     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1833         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1834       return true;
1835     }
1836 
1837     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1838     // but Index can't overlap with PrevDstReg.
1839     if (AMDGPU::isGFX12Plus(ST)) {
1840       if (SIInstrInfo::isSWMMAC(*MI)) {
1841         const Register CurIndex =
1842             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1843         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1844           return true;
1845       }
1846       return false;
1847     }
1848 
1849     return false;
1850   };
1851 
1852   auto IsExpiredFn = [](const MachineInstr &I, int) {
1853     return SIInstrInfo::isVALU(I);
1854   };
1855 
1856   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1857       std::numeric_limits<int>::max())
1858     return false;
1859 
1860   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1861 
1862   return true;
1863 }
1864 
1865 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1866   if (!ST.hasShift64HighRegBug())
1867     return false;
1868   assert(!ST.hasExtendedWaitCounts());
1869 
1870   switch (MI->getOpcode()) {
1871   default:
1872     return false;
1873   case AMDGPU::V_LSHLREV_B64_e64:
1874   case AMDGPU::V_LSHRREV_B64_e64:
1875   case AMDGPU::V_ASHRREV_I64_e64:
1876     break;
1877   }
1878 
1879   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1880   if (!Amt->isReg())
1881     return false;
1882 
1883   Register AmtReg = Amt->getReg();
1884   const MachineRegisterInfo &MRI = MF.getRegInfo();
1885   // Check if this is a last VGPR in the allocation block.
1886   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1887     return false;
1888 
1889   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1890     return false;
1891 
1892   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1893   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1894   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1895   bool Overlapped = OverlappedSrc || OverlappedDst;
1896 
1897   assert(!OverlappedDst || !OverlappedSrc ||
1898          Src1->getReg() == MI->getOperand(0).getReg());
1899   assert(ST.needsAlignedVGPRs());
1900   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1901 
1902   Register NewReg;
1903   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1904                                    : AMDGPU::VGPR_32RegClass) {
1905     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1906       NewReg = Reg;
1907       break;
1908     }
1909   }
1910 
1911   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1912                                : NewReg;
1913   Register NewAmtLo;
1914 
1915   if (Overlapped)
1916     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1917 
1918   DebugLoc DL = MI->getDebugLoc();
1919   MachineBasicBlock *MBB = MI->getParent();
1920   // Insert a full wait count because found register might be pending a wait.
1921   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1922       .addImm(0);
1923 
1924   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1925   if (Overlapped)
1926     runOnInstruction(
1927         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1928             .addDef(AmtReg - 1)
1929             .addReg(AmtReg - 1, RegState::Undef)
1930             .addReg(NewAmtLo, RegState::Undef));
1931   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1932                        .addDef(AmtReg)
1933                        .addReg(AmtReg, RegState::Undef)
1934                        .addReg(NewAmt, RegState::Undef));
1935 
1936   // Instructions emitted after the current instruction will be processed by the
1937   // parent loop of the hazard recognizer in a natural way.
1938   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1939           AmtReg)
1940       .addDef(NewAmt)
1941       .addReg(NewAmt)
1942       .addReg(AmtReg);
1943   if (Overlapped)
1944     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1945             AmtReg - 1)
1946         .addDef(NewAmtLo)
1947         .addReg(NewAmtLo)
1948         .addReg(AmtReg - 1);
1949 
1950   // Re-running hazard recognizer on the modified instruction is not necessary,
1951   // inserted V_SWAP_B32 has already both read and write new registers so
1952   // hazards related to these register has already been handled.
1953   Amt->setReg(NewAmt);
1954   Amt->setIsKill(false);
1955   // We do not update liveness, so verifier may see it as undef.
1956   Amt->setIsUndef();
1957   if (OverlappedDst)
1958     MI->getOperand(0).setReg(NewReg);
1959   if (OverlappedSrc) {
1960     Src1->setReg(NewReg);
1961     Src1->setIsKill(false);
1962     Src1->setIsUndef();
1963   }
1964 
1965   return true;
1966 }
1967 
1968 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1969   int NSAtoVMEMWaitStates = 1;
1970 
1971   if (!ST.hasNSAtoVMEMBug())
1972     return 0;
1973 
1974   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1975     return 0;
1976 
1977   const SIInstrInfo *TII = ST.getInstrInfo();
1978   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1979   if (!Offset || (Offset->getImm() & 6) == 0)
1980     return 0;
1981 
1982   auto IsHazardFn = [TII](const MachineInstr &I) {
1983     if (!SIInstrInfo::isMIMG(I))
1984       return false;
1985     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1986     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1987            TII->getInstSizeInBytes(I) >= 16;
1988   };
1989 
1990   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1991 }
1992 
1993 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1994   int FPAtomicToDenormModeWaitStates = 3;
1995 
1996   if (!ST.hasFPAtomicToDenormModeHazard())
1997     return 0;
1998   assert(!ST.hasExtendedWaitCounts());
1999 
2000   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2001     return 0;
2002 
2003   auto IsHazardFn = [](const MachineInstr &I) {
2004     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2005       return false;
2006     return SIInstrInfo::isFPAtomic(I);
2007   };
2008 
2009   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2010     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2011       return true;
2012 
2013     switch (MI.getOpcode()) {
2014     case AMDGPU::S_WAITCNT:
2015     case AMDGPU::S_WAITCNT_VSCNT:
2016     case AMDGPU::S_WAITCNT_VMCNT:
2017     case AMDGPU::S_WAITCNT_EXPCNT:
2018     case AMDGPU::S_WAITCNT_LGKMCNT:
2019     case AMDGPU::S_WAIT_IDLE:
2020       return true;
2021     default:
2022       break;
2023     }
2024 
2025     return false;
2026   };
2027 
2028   return FPAtomicToDenormModeWaitStates -
2029          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2030 }
2031 
2032 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2033   assert(SIInstrInfo::isMAI(*MI));
2034 
2035   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2036 }
2037 
2038 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2039   // Early exit if no padding is requested.
2040   if (MFMAPaddingRatio == 0)
2041     return 0;
2042 
2043   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2044   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2045     return 0;
2046 
2047   int NeighborMFMALatency = 0;
2048   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2049                             this](const MachineInstr &MI) {
2050     if (!SIInstrInfo::isMFMA(MI))
2051       return false;
2052 
2053     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2054     return true;
2055   };
2056 
2057   const int MaxMFMAPipelineWaitStates = 16;
2058   int WaitStatesSinceNeighborMFMA =
2059       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2060 
2061   int NeighborMFMAPaddingNeeded =
2062       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2063       WaitStatesSinceNeighborMFMA;
2064 
2065   return std::max(0, NeighborMFMAPaddingNeeded);
2066 }
2067 
2068 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2069   int WaitStatesNeeded = 0;
2070   unsigned Opc = MI->getOpcode();
2071 
2072   auto IsVALUFn = [](const MachineInstr &MI) {
2073     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2074   };
2075 
2076   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2077     const int LegacyVALUWritesVGPRWaitStates = 2;
2078     const int VALUWritesExecWaitStates = 4;
2079     const int MaxWaitStates = 4;
2080 
2081     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2082       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2083     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2084 
2085     if (WaitStatesNeeded < MaxWaitStates) {
2086       for (const MachineOperand &Use : MI->explicit_uses()) {
2087         const int MaxWaitStates = 2;
2088 
2089         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2090           continue;
2091 
2092         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2093           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2094         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2095 
2096         if (WaitStatesNeeded == MaxWaitStates)
2097           break;
2098       }
2099     }
2100   }
2101 
2102   for (const MachineOperand &Op : MI->explicit_operands()) {
2103     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2104       continue;
2105 
2106     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2107       continue;
2108 
2109     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2110     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2111     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2112     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2113     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2114     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2115     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2116     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2117     const int MaxWaitStates = 18;
2118     Register Reg = Op.getReg();
2119     unsigned HazardDefLatency = 0;
2120 
2121     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2122                                this](const MachineInstr &MI) {
2123       if (!SIInstrInfo::isMFMA(MI))
2124         return false;
2125       Register DstReg = MI.getOperand(0).getReg();
2126       if (DstReg == Reg)
2127         return false;
2128       HazardDefLatency =
2129           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2130       return TRI.regsOverlap(DstReg, Reg);
2131     };
2132 
2133     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2134                                                    MaxWaitStates);
2135     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2136     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2137     int OpNo = Op.getOperandNo();
2138     if (OpNo == SrcCIdx) {
2139       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2140     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2141       switch (HazardDefLatency) {
2142       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2143                break;
2144       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2145                break;
2146       case 16: [[fallthrough]];
2147       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2148                break;
2149       }
2150     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2151       switch (HazardDefLatency) {
2152       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2153                break;
2154       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2155                break;
2156       case 16: [[fallthrough]];
2157       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2158                break;
2159       }
2160     }
2161 
2162     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2163     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2164 
2165     if (WaitStatesNeeded == MaxWaitStates)
2166       return WaitStatesNeeded; // Early exit.
2167 
2168     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2169       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2170         return false;
2171       Register DstReg = MI.getOperand(0).getReg();
2172       return TRI.regsOverlap(Reg, DstReg);
2173     };
2174 
2175     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2176     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2177     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2178     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2179     if (OpNo == SrcCIdx)
2180       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2181     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2182       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2183 
2184     WaitStatesNeededForUse = NeedWaitStates -
2185       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2186     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2187 
2188     if (WaitStatesNeeded == MaxWaitStates)
2189       return WaitStatesNeeded; // Early exit.
2190   }
2191 
2192   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2193     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2194     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2195     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2196     const int MaxWaitStates = 13;
2197     Register DstReg = MI->getOperand(0).getReg();
2198     unsigned HazardDefLatency = 0;
2199 
2200     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2201                          this](const MachineInstr &MI) {
2202       if (!SIInstrInfo::isMFMA(MI))
2203         return false;
2204       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2205       HazardDefLatency =
2206           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2207       return TRI.regsOverlap(Reg, DstReg);
2208     };
2209 
2210     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2211     int NeedWaitStates;
2212     switch (HazardDefLatency) {
2213     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2214              break;
2215     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2216              break;
2217     case 16: [[fallthrough]];
2218     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2219              break;
2220     }
2221 
2222     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2223     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2224   }
2225 
2226   // Pad neighboring MFMA with noops for better inter-wave performance.
2227   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2228 
2229   return WaitStatesNeeded;
2230 }
2231 
2232 static int
2233 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2234   // 2 pass -> 3
2235   // 4 pass -> 5
2236   // 8 pass -> 9
2237   // 16 pass -> 17
2238   return NumPasses + 1;
2239 }
2240 
2241 static int
2242 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2243   // 2 pass -> 2
2244   // 4 pass -> 4
2245   // 8 pass -> 8
2246   // 16 pass -> 16
2247   return NumPasses;
2248 }
2249 
2250 static int
2251 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2252   // 2 pass -> 4
2253   // 4 pass -> 6
2254   // 8 pass -> 10
2255   // 16 pass -> 18
2256   return NumPasses + 2;
2257 }
2258 
2259 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2260   // 2 pass -> 5
2261   // 4 pass -> 7
2262   // 8 pass -> 11
2263   // 16 pass -> 19
2264   return NumPasses + 3;
2265 }
2266 
2267 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2268   int WaitStatesNeeded = 0;
2269   unsigned Opc = MI->getOpcode();
2270 
2271   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2272     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2273   };
2274 
2275   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2276     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2277            !SIInstrInfo::isDOT(MI);
2278   };
2279 
2280   if (!SIInstrInfo::isMFMA(*MI))
2281     return WaitStatesNeeded;
2282 
2283   const int VALUWritesExecWaitStates = 4;
2284   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2285     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2286                           VALUWritesExecWaitStates);
2287   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2288 
2289   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2290 
2291   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2292   for (const MachineOperand &Use : MI->explicit_uses()) {
2293     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2294     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2295     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2296     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2297     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2298     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2299     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2300     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2301     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2302     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2303     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2304     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2305     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2306     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2307     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2308     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2309     const int MaxWaitStates = 19;
2310 
2311     if (!Use.isReg())
2312       continue;
2313     Register Reg = Use.getReg();
2314     bool FullReg;
2315     const MachineInstr *MI1;
2316 
2317     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2318                                this](const MachineInstr &MI) {
2319       if (!SIInstrInfo::isMFMA(MI))
2320         return false;
2321       Register DstReg = MI.getOperand(0).getReg();
2322       FullReg = (DstReg == Reg);
2323       MI1 = &MI;
2324       return TRI.regsOverlap(DstReg, Reg);
2325     };
2326 
2327     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2328       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2329     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2330 
2331     int NumWaitStates =
2332         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2333     if (NumWaitStates == std::numeric_limits<int>::max())
2334       continue;
2335 
2336     int OpNo = Use.getOperandNo();
2337     unsigned Opc1 = MI1->getOpcode();
2338     int NeedWaitStates = 0;
2339     if (OpNo == SrcCIdx) {
2340       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2341         NeedWaitStates = 0;
2342       } else if (FullReg) {
2343         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2344              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2345             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2346              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2347           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2348         else if (ST.hasGFX940Insts() &&
2349                  TSchedModel.computeInstrLatency(MI1) == 2)
2350           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2351       } else {
2352         switch (Opc1) {
2353         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2354         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2355         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2356         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2357           if (!isXDL(ST, *MI))
2358             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2359           break;
2360         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2361         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2362           if (!isXDL(ST, *MI))
2363             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2364           break;
2365         default:
2366           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2367           if (ST.hasGFX940Insts()) {
2368             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2369               break;
2370 
2371             NeedWaitStates =
2372                 isXDL(ST, *MI1)
2373                     ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2374                           NumPasses)
2375                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2376                           NumPasses);
2377             break;
2378           }
2379 
2380           switch (NumPasses) {
2381           case 2:
2382             NeedWaitStates =
2383                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2384                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2385             break;
2386           case 8:
2387             NeedWaitStates =
2388                 isDGEMM(Opc)
2389                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2390                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2391             break;
2392           case 16:
2393             NeedWaitStates =
2394                 isDGEMM(Opc)
2395                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2396                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2397             break;
2398           default:
2399             llvm_unreachable("unexpected number of passes");
2400           }
2401         }
2402       }
2403     } else {
2404       switch (Opc1) {
2405       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2406       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2407       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2408       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2409         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2410         break;
2411       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2412       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2413         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2414         break;
2415       default:
2416         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2417 
2418         if (ST.hasGFX940Insts()) {
2419           NeedWaitStates =
2420               isXDL(ST, *MI1)
2421                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2422                         NumPasses)
2423                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2424                         NumPasses);
2425           break;
2426         }
2427 
2428         switch (NumPasses) {
2429         case 2:
2430           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2431           break;
2432         case 4:
2433           llvm_unreachable("unexpected number of passes for mfma");
2434         case 8:
2435           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2436           break;
2437         case 16:
2438         default:
2439           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2440         }
2441       }
2442     }
2443     if (WaitStatesNeeded >= NeedWaitStates)
2444       continue;
2445 
2446     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2447     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2448 
2449     if (WaitStatesNeeded == MaxWaitStates)
2450       break;
2451   }
2452 
2453   // Pad neighboring MFMA with noops for better inter-wave performance.
2454   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2455 
2456   return WaitStatesNeeded;
2457 }
2458 
2459 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2460   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2461   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2462     return 0;
2463 
2464   int WaitStatesNeeded = 0;
2465 
2466   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2467     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2468   };
2469 
2470   for (const MachineOperand &Op : MI->explicit_uses()) {
2471     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2472       continue;
2473 
2474     Register Reg = Op.getReg();
2475 
2476     const int AccVgprReadLdStWaitStates = 2;
2477     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2478     const int MaxWaitStates = 2;
2479 
2480     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2481       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2482     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2483 
2484     if (WaitStatesNeeded == MaxWaitStates)
2485       return WaitStatesNeeded; // Early exit.
2486 
2487     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2488       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2489           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2490         return false;
2491       auto IsVALUFn = [](const MachineInstr &MI) {
2492         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2493       };
2494       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2495              std::numeric_limits<int>::max();
2496     };
2497 
2498     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2499       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2500     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2501   }
2502 
2503   return WaitStatesNeeded;
2504 }
2505 
2506 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2507   // 2 pass -> 4
2508   // 4 pass -> 6
2509   // 8 pass -> 10
2510   // 16 pass -> 18
2511   return NumPasses + 2;
2512 }
2513 
2514 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2515   // 2 pass -> 5
2516   // 4 pass -> 7
2517   // 8 pass -> 11
2518   // 16 pass -> 19
2519   return NumPasses + 3;
2520 }
2521 
2522 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2523   // 2 pass -> 5
2524   // 4 pass -> 7
2525   // 8 pass -> 11
2526   // 16 pass -> 19
2527   return NumPasses + 3;
2528 }
2529 
2530 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2531   // 2 pass -> 4
2532   // 4 pass -> 6
2533   // 8 pass -> 10
2534   // 16 pass -> 18
2535   return NumPasses + 2;
2536 }
2537 
2538 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2539   if (!ST.hasGFX90AInsts())
2540     return 0;
2541 
2542   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2543     return isDGEMM(MI.getOpcode());
2544   };
2545 
2546   // This is checked in checkMAIHazards90A()
2547   if (SIInstrInfo::isMFMA(*MI))
2548     return 0;
2549 
2550   const MachineRegisterInfo &MRI = MF.getRegInfo();
2551 
2552   int WaitStatesNeeded = 0;
2553 
2554   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2555                SIInstrInfo::isFLAT(*MI) ||
2556                SIInstrInfo::isDS(*MI);
2557   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2558   bool IsVALU = SIInstrInfo::isVALU(*MI);
2559 
2560   const MachineInstr *MFMA = nullptr;
2561   unsigned Reg;
2562   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2563     if (!SIInstrInfo::isMFMA(MI) ||
2564         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2565       return false;
2566     MFMA = &MI;
2567     return true;
2568   };
2569 
2570   const MachineInstr *DOT = nullptr;
2571   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2572     if (!SIInstrInfo::isDOT(MI) ||
2573         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2574       return false;
2575     DOT = &MI;
2576     return true;
2577   };
2578 
2579   bool DGEMMAfterVALUWrite = false;
2580   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2581     // Found DGEMM on reverse traversal to def.
2582     if (isDGEMM(MI.getOpcode()))
2583       DGEMMAfterVALUWrite = true;
2584 
2585     // Only hazard if register is defined by a VALU and a DGEMM is found after
2586     // after the def.
2587     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2588       return false;
2589 
2590     return true;
2591   };
2592 
2593   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2594                                            AMDGPU::OpName::src2);
2595 
2596   if (IsMemOrExport || IsVALU) {
2597     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2598     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2599     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2600     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2601     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2602     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2603     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2604     const int DotWriteSameDotReadSrcAB = 3;
2605     const int DotWriteDifferentVALURead = 3;
2606     const int DMFMABetweenVALUWriteVMEMRead = 2;
2607     const int MaxWaitStates = 19;
2608 
2609     for (const MachineOperand &Use : MI->explicit_uses()) {
2610       if (!Use.isReg())
2611         continue;
2612       Reg = Use.getReg();
2613 
2614       DOT = nullptr;
2615       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2616                                                      MaxWaitStates);
2617       if (DOT) {
2618         int NeedWaitStates = 0;
2619         if (DOT->getOpcode() == MI->getOpcode()) {
2620           if (&Use - &MI->getOperand(0) != SrcCIdx)
2621             NeedWaitStates = DotWriteSameDotReadSrcAB;
2622         } else {
2623           NeedWaitStates = DotWriteDifferentVALURead;
2624         }
2625 
2626         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2627         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2628       }
2629 
2630       // Workaround for HW data hazard bug observed only in GFX90A. When there
2631       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2632       // causes the SQ to incorrectly not insert two wait states between the two
2633       // instructions needed to avoid data hazard.
2634       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2635         DGEMMAfterVALUWrite = false;
2636         if (TRI.isVectorRegister(MRI, Reg)) {
2637           int WaitStatesNeededForUse =
2638                 DMFMABetweenVALUWriteVMEMRead -
2639                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2640                                       DMFMABetweenVALUWriteVMEMRead);
2641 
2642           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2643         }
2644       }
2645 
2646       MFMA = nullptr;
2647       WaitStatesSinceDef =
2648           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2649       if (!MFMA)
2650         continue;
2651 
2652       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2653       int NumPasses = HazardDefLatency;
2654       int NeedWaitStates = MaxWaitStates;
2655 
2656       if (isDGEMM(MFMA->getOpcode())) {
2657         switch (HazardDefLatency) {
2658         case 4:
2659           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2660                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2661           break;
2662         case 8:
2663         case 16:
2664           NeedWaitStates = IsMemOrExport
2665                                ? DMFMA16x16WriteVgprMemExpReadWaitStates
2666                                : DMFMA16x16WriteVgprVALUReadWaitStates;
2667           break;
2668         default:
2669           llvm_unreachable("unexpected dgemm");
2670         }
2671       } else if (ST.hasGFX940Insts()) {
2672         NeedWaitStates =
2673             isXDL(ST, *MFMA)
2674                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2675                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2676                       NumPasses);
2677       } else {
2678         switch (HazardDefLatency) {
2679         case 2:
2680           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2681           break;
2682         case 8:
2683           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2684           break;
2685         case 16:
2686           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2687           break;
2688         default:
2689           llvm_unreachable("unexpected number of passes for mfma");
2690         }
2691       }
2692 
2693       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2694       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2695 
2696       if (WaitStatesNeeded == MaxWaitStates)
2697         break;
2698     }
2699   }
2700 
2701   unsigned Opc = MI->getOpcode();
2702   const int DMFMAToFMA64WaitStates = 2;
2703   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2704        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2705        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2706       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2707     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2708       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2709     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2710   }
2711 
2712   if (!IsVALU && !IsMemOrExport)
2713     return WaitStatesNeeded;
2714 
2715   for (const MachineOperand &Def : MI->defs()) {
2716     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2717     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2718     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2719     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2720     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2721     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2722     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2723     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2724     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2725     const int DotWriteDifferentVALUWrite = 3;
2726     const int MaxWaitStates = 19;
2727     const int MaxWarWaitStates = 15;
2728 
2729     Reg = Def.getReg();
2730 
2731     DOT = nullptr;
2732     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2733                                                    MaxWaitStates);
2734     if (DOT && DOT->getOpcode() != MI->getOpcode())
2735       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2736                                                     WaitStatesSinceDef);
2737 
2738     MFMA = nullptr;
2739     WaitStatesSinceDef =
2740         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2741     if (MFMA) {
2742       int NeedWaitStates = MaxWaitStates;
2743       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2744 
2745       if (isDGEMM(MFMA->getOpcode())) {
2746         switch (NumPasses) {
2747         case 4:
2748           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2749           break;
2750         case 8:
2751         case 16:
2752           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2753           break;
2754         default:
2755           llvm_unreachable("unexpected number of cycles for dgemm");
2756         }
2757       } else if (ST.hasGFX940Insts()) {
2758         NeedWaitStates =
2759             isXDL(ST, *MFMA)
2760                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2761                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2762       } else {
2763         switch (NumPasses) {
2764         case 2:
2765           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2766           break;
2767         case 8:
2768           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2769           break;
2770         case 16:
2771           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2772           break;
2773         default:
2774           llvm_unreachable("Unexpected number of passes for mfma");
2775         }
2776       }
2777 
2778       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2779       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2780 
2781       if (WaitStatesNeeded == MaxWaitStates)
2782         break;
2783     }
2784 
2785     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2786       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2787           !MI.readsRegister(Reg, &TRI))
2788         return false;
2789 
2790       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2791         return false;
2792 
2793       const MachineOperand *SrcC =
2794           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2795       assert(SrcC);
2796       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2797         return false;
2798 
2799       MFMA = &MI;
2800       return true;
2801     };
2802 
2803     MFMA = nullptr;
2804     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2805                                                 MaxWarWaitStates);
2806     if (!MFMA)
2807       continue;
2808 
2809     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2810     int NeedWaitStates = MaxWaitStates;
2811     switch (HazardDefLatency) {
2812     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2813              break;
2814     case 4:  assert(ST.hasGFX940Insts());
2815              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2816              break;
2817     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2818              break;
2819     case 16: [[fallthrough]];
2820     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2821              break;
2822     }
2823 
2824     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2825     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2826   }
2827 
2828   return WaitStatesNeeded;
2829 }
2830 
2831 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2832   if (!SU->isInstr())
2833     return false;
2834 
2835   const MachineInstr *MAI = nullptr;
2836 
2837   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2838     MAI = nullptr;
2839     if (SIInstrInfo::isMFMA(MI))
2840       MAI = &MI;
2841     return MAI != nullptr;
2842   };
2843 
2844   MachineInstr *MI = SU->getInstr();
2845   if (IsMFMAFn(*MI)) {
2846     int W = getWaitStatesSince(IsMFMAFn, 16);
2847     if (MAI)
2848       return W < (int)TSchedModel.computeInstrLatency(MAI);
2849   }
2850 
2851   return false;
2852 }
2853 
2854 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2855   if (!ST.hasVALUMaskWriteHazard())
2856     return false;
2857   assert(!ST.hasExtendedWaitCounts());
2858 
2859   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2860     return false;
2861 
2862   // The hazard sequence is three instructions:
2863   //   1. VALU reads SGPR as mask
2864   //   2. SALU writes SGPR
2865   //   3. SALU reads SGPR
2866   // The hazard can expire if the distance between 2 and 3 is sufficient.
2867   // In practice this happens <10% of the time, hence this always assumes
2868   // the hazard exists if 1 and 2 are present to avoid searching.
2869 
2870   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2871   if (!SDSTOp || !SDSTOp->isReg())
2872     return false;
2873 
2874   const Register HazardReg = SDSTOp->getReg();
2875   if (HazardReg == AMDGPU::EXEC ||
2876       HazardReg == AMDGPU::EXEC_LO ||
2877       HazardReg == AMDGPU::EXEC_HI ||
2878       HazardReg == AMDGPU::M0)
2879     return false;
2880 
2881   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2882     switch (I.getOpcode()) {
2883     case AMDGPU::V_ADDC_U32_e32:
2884     case AMDGPU::V_ADDC_U32_dpp:
2885     case AMDGPU::V_CNDMASK_B16_e32:
2886     case AMDGPU::V_CNDMASK_B16_dpp:
2887     case AMDGPU::V_CNDMASK_B32_e32:
2888     case AMDGPU::V_CNDMASK_B32_dpp:
2889     case AMDGPU::V_DIV_FMAS_F32_e64:
2890     case AMDGPU::V_DIV_FMAS_F64_e64:
2891     case AMDGPU::V_SUBB_U32_e32:
2892     case AMDGPU::V_SUBB_U32_dpp:
2893     case AMDGPU::V_SUBBREV_U32_e32:
2894     case AMDGPU::V_SUBBREV_U32_dpp:
2895       // These implicitly read VCC as mask source.
2896       return HazardReg == AMDGPU::VCC ||
2897              HazardReg == AMDGPU::VCC_LO ||
2898              HazardReg == AMDGPU::VCC_HI;
2899     case AMDGPU::V_ADDC_U32_e64:
2900     case AMDGPU::V_ADDC_U32_e64_dpp:
2901     case AMDGPU::V_CNDMASK_B16_e64:
2902     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2903     case AMDGPU::V_CNDMASK_B32_e64:
2904     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2905     case AMDGPU::V_SUBB_U32_e64:
2906     case AMDGPU::V_SUBB_U32_e64_dpp:
2907     case AMDGPU::V_SUBBREV_U32_e64:
2908     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2909       // Only check mask register overlaps.
2910       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2911       assert(SSRCOp);
2912       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2913     }
2914     default:
2915       return false;
2916     }
2917   };
2918 
2919   const MachineRegisterInfo &MRI = MF.getRegInfo();
2920   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2921     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2922     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2923         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2924       return true;
2925 
2926     // VALU access to any SGPR or literal constant other than HazardReg
2927     // mitigates hazard. No need to check HazardReg here as this will
2928     // only be called when !IsHazardFn.
2929     if (!SIInstrInfo::isVALU(I))
2930       return false;
2931     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2932       const MachineOperand &Op = I.getOperand(OpNo);
2933       if (Op.isReg()) {
2934         Register OpReg = Op.getReg();
2935         // Only consider uses
2936         if (!Op.isUse())
2937           continue;
2938         // Ignore EXEC
2939         if (OpReg == AMDGPU::EXEC ||
2940             OpReg == AMDGPU::EXEC_LO ||
2941             OpReg == AMDGPU::EXEC_HI)
2942           continue;
2943         // Ignore all implicit uses except VCC
2944         if (Op.isImplicit()) {
2945           if (OpReg == AMDGPU::VCC ||
2946               OpReg == AMDGPU::VCC_LO ||
2947               OpReg == AMDGPU::VCC_HI)
2948             return true;
2949           continue;
2950         }
2951         if (TRI.isSGPRReg(MRI, OpReg))
2952           return true;
2953       } else {
2954         const MCInstrDesc &InstDesc = I.getDesc();
2955         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2956         if (!TII.isInlineConstant(Op, OpInfo))
2957           return true;
2958       }
2959     }
2960     return false;
2961   };
2962 
2963   // Check for hazard
2964   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2965       std::numeric_limits<int>::max())
2966     return false;
2967 
2968   auto NextMI = std::next(MI->getIterator());
2969 
2970   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2971   BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2972           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2973       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2974 
2975   // SALU write may be s_getpc in a bundle.
2976   if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2977     // Update offsets of any references in the bundle.
2978     while (NextMI != MI->getParent()->end() &&
2979            NextMI->isBundledWithPred()) {
2980       for (auto &Operand : NextMI->operands()) {
2981         if (Operand.isGlobal())
2982           Operand.setOffset(Operand.getOffset() + 4);
2983       }
2984       NextMI++;
2985     }
2986   }
2987 
2988   return true;
2989 }
2990 
2991 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
2992                                const SIInstrInfo &TII) {
2993   MachineBasicBlock &EntryMBB = MF->front();
2994   if (EntryMBB.begin() != EntryMBB.end()) {
2995     auto &EntryMI = *EntryMBB.begin();
2996     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2997         EntryMI.getOperand(0).getImm() >= Priority)
2998       return false;
2999   }
3000 
3001   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3002       .addImm(Priority);
3003   return true;
3004 }
3005 
3006 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3007   if (!ST.hasRequiredExportPriority())
3008     return false;
3009 
3010   // Assume the following shader types will never have exports,
3011   // and avoid adding or adjusting S_SETPRIO.
3012   MachineBasicBlock *MBB = MI->getParent();
3013   MachineFunction *MF = MBB->getParent();
3014   auto CC = MF->getFunction().getCallingConv();
3015   switch (CC) {
3016   case CallingConv::AMDGPU_CS:
3017   case CallingConv::AMDGPU_CS_Chain:
3018   case CallingConv::AMDGPU_CS_ChainPreserve:
3019   case CallingConv::AMDGPU_KERNEL:
3020     return false;
3021   default:
3022     break;
3023   }
3024 
3025   const int MaxPriority = 3;
3026   const int NormalPriority = 2;
3027   const int PostExportPriority = 0;
3028 
3029   auto It = MI->getIterator();
3030   switch (MI->getOpcode()) {
3031   case AMDGPU::S_ENDPGM:
3032   case AMDGPU::S_ENDPGM_SAVED:
3033   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3034   case AMDGPU::SI_RETURN_TO_EPILOG:
3035     // Ensure shader with calls raises priority at entry.
3036     // This ensures correct priority if exports exist in callee.
3037     if (MF->getFrameInfo().hasCalls())
3038       return ensureEntrySetPrio(MF, NormalPriority, TII);
3039     return false;
3040   case AMDGPU::S_SETPRIO: {
3041     // Raise minimum priority unless in workaround.
3042     auto &PrioOp = MI->getOperand(0);
3043     int Prio = PrioOp.getImm();
3044     bool InWA = (Prio == PostExportPriority) &&
3045                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3046     if (InWA || Prio >= NormalPriority)
3047       return false;
3048     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3049     return true;
3050   }
3051   default:
3052     if (!TII.isEXP(*MI))
3053       return false;
3054     break;
3055   }
3056 
3057   // Check entry priority at each export (as there will only be a few).
3058   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3059   bool Changed = false;
3060   if (CC != CallingConv::AMDGPU_Gfx)
3061     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3062 
3063   auto NextMI = std::next(It);
3064   bool EndOfShader = false;
3065   if (NextMI != MBB->end()) {
3066     // Only need WA at end of sequence of exports.
3067     if (TII.isEXP(*NextMI))
3068       return Changed;
3069     // Assume appropriate S_SETPRIO after export means WA already applied.
3070     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3071         NextMI->getOperand(0).getImm() == PostExportPriority)
3072       return Changed;
3073     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3074   }
3075 
3076   const DebugLoc &DL = MI->getDebugLoc();
3077 
3078   // Lower priority.
3079   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3080       .addImm(PostExportPriority);
3081 
3082   if (!EndOfShader) {
3083     // Wait for exports to complete.
3084     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3085         .addReg(AMDGPU::SGPR_NULL)
3086         .addImm(0);
3087   }
3088 
3089   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3090   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3091 
3092   if (!EndOfShader) {
3093     // Return to normal (higher) priority.
3094     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3095         .addImm(NormalPriority);
3096   }
3097 
3098   return true;
3099 }
3100