xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 3db4f5b0daa33903e6522e2bf1b07c45edb5c8ab)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       UseVALUReadHazardExhaustiveSearch(false),
64       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
172 }
173 
174 static bool isLdsDma(const MachineInstr &MI) {
175   return SIInstrInfo::isVALU(MI) &&
176          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
177 }
178 
179 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
180   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
181                                                      AMDGPU::OpName::simm16);
182   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
183 }
184 
185 ScheduleHazardRecognizer::HazardType
186 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
187   MachineInstr *MI = SU->getInstr();
188   // If we are not in "HazardRecognizerMode" and therefore not being run from
189   // the scheduler, track possible stalls from hazards but don't insert noops.
190   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
191 
192   if (MI->isBundle())
193    return NoHazard;
194 
195   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
196     return HazardType;
197 
198   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
199     return HazardType;
200 
201   if (checkFPAtomicToDenormModeHazard(MI) > 0)
202     return HazardType;
203 
204   if (ST.hasNoDataDepHazard())
205     return NoHazard;
206 
207   // FIXME: Should flat be considered vmem?
208   if ((SIInstrInfo::isVMEM(*MI) ||
209        SIInstrInfo::isFLAT(*MI))
210       && checkVMEMHazards(MI) > 0)
211     return HazardType;
212 
213   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
214     return HazardType;
215 
216   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
217     return HazardType;
218 
219   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
220     return HazardType;
221 
222   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
223     return HazardType;
224 
225   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
226        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
227        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
228     return HazardType;
229 
230   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
231     return HazardType;
232 
233   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
234     return HazardType;
235 
236   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
237     return HazardType;
238 
239   if (((ST.hasReadM0MovRelInterpHazard() &&
240         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
241          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
242          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
243        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
244        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
245        (ST.hasReadM0LdsDirectHazard() &&
246         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
247       checkReadM0Hazards(MI) > 0)
248     return HazardType;
249 
250   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
251     return HazardType;
252 
253   if ((SIInstrInfo::isVMEM(*MI) ||
254        SIInstrInfo::isFLAT(*MI) ||
255        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
256     return HazardType;
257 
258   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
259     return HazardType;
260 
261   return NoHazard;
262 }
263 
264 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
265                                 unsigned Quantity) {
266   while (Quantity > 0) {
267     unsigned Arg = std::min(Quantity, 8u);
268     Quantity -= Arg;
269     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
270         .addImm(Arg - 1);
271   }
272 }
273 
274 unsigned
275 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
276   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
277   assert(TSchedModel.getWriteProcResBegin(SC) !=
278          TSchedModel.getWriteProcResEnd(SC));
279   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
280 }
281 
282 void GCNHazardRecognizer::processBundle() {
283   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
284   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
285   // Check bundled MachineInstr's for hazards.
286   for (; MI != E && MI->isInsideBundle(); ++MI) {
287     CurrCycleInstr = &*MI;
288     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
289 
290     if (IsHazardRecognizerMode) {
291       fixHazards(CurrCycleInstr);
292 
293       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
294     }
295 
296     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
297     // include the bundled MI directly after, only add a maximum of
298     // (MaxLookAhead - 1) noops to EmittedInstrs.
299     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
300       EmittedInstrs.push_front(nullptr);
301 
302     EmittedInstrs.push_front(CurrCycleInstr);
303     EmittedInstrs.resize(MaxLookAhead);
304   }
305   CurrCycleInstr = nullptr;
306 }
307 
308 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
309   assert(IsHazardRecognizerMode);
310 
311   unsigned NumPreNoops = PreEmitNoops(MI);
312   EmitNoops(NumPreNoops);
313   if (MI->isInsideBundle())
314     insertNoopsInBundle(MI, TII, NumPreNoops);
315   else
316     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
317                     NumPreNoops);
318   EmitInstruction(MI);
319   AdvanceCycle();
320 }
321 
322 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
323   IsHazardRecognizerMode = true;
324   CurrCycleInstr = MI;
325   unsigned W = PreEmitNoopsCommon(MI);
326   fixHazards(MI);
327   CurrCycleInstr = nullptr;
328   return W;
329 }
330 
331 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
332   if (MI->isBundle())
333     return 0;
334 
335   int WaitStates = 0;
336 
337   if (SIInstrInfo::isSMRD(*MI))
338     return std::max(WaitStates, checkSMRDHazards(MI));
339 
340   if (ST.hasNSAtoVMEMBug())
341     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
342 
343   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
344 
345   if (ST.hasNoDataDepHazard())
346     return WaitStates;
347 
348   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
349     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
350 
351   if (SIInstrInfo::isVALU(*MI))
352     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
353 
354   if (SIInstrInfo::isDPP(*MI))
355     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
356 
357   if (isDivFMas(MI->getOpcode()))
358     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
359 
360   if (isRWLane(MI->getOpcode()))
361     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
362 
363   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
364        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
365        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
366     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
367 
368   if (MI->isInlineAsm())
369     return std::max(WaitStates, checkInlineAsmHazards(MI));
370 
371   if (isSGetReg(MI->getOpcode()))
372     return std::max(WaitStates, checkGetRegHazards(MI));
373 
374   if (isSSetReg(MI->getOpcode()))
375     return std::max(WaitStates, checkSetRegHazards(MI));
376 
377   if (isRFE(MI->getOpcode()))
378     return std::max(WaitStates, checkRFEHazards(MI));
379 
380   if ((ST.hasReadM0MovRelInterpHazard() &&
381        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
382         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
383         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
384       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
385       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
386       (ST.hasReadM0LdsDirectHazard() &&
387        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
388     return std::max(WaitStates, checkReadM0Hazards(MI));
389 
390   if (SIInstrInfo::isMAI(*MI))
391     return std::max(WaitStates, checkMAIHazards(MI));
392 
393   if (SIInstrInfo::isVMEM(*MI) ||
394       SIInstrInfo::isFLAT(*MI) ||
395       SIInstrInfo::isDS(*MI))
396     return std::max(WaitStates, checkMAILdStHazards(MI));
397 
398   return WaitStates;
399 }
400 
401 void GCNHazardRecognizer::EmitNoop() {
402   EmittedInstrs.push_front(nullptr);
403 }
404 
405 void GCNHazardRecognizer::AdvanceCycle() {
406   // When the scheduler detects a stall, it will call AdvanceCycle() without
407   // emitting any instructions.
408   if (!CurrCycleInstr) {
409     EmittedInstrs.push_front(nullptr);
410     return;
411   }
412 
413   if (CurrCycleInstr->isBundle()) {
414     processBundle();
415     return;
416   }
417 
418   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
419   if (!NumWaitStates) {
420     CurrCycleInstr = nullptr;
421     return;
422   }
423 
424   // Keep track of emitted instructions
425   EmittedInstrs.push_front(CurrCycleInstr);
426 
427   // Add a nullptr for each additional wait state after the first.  Make sure
428   // not to add more than getMaxLookAhead() items to the list, since we
429   // truncate the list to that size right after this loop.
430   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
431        i < e; ++i) {
432     EmittedInstrs.push_front(nullptr);
433   }
434 
435   // getMaxLookahead() is the largest number of wait states we will ever need
436   // to insert, so there is no point in keeping track of more than that many
437   // wait states.
438   EmittedInstrs.resize(getMaxLookAhead());
439 
440   CurrCycleInstr = nullptr;
441 }
442 
443 void GCNHazardRecognizer::RecedeCycle() {
444   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
445 }
446 
447 //===----------------------------------------------------------------------===//
448 // Helper Functions
449 //===----------------------------------------------------------------------===//
450 
451 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
452 
453 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
454 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
455 
456 // Search for a hazard in a block and its predecessors.
457 template <typename StateT>
458 static bool
459 hasHazard(StateT State,
460           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
461           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
462           const MachineBasicBlock *MBB,
463           MachineBasicBlock::const_reverse_instr_iterator I,
464           DenseSet<const MachineBasicBlock *> &Visited) {
465   for (auto E = MBB->instr_rend(); I != E; ++I) {
466     // No need to look at parent BUNDLE instructions.
467     if (I->isBundle())
468       continue;
469 
470     switch (IsHazard(State, *I)) {
471     case HazardFound:
472       return true;
473     case HazardExpired:
474       return false;
475     default:
476       // Continue search
477       break;
478     }
479 
480     if (I->isInlineAsm() || I->isMetaInstruction())
481       continue;
482 
483     UpdateState(State, *I);
484   }
485 
486   for (MachineBasicBlock *Pred : MBB->predecessors()) {
487     if (!Visited.insert(Pred).second)
488       continue;
489 
490     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
491                   Visited))
492       return true;
493   }
494 
495   return false;
496 }
497 
498 // Returns a minimum wait states since \p I walking all predecessors.
499 // Only scans until \p IsExpired does not return true.
500 // Can only be run in a hazard recognizer mode.
501 static int getWaitStatesSince(
502     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
503     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
504     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
505     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
506   for (auto E = MBB->instr_rend(); I != E; ++I) {
507     // Don't add WaitStates for parent BUNDLE instructions.
508     if (I->isBundle())
509       continue;
510 
511     if (IsHazard(*I))
512       return WaitStates;
513 
514     if (I->isInlineAsm())
515       continue;
516 
517     WaitStates += GetNumWaitStates(*I);
518 
519     if (IsExpired(*I, WaitStates))
520       return std::numeric_limits<int>::max();
521   }
522 
523   int MinWaitStates = std::numeric_limits<int>::max();
524   for (MachineBasicBlock *Pred : MBB->predecessors()) {
525     if (!Visited.insert(Pred).second)
526       continue;
527 
528     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
529                                IsExpired, Visited, GetNumWaitStates);
530 
531     MinWaitStates = std::min(MinWaitStates, W);
532   }
533 
534   return MinWaitStates;
535 }
536 
537 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
538                               const MachineInstr *MI, IsExpiredFn IsExpired) {
539   DenseSet<const MachineBasicBlock *> Visited;
540   return getWaitStatesSince(IsHazard, MI->getParent(),
541                             std::next(MI->getReverseIterator()),
542                             0, IsExpired, Visited);
543 }
544 
545 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
546   if (IsHazardRecognizerMode) {
547     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
548       return WaitStates >= Limit;
549     };
550     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
551   }
552 
553   int WaitStates = 0;
554   for (MachineInstr *MI : EmittedInstrs) {
555     if (MI) {
556       if (IsHazard(*MI))
557         return WaitStates;
558 
559       if (MI->isInlineAsm())
560         continue;
561     }
562     ++WaitStates;
563 
564     if (WaitStates >= Limit)
565       break;
566   }
567   return std::numeric_limits<int>::max();
568 }
569 
570 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
571                                                IsHazardFn IsHazardDef,
572                                                int Limit) {
573   const SIRegisterInfo *TRI = ST.getRegisterInfo();
574 
575   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
576     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
577   };
578 
579   return getWaitStatesSince(IsHazardFn, Limit);
580 }
581 
582 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
583                                                   int Limit) {
584   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
585     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
586   };
587 
588   return getWaitStatesSince(IsHazardFn, Limit);
589 }
590 
591 //===----------------------------------------------------------------------===//
592 // No-op Hazard Detection
593 //===----------------------------------------------------------------------===//
594 
595 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
596                         MCRegister Reg) {
597   for (MCRegUnit Unit : TRI.regunits(Reg))
598     BV.set(Unit);
599 }
600 
601 static void addRegsToSet(const SIRegisterInfo &TRI,
602                          iterator_range<MachineInstr::const_mop_iterator> Ops,
603                          BitVector &DefSet, BitVector &UseSet) {
604   for (const MachineOperand &Op : Ops) {
605     if (Op.isReg())
606       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
607   }
608 }
609 
610 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
611   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
612 }
613 
614 static bool breaksSMEMSoftClause(MachineInstr *MI) {
615   return !SIInstrInfo::isSMRD(*MI);
616 }
617 
618 static bool breaksVMEMSoftClause(MachineInstr *MI) {
619   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
620 }
621 
622 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
623   // SMEM soft clause are only present on VI+, and only matter if xnack is
624   // enabled.
625   if (!ST.isXNACKEnabled())
626     return 0;
627 
628   bool IsSMRD = TII.isSMRD(*MEM);
629 
630   resetClause();
631 
632   // A soft-clause is any group of consecutive SMEM instructions.  The
633   // instructions in this group may return out of order and/or may be
634   // replayed (i.e. the same instruction issued more than once).
635   //
636   // In order to handle these situations correctly we need to make sure that
637   // when a clause has more than one instruction, no instruction in the clause
638   // writes to a register that is read by another instruction in the clause
639   // (including itself). If we encounter this situation, we need to break the
640   // clause by inserting a non SMEM instruction.
641 
642   for (MachineInstr *MI : EmittedInstrs) {
643     // When we hit a non-SMEM instruction then we have passed the start of the
644     // clause and we can stop.
645     if (!MI)
646       break;
647 
648     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
649       break;
650 
651     addClauseInst(*MI);
652   }
653 
654   if (ClauseDefs.none())
655     return 0;
656 
657   // We need to make sure not to put loads and stores in the same clause if they
658   // use the same address. For now, just start a new clause whenever we see a
659   // store.
660   if (MEM->mayStore())
661     return 1;
662 
663   addClauseInst(*MEM);
664 
665   // If the set of defs and uses intersect then we cannot add this instruction
666   // to the clause, so we have a hazard.
667   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
668 }
669 
670 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
671   int WaitStatesNeeded = 0;
672 
673   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
674 
675   // This SMRD hazard only affects SI.
676   if (!ST.hasSMRDReadVALUDefHazard())
677     return WaitStatesNeeded;
678 
679   // A read of an SGPR by SMRD instruction requires 4 wait states when the
680   // SGPR was written by a VALU instruction.
681   int SmrdSgprWaitStates = 4;
682   auto IsHazardDefFn = [this](const MachineInstr &MI) {
683     return TII.isVALU(MI);
684   };
685   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
686     return TII.isSALU(MI);
687   };
688 
689   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
690 
691   for (const MachineOperand &Use : SMRD->uses()) {
692     if (!Use.isReg())
693       continue;
694     int WaitStatesNeededForUse =
695         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
696                                                    SmrdSgprWaitStates);
697     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
698 
699     // This fixes what appears to be undocumented hardware behavior in SI where
700     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
701     // needs some number of nops in between. We don't know how many we need, but
702     // let's use 4. This wasn't discovered before probably because the only
703     // case when this happens is when we expand a 64-bit pointer into a full
704     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
705     // probably never encountered in the closed-source land.
706     if (IsBufferSMRD) {
707       int WaitStatesNeededForUse =
708         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
709                                                    IsBufferHazardDefFn,
710                                                    SmrdSgprWaitStates);
711       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
712     }
713   }
714 
715   return WaitStatesNeeded;
716 }
717 
718 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
719   if (!ST.hasVMEMReadSGPRVALUDefHazard())
720     return 0;
721 
722   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
723 
724   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
725   // SGPR was written by a VALU Instruction.
726   const int VmemSgprWaitStates = 5;
727   auto IsHazardDefFn = [this](const MachineInstr &MI) {
728     return TII.isVALU(MI);
729   };
730   for (const MachineOperand &Use : VMEM->uses()) {
731     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
732       continue;
733 
734     int WaitStatesNeededForUse =
735         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
736                                                    VmemSgprWaitStates);
737     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
738   }
739   return WaitStatesNeeded;
740 }
741 
742 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
743   const SIRegisterInfo *TRI = ST.getRegisterInfo();
744   const SIInstrInfo *TII = ST.getInstrInfo();
745 
746   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
747   int DppVgprWaitStates = 2;
748   int DppExecWaitStates = 5;
749   int WaitStatesNeeded = 0;
750   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
751     return TII->isVALU(MI);
752   };
753 
754   for (const MachineOperand &Use : DPP->uses()) {
755     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
756       continue;
757     int WaitStatesNeededForUse =
758         DppVgprWaitStates - getWaitStatesSinceDef(
759                                 Use.getReg(),
760                                 [](const MachineInstr &) { return true; },
761                                 DppVgprWaitStates);
762     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763   }
764 
765   WaitStatesNeeded = std::max(
766       WaitStatesNeeded,
767       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
768                                                 DppExecWaitStates));
769 
770   return WaitStatesNeeded;
771 }
772 
773 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
774   const SIInstrInfo *TII = ST.getInstrInfo();
775 
776   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
777   // instruction.
778   const int DivFMasWaitStates = 4;
779   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
780     return TII->isVALU(MI);
781   };
782   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
783                                                DivFMasWaitStates);
784 
785   return DivFMasWaitStates - WaitStatesNeeded;
786 }
787 
788 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
789   const SIInstrInfo *TII = ST.getInstrInfo();
790   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
791 
792   const int GetRegWaitStates = 2;
793   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
794     return GetRegHWReg == getHWReg(TII, MI);
795   };
796   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
797 
798   return GetRegWaitStates - WaitStatesNeeded;
799 }
800 
801 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
802   const SIInstrInfo *TII = ST.getInstrInfo();
803   unsigned HWReg = getHWReg(TII, *SetRegInstr);
804 
805   const int SetRegWaitStates = ST.getSetRegWaitStates();
806   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
807     return HWReg == getHWReg(TII, MI);
808   };
809   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
810   return SetRegWaitStates - WaitStatesNeeded;
811 }
812 
813 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
814   if (!MI.mayStore())
815     return -1;
816 
817   const SIInstrInfo *TII = ST.getInstrInfo();
818   unsigned Opcode = MI.getOpcode();
819   const MCInstrDesc &Desc = MI.getDesc();
820 
821   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
822   int VDataRCID = -1;
823   if (VDataIdx != -1)
824     VDataRCID = Desc.operands()[VDataIdx].RegClass;
825 
826   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
827     // There is no hazard if the instruction does not use vector regs
828     // (like wbinvl1)
829     if (VDataIdx == -1)
830       return -1;
831     // For MUBUF/MTBUF instructions this hazard only exists if the
832     // instruction is not using a register in the soffset field.
833     const MachineOperand *SOffset =
834         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
835     // If we have no soffset operand, then assume this field has been
836     // hardcoded to zero.
837     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
838         (!SOffset || !SOffset->isReg()))
839       return VDataIdx;
840   }
841 
842   // MIMG instructions create a hazard if they don't use a 256-bit T# and
843   // the store size is greater than 8 bytes and they have more than two bits
844   // of their dmask set.
845   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
846   if (TII->isMIMG(MI)) {
847     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
848     assert(SRsrcIdx != -1 &&
849            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
850     (void)SRsrcIdx;
851   }
852 
853   if (TII->isFLAT(MI)) {
854     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
855     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
856       return DataIdx;
857   }
858 
859   return -1;
860 }
861 
862 int
863 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
864                                             const MachineRegisterInfo &MRI) {
865   // Helper to check for the hazard where VMEM instructions that store more than
866   // 8 bytes can have there store data over written by the next instruction.
867   const SIRegisterInfo *TRI = ST.getRegisterInfo();
868 
869   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
870   int WaitStatesNeeded = 0;
871 
872   if (!TRI->isVectorRegister(MRI, Def.getReg()))
873     return WaitStatesNeeded;
874   Register Reg = Def.getReg();
875   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
876     int DataIdx = createsVALUHazard(MI);
877     return DataIdx >= 0 &&
878            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
879   };
880 
881   int WaitStatesNeededForDef =
882     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
883   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
884 
885   return WaitStatesNeeded;
886 }
887 
888 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
889 /// pack the computed value into correct bit position of the dest register. This
890 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
891 /// dst_sel that is not aligned to the register. This function analayzes the \p
892 /// MI and \returns an operand with dst forwarding issue, or nullptr if
893 /// none exists.
894 static const MachineOperand *
895 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
896   if (!SIInstrInfo::isVALU(MI))
897     return nullptr;
898 
899   const SIInstrInfo *TII = ST.getInstrInfo();
900 
901   unsigned Opcode = MI.getOpcode();
902 
903   // There are three different types of instructions
904   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
905   // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
906   // CVT_SR_BF8_F32 with op_sel[3:2]
907   // != 0
908   if (SIInstrInfo::isSDWA(MI)) {
909     // Type 1: SDWA with dst_sel != DWORD
910     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
911       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
912         return nullptr;
913   } else {
914     // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
915     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
916     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
917         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
918               SISrcMods::DST_OP_SEL ||
919           (AMDGPU::isFP8DstSelInst(Opcode) &&
920            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
921             SISrcMods::OP_SEL_0))))
922       return nullptr;
923   }
924 
925   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
926 }
927 
928 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
929 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
930 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
931 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
932                                             const MachineOperand *Dst,
933                                             const SIRegisterInfo *TRI) {
934   // We must consider implicit reads of the VALU. SDWA with dst_sel and
935   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
936   // and we must account for that hazard.
937   // We also must account for WAW hazards. In particular, WAW with dest
938   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
939   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
940   // check for ECC. Without accounting for this hazard, the ECC will be
941   // wrong.
942   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
943   // complete zeroesHigh16BitsOfDest)
944   for (auto &Operand : VALU->operands()) {
945     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
946       return true;
947     }
948   }
949   return false;
950 }
951 
952 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
953   int WaitStatesNeeded = 0;
954 
955   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
956     const int TransDefWaitstates = 1;
957 
958     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
959       if (!SIInstrInfo::isTRANS(MI))
960         return false;
961       const SIRegisterInfo *TRI = ST.getRegisterInfo();
962       const SIInstrInfo *TII = ST.getInstrInfo();
963       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
964 
965       for (const MachineOperand &Use : VALU->explicit_uses()) {
966         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
967           return true;
968       }
969 
970       return false;
971     };
972 
973     int WaitStatesNeededForDef =
974         TransDefWaitstates -
975         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
976     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
977   }
978 
979   if (ST.hasDstSelForwardingHazard()) {
980     const int Shift16DefWaitstates = 1;
981 
982     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
983       const SIRegisterInfo *TRI = ST.getRegisterInfo();
984       const MachineOperand *ForwardedDst =
985           getDstSelForwardingOperand(ProducerMI, ST);
986       if (ForwardedDst) {
987         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
988       }
989 
990       if (ProducerMI.isInlineAsm()) {
991         // Assume inline asm has dst forwarding hazard
992         for (auto &Def : ProducerMI.all_defs()) {
993           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
994             return true;
995         }
996       }
997 
998       return false;
999     };
1000 
1001     int WaitStatesNeededForDef =
1002         Shift16DefWaitstates -
1003         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1004     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1005   }
1006 
1007   if (ST.hasVDecCoExecHazard()) {
1008     const int VALUWriteSGPRVALUReadWaitstates = 2;
1009     const int VALUWriteEXECRWLane = 4;
1010     const int VALUWriteVGPRReadlaneRead = 1;
1011 
1012     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1013     const MachineRegisterInfo &MRI = MF.getRegInfo();
1014     Register UseReg;
1015     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1016       if (!SIInstrInfo::isVALU(MI))
1017         return false;
1018       return MI.modifiesRegister(UseReg, TRI);
1019     };
1020 
1021     for (const MachineOperand &Use : VALU->explicit_uses()) {
1022       if (!Use.isReg())
1023         continue;
1024 
1025       UseReg = Use.getReg();
1026       if (TRI->isSGPRReg(MRI, UseReg)) {
1027         int WaitStatesNeededForDef =
1028             VALUWriteSGPRVALUReadWaitstates -
1029             getWaitStatesSince(IsVALUDefSGPRFn,
1030                                VALUWriteSGPRVALUReadWaitstates);
1031         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1032       }
1033     }
1034 
1035     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1036       UseReg = AMDGPU::VCC;
1037       int WaitStatesNeededForDef =
1038           VALUWriteSGPRVALUReadWaitstates -
1039           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1040       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1041     }
1042 
1043     switch (VALU->getOpcode()) {
1044     case AMDGPU::V_READLANE_B32:
1045     case AMDGPU::V_READFIRSTLANE_B32: {
1046       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1047       UseReg = Src->getReg();
1048       int WaitStatesNeededForDef =
1049           VALUWriteVGPRReadlaneRead -
1050           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1051       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052     }
1053       [[fallthrough]];
1054     case AMDGPU::V_WRITELANE_B32: {
1055       UseReg = AMDGPU::EXEC;
1056       int WaitStatesNeededForDef =
1057           VALUWriteEXECRWLane -
1058           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1059       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1060       break;
1061     }
1062     default:
1063       break;
1064     }
1065   }
1066 
1067   // This checks for the hazard where VMEM instructions that store more than
1068   // 8 bytes can have there store data over written by the next instruction.
1069   if (!ST.has12DWordStoreHazard())
1070     return WaitStatesNeeded;
1071 
1072   const MachineRegisterInfo &MRI = MF.getRegInfo();
1073 
1074   for (const MachineOperand &Def : VALU->defs()) {
1075     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1076   }
1077 
1078   return WaitStatesNeeded;
1079 }
1080 
1081 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1082   // This checks for hazards associated with inline asm statements.
1083   // Since inline asms can contain just about anything, we use this
1084   // to call/leverage other check*Hazard routines. Note that
1085   // this function doesn't attempt to address all possible inline asm
1086   // hazards (good luck), but is a collection of what has been
1087   // problematic thus far.
1088 
1089   // see checkVALUHazards()
1090   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1091     return 0;
1092 
1093   const MachineRegisterInfo &MRI = MF.getRegInfo();
1094   int WaitStatesNeeded = 0;
1095 
1096   for (const MachineOperand &Op :
1097        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1098     if (Op.isReg() && Op.isDef()) {
1099       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1100         continue;
1101 
1102       if (ST.has12DWordStoreHazard()) {
1103         WaitStatesNeeded =
1104             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1105       }
1106     }
1107   }
1108 
1109   if (ST.hasDstSelForwardingHazard()) {
1110     const int Shift16DefWaitstates = 1;
1111 
1112     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1113       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1114       // Assume inline asm reads the dst
1115       if (Dst)
1116         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1117                IA->readsRegister(Dst->getReg(), &TRI);
1118 
1119       if (ProducerMI.isInlineAsm()) {
1120         // If MI is inline asm, assume it has dst forwarding hazard
1121         for (auto &Def : ProducerMI.all_defs()) {
1122           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1123               IA->readsRegister(Def.getReg(), &TRI)) {
1124             return true;
1125           }
1126         }
1127       }
1128 
1129       return false;
1130     };
1131 
1132     int WaitStatesNeededForDef =
1133         Shift16DefWaitstates -
1134         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1135     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1136   }
1137 
1138   return WaitStatesNeeded;
1139 }
1140 
1141 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1142   const SIInstrInfo *TII = ST.getInstrInfo();
1143   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1144   const MachineRegisterInfo &MRI = MF.getRegInfo();
1145 
1146   const MachineOperand *LaneSelectOp =
1147       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1148 
1149   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1150     return 0;
1151 
1152   Register LaneSelectReg = LaneSelectOp->getReg();
1153   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1154 
1155   const int RWLaneWaitStates = 4;
1156   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1157                                               RWLaneWaitStates);
1158   return RWLaneWaitStates - WaitStatesSince;
1159 }
1160 
1161 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1162   if (!ST.hasRFEHazards())
1163     return 0;
1164 
1165   const SIInstrInfo *TII = ST.getInstrInfo();
1166 
1167   const int RFEWaitStates = 1;
1168 
1169   auto IsHazardFn = [TII](const MachineInstr &MI) {
1170     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1171   };
1172   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1173   return RFEWaitStates - WaitStatesNeeded;
1174 }
1175 
1176 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1177   const SIInstrInfo *TII = ST.getInstrInfo();
1178   const int ReadM0WaitStates = 1;
1179   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1180   return ReadM0WaitStates -
1181          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1182 }
1183 
1184 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1185   fixVMEMtoScalarWriteHazards(MI);
1186   fixVcmpxPermlaneHazards(MI);
1187   fixSMEMtoVectorWriteHazards(MI);
1188   fixVcmpxExecWARHazard(MI);
1189   fixLdsBranchVmemWARHazard(MI);
1190   if (ST.hasLdsDirect()) {
1191     fixLdsDirectVALUHazard(MI);
1192     fixLdsDirectVMEMHazard(MI);
1193   }
1194   fixVALUPartialForwardingHazard(MI);
1195   fixVALUTransUseHazard(MI);
1196   fixWMMAHazards(MI);
1197   fixShift64HighRegBug(MI);
1198   fixVALUMaskWriteHazard(MI);
1199   fixVALUReadSGPRHazard(MI);
1200   fixRequiredExportPriority(MI);
1201 }
1202 
1203 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1204   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1205     return false;
1206 
1207   const SIInstrInfo *TII = ST.getInstrInfo();
1208   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1209   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1210     return (TII->isVOPC(MI) ||
1211             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1212            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1213   };
1214 
1215   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1216     unsigned Opc = MI.getOpcode();
1217     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1218            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1219   };
1220 
1221   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1222       std::numeric_limits<int>::max())
1223     return false;
1224 
1225   // V_NOP will be discarded by SQ.
1226   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1227   // which is always a VGPR and available.
1228   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1229   Register Reg = Src0->getReg();
1230   bool IsUndef = Src0->isUndef();
1231   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1232           TII->get(AMDGPU::V_MOV_B32_e32))
1233     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1234     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1235 
1236   return true;
1237 }
1238 
1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1240   if (!ST.hasVMEMtoScalarWriteHazard())
1241     return false;
1242   assert(!ST.hasExtendedWaitCounts());
1243 
1244   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1245     return false;
1246 
1247   if (MI->getNumDefs() == 0)
1248     return false;
1249 
1250   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1251 
1252   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1253     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1254         !SIInstrInfo::isFLAT(I))
1255       return false;
1256 
1257     for (const MachineOperand &Def : MI->defs()) {
1258       const MachineOperand *Op =
1259           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1260       if (!Op)
1261         continue;
1262       return true;
1263     }
1264     return false;
1265   };
1266 
1267   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1268     return SIInstrInfo::isVALU(MI) ||
1269            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1270             !MI.getOperand(0).getImm()) ||
1271            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1272             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1273   };
1274 
1275   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1276       std::numeric_limits<int>::max())
1277     return false;
1278 
1279   const SIInstrInfo *TII = ST.getInstrInfo();
1280   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1281           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1282       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1283   return true;
1284 }
1285 
1286 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1287   if (!ST.hasSMEMtoVectorWriteHazard())
1288     return false;
1289   assert(!ST.hasExtendedWaitCounts());
1290 
1291   if (!SIInstrInfo::isVALU(*MI))
1292     return false;
1293 
1294   unsigned SDSTName;
1295   switch (MI->getOpcode()) {
1296   case AMDGPU::V_READLANE_B32:
1297   case AMDGPU::V_READFIRSTLANE_B32:
1298     SDSTName = AMDGPU::OpName::vdst;
1299     break;
1300   default:
1301     SDSTName = AMDGPU::OpName::sdst;
1302     break;
1303   }
1304 
1305   const SIInstrInfo *TII = ST.getInstrInfo();
1306   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1307   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1308   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1309   if (!SDST) {
1310     for (const auto &MO : MI->implicit_operands()) {
1311       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1312         SDST = &MO;
1313         break;
1314       }
1315     }
1316   }
1317 
1318   if (!SDST)
1319     return false;
1320 
1321   const Register SDSTReg = SDST->getReg();
1322   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1323     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1324   };
1325 
1326   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1327     if (TII->isSALU(MI)) {
1328       switch (MI.getOpcode()) {
1329       case AMDGPU::S_SETVSKIP:
1330       case AMDGPU::S_VERSION:
1331       case AMDGPU::S_WAITCNT_VSCNT:
1332       case AMDGPU::S_WAITCNT_VMCNT:
1333       case AMDGPU::S_WAITCNT_EXPCNT:
1334         // These instructions cannot not mitigate the hazard.
1335         return false;
1336       case AMDGPU::S_WAITCNT_LGKMCNT:
1337         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1338         return (MI.getOperand(1).getImm() == 0) &&
1339                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1340       case AMDGPU::S_WAITCNT: {
1341         const int64_t Imm = MI.getOperand(0).getImm();
1342         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1343         // DsCnt corresponds to LGKMCnt here.
1344         return (Decoded.DsCnt == 0);
1345       }
1346       default:
1347         // SOPP instructions cannot mitigate the hazard.
1348         if (TII->isSOPP(MI))
1349           return false;
1350         // At this point the SALU can be assumed to mitigate the hazard
1351         // because either:
1352         // (a) it is independent of the at risk SMEM (breaking chain),
1353         // or
1354         // (b) it is dependent on the SMEM, in which case an appropriate
1355         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1356         //     SMEM instruction.
1357         return true;
1358       }
1359     }
1360     return false;
1361   };
1362 
1363   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1364       std::numeric_limits<int>::max())
1365     return false;
1366 
1367   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1368           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1369       .addImm(0);
1370   return true;
1371 }
1372 
1373 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1374   if (!ST.hasVcmpxExecWARHazard())
1375     return false;
1376   assert(!ST.hasExtendedWaitCounts());
1377 
1378   if (!SIInstrInfo::isVALU(*MI))
1379     return false;
1380 
1381   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1382   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1383     return false;
1384 
1385   auto IsHazardFn = [TRI](const MachineInstr &I) {
1386     if (SIInstrInfo::isVALU(I))
1387       return false;
1388     return I.readsRegister(AMDGPU::EXEC, TRI);
1389   };
1390 
1391   const SIInstrInfo *TII = ST.getInstrInfo();
1392   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1393     if (SIInstrInfo::isVALU(MI)) {
1394       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1395         return true;
1396       for (auto MO : MI.implicit_operands())
1397         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1398           return true;
1399     }
1400     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1401         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1402       return true;
1403     return false;
1404   };
1405 
1406   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1407       std::numeric_limits<int>::max())
1408     return false;
1409 
1410   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1411           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1412       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1413   return true;
1414 }
1415 
1416 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1417                                                  const GCNSubtarget &ST) {
1418   if (!ST.hasLdsBranchVmemWARHazard())
1419     return false;
1420 
1421   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1422   // instructions need to appear in the same function.
1423   bool HasLds = false;
1424   bool HasVmem = false;
1425   for (auto &MBB : MF) {
1426     for (auto &MI : MBB) {
1427       HasLds |= SIInstrInfo::isDS(MI);
1428       HasVmem |=
1429           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1430       if (HasLds && HasVmem)
1431         return true;
1432     }
1433   }
1434   return false;
1435 }
1436 
1437 static bool isStoreCountWaitZero(const MachineInstr &I) {
1438   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1439          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1440          !I.getOperand(1).getImm();
1441 }
1442 
1443 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1444   if (!RunLdsBranchVmemWARHazardFixup)
1445     return false;
1446 
1447   assert(ST.hasLdsBranchVmemWARHazard());
1448   assert(!ST.hasExtendedWaitCounts());
1449 
1450   auto IsHazardInst = [](const MachineInstr &MI) {
1451     if (SIInstrInfo::isDS(MI))
1452       return 1;
1453     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1454       return 2;
1455     return 0;
1456   };
1457 
1458   auto InstType = IsHazardInst(*MI);
1459   if (!InstType)
1460     return false;
1461 
1462   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1463     return IsHazardInst(I) || isStoreCountWaitZero(I);
1464   };
1465 
1466   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1467     if (!I.isBranch())
1468       return false;
1469 
1470     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1471       auto InstType2 = IsHazardInst(I);
1472       return InstType2 && InstType != InstType2;
1473     };
1474 
1475     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1476       auto InstType2 = IsHazardInst(I);
1477       if (InstType == InstType2)
1478         return true;
1479 
1480       return isStoreCountWaitZero(I);
1481     };
1482 
1483     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1484            std::numeric_limits<int>::max();
1485   };
1486 
1487   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1488       std::numeric_limits<int>::max())
1489     return false;
1490 
1491   const SIInstrInfo *TII = ST.getInstrInfo();
1492   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1493           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1494     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1495     .addImm(0);
1496 
1497   return true;
1498 }
1499 
1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1501   if (!SIInstrInfo::isLDSDIR(*MI))
1502     return false;
1503 
1504   const int NoHazardWaitStates = 15;
1505   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1506   const Register VDSTReg = VDST->getReg();
1507 
1508   bool VisitedTrans = false;
1509   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1510     if (!SIInstrInfo::isVALU(I))
1511       return false;
1512     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1513     // Cover both WAR and WAW
1514     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1515   };
1516   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1517     if (WaitStates >= NoHazardWaitStates)
1518       return true;
1519     // Instructions which cause va_vdst==0 expire hazard
1520     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1521            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1522   };
1523   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1524     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1525   };
1526 
1527   DenseSet<const MachineBasicBlock *> Visited;
1528   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1529                                     std::next(MI->getReverseIterator()), 0,
1530                                     IsExpiredFn, Visited, GetWaitStatesFn);
1531 
1532   // Transcendentals can execute in parallel to other VALUs.
1533   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1534   if (VisitedTrans)
1535     Count = 0;
1536 
1537   MachineOperand *WaitVdstOp =
1538       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1539   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1540 
1541   return true;
1542 }
1543 
1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1545   if (!SIInstrInfo::isLDSDIR(*MI))
1546     return false;
1547 
1548   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1549   const Register VDSTReg = VDST->getReg();
1550 
1551   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1552     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1553         !SIInstrInfo::isDS(I))
1554       return false;
1555     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1556   };
1557   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1558   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1559   // according to the type of VMEM instruction.
1560   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1561     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1562            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1563            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1564             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1565            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1566             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1567   };
1568 
1569   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1570       std::numeric_limits<int>::max())
1571     return false;
1572 
1573   if (LdsdirCanWait) {
1574     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1575   } else {
1576     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1577             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1578         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1579   }
1580 
1581   return true;
1582 }
1583 
1584 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1585   if (!ST.hasVALUPartialForwardingHazard())
1586     return false;
1587   assert(!ST.hasExtendedWaitCounts());
1588 
1589   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1590     return false;
1591 
1592   SmallSetVector<Register, 4> SrcVGPRs;
1593 
1594   for (const MachineOperand &Use : MI->explicit_uses()) {
1595     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1596       SrcVGPRs.insert(Use.getReg());
1597   }
1598 
1599   // Only applies with >= 2 unique VGPR sources
1600   if (SrcVGPRs.size() <= 1)
1601     return false;
1602 
1603   // Look for the following pattern:
1604   //   Va <- VALU [PreExecPos]
1605   //   intv1
1606   //   Exec <- SALU [ExecPos]
1607   //   intv2
1608   //   Vb <- VALU [PostExecPos]
1609   //   intv3
1610   //   MI Va, Vb (WaitState = 0)
1611   //
1612   // Where:
1613   // intv1 + intv2 <= 2 VALUs
1614   // intv3 <= 4 VALUs
1615   //
1616   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1617 
1618   const int Intv1plus2MaxVALUs = 2;
1619   const int Intv3MaxVALUs = 4;
1620   const int IntvMaxVALUs = 6;
1621   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1622 
1623   struct StateType {
1624     SmallDenseMap<Register, int, 4> DefPos;
1625     int ExecPos = std::numeric_limits<int>::max();
1626     int VALUs = 0;
1627   };
1628 
1629   StateType State;
1630 
1631   // This overloads expiry testing with all the hazard detection
1632   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1633     // Too many VALU states have passed
1634     if (State.VALUs > NoHazardVALUWaitStates)
1635       return HazardExpired;
1636 
1637     // Instructions which cause va_vdst==0 expire hazard
1638     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1639         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1640         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1641          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1642       return HazardExpired;
1643 
1644     // Track registers writes
1645     bool Changed = false;
1646     if (SIInstrInfo::isVALU(I)) {
1647       for (Register Src : SrcVGPRs) {
1648         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1649           State.DefPos[Src] = State.VALUs;
1650           Changed = true;
1651         }
1652       }
1653     } else if (SIInstrInfo::isSALU(I)) {
1654       if (State.ExecPos == std::numeric_limits<int>::max()) {
1655         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1656           State.ExecPos = State.VALUs;
1657           Changed = true;
1658         }
1659       }
1660     }
1661 
1662     // Early expiration: too many VALUs in intv3
1663     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1664       return HazardExpired;
1665 
1666     // Only evaluate state if something changed
1667     if (!Changed)
1668       return NoHazardFound;
1669 
1670     // Determine positions of VALUs pre/post exec change
1671     if (State.ExecPos == std::numeric_limits<int>::max())
1672       return NoHazardFound;
1673 
1674     int PreExecPos = std::numeric_limits<int>::max();
1675     int PostExecPos = std::numeric_limits<int>::max();
1676 
1677     for (auto Entry : State.DefPos) {
1678       int DefVALUs = Entry.second;
1679       if (DefVALUs != std::numeric_limits<int>::max()) {
1680         if (DefVALUs >= State.ExecPos)
1681           PreExecPos = std::min(PreExecPos, DefVALUs);
1682         else
1683           PostExecPos = std::min(PostExecPos, DefVALUs);
1684       }
1685     }
1686 
1687     // Need a VALUs post exec change
1688     if (PostExecPos == std::numeric_limits<int>::max())
1689       return NoHazardFound;
1690 
1691     // Too many VALUs in intv3?
1692     int Intv3VALUs = PostExecPos;
1693     if (Intv3VALUs > Intv3MaxVALUs)
1694       return HazardExpired;
1695 
1696     // Too many VALUs in intv2?
1697     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1698     if (Intv2VALUs > Intv1plus2MaxVALUs)
1699       return HazardExpired;
1700 
1701     // Need a VALUs pre exec change
1702     if (PreExecPos == std::numeric_limits<int>::max())
1703       return NoHazardFound;
1704 
1705     // Too many VALUs in intv1?
1706     int Intv1VALUs = PreExecPos - State.ExecPos;
1707     if (Intv1VALUs > Intv1plus2MaxVALUs)
1708       return HazardExpired;
1709 
1710     // Too many VALUs in intv1 + intv2
1711     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1712       return HazardExpired;
1713 
1714     return HazardFound;
1715   };
1716   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1717     if (SIInstrInfo::isVALU(MI))
1718       State.VALUs += 1;
1719   };
1720 
1721   DenseSet<const MachineBasicBlock *> Visited;
1722   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1723                             std::next(MI->getReverseIterator()), Visited))
1724     return false;
1725 
1726   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1727           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1728       .addImm(0x0fff);
1729 
1730   return true;
1731 }
1732 
1733 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1734   if (!ST.hasVALUTransUseHazard())
1735     return false;
1736   assert(!ST.hasExtendedWaitCounts());
1737 
1738   if (!SIInstrInfo::isVALU(*MI))
1739     return false;
1740 
1741   SmallSet<Register, 4> SrcVGPRs;
1742 
1743   for (const MachineOperand &Use : MI->explicit_uses()) {
1744     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1745       SrcVGPRs.insert(Use.getReg());
1746   }
1747 
1748   // Look for the following pattern:
1749   //   Va <- TRANS VALU
1750   //   intv
1751   //   MI Va (WaitState = 0)
1752   //
1753   // Where:
1754   // intv <= 5 VALUs / 1 TRANS
1755   //
1756   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1757 
1758   const int IntvMaxVALUs = 5;
1759   const int IntvMaxTRANS = 1;
1760 
1761   struct StateType {
1762     int VALUs = 0;
1763     int TRANS = 0;
1764   };
1765 
1766   StateType State;
1767 
1768   // This overloads expiry testing with all the hazard detection
1769   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1770     // Too many VALU states have passed
1771     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1772       return HazardExpired;
1773 
1774     // Instructions which cause va_vdst==0 expire hazard
1775     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1776         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1777         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1778          I.getOperand(0).getImm() == 0x0fff))
1779       return HazardExpired;
1780 
1781     // Track registers writes
1782     if (SIInstrInfo::isTRANS(I)) {
1783       for (Register Src : SrcVGPRs) {
1784         if (I.modifiesRegister(Src, &TRI)) {
1785           return HazardFound;
1786         }
1787       }
1788     }
1789 
1790     return NoHazardFound;
1791   };
1792   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1793     if (SIInstrInfo::isVALU(MI))
1794       State.VALUs += 1;
1795     if (SIInstrInfo::isTRANS(MI))
1796       State.TRANS += 1;
1797   };
1798 
1799   DenseSet<const MachineBasicBlock *> Visited;
1800   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1801                             std::next(MI->getReverseIterator()), Visited))
1802     return false;
1803 
1804   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1805   // avoided.
1806   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1807           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1808       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1809 
1810   return true;
1811 }
1812 
1813 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1814   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1815     return false;
1816 
1817   const SIInstrInfo *TII = ST.getInstrInfo();
1818   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1819 
1820   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1821     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1822       return false;
1823 
1824     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1825     // with the dest(matrix D) of the previous wmma.
1826     const Register CurSrc0Reg =
1827         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1828     const Register CurSrc1Reg =
1829         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1830 
1831     const Register PrevDstReg =
1832         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1833 
1834     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1835         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1836       return true;
1837     }
1838 
1839     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1840     // but Index can't overlap with PrevDstReg.
1841     if (AMDGPU::isGFX12Plus(ST)) {
1842       if (SIInstrInfo::isSWMMAC(*MI)) {
1843         const Register CurIndex =
1844             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1845         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1846           return true;
1847       }
1848       return false;
1849     }
1850 
1851     return false;
1852   };
1853 
1854   auto IsExpiredFn = [](const MachineInstr &I, int) {
1855     return SIInstrInfo::isVALU(I);
1856   };
1857 
1858   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1859       std::numeric_limits<int>::max())
1860     return false;
1861 
1862   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1863 
1864   return true;
1865 }
1866 
1867 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1868   if (!ST.hasShift64HighRegBug())
1869     return false;
1870   assert(!ST.hasExtendedWaitCounts());
1871 
1872   switch (MI->getOpcode()) {
1873   default:
1874     return false;
1875   case AMDGPU::V_LSHLREV_B64_e64:
1876   case AMDGPU::V_LSHRREV_B64_e64:
1877   case AMDGPU::V_ASHRREV_I64_e64:
1878     break;
1879   }
1880 
1881   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1882   if (!Amt->isReg())
1883     return false;
1884 
1885   Register AmtReg = Amt->getReg();
1886   const MachineRegisterInfo &MRI = MF.getRegInfo();
1887   // Check if this is a last VGPR in the allocation block.
1888   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1889     return false;
1890 
1891   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1892     return false;
1893 
1894   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1895   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1896   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1897   bool Overlapped = OverlappedSrc || OverlappedDst;
1898 
1899   assert(!OverlappedDst || !OverlappedSrc ||
1900          Src1->getReg() == MI->getOperand(0).getReg());
1901   assert(ST.needsAlignedVGPRs());
1902   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1903 
1904   Register NewReg;
1905   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1906                                    : AMDGPU::VGPR_32RegClass) {
1907     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1908       NewReg = Reg;
1909       break;
1910     }
1911   }
1912 
1913   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1914                                : NewReg;
1915   Register NewAmtLo;
1916 
1917   if (Overlapped)
1918     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1919 
1920   DebugLoc DL = MI->getDebugLoc();
1921   MachineBasicBlock *MBB = MI->getParent();
1922   // Insert a full wait count because found register might be pending a wait.
1923   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1924       .addImm(0);
1925 
1926   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1927   if (Overlapped)
1928     runOnInstruction(
1929         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1930             .addDef(AmtReg - 1)
1931             .addReg(AmtReg - 1, RegState::Undef)
1932             .addReg(NewAmtLo, RegState::Undef));
1933   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1934                        .addDef(AmtReg)
1935                        .addReg(AmtReg, RegState::Undef)
1936                        .addReg(NewAmt, RegState::Undef));
1937 
1938   // Instructions emitted after the current instruction will be processed by the
1939   // parent loop of the hazard recognizer in a natural way.
1940   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1941           AmtReg)
1942       .addDef(NewAmt)
1943       .addReg(NewAmt)
1944       .addReg(AmtReg);
1945   if (Overlapped)
1946     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1947             AmtReg - 1)
1948         .addDef(NewAmtLo)
1949         .addReg(NewAmtLo)
1950         .addReg(AmtReg - 1);
1951 
1952   // Re-running hazard recognizer on the modified instruction is not necessary,
1953   // inserted V_SWAP_B32 has already both read and write new registers so
1954   // hazards related to these register has already been handled.
1955   Amt->setReg(NewAmt);
1956   Amt->setIsKill(false);
1957   // We do not update liveness, so verifier may see it as undef.
1958   Amt->setIsUndef();
1959   if (OverlappedDst)
1960     MI->getOperand(0).setReg(NewReg);
1961   if (OverlappedSrc) {
1962     Src1->setReg(NewReg);
1963     Src1->setIsKill(false);
1964     Src1->setIsUndef();
1965   }
1966 
1967   return true;
1968 }
1969 
1970 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1971   int NSAtoVMEMWaitStates = 1;
1972 
1973   if (!ST.hasNSAtoVMEMBug())
1974     return 0;
1975 
1976   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1977     return 0;
1978 
1979   const SIInstrInfo *TII = ST.getInstrInfo();
1980   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1981   if (!Offset || (Offset->getImm() & 6) == 0)
1982     return 0;
1983 
1984   auto IsHazardFn = [TII](const MachineInstr &I) {
1985     if (!SIInstrInfo::isMIMG(I))
1986       return false;
1987     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1988     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1989            TII->getInstSizeInBytes(I) >= 16;
1990   };
1991 
1992   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1993 }
1994 
1995 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1996   int FPAtomicToDenormModeWaitStates = 3;
1997 
1998   if (!ST.hasFPAtomicToDenormModeHazard())
1999     return 0;
2000   assert(!ST.hasExtendedWaitCounts());
2001 
2002   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2003     return 0;
2004 
2005   auto IsHazardFn = [](const MachineInstr &I) {
2006     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2007       return false;
2008     return SIInstrInfo::isFPAtomic(I);
2009   };
2010 
2011   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2012     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2013       return true;
2014 
2015     switch (MI.getOpcode()) {
2016     case AMDGPU::S_WAITCNT:
2017     case AMDGPU::S_WAITCNT_VSCNT:
2018     case AMDGPU::S_WAITCNT_VMCNT:
2019     case AMDGPU::S_WAITCNT_EXPCNT:
2020     case AMDGPU::S_WAITCNT_LGKMCNT:
2021     case AMDGPU::S_WAIT_IDLE:
2022       return true;
2023     default:
2024       break;
2025     }
2026 
2027     return false;
2028   };
2029 
2030   return FPAtomicToDenormModeWaitStates -
2031          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2032 }
2033 
2034 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2035   assert(SIInstrInfo::isMAI(*MI));
2036 
2037   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2038 }
2039 
2040 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2041   // Early exit if no padding is requested.
2042   if (MFMAPaddingRatio == 0)
2043     return 0;
2044 
2045   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2046   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2047     return 0;
2048 
2049   int NeighborMFMALatency = 0;
2050   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2051                             this](const MachineInstr &MI) {
2052     if (!SIInstrInfo::isMFMA(MI))
2053       return false;
2054 
2055     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2056     return true;
2057   };
2058 
2059   const int MaxMFMAPipelineWaitStates = 16;
2060   int WaitStatesSinceNeighborMFMA =
2061       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2062 
2063   int NeighborMFMAPaddingNeeded =
2064       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2065       WaitStatesSinceNeighborMFMA;
2066 
2067   return std::max(0, NeighborMFMAPaddingNeeded);
2068 }
2069 
2070 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2071   int WaitStatesNeeded = 0;
2072   unsigned Opc = MI->getOpcode();
2073 
2074   auto IsVALUFn = [](const MachineInstr &MI) {
2075     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2076   };
2077 
2078   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2079     const int LegacyVALUWritesVGPRWaitStates = 2;
2080     const int VALUWritesExecWaitStates = 4;
2081     const int MaxWaitStates = 4;
2082 
2083     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2084       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2085     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2086 
2087     if (WaitStatesNeeded < MaxWaitStates) {
2088       for (const MachineOperand &Use : MI->explicit_uses()) {
2089         const int MaxWaitStates = 2;
2090 
2091         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2092           continue;
2093 
2094         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2095           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2096         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2097 
2098         if (WaitStatesNeeded == MaxWaitStates)
2099           break;
2100       }
2101     }
2102   }
2103 
2104   for (const MachineOperand &Op : MI->explicit_operands()) {
2105     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2106       continue;
2107 
2108     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2109       continue;
2110 
2111     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2112     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2113     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2114     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2115     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2116     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2117     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2118     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2119     const int MaxWaitStates = 18;
2120     Register Reg = Op.getReg();
2121     unsigned HazardDefLatency = 0;
2122 
2123     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2124                                this](const MachineInstr &MI) {
2125       if (!SIInstrInfo::isMFMA(MI))
2126         return false;
2127       Register DstReg = MI.getOperand(0).getReg();
2128       if (DstReg == Reg)
2129         return false;
2130       HazardDefLatency =
2131           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2132       return TRI.regsOverlap(DstReg, Reg);
2133     };
2134 
2135     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2136                                                    MaxWaitStates);
2137     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2138     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2139     int OpNo = Op.getOperandNo();
2140     if (OpNo == SrcCIdx) {
2141       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2142     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2143       switch (HazardDefLatency) {
2144       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2145                break;
2146       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2147                break;
2148       case 16: [[fallthrough]];
2149       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2150                break;
2151       }
2152     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2153       switch (HazardDefLatency) {
2154       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2155                break;
2156       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2157                break;
2158       case 16: [[fallthrough]];
2159       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2160                break;
2161       }
2162     }
2163 
2164     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2165     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2166 
2167     if (WaitStatesNeeded == MaxWaitStates)
2168       return WaitStatesNeeded; // Early exit.
2169 
2170     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2171       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2172         return false;
2173       Register DstReg = MI.getOperand(0).getReg();
2174       return TRI.regsOverlap(Reg, DstReg);
2175     };
2176 
2177     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2178     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2179     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2180     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2181     if (OpNo == SrcCIdx)
2182       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2183     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2184       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2185 
2186     WaitStatesNeededForUse = NeedWaitStates -
2187       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2188     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2189 
2190     if (WaitStatesNeeded == MaxWaitStates)
2191       return WaitStatesNeeded; // Early exit.
2192   }
2193 
2194   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2195     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2196     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2197     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2198     const int MaxWaitStates = 13;
2199     Register DstReg = MI->getOperand(0).getReg();
2200     unsigned HazardDefLatency = 0;
2201 
2202     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2203                          this](const MachineInstr &MI) {
2204       if (!SIInstrInfo::isMFMA(MI))
2205         return false;
2206       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2207       HazardDefLatency =
2208           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2209       return TRI.regsOverlap(Reg, DstReg);
2210     };
2211 
2212     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2213     int NeedWaitStates;
2214     switch (HazardDefLatency) {
2215     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2216              break;
2217     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2218              break;
2219     case 16: [[fallthrough]];
2220     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2221              break;
2222     }
2223 
2224     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2225     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2226   }
2227 
2228   // Pad neighboring MFMA with noops for better inter-wave performance.
2229   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2230 
2231   return WaitStatesNeeded;
2232 }
2233 
2234 static int
2235 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2236                                                               bool IsGFX950) {
2237   // xdl def cycles | gfx940 | gfx950
2238   // 2 pass         |  3        4
2239   // 4 pass         |  5        6
2240   // 8 pass         |  9        10
2241   // 16 pass        |  17       18
2242   return NumPasses + 1 + IsGFX950;
2243 }
2244 
2245 static int
2246 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2247                                                               bool IsGFX950) {
2248   // xdl def cycles | gfx940 | gfx950
2249   // 2 pass         |  3        3
2250   // 4 pass         |  5        6
2251   // 8 pass         |  9        10
2252   // 16 pass        |  17       18
2253   return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2254 }
2255 
2256 static int
2257 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2258   // 2 pass -> 2
2259   // 4 pass -> 4
2260   // 8 pass -> 8
2261   // 16 pass -> 16
2262   return NumPasses;
2263 }
2264 
2265 static int
2266 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2267   // 2 pass -> 4
2268   // 4 pass -> 6
2269   // 8 pass -> 10
2270   // 16 pass -> 18
2271   return NumPasses + 2;
2272 }
2273 
2274 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2275   // 2 pass -> 5
2276   // 4 pass -> 7
2277   // 8 pass -> 11
2278   // 16 pass -> 19
2279   return NumPasses + 3;
2280 }
2281 
2282 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2283   int WaitStatesNeeded = 0;
2284   unsigned Opc = MI->getOpcode();
2285 
2286   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2287     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2288   };
2289 
2290   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2291     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2292            !SIInstrInfo::isDOT(MI);
2293   };
2294 
2295   if (!SIInstrInfo::isMFMA(*MI))
2296     return WaitStatesNeeded;
2297 
2298   const int VALUWritesExecWaitStates = 4;
2299   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2300     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2301                           VALUWritesExecWaitStates);
2302   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2303 
2304   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2305 
2306   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2307   for (const MachineOperand &Use : MI->explicit_uses()) {
2308     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2309     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2310     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2311     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2312     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2313     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2314     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2315     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2316     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2317     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2318     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2319     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2320     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2321     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2322     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2323     const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2324     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2325     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2326     const int MaxWaitStates = 19;
2327 
2328     if (!Use.isReg())
2329       continue;
2330     Register Reg = Use.getReg();
2331     bool FullReg;
2332     const MachineInstr *MI1;
2333 
2334     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2335                                this](const MachineInstr &MI) {
2336       if (!SIInstrInfo::isMFMA(MI))
2337         return false;
2338       Register DstReg = MI.getOperand(0).getReg();
2339       FullReg = (DstReg == Reg);
2340       MI1 = &MI;
2341       return TRI.regsOverlap(DstReg, Reg);
2342     };
2343 
2344     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2345       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2346     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2347 
2348     int NumWaitStates =
2349         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2350     if (NumWaitStates == std::numeric_limits<int>::max())
2351       continue;
2352 
2353     int OpNo = Use.getOperandNo();
2354     unsigned Opc1 = MI1->getOpcode();
2355     int NeedWaitStates = 0;
2356     if (OpNo == SrcCIdx) {
2357       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2358         NeedWaitStates = 0;
2359       } else if (FullReg) {
2360         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2361              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2362             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2363              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2364           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2365         else if (ST.hasGFX940Insts() &&
2366                  TSchedModel.computeInstrLatency(MI1) == 2)
2367           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2368       } else {
2369         switch (Opc1) {
2370         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2371         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2372         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2373         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2374           if (!isXDL(ST, *MI))
2375             NeedWaitStates =
2376                 ST.hasGFX950Insts()
2377                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2378                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2379           break;
2380         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2381         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2382           if (!isXDL(ST, *MI))
2383             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2384           break;
2385         default:
2386           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2387           if (ST.hasGFX940Insts()) {
2388             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2389               break;
2390 
2391             NeedWaitStates =
2392                 isXDL(ST, *MI1)
2393                     ? (isXDL(ST, *MI)
2394                            ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2395                                  NumPasses, ST.hasGFX950Insts())
2396                            : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2397                                  NumPasses, ST.hasGFX950Insts()))
2398                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2399                           NumPasses);
2400             break;
2401           }
2402 
2403           switch (NumPasses) {
2404           case 2:
2405             NeedWaitStates =
2406                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2407                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2408             break;
2409           case 8:
2410             NeedWaitStates =
2411                 isDGEMM(Opc)
2412                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2413                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2414             break;
2415           case 16:
2416             NeedWaitStates =
2417                 isDGEMM(Opc)
2418                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2419                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2420             break;
2421           default:
2422             llvm_unreachable("unexpected number of passes");
2423           }
2424         }
2425       }
2426     } else {
2427       switch (Opc1) {
2428       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2429       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2430       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2431       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2432         NeedWaitStates =
2433             ST.hasGFX950Insts()
2434                 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2435                 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2436         break;
2437       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2438       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2439         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2440         break;
2441       default:
2442         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2443 
2444         if (ST.hasGFX940Insts()) {
2445           NeedWaitStates =
2446               isXDL(ST, *MI1)
2447                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2448                         NumPasses)
2449                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2450                         NumPasses);
2451           break;
2452         }
2453 
2454         switch (NumPasses) {
2455         case 2:
2456           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2457           break;
2458         case 4:
2459           llvm_unreachable("unexpected number of passes for mfma");
2460         case 8:
2461           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2462           break;
2463         case 16:
2464         default:
2465           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2466         }
2467       }
2468     }
2469     if (WaitStatesNeeded >= NeedWaitStates)
2470       continue;
2471 
2472     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2473     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2474 
2475     if (WaitStatesNeeded == MaxWaitStates)
2476       break;
2477   }
2478 
2479   // Pad neighboring MFMA with noops for better inter-wave performance.
2480   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2481 
2482   return WaitStatesNeeded;
2483 }
2484 
2485 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2486   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2487   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2488     return 0;
2489 
2490   int WaitStatesNeeded = 0;
2491 
2492   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2493     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2494   };
2495 
2496   for (const MachineOperand &Op : MI->explicit_uses()) {
2497     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2498       continue;
2499 
2500     Register Reg = Op.getReg();
2501 
2502     const int AccVgprReadLdStWaitStates = 2;
2503     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2504     const int MaxWaitStates = 2;
2505 
2506     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2507       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2508     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2509 
2510     if (WaitStatesNeeded == MaxWaitStates)
2511       return WaitStatesNeeded; // Early exit.
2512 
2513     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2514       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2515           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2516         return false;
2517       auto IsVALUFn = [](const MachineInstr &MI) {
2518         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2519       };
2520       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2521              std::numeric_limits<int>::max();
2522     };
2523 
2524     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2525       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2526     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2527   }
2528 
2529   return WaitStatesNeeded;
2530 }
2531 
2532 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2533   // 2 pass -> 4
2534   // 4 pass -> 6
2535   // 8 pass -> 10
2536   // 16 pass -> 18
2537   return NumPasses + 2;
2538 }
2539 
2540 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2541   // 2 pass -> 5
2542   // 4 pass -> 7
2543   // 8 pass -> 11
2544   // 16 pass -> 19
2545   return NumPasses + 3;
2546 }
2547 
2548 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2549   // 2 pass -> 5
2550   // 4 pass -> 7
2551   // 8 pass -> 11
2552   // 16 pass -> 19
2553   return NumPasses + 3;
2554 }
2555 
2556 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2557   // 2 pass -> 4
2558   // 4 pass -> 6
2559   // 8 pass -> 10
2560   // 16 pass -> 18
2561   return NumPasses + 2;
2562 }
2563 
2564 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2565   if (!ST.hasGFX90AInsts())
2566     return 0;
2567 
2568   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2569     return isDGEMM(MI.getOpcode());
2570   };
2571 
2572   // This is checked in checkMAIHazards90A()
2573   if (SIInstrInfo::isMFMA(*MI))
2574     return 0;
2575 
2576   const MachineRegisterInfo &MRI = MF.getRegInfo();
2577 
2578   int WaitStatesNeeded = 0;
2579 
2580   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2581                SIInstrInfo::isFLAT(*MI) ||
2582                SIInstrInfo::isDS(*MI);
2583   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2584   bool IsVALU = SIInstrInfo::isVALU(*MI);
2585 
2586   const MachineInstr *MFMA = nullptr;
2587   unsigned Reg;
2588   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2589     if (!SIInstrInfo::isMFMA(MI) ||
2590         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2591       return false;
2592     MFMA = &MI;
2593     return true;
2594   };
2595 
2596   const MachineInstr *DOT = nullptr;
2597   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2598     if (!SIInstrInfo::isDOT(MI) ||
2599         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2600       return false;
2601     DOT = &MI;
2602     return true;
2603   };
2604 
2605   bool DGEMMAfterVALUWrite = false;
2606   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2607     // Found DGEMM on reverse traversal to def.
2608     if (isDGEMM(MI.getOpcode()))
2609       DGEMMAfterVALUWrite = true;
2610 
2611     // Only hazard if register is defined by a VALU and a DGEMM is found after
2612     // after the def.
2613     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2614       return false;
2615 
2616     return true;
2617   };
2618 
2619   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2620                                            AMDGPU::OpName::src2);
2621 
2622   if (IsMemOrExport || IsVALU) {
2623     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2624     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2625     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2626     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2627     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2628     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2629     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2630     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2631     const int DotWriteSameDotReadSrcAB = 3;
2632     const int DotWriteDifferentVALURead = 3;
2633     const int DMFMABetweenVALUWriteVMEMRead = 2;
2634     const int MaxWaitStates = 19;
2635 
2636     for (const MachineOperand &Use : MI->explicit_uses()) {
2637       if (!Use.isReg())
2638         continue;
2639       Reg = Use.getReg();
2640 
2641       DOT = nullptr;
2642       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2643                                                      MaxWaitStates);
2644       if (DOT) {
2645         int NeedWaitStates = 0;
2646         if (DOT->getOpcode() == MI->getOpcode()) {
2647           if (&Use - &MI->getOperand(0) != SrcCIdx)
2648             NeedWaitStates = DotWriteSameDotReadSrcAB;
2649         } else {
2650           NeedWaitStates = DotWriteDifferentVALURead;
2651         }
2652 
2653         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2654         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2655       }
2656 
2657       // Workaround for HW data hazard bug observed only in GFX90A. When there
2658       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2659       // causes the SQ to incorrectly not insert two wait states between the two
2660       // instructions needed to avoid data hazard.
2661       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2662         DGEMMAfterVALUWrite = false;
2663         if (TRI.isVectorRegister(MRI, Reg)) {
2664           int WaitStatesNeededForUse =
2665                 DMFMABetweenVALUWriteVMEMRead -
2666                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2667                                       DMFMABetweenVALUWriteVMEMRead);
2668 
2669           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2670         }
2671       }
2672 
2673       MFMA = nullptr;
2674       WaitStatesSinceDef =
2675           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2676       if (!MFMA)
2677         continue;
2678 
2679       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2680       int NumPasses = HazardDefLatency;
2681       int NeedWaitStates = MaxWaitStates;
2682 
2683       if (isDGEMM(MFMA->getOpcode())) {
2684         switch (HazardDefLatency) {
2685         case 4:
2686           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2687                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2688           break;
2689         case 8:
2690         case 16:
2691           NeedWaitStates =
2692               IsMemOrExport
2693                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2694                   : (ST.hasGFX950Insts()
2695                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2696                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2697           break;
2698         default:
2699           llvm_unreachable("unexpected dgemm");
2700         }
2701       } else if (ST.hasGFX940Insts()) {
2702         NeedWaitStates =
2703             isXDL(ST, *MFMA)
2704                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2705                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2706                       NumPasses);
2707       } else {
2708         switch (HazardDefLatency) {
2709         case 2:
2710           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2711           break;
2712         case 8:
2713           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2714           break;
2715         case 16:
2716           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2717           break;
2718         default:
2719           llvm_unreachable("unexpected number of passes for mfma");
2720         }
2721       }
2722 
2723       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2724       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2725 
2726       if (WaitStatesNeeded == MaxWaitStates)
2727         break;
2728     }
2729   }
2730 
2731   unsigned Opc = MI->getOpcode();
2732   const int DMFMAToFMA64WaitStates = 2;
2733   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2734        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2735        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2736       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2737     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2738       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2739     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2740   }
2741 
2742   if (!IsVALU && !IsMemOrExport)
2743     return WaitStatesNeeded;
2744 
2745   for (const MachineOperand &Def : MI->defs()) {
2746     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2747     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2748     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2749     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2750     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2751     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2752     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2753     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2754     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2755     const int DotWriteDifferentVALUWrite = 3;
2756     const int MaxWaitStates = 19;
2757     const int MaxWarWaitStates = 15;
2758 
2759     Reg = Def.getReg();
2760 
2761     DOT = nullptr;
2762     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2763                                                    MaxWaitStates);
2764     if (DOT && DOT->getOpcode() != MI->getOpcode())
2765       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2766                                                     WaitStatesSinceDef);
2767 
2768     MFMA = nullptr;
2769     WaitStatesSinceDef =
2770         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2771     if (MFMA) {
2772       int NeedWaitStates = MaxWaitStates;
2773       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2774 
2775       if (isDGEMM(MFMA->getOpcode())) {
2776         switch (NumPasses) {
2777         case 4:
2778           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2779           break;
2780         case 8:
2781         case 16:
2782           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2783           break;
2784         default:
2785           llvm_unreachable("unexpected number of cycles for dgemm");
2786         }
2787       } else if (ST.hasGFX940Insts()) {
2788         NeedWaitStates =
2789             isXDL(ST, *MFMA)
2790                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2791                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2792       } else {
2793         switch (NumPasses) {
2794         case 2:
2795           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2796           break;
2797         case 8:
2798           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2799           break;
2800         case 16:
2801           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2802           break;
2803         default:
2804           llvm_unreachable("Unexpected number of passes for mfma");
2805         }
2806       }
2807 
2808       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2809       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2810 
2811       if (WaitStatesNeeded == MaxWaitStates)
2812         break;
2813     }
2814 
2815     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2816       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2817           !MI.readsRegister(Reg, &TRI))
2818         return false;
2819 
2820       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2821         return false;
2822 
2823       const MachineOperand *SrcC =
2824           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2825       assert(SrcC);
2826       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2827         return false;
2828 
2829       MFMA = &MI;
2830       return true;
2831     };
2832 
2833     MFMA = nullptr;
2834     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2835                                                 MaxWarWaitStates);
2836     if (!MFMA)
2837       continue;
2838 
2839     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2840     int NeedWaitStates = MaxWaitStates;
2841     switch (HazardDefLatency) {
2842     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2843              break;
2844     case 4:  assert(ST.hasGFX940Insts());
2845              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2846              break;
2847     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2848              break;
2849     case 16: [[fallthrough]];
2850     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2851              break;
2852     }
2853 
2854     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2855     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2856   }
2857 
2858   return WaitStatesNeeded;
2859 }
2860 
2861 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2862   if (!SU->isInstr())
2863     return false;
2864 
2865   const MachineInstr *MAI = nullptr;
2866 
2867   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2868     MAI = nullptr;
2869     if (SIInstrInfo::isMFMA(MI))
2870       MAI = &MI;
2871     return MAI != nullptr;
2872   };
2873 
2874   MachineInstr *MI = SU->getInstr();
2875   if (IsMFMAFn(*MI)) {
2876     int W = getWaitStatesSince(IsMFMAFn, 16);
2877     if (MAI)
2878       return W < (int)TSchedModel.computeInstrLatency(MAI);
2879   }
2880 
2881   return false;
2882 }
2883 
2884 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2885 // insertion of a new instruction.
2886 static void updateGetPCBundle(MachineInstr *NewMI) {
2887   if (!NewMI->isBundled())
2888     return;
2889 
2890   // Find start of bundle.
2891   auto I = NewMI->getIterator();
2892   while (I->isBundledWithPred())
2893     I--;
2894   if (I->isBundle())
2895     I++;
2896 
2897   // Bail if this is not an S_GETPC bundle.
2898   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2899     return;
2900 
2901   // Update offsets of any references in the bundle.
2902   const unsigned NewBytes = 4;
2903   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2904          "Unexpected instruction insertion in bundle");
2905   auto NextMI = std::next(NewMI->getIterator());
2906   auto End = NewMI->getParent()->end();
2907   while (NextMI != End && NextMI->isBundledWithPred()) {
2908     for (auto &Operand : NextMI->operands()) {
2909       if (Operand.isGlobal())
2910         Operand.setOffset(Operand.getOffset() + NewBytes);
2911     }
2912     NextMI++;
2913   }
2914 }
2915 
2916 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2917   if (!ST.hasVALUMaskWriteHazard())
2918     return false;
2919   assert(!ST.hasExtendedWaitCounts());
2920 
2921   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2922     return false;
2923 
2924   // The hazard sequence is three instructions:
2925   //   1. VALU reads SGPR as mask
2926   //   2. SALU writes SGPR
2927   //   3. SALU reads SGPR
2928   // The hazard can expire if the distance between 2 and 3 is sufficient.
2929   // In practice this happens <10% of the time, hence this always assumes
2930   // the hazard exists if 1 and 2 are present to avoid searching.
2931 
2932   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2933   if (!SDSTOp || !SDSTOp->isReg())
2934     return false;
2935 
2936   const Register HazardReg = SDSTOp->getReg();
2937   if (HazardReg == AMDGPU::EXEC ||
2938       HazardReg == AMDGPU::EXEC_LO ||
2939       HazardReg == AMDGPU::EXEC_HI ||
2940       HazardReg == AMDGPU::M0)
2941     return false;
2942 
2943   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2944     switch (I.getOpcode()) {
2945     case AMDGPU::V_ADDC_U32_e32:
2946     case AMDGPU::V_ADDC_U32_dpp:
2947     case AMDGPU::V_CNDMASK_B16_e32:
2948     case AMDGPU::V_CNDMASK_B16_dpp:
2949     case AMDGPU::V_CNDMASK_B32_e32:
2950     case AMDGPU::V_CNDMASK_B32_dpp:
2951     case AMDGPU::V_DIV_FMAS_F32_e64:
2952     case AMDGPU::V_DIV_FMAS_F64_e64:
2953     case AMDGPU::V_SUBB_U32_e32:
2954     case AMDGPU::V_SUBB_U32_dpp:
2955     case AMDGPU::V_SUBBREV_U32_e32:
2956     case AMDGPU::V_SUBBREV_U32_dpp:
2957       // These implicitly read VCC as mask source.
2958       return HazardReg == AMDGPU::VCC ||
2959              HazardReg == AMDGPU::VCC_LO ||
2960              HazardReg == AMDGPU::VCC_HI;
2961     case AMDGPU::V_ADDC_U32_e64:
2962     case AMDGPU::V_ADDC_U32_e64_dpp:
2963     case AMDGPU::V_CNDMASK_B16_e64:
2964     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2965     case AMDGPU::V_CNDMASK_B32_e64:
2966     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2967     case AMDGPU::V_SUBB_U32_e64:
2968     case AMDGPU::V_SUBB_U32_e64_dpp:
2969     case AMDGPU::V_SUBBREV_U32_e64:
2970     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2971       // Only check mask register overlaps.
2972       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2973       assert(SSRCOp);
2974       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2975     }
2976     default:
2977       return false;
2978     }
2979   };
2980 
2981   const MachineRegisterInfo &MRI = MF.getRegInfo();
2982   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2983     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2984     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2985         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2986       return true;
2987 
2988     // VALU access to any SGPR or literal constant other than HazardReg
2989     // mitigates hazard. No need to check HazardReg here as this will
2990     // only be called when !IsHazardFn.
2991     if (!SIInstrInfo::isVALU(I))
2992       return false;
2993     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2994       const MachineOperand &Op = I.getOperand(OpNo);
2995       if (Op.isReg()) {
2996         Register OpReg = Op.getReg();
2997         // Only consider uses
2998         if (!Op.isUse())
2999           continue;
3000         // Ignore EXEC
3001         if (OpReg == AMDGPU::EXEC ||
3002             OpReg == AMDGPU::EXEC_LO ||
3003             OpReg == AMDGPU::EXEC_HI)
3004           continue;
3005         // Ignore all implicit uses except VCC
3006         if (Op.isImplicit()) {
3007           if (OpReg == AMDGPU::VCC ||
3008               OpReg == AMDGPU::VCC_LO ||
3009               OpReg == AMDGPU::VCC_HI)
3010             return true;
3011           continue;
3012         }
3013         if (TRI.isSGPRReg(MRI, OpReg))
3014           return true;
3015       } else {
3016         const MCInstrDesc &InstDesc = I.getDesc();
3017         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3018         if (!TII.isInlineConstant(Op, OpInfo))
3019           return true;
3020       }
3021     }
3022     return false;
3023   };
3024 
3025   // Check for hazard
3026   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3027       std::numeric_limits<int>::max())
3028     return false;
3029 
3030   auto NextMI = std::next(MI->getIterator());
3031 
3032   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3033   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3034                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3035                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3036 
3037   // SALU write may be s_getpc in a bundle.
3038   updateGetPCBundle(NewMI);
3039 
3040   return true;
3041 }
3042 
3043 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3044 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3045 static std::optional<unsigned> sgprPairNumber(Register Reg,
3046                                               const SIRegisterInfo &TRI) {
3047   switch (Reg) {
3048   case AMDGPU::M0:
3049   case AMDGPU::EXEC:
3050   case AMDGPU::EXEC_LO:
3051   case AMDGPU::EXEC_HI:
3052   case AMDGPU::SGPR_NULL:
3053   case AMDGPU::SGPR_NULL64:
3054     return {};
3055   default:
3056     break;
3057   }
3058   unsigned RegN = TRI.getEncodingValue(Reg);
3059   if (RegN > 127)
3060     return {};
3061   return (RegN >> 1) & 0x3f;
3062 }
3063 
3064 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3065 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3066   assert(MMF == &MF);
3067 
3068   // Assume non-empty vector means it has already been computed.
3069   if (!VALUReadHazardSGPRs.empty())
3070     return;
3071 
3072   auto CallingConv = MF.getFunction().getCallingConv();
3073   bool IsCallFree =
3074       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3075 
3076   // Exhaustive search is only viable in non-caller/callee functions where
3077   // VALUs will be exposed to the hazard recognizer.
3078   UseVALUReadHazardExhaustiveSearch =
3079       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3080       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3081 
3082   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3083   bool UseVALUUseCache =
3084       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3085   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3086   if (!UseVALUUseCache)
3087     return;
3088 
3089   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3090   // before a SALU write to the same SGPR.  This provides a reduction in
3091   // hazard insertion when all VALU access to an SGPR occurs after its last
3092   // SALU write, when compared to a linear scan.
3093   const MachineRegisterInfo &MRI = MF.getRegInfo();
3094   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3095   MachineCycleInfo CI;
3096   CI.compute(*MMF);
3097 
3098   for (auto *MBB : post_order(&MF)) {
3099     bool InCycle = CI.getCycle(MBB) != nullptr;
3100     for (auto &MI : reverse(MBB->instrs())) {
3101       bool IsVALU = SIInstrInfo::isVALU(MI);
3102       bool IsSALU = SIInstrInfo::isSALU(MI);
3103       if (!IsVALU && !IsSALU)
3104         continue;
3105 
3106       for (const MachineOperand &Op : MI.operands()) {
3107         if (!Op.isReg())
3108           continue;
3109         Register Reg = Op.getReg();
3110         assert(!Op.getSubReg());
3111         // Only consider implicit operands of VCC.
3112         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3113                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3114           continue;
3115         if (!TRI.isSGPRReg(MRI, Reg))
3116           continue;
3117         auto RegN = sgprPairNumber(Reg, TRI);
3118         if (!RegN)
3119           continue;
3120         if (IsVALU && Op.isUse()) {
3121           // Note: any access within a cycle must be considered a hazard.
3122           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3123             VALUReadHazardSGPRs.set(*RegN);
3124           ReadSGPRs.set(*RegN);
3125         } else if (IsSALU) {
3126           if (Op.isDef())
3127             SALUWriteSGPRs.set(*RegN);
3128           else
3129             ReadSGPRs.set(*RegN);
3130         }
3131       }
3132     }
3133   }
3134 }
3135 
3136 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3137   if (!ST.hasVALUReadSGPRHazard())
3138     return false;
3139 
3140   // The hazard sequence is fundamentally three instructions:
3141   //   1. VALU reads SGPR
3142   //   2. SALU writes SGPR
3143   //   3. VALU/SALU reads SGPR
3144   // Try to avoid searching for (1) because the expiry point of the hazard is
3145   // indeterminate; however, the hazard between (2) and (3) can expire if the
3146   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3147   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3148   // even if individual SGPRs are accessed.
3149 
3150   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3151   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3152   if (!(MIIsSALU || MIIsVALU))
3153     return false;
3154 
3155   // Avoid expensive search when compile time is priority by
3156   // mitigating every SALU which writes an SGPR.
3157   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3158     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3159       return false;
3160 
3161     const MachineOperand *SDSTOp =
3162         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3163     if (!SDSTOp || !SDSTOp->isReg())
3164       return false;
3165 
3166     const Register HazardReg = SDSTOp->getReg();
3167     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3168         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3169       return false;
3170 
3171     // Add s_wait_alu sa_sdst(0) after SALU write.
3172     auto NextMI = std::next(MI->getIterator());
3173     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3174                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3175                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3176 
3177     // SALU write may be s_getpc in a bundle.
3178     updateGetPCBundle(NewMI);
3179 
3180     return true;
3181   }
3182 
3183   // Pre-compute set of SGPR pairs read by VALUs.
3184   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3185   computeVALUHazardSGPRs(MI->getMF());
3186 
3187   // If no VALUs hazard SGPRs exist then nothing to do.
3188   if (VALUReadHazardSGPRs.none())
3189     return false;
3190 
3191   // All SGPR writes before a call/return must be flushed as the callee/caller
3192   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3193   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3194                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3195                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3196 
3197   // Collect all SGPR sources for MI which are read by a VALU.
3198   const MachineRegisterInfo &MRI = MF.getRegInfo();
3199   SmallSet<Register, 4> SGPRsUsed;
3200 
3201   if (!IsSetPC) {
3202     for (const MachineOperand &Op : MI->all_uses()) {
3203       Register OpReg = Op.getReg();
3204 
3205       // Only consider VCC implicit uses on VALUs.
3206       // The only expected SALU implicit access is SCC which is no hazard.
3207       if (MIIsSALU && Op.isImplicit())
3208         continue;
3209 
3210       if (!TRI.isSGPRReg(MRI, OpReg))
3211         continue;
3212 
3213       auto RegN = sgprPairNumber(OpReg, TRI);
3214       if (!RegN)
3215         continue;
3216 
3217       if (!VALUReadHazardSGPRs[*RegN])
3218         continue;
3219 
3220       SGPRsUsed.insert(OpReg);
3221     }
3222 
3223     // No SGPRs -> nothing to do.
3224     if (SGPRsUsed.empty())
3225       return false;
3226   }
3227 
3228   // A hazard is any SALU which writes one of the SGPRs read by MI.
3229   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3230     if (!SIInstrInfo::isSALU(I))
3231       return false;
3232     // Ensure SGPR flush before call/return by conservatively assuming every
3233     // SALU writes an SGPR.
3234     if (IsSetPC && I.getNumDefs() > 0)
3235       return true;
3236     // Check for any register writes.
3237     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3238       return I.modifiesRegister(Reg, &TRI);
3239     });
3240   };
3241 
3242   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3243   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3244     if (Count >= SALUExpiryCount)
3245       return true;
3246     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3247     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3248         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3249       return true;
3250     return false;
3251   };
3252 
3253   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3254     // Only count true SALUs as wait states.
3255     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3256       return 0;
3257     // SALU must be unrelated to any hazard registers.
3258     if (any_of(SGPRsUsed,
3259                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3260       return 0;
3261     return 1;
3262   };
3263 
3264   // Check for the hazard.
3265   DenseSet<const MachineBasicBlock *> Visited;
3266   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3267                                         std::next(MI->getReverseIterator()), 0,
3268                                         IsExpiredFn, Visited, WaitStatesFn);
3269 
3270   if (WaitStates >= SALUExpiryCount)
3271     return false;
3272 
3273   // Validate hazard through an exhaustive search.
3274   if (UseVALUReadHazardExhaustiveSearch) {
3275     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3276     // This is searching for (1) in the hazard description.
3277     auto hazardPair = [this](Register Reg) {
3278       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3279         return Register(AMDGPU::VCC);
3280       auto RegN = sgprPairNumber(Reg, TRI);
3281       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3282     };
3283     auto SearchHazardFn = [this, hazardPair,
3284                            &SGPRsUsed](const MachineInstr &I) {
3285       if (!SIInstrInfo::isVALU(I))
3286         return false;
3287       // Check for any register reads.
3288       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3289         return I.readsRegister(hazardPair(Reg), &TRI);
3290       });
3291     };
3292     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3293       return false;
3294     };
3295     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3296         std::numeric_limits<int>::max())
3297       return false;
3298   }
3299 
3300   // Add s_wait_alu sa_sdst(0) before SALU read.
3301   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3302                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3303                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3304 
3305   // SALU read may be after s_getpc in a bundle.
3306   updateGetPCBundle(NewMI);
3307 
3308   return true;
3309 }
3310 
3311 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3312                                const SIInstrInfo &TII) {
3313   MachineBasicBlock &EntryMBB = MF->front();
3314   if (EntryMBB.begin() != EntryMBB.end()) {
3315     auto &EntryMI = *EntryMBB.begin();
3316     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3317         EntryMI.getOperand(0).getImm() >= Priority)
3318       return false;
3319   }
3320 
3321   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3322       .addImm(Priority);
3323   return true;
3324 }
3325 
3326 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3327   if (!ST.hasRequiredExportPriority())
3328     return false;
3329 
3330   // Assume the following shader types will never have exports,
3331   // and avoid adding or adjusting S_SETPRIO.
3332   MachineBasicBlock *MBB = MI->getParent();
3333   MachineFunction *MF = MBB->getParent();
3334   auto CC = MF->getFunction().getCallingConv();
3335   switch (CC) {
3336   case CallingConv::AMDGPU_CS:
3337   case CallingConv::AMDGPU_CS_Chain:
3338   case CallingConv::AMDGPU_CS_ChainPreserve:
3339   case CallingConv::AMDGPU_KERNEL:
3340     return false;
3341   default:
3342     break;
3343   }
3344 
3345   const int MaxPriority = 3;
3346   const int NormalPriority = 2;
3347   const int PostExportPriority = 0;
3348 
3349   auto It = MI->getIterator();
3350   switch (MI->getOpcode()) {
3351   case AMDGPU::S_ENDPGM:
3352   case AMDGPU::S_ENDPGM_SAVED:
3353   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3354   case AMDGPU::SI_RETURN_TO_EPILOG:
3355     // Ensure shader with calls raises priority at entry.
3356     // This ensures correct priority if exports exist in callee.
3357     if (MF->getFrameInfo().hasCalls())
3358       return ensureEntrySetPrio(MF, NormalPriority, TII);
3359     return false;
3360   case AMDGPU::S_SETPRIO: {
3361     // Raise minimum priority unless in workaround.
3362     auto &PrioOp = MI->getOperand(0);
3363     int Prio = PrioOp.getImm();
3364     bool InWA = (Prio == PostExportPriority) &&
3365                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3366     if (InWA || Prio >= NormalPriority)
3367       return false;
3368     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3369     return true;
3370   }
3371   default:
3372     if (!TII.isEXP(*MI))
3373       return false;
3374     break;
3375   }
3376 
3377   // Check entry priority at each export (as there will only be a few).
3378   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3379   bool Changed = false;
3380   if (CC != CallingConv::AMDGPU_Gfx)
3381     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3382 
3383   auto NextMI = std::next(It);
3384   bool EndOfShader = false;
3385   if (NextMI != MBB->end()) {
3386     // Only need WA at end of sequence of exports.
3387     if (TII.isEXP(*NextMI))
3388       return Changed;
3389     // Assume appropriate S_SETPRIO after export means WA already applied.
3390     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3391         NextMI->getOperand(0).getImm() == PostExportPriority)
3392       return Changed;
3393     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3394   }
3395 
3396   const DebugLoc &DL = MI->getDebugLoc();
3397 
3398   // Lower priority.
3399   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3400       .addImm(PostExportPriority);
3401 
3402   if (!EndOfShader) {
3403     // Wait for exports to complete.
3404     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3405         .addReg(AMDGPU::SGPR_NULL)
3406         .addImm(0);
3407   }
3408 
3409   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3410   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3411 
3412   if (!EndOfShader) {
3413     // Return to normal (higher) priority.
3414     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3415         .addImm(NormalPriority);
3416   }
3417 
3418   return true;
3419 }
3420