xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 86627149f6fd5148311b7b0aa1c7195a05a5d6a8)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false),
63       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65   TSchedModel.init(&ST);
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
172 }
173 
174 static bool isLdsDma(const MachineInstr &MI) {
175   return SIInstrInfo::isVALU(MI) &&
176          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
177 }
178 
179 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
180   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
181                                                      AMDGPU::OpName::simm16);
182   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
183 }
184 
185 ScheduleHazardRecognizer::HazardType
186 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
187   MachineInstr *MI = SU->getInstr();
188   // If we are not in "HazardRecognizerMode" and therefore not being run from
189   // the scheduler, track possible stalls from hazards but don't insert noops.
190   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
191 
192   if (MI->isBundle())
193    return NoHazard;
194 
195   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
196     return HazardType;
197 
198   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
199     return HazardType;
200 
201   if (checkFPAtomicToDenormModeHazard(MI) > 0)
202     return HazardType;
203 
204   if (ST.hasNoDataDepHazard())
205     return NoHazard;
206 
207   // FIXME: Should flat be considered vmem?
208   if ((SIInstrInfo::isVMEM(*MI) ||
209        SIInstrInfo::isFLAT(*MI))
210       && checkVMEMHazards(MI) > 0)
211     return HazardType;
212 
213   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
214     return HazardType;
215 
216   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
217     return HazardType;
218 
219   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
220     return HazardType;
221 
222   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
223     return HazardType;
224 
225   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
226        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
227        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
228     return HazardType;
229 
230   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
231     return HazardType;
232 
233   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
234     return HazardType;
235 
236   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
237     return HazardType;
238 
239   if (((ST.hasReadM0MovRelInterpHazard() &&
240         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
241          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
242          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
243        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
244        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
245        (ST.hasReadM0LdsDirectHazard() &&
246         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
247       checkReadM0Hazards(MI) > 0)
248     return HazardType;
249 
250   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
251     return HazardType;
252 
253   if ((SIInstrInfo::isVMEM(*MI) ||
254        SIInstrInfo::isFLAT(*MI) ||
255        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
256     return HazardType;
257 
258   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
259     return HazardType;
260 
261   return NoHazard;
262 }
263 
264 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
265                                 unsigned Quantity) {
266   while (Quantity > 0) {
267     unsigned Arg = std::min(Quantity, 8u);
268     Quantity -= Arg;
269     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
270         .addImm(Arg - 1);
271   }
272 }
273 
274 unsigned
275 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
276   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
277   assert(TSchedModel.getWriteProcResBegin(SC) !=
278          TSchedModel.getWriteProcResEnd(SC));
279   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
280 }
281 
282 void GCNHazardRecognizer::processBundle() {
283   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
284   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
285   // Check bundled MachineInstr's for hazards.
286   for (; MI != E && MI->isInsideBundle(); ++MI) {
287     CurrCycleInstr = &*MI;
288     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
289 
290     if (IsHazardRecognizerMode) {
291       fixHazards(CurrCycleInstr);
292 
293       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
294     }
295 
296     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
297     // include the bundled MI directly after, only add a maximum of
298     // (MaxLookAhead - 1) noops to EmittedInstrs.
299     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
300       EmittedInstrs.push_front(nullptr);
301 
302     EmittedInstrs.push_front(CurrCycleInstr);
303     EmittedInstrs.resize(MaxLookAhead);
304   }
305   CurrCycleInstr = nullptr;
306 }
307 
308 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
309   assert(IsHazardRecognizerMode);
310 
311   unsigned NumPreNoops = PreEmitNoops(MI);
312   EmitNoops(NumPreNoops);
313   if (MI->isInsideBundle())
314     insertNoopsInBundle(MI, TII, NumPreNoops);
315   else
316     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
317                     NumPreNoops);
318   EmitInstruction(MI);
319   AdvanceCycle();
320 }
321 
322 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
323   IsHazardRecognizerMode = true;
324   CurrCycleInstr = MI;
325   unsigned W = PreEmitNoopsCommon(MI);
326   fixHazards(MI);
327   CurrCycleInstr = nullptr;
328   return W;
329 }
330 
331 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
332   if (MI->isBundle())
333     return 0;
334 
335   int WaitStates = 0;
336 
337   if (SIInstrInfo::isSMRD(*MI))
338     return std::max(WaitStates, checkSMRDHazards(MI));
339 
340   if (ST.hasNSAtoVMEMBug())
341     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
342 
343   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
344 
345   if (ST.hasNoDataDepHazard())
346     return WaitStates;
347 
348   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
349     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
350 
351   if (SIInstrInfo::isVALU(*MI))
352     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
353 
354   if (SIInstrInfo::isDPP(*MI))
355     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
356 
357   if (isDivFMas(MI->getOpcode()))
358     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
359 
360   if (isRWLane(MI->getOpcode()))
361     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
362 
363   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
364        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
365        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
366     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
367 
368   if (MI->isInlineAsm())
369     return std::max(WaitStates, checkInlineAsmHazards(MI));
370 
371   if (isSGetReg(MI->getOpcode()))
372     return std::max(WaitStates, checkGetRegHazards(MI));
373 
374   if (isSSetReg(MI->getOpcode()))
375     return std::max(WaitStates, checkSetRegHazards(MI));
376 
377   if (isRFE(MI->getOpcode()))
378     return std::max(WaitStates, checkRFEHazards(MI));
379 
380   if ((ST.hasReadM0MovRelInterpHazard() &&
381        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
382         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
383         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
384       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
385       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
386       (ST.hasReadM0LdsDirectHazard() &&
387        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
388     return std::max(WaitStates, checkReadM0Hazards(MI));
389 
390   if (SIInstrInfo::isMAI(*MI))
391     return std::max(WaitStates, checkMAIHazards(MI));
392 
393   if (SIInstrInfo::isVMEM(*MI) ||
394       SIInstrInfo::isFLAT(*MI) ||
395       SIInstrInfo::isDS(*MI))
396     return std::max(WaitStates, checkMAILdStHazards(MI));
397 
398   return WaitStates;
399 }
400 
401 void GCNHazardRecognizer::EmitNoop() {
402   EmittedInstrs.push_front(nullptr);
403 }
404 
405 void GCNHazardRecognizer::AdvanceCycle() {
406   // When the scheduler detects a stall, it will call AdvanceCycle() without
407   // emitting any instructions.
408   if (!CurrCycleInstr) {
409     EmittedInstrs.push_front(nullptr);
410     return;
411   }
412 
413   if (CurrCycleInstr->isBundle()) {
414     processBundle();
415     return;
416   }
417 
418   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
419   if (!NumWaitStates) {
420     CurrCycleInstr = nullptr;
421     return;
422   }
423 
424   // Keep track of emitted instructions
425   EmittedInstrs.push_front(CurrCycleInstr);
426 
427   // Add a nullptr for each additional wait state after the first.  Make sure
428   // not to add more than getMaxLookAhead() items to the list, since we
429   // truncate the list to that size right after this loop.
430   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
431        i < e; ++i) {
432     EmittedInstrs.push_front(nullptr);
433   }
434 
435   // getMaxLookahead() is the largest number of wait states we will ever need
436   // to insert, so there is no point in keeping track of more than that many
437   // wait states.
438   EmittedInstrs.resize(getMaxLookAhead());
439 
440   CurrCycleInstr = nullptr;
441 }
442 
443 void GCNHazardRecognizer::RecedeCycle() {
444   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
445 }
446 
447 //===----------------------------------------------------------------------===//
448 // Helper Functions
449 //===----------------------------------------------------------------------===//
450 
451 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
452 
453 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
454 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
455 
456 // Search for a hazard in a block and its predecessors.
457 template <typename StateT>
458 static bool
459 hasHazard(StateT State,
460           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
461           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
462           const MachineBasicBlock *MBB,
463           MachineBasicBlock::const_reverse_instr_iterator I,
464           DenseSet<const MachineBasicBlock *> &Visited) {
465   for (auto E = MBB->instr_rend(); I != E; ++I) {
466     // No need to look at parent BUNDLE instructions.
467     if (I->isBundle())
468       continue;
469 
470     switch (IsHazard(State, *I)) {
471     case HazardFound:
472       return true;
473     case HazardExpired:
474       return false;
475     default:
476       // Continue search
477       break;
478     }
479 
480     if (I->isInlineAsm() || I->isMetaInstruction())
481       continue;
482 
483     UpdateState(State, *I);
484   }
485 
486   for (MachineBasicBlock *Pred : MBB->predecessors()) {
487     if (!Visited.insert(Pred).second)
488       continue;
489 
490     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
491                   Visited))
492       return true;
493   }
494 
495   return false;
496 }
497 
498 // Returns a minimum wait states since \p I walking all predecessors.
499 // Only scans until \p IsExpired does not return true.
500 // Can only be run in a hazard recognizer mode.
501 static int getWaitStatesSince(
502     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
503     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
504     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
505     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
506   for (auto E = MBB->instr_rend(); I != E; ++I) {
507     // Don't add WaitStates for parent BUNDLE instructions.
508     if (I->isBundle())
509       continue;
510 
511     if (IsHazard(*I))
512       return WaitStates;
513 
514     if (I->isInlineAsm())
515       continue;
516 
517     WaitStates += GetNumWaitStates(*I);
518 
519     if (IsExpired(*I, WaitStates))
520       return std::numeric_limits<int>::max();
521   }
522 
523   int MinWaitStates = std::numeric_limits<int>::max();
524   for (MachineBasicBlock *Pred : MBB->predecessors()) {
525     if (!Visited.insert(Pred).second)
526       continue;
527 
528     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
529                                IsExpired, Visited, GetNumWaitStates);
530 
531     MinWaitStates = std::min(MinWaitStates, W);
532   }
533 
534   return MinWaitStates;
535 }
536 
537 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
538                               const MachineInstr *MI, IsExpiredFn IsExpired) {
539   DenseSet<const MachineBasicBlock *> Visited;
540   return getWaitStatesSince(IsHazard, MI->getParent(),
541                             std::next(MI->getReverseIterator()),
542                             0, IsExpired, Visited);
543 }
544 
545 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
546   if (IsHazardRecognizerMode) {
547     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
548       return WaitStates >= Limit;
549     };
550     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
551   }
552 
553   int WaitStates = 0;
554   for (MachineInstr *MI : EmittedInstrs) {
555     if (MI) {
556       if (IsHazard(*MI))
557         return WaitStates;
558 
559       if (MI->isInlineAsm())
560         continue;
561     }
562     ++WaitStates;
563 
564     if (WaitStates >= Limit)
565       break;
566   }
567   return std::numeric_limits<int>::max();
568 }
569 
570 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
571                                                IsHazardFn IsHazardDef,
572                                                int Limit) {
573   const SIRegisterInfo *TRI = ST.getRegisterInfo();
574 
575   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
576     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
577   };
578 
579   return getWaitStatesSince(IsHazardFn, Limit);
580 }
581 
582 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
583                                                   int Limit) {
584   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
585     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
586   };
587 
588   return getWaitStatesSince(IsHazardFn, Limit);
589 }
590 
591 //===----------------------------------------------------------------------===//
592 // No-op Hazard Detection
593 //===----------------------------------------------------------------------===//
594 
595 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
596                         MCRegister Reg) {
597   for (MCRegUnit Unit : TRI.regunits(Reg))
598     BV.set(Unit);
599 }
600 
601 static void addRegsToSet(const SIRegisterInfo &TRI,
602                          iterator_range<MachineInstr::const_mop_iterator> Ops,
603                          BitVector &DefSet, BitVector &UseSet) {
604   for (const MachineOperand &Op : Ops) {
605     if (Op.isReg())
606       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
607   }
608 }
609 
610 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
611   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
612 }
613 
614 static bool breaksSMEMSoftClause(MachineInstr *MI) {
615   return !SIInstrInfo::isSMRD(*MI);
616 }
617 
618 static bool breaksVMEMSoftClause(MachineInstr *MI) {
619   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
620 }
621 
622 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
623   // SMEM soft clause are only present on VI+, and only matter if xnack is
624   // enabled.
625   if (!ST.isXNACKEnabled())
626     return 0;
627 
628   bool IsSMRD = TII.isSMRD(*MEM);
629 
630   resetClause();
631 
632   // A soft-clause is any group of consecutive SMEM instructions.  The
633   // instructions in this group may return out of order and/or may be
634   // replayed (i.e. the same instruction issued more than once).
635   //
636   // In order to handle these situations correctly we need to make sure that
637   // when a clause has more than one instruction, no instruction in the clause
638   // writes to a register that is read by another instruction in the clause
639   // (including itself). If we encounter this situation, we need to break the
640   // clause by inserting a non SMEM instruction.
641 
642   for (MachineInstr *MI : EmittedInstrs) {
643     // When we hit a non-SMEM instruction then we have passed the start of the
644     // clause and we can stop.
645     if (!MI)
646       break;
647 
648     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
649       break;
650 
651     addClauseInst(*MI);
652   }
653 
654   if (ClauseDefs.none())
655     return 0;
656 
657   // We need to make sure not to put loads and stores in the same clause if they
658   // use the same address. For now, just start a new clause whenever we see a
659   // store.
660   if (MEM->mayStore())
661     return 1;
662 
663   addClauseInst(*MEM);
664 
665   // If the set of defs and uses intersect then we cannot add this instruction
666   // to the clause, so we have a hazard.
667   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
668 }
669 
670 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
671   int WaitStatesNeeded = 0;
672 
673   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
674 
675   // This SMRD hazard only affects SI.
676   if (!ST.hasSMRDReadVALUDefHazard())
677     return WaitStatesNeeded;
678 
679   // A read of an SGPR by SMRD instruction requires 4 wait states when the
680   // SGPR was written by a VALU instruction.
681   int SmrdSgprWaitStates = 4;
682   auto IsHazardDefFn = [this](const MachineInstr &MI) {
683     return TII.isVALU(MI);
684   };
685   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
686     return TII.isSALU(MI);
687   };
688 
689   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
690 
691   for (const MachineOperand &Use : SMRD->uses()) {
692     if (!Use.isReg())
693       continue;
694     int WaitStatesNeededForUse =
695         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
696                                                    SmrdSgprWaitStates);
697     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
698 
699     // This fixes what appears to be undocumented hardware behavior in SI where
700     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
701     // needs some number of nops in between. We don't know how many we need, but
702     // let's use 4. This wasn't discovered before probably because the only
703     // case when this happens is when we expand a 64-bit pointer into a full
704     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
705     // probably never encountered in the closed-source land.
706     if (IsBufferSMRD) {
707       int WaitStatesNeededForUse =
708         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
709                                                    IsBufferHazardDefFn,
710                                                    SmrdSgprWaitStates);
711       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
712     }
713   }
714 
715   return WaitStatesNeeded;
716 }
717 
718 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
719   if (!ST.hasVMEMReadSGPRVALUDefHazard())
720     return 0;
721 
722   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
723 
724   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
725   // SGPR was written by a VALU Instruction.
726   const int VmemSgprWaitStates = 5;
727   auto IsHazardDefFn = [this](const MachineInstr &MI) {
728     return TII.isVALU(MI);
729   };
730   for (const MachineOperand &Use : VMEM->uses()) {
731     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
732       continue;
733 
734     int WaitStatesNeededForUse =
735         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
736                                                    VmemSgprWaitStates);
737     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
738   }
739   return WaitStatesNeeded;
740 }
741 
742 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
743   const SIRegisterInfo *TRI = ST.getRegisterInfo();
744   const SIInstrInfo *TII = ST.getInstrInfo();
745 
746   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
747   int DppVgprWaitStates = 2;
748   int DppExecWaitStates = 5;
749   int WaitStatesNeeded = 0;
750   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
751     return TII->isVALU(MI);
752   };
753 
754   for (const MachineOperand &Use : DPP->uses()) {
755     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
756       continue;
757     int WaitStatesNeededForUse =
758         DppVgprWaitStates - getWaitStatesSinceDef(
759                                 Use.getReg(),
760                                 [](const MachineInstr &) { return true; },
761                                 DppVgprWaitStates);
762     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763   }
764 
765   WaitStatesNeeded = std::max(
766       WaitStatesNeeded,
767       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
768                                                 DppExecWaitStates));
769 
770   return WaitStatesNeeded;
771 }
772 
773 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
774   const SIInstrInfo *TII = ST.getInstrInfo();
775 
776   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
777   // instruction.
778   const int DivFMasWaitStates = 4;
779   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
780     return TII->isVALU(MI);
781   };
782   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
783                                                DivFMasWaitStates);
784 
785   return DivFMasWaitStates - WaitStatesNeeded;
786 }
787 
788 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
789   const SIInstrInfo *TII = ST.getInstrInfo();
790   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
791 
792   const int GetRegWaitStates = 2;
793   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
794     return GetRegHWReg == getHWReg(TII, MI);
795   };
796   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
797 
798   return GetRegWaitStates - WaitStatesNeeded;
799 }
800 
801 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
802   const SIInstrInfo *TII = ST.getInstrInfo();
803   unsigned HWReg = getHWReg(TII, *SetRegInstr);
804 
805   const int SetRegWaitStates = ST.getSetRegWaitStates();
806   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
807     return HWReg == getHWReg(TII, MI);
808   };
809   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
810   return SetRegWaitStates - WaitStatesNeeded;
811 }
812 
813 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
814   if (!MI.mayStore())
815     return -1;
816 
817   const SIInstrInfo *TII = ST.getInstrInfo();
818   unsigned Opcode = MI.getOpcode();
819   const MCInstrDesc &Desc = MI.getDesc();
820 
821   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
822   int VDataRCID = -1;
823   if (VDataIdx != -1)
824     VDataRCID = Desc.operands()[VDataIdx].RegClass;
825 
826   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
827     // There is no hazard if the instruction does not use vector regs
828     // (like wbinvl1)
829     if (VDataIdx == -1)
830       return -1;
831     // For MUBUF/MTBUF instructions this hazard only exists if the
832     // instruction is not using a register in the soffset field.
833     const MachineOperand *SOffset =
834         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
835     // If we have no soffset operand, then assume this field has been
836     // hardcoded to zero.
837     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
838         (!SOffset || !SOffset->isReg()))
839       return VDataIdx;
840   }
841 
842   // MIMG instructions create a hazard if they don't use a 256-bit T# and
843   // the store size is greater than 8 bytes and they have more than two bits
844   // of their dmask set.
845   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
846   if (TII->isMIMG(MI)) {
847     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
848     assert(SRsrcIdx != -1 &&
849            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
850     (void)SRsrcIdx;
851   }
852 
853   if (TII->isFLAT(MI)) {
854     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
855     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
856       return DataIdx;
857   }
858 
859   return -1;
860 }
861 
862 int
863 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
864                                             const MachineRegisterInfo &MRI) {
865   // Helper to check for the hazard where VMEM instructions that store more than
866   // 8 bytes can have there store data over written by the next instruction.
867   const SIRegisterInfo *TRI = ST.getRegisterInfo();
868 
869   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
870   int WaitStatesNeeded = 0;
871 
872   if (!TRI->isVectorRegister(MRI, Def.getReg()))
873     return WaitStatesNeeded;
874   Register Reg = Def.getReg();
875   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
876     int DataIdx = createsVALUHazard(MI);
877     return DataIdx >= 0 &&
878            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
879   };
880 
881   int WaitStatesNeededForDef =
882     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
883   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
884 
885   return WaitStatesNeeded;
886 }
887 
888 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
889 /// pack the computed value into correct bit position of the dest register. This
890 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
891 /// dst_sel that is not aligned to the register. This function analayzes the \p
892 /// MI and \returns an operand with dst forwarding issue, or nullptr if
893 /// none exists.
894 static const MachineOperand *
895 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
896   if (!SIInstrInfo::isVALU(MI))
897     return nullptr;
898 
899   const SIInstrInfo *TII = ST.getInstrInfo();
900 
901   unsigned Opcode = MI.getOpcode();
902 
903   // There are three different types of instructions
904   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
905   // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
906   // CVT_SR_BF8_F32 with op_sel[3:2]
907   // != 0
908   if (SIInstrInfo::isSDWA(MI)) {
909     // Type 1: SDWA with dst_sel != DWORD
910     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
911       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
912         return nullptr;
913   } else {
914     // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
915     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
916     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
917         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
918               SISrcMods::DST_OP_SEL ||
919           (AMDGPU::isFP8DstSelInst(Opcode) &&
920            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
921             SISrcMods::OP_SEL_0))))
922       return nullptr;
923   }
924 
925   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
926 }
927 
928 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
929 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
930 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
931 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
932                                             const MachineOperand *Dst,
933                                             const SIRegisterInfo *TRI) {
934   // We must consider implicit reads of the VALU. SDWA with dst_sel and
935   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
936   // and we must account for that hazard.
937   // We also must account for WAW hazards. In particular, WAW with dest
938   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
939   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
940   // check for ECC. Without accounting for this hazard, the ECC will be
941   // wrong.
942   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
943   // complete zeroesHigh16BitsOfDest)
944   for (auto &Operand : VALU->operands()) {
945     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
946       return true;
947     }
948   }
949   return false;
950 }
951 
952 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
953   int WaitStatesNeeded = 0;
954 
955   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
956     const int TransDefWaitstates = 1;
957 
958     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
959       if (!SIInstrInfo::isTRANS(MI))
960         return false;
961       const SIRegisterInfo *TRI = ST.getRegisterInfo();
962       const SIInstrInfo *TII = ST.getInstrInfo();
963       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
964 
965       for (const MachineOperand &Use : VALU->explicit_uses()) {
966         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
967           return true;
968       }
969 
970       return false;
971     };
972 
973     int WaitStatesNeededForDef =
974         TransDefWaitstates -
975         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
976     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
977   }
978 
979   if (ST.hasDstSelForwardingHazard()) {
980     const int Shift16DefWaitstates = 1;
981 
982     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
983       const SIRegisterInfo *TRI = ST.getRegisterInfo();
984       const MachineOperand *ForwardedDst =
985           getDstSelForwardingOperand(ProducerMI, ST);
986       if (ForwardedDst) {
987         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
988       }
989 
990       if (ProducerMI.isInlineAsm()) {
991         // Assume inline asm has dst forwarding hazard
992         for (auto &Def : ProducerMI.all_defs()) {
993           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
994             return true;
995         }
996       }
997 
998       return false;
999     };
1000 
1001     int WaitStatesNeededForDef =
1002         Shift16DefWaitstates -
1003         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1004     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1005   }
1006 
1007   if (ST.hasVDecCoExecHazard()) {
1008     const int VALUWriteSGPRVALUReadWaitstates = 2;
1009     const int VALUWriteEXECRWLane = 4;
1010     const int VALUWriteVGPRReadlaneRead = 1;
1011 
1012     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1013     const MachineRegisterInfo &MRI = MF.getRegInfo();
1014     Register UseReg;
1015     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1016       if (!SIInstrInfo::isVALU(MI))
1017         return false;
1018       return MI.modifiesRegister(UseReg, TRI);
1019     };
1020 
1021     for (const MachineOperand &Use : VALU->explicit_uses()) {
1022       if (!Use.isReg())
1023         continue;
1024 
1025       UseReg = Use.getReg();
1026       if (TRI->isSGPRReg(MRI, UseReg)) {
1027         int WaitStatesNeededForDef =
1028             VALUWriteSGPRVALUReadWaitstates -
1029             getWaitStatesSince(IsVALUDefSGPRFn,
1030                                VALUWriteSGPRVALUReadWaitstates);
1031         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1032       }
1033     }
1034 
1035     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1036       UseReg = AMDGPU::VCC;
1037       int WaitStatesNeededForDef =
1038           VALUWriteSGPRVALUReadWaitstates -
1039           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1040       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1041     }
1042 
1043     switch (VALU->getOpcode()) {
1044     case AMDGPU::V_READLANE_B32:
1045     case AMDGPU::V_READFIRSTLANE_B32: {
1046       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1047       UseReg = Src->getReg();
1048       int WaitStatesNeededForDef =
1049           VALUWriteVGPRReadlaneRead -
1050           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1051       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052     }
1053       [[fallthrough]];
1054     case AMDGPU::V_WRITELANE_B32: {
1055       UseReg = AMDGPU::EXEC;
1056       int WaitStatesNeededForDef =
1057           VALUWriteEXECRWLane -
1058           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1059       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1060       break;
1061     }
1062     default:
1063       break;
1064     }
1065   }
1066 
1067   // This checks for the hazard where VMEM instructions that store more than
1068   // 8 bytes can have there store data over written by the next instruction.
1069   if (!ST.has12DWordStoreHazard())
1070     return WaitStatesNeeded;
1071 
1072   const MachineRegisterInfo &MRI = MF.getRegInfo();
1073 
1074   for (const MachineOperand &Def : VALU->defs()) {
1075     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1076   }
1077 
1078   return WaitStatesNeeded;
1079 }
1080 
1081 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1082   // This checks for hazards associated with inline asm statements.
1083   // Since inline asms can contain just about anything, we use this
1084   // to call/leverage other check*Hazard routines. Note that
1085   // this function doesn't attempt to address all possible inline asm
1086   // hazards (good luck), but is a collection of what has been
1087   // problematic thus far.
1088 
1089   // see checkVALUHazards()
1090   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1091     return 0;
1092 
1093   const MachineRegisterInfo &MRI = MF.getRegInfo();
1094   int WaitStatesNeeded = 0;
1095 
1096   for (const MachineOperand &Op :
1097        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1098     if (Op.isReg() && Op.isDef()) {
1099       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1100         continue;
1101 
1102       if (ST.has12DWordStoreHazard()) {
1103         WaitStatesNeeded =
1104             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1105       }
1106     }
1107   }
1108 
1109   if (ST.hasDstSelForwardingHazard()) {
1110     const int Shift16DefWaitstates = 1;
1111 
1112     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1113       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1114       // Assume inline asm reads the dst
1115       if (Dst)
1116         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1117                IA->readsRegister(Dst->getReg(), &TRI);
1118 
1119       if (ProducerMI.isInlineAsm()) {
1120         // If MI is inline asm, assume it has dst forwarding hazard
1121         for (auto &Def : ProducerMI.all_defs()) {
1122           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1123               IA->readsRegister(Def.getReg(), &TRI)) {
1124             return true;
1125           }
1126         }
1127       }
1128 
1129       return false;
1130     };
1131 
1132     int WaitStatesNeededForDef =
1133         Shift16DefWaitstates -
1134         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1135     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1136   }
1137 
1138   return WaitStatesNeeded;
1139 }
1140 
1141 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1142   const SIInstrInfo *TII = ST.getInstrInfo();
1143   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1144   const MachineRegisterInfo &MRI = MF.getRegInfo();
1145 
1146   const MachineOperand *LaneSelectOp =
1147       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1148 
1149   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1150     return 0;
1151 
1152   Register LaneSelectReg = LaneSelectOp->getReg();
1153   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1154 
1155   const int RWLaneWaitStates = 4;
1156   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1157                                               RWLaneWaitStates);
1158   return RWLaneWaitStates - WaitStatesSince;
1159 }
1160 
1161 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1162   if (!ST.hasRFEHazards())
1163     return 0;
1164 
1165   const SIInstrInfo *TII = ST.getInstrInfo();
1166 
1167   const int RFEWaitStates = 1;
1168 
1169   auto IsHazardFn = [TII](const MachineInstr &MI) {
1170     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1171   };
1172   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1173   return RFEWaitStates - WaitStatesNeeded;
1174 }
1175 
1176 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1177   const SIInstrInfo *TII = ST.getInstrInfo();
1178   const int ReadM0WaitStates = 1;
1179   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1180   return ReadM0WaitStates -
1181          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1182 }
1183 
1184 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1185   fixVMEMtoScalarWriteHazards(MI);
1186   fixVcmpxPermlaneHazards(MI);
1187   fixSMEMtoVectorWriteHazards(MI);
1188   fixVcmpxExecWARHazard(MI);
1189   fixLdsBranchVmemWARHazard(MI);
1190   if (ST.hasLdsDirect()) {
1191     fixLdsDirectVALUHazard(MI);
1192     fixLdsDirectVMEMHazard(MI);
1193   }
1194   fixVALUPartialForwardingHazard(MI);
1195   fixVALUTransUseHazard(MI);
1196   fixWMMAHazards(MI);
1197   fixShift64HighRegBug(MI);
1198   fixVALUMaskWriteHazard(MI);
1199   fixVALUReadSGPRHazard(MI);
1200   fixRequiredExportPriority(MI);
1201 }
1202 
1203 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1204   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1205     return false;
1206 
1207   const SIInstrInfo *TII = ST.getInstrInfo();
1208   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1209   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1210     return (TII->isVOPC(MI) ||
1211             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1212            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1213   };
1214 
1215   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1216     unsigned Opc = MI.getOpcode();
1217     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1218            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1219   };
1220 
1221   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1222       std::numeric_limits<int>::max())
1223     return false;
1224 
1225   // V_NOP will be discarded by SQ.
1226   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1227   // which is always a VGPR and available.
1228   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1229   Register Reg = Src0->getReg();
1230   bool IsUndef = Src0->isUndef();
1231   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1232           TII->get(AMDGPU::V_MOV_B32_e32))
1233     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1234     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1235 
1236   return true;
1237 }
1238 
1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1240   if (!ST.hasVMEMtoScalarWriteHazard())
1241     return false;
1242   assert(!ST.hasExtendedWaitCounts());
1243 
1244   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1245     return false;
1246 
1247   if (MI->getNumDefs() == 0)
1248     return false;
1249 
1250   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1251 
1252   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1253     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1254         !SIInstrInfo::isFLAT(I))
1255       return false;
1256 
1257     for (const MachineOperand &Def : MI->defs()) {
1258       const MachineOperand *Op =
1259           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1260       if (!Op)
1261         continue;
1262       return true;
1263     }
1264     return false;
1265   };
1266 
1267   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1268     return SIInstrInfo::isVALU(MI) ||
1269            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1270             !MI.getOperand(0).getImm()) ||
1271            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1272             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1273   };
1274 
1275   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1276       std::numeric_limits<int>::max())
1277     return false;
1278 
1279   const SIInstrInfo *TII = ST.getInstrInfo();
1280   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1281           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1282       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1283   return true;
1284 }
1285 
1286 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1287   if (!ST.hasSMEMtoVectorWriteHazard())
1288     return false;
1289   assert(!ST.hasExtendedWaitCounts());
1290 
1291   if (!SIInstrInfo::isVALU(*MI))
1292     return false;
1293 
1294   unsigned SDSTName;
1295   switch (MI->getOpcode()) {
1296   case AMDGPU::V_READLANE_B32:
1297   case AMDGPU::V_READFIRSTLANE_B32:
1298     SDSTName = AMDGPU::OpName::vdst;
1299     break;
1300   default:
1301     SDSTName = AMDGPU::OpName::sdst;
1302     break;
1303   }
1304 
1305   const SIInstrInfo *TII = ST.getInstrInfo();
1306   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1307   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1308   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1309   if (!SDST) {
1310     for (const auto &MO : MI->implicit_operands()) {
1311       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1312         SDST = &MO;
1313         break;
1314       }
1315     }
1316   }
1317 
1318   if (!SDST)
1319     return false;
1320 
1321   const Register SDSTReg = SDST->getReg();
1322   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1323     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1324   };
1325 
1326   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1327     if (TII->isSALU(MI)) {
1328       switch (MI.getOpcode()) {
1329       case AMDGPU::S_SETVSKIP:
1330       case AMDGPU::S_VERSION:
1331       case AMDGPU::S_WAITCNT_VSCNT:
1332       case AMDGPU::S_WAITCNT_VMCNT:
1333       case AMDGPU::S_WAITCNT_EXPCNT:
1334         // These instructions cannot not mitigate the hazard.
1335         return false;
1336       case AMDGPU::S_WAITCNT_LGKMCNT:
1337         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1338         return (MI.getOperand(1).getImm() == 0) &&
1339                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1340       case AMDGPU::S_WAITCNT: {
1341         const int64_t Imm = MI.getOperand(0).getImm();
1342         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1343         // DsCnt corresponds to LGKMCnt here.
1344         return (Decoded.DsCnt == 0);
1345       }
1346       default:
1347         // SOPP instructions cannot mitigate the hazard.
1348         if (TII->isSOPP(MI))
1349           return false;
1350         // At this point the SALU can be assumed to mitigate the hazard
1351         // because either:
1352         // (a) it is independent of the at risk SMEM (breaking chain),
1353         // or
1354         // (b) it is dependent on the SMEM, in which case an appropriate
1355         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1356         //     SMEM instruction.
1357         return true;
1358       }
1359     }
1360     return false;
1361   };
1362 
1363   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1364       std::numeric_limits<int>::max())
1365     return false;
1366 
1367   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1368           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1369       .addImm(0);
1370   return true;
1371 }
1372 
1373 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1374   if (!ST.hasVcmpxExecWARHazard())
1375     return false;
1376   assert(!ST.hasExtendedWaitCounts());
1377 
1378   if (!SIInstrInfo::isVALU(*MI))
1379     return false;
1380 
1381   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1382   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1383     return false;
1384 
1385   auto IsHazardFn = [TRI](const MachineInstr &I) {
1386     if (SIInstrInfo::isVALU(I))
1387       return false;
1388     return I.readsRegister(AMDGPU::EXEC, TRI);
1389   };
1390 
1391   const SIInstrInfo *TII = ST.getInstrInfo();
1392   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1393     if (SIInstrInfo::isVALU(MI)) {
1394       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1395         return true;
1396       for (auto MO : MI.implicit_operands())
1397         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1398           return true;
1399     }
1400     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1401         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1402       return true;
1403     return false;
1404   };
1405 
1406   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1407       std::numeric_limits<int>::max())
1408     return false;
1409 
1410   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1411           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1412       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1413   return true;
1414 }
1415 
1416 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1417                                                  const GCNSubtarget &ST) {
1418   if (!ST.hasLdsBranchVmemWARHazard())
1419     return false;
1420 
1421   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1422   // instructions need to appear in the same function.
1423   bool HasLds = false;
1424   bool HasVmem = false;
1425   for (auto &MBB : MF) {
1426     for (auto &MI : MBB) {
1427       HasLds |= SIInstrInfo::isDS(MI);
1428       HasVmem |=
1429           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1430       if (HasLds && HasVmem)
1431         return true;
1432     }
1433   }
1434   return false;
1435 }
1436 
1437 static bool isStoreCountWaitZero(const MachineInstr &I) {
1438   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1439          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1440          !I.getOperand(1).getImm();
1441 }
1442 
1443 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1444   if (!RunLdsBranchVmemWARHazardFixup)
1445     return false;
1446 
1447   assert(ST.hasLdsBranchVmemWARHazard());
1448   assert(!ST.hasExtendedWaitCounts());
1449 
1450   auto IsHazardInst = [](const MachineInstr &MI) {
1451     if (SIInstrInfo::isDS(MI))
1452       return 1;
1453     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1454       return 2;
1455     return 0;
1456   };
1457 
1458   auto InstType = IsHazardInst(*MI);
1459   if (!InstType)
1460     return false;
1461 
1462   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1463     return IsHazardInst(I) || isStoreCountWaitZero(I);
1464   };
1465 
1466   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1467     if (!I.isBranch())
1468       return false;
1469 
1470     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1471       auto InstType2 = IsHazardInst(I);
1472       return InstType2 && InstType != InstType2;
1473     };
1474 
1475     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1476       auto InstType2 = IsHazardInst(I);
1477       if (InstType == InstType2)
1478         return true;
1479 
1480       return isStoreCountWaitZero(I);
1481     };
1482 
1483     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1484            std::numeric_limits<int>::max();
1485   };
1486 
1487   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1488       std::numeric_limits<int>::max())
1489     return false;
1490 
1491   const SIInstrInfo *TII = ST.getInstrInfo();
1492   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1493           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1494     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1495     .addImm(0);
1496 
1497   return true;
1498 }
1499 
1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1501   if (!SIInstrInfo::isLDSDIR(*MI))
1502     return false;
1503 
1504   const int NoHazardWaitStates = 15;
1505   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1506   const Register VDSTReg = VDST->getReg();
1507 
1508   bool VisitedTrans = false;
1509   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1510     if (!SIInstrInfo::isVALU(I))
1511       return false;
1512     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1513     // Cover both WAR and WAW
1514     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1515   };
1516   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1517     if (WaitStates >= NoHazardWaitStates)
1518       return true;
1519     // Instructions which cause va_vdst==0 expire hazard
1520     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1521            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1522   };
1523   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1524     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1525   };
1526 
1527   DenseSet<const MachineBasicBlock *> Visited;
1528   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1529                                     std::next(MI->getReverseIterator()), 0,
1530                                     IsExpiredFn, Visited, GetWaitStatesFn);
1531 
1532   // Transcendentals can execute in parallel to other VALUs.
1533   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1534   if (VisitedTrans)
1535     Count = 0;
1536 
1537   MachineOperand *WaitVdstOp =
1538       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1539   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1540 
1541   return true;
1542 }
1543 
1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1545   if (!SIInstrInfo::isLDSDIR(*MI))
1546     return false;
1547 
1548   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1549   const Register VDSTReg = VDST->getReg();
1550 
1551   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1552     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1553         !SIInstrInfo::isDS(I))
1554       return false;
1555     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1556   };
1557   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1558   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1559   // according to the type of VMEM instruction.
1560   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1561     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1562            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1563            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1564             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1565            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1566             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1567   };
1568 
1569   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1570       std::numeric_limits<int>::max())
1571     return false;
1572 
1573   if (LdsdirCanWait) {
1574     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1575   } else {
1576     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1577             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1578         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1579   }
1580 
1581   return true;
1582 }
1583 
1584 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1585   if (!ST.hasVALUPartialForwardingHazard())
1586     return false;
1587   assert(!ST.hasExtendedWaitCounts());
1588 
1589   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1590     return false;
1591 
1592   SmallSetVector<Register, 4> SrcVGPRs;
1593 
1594   for (const MachineOperand &Use : MI->explicit_uses()) {
1595     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1596       SrcVGPRs.insert(Use.getReg());
1597   }
1598 
1599   // Only applies with >= 2 unique VGPR sources
1600   if (SrcVGPRs.size() <= 1)
1601     return false;
1602 
1603   // Look for the following pattern:
1604   //   Va <- VALU [PreExecPos]
1605   //   intv1
1606   //   Exec <- SALU [ExecPos]
1607   //   intv2
1608   //   Vb <- VALU [PostExecPos]
1609   //   intv3
1610   //   MI Va, Vb (WaitState = 0)
1611   //
1612   // Where:
1613   // intv1 + intv2 <= 2 VALUs
1614   // intv3 <= 4 VALUs
1615   //
1616   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1617 
1618   const int Intv1plus2MaxVALUs = 2;
1619   const int Intv3MaxVALUs = 4;
1620   const int IntvMaxVALUs = 6;
1621   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1622 
1623   struct StateType {
1624     SmallDenseMap<Register, int, 4> DefPos;
1625     int ExecPos = std::numeric_limits<int>::max();
1626     int VALUs = 0;
1627   };
1628 
1629   StateType State;
1630 
1631   // This overloads expiry testing with all the hazard detection
1632   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1633     // Too many VALU states have passed
1634     if (State.VALUs > NoHazardVALUWaitStates)
1635       return HazardExpired;
1636 
1637     // Instructions which cause va_vdst==0 expire hazard
1638     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1639         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1640         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1641          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1642       return HazardExpired;
1643 
1644     // Track registers writes
1645     bool Changed = false;
1646     if (SIInstrInfo::isVALU(I)) {
1647       for (Register Src : SrcVGPRs) {
1648         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1649           State.DefPos[Src] = State.VALUs;
1650           Changed = true;
1651         }
1652       }
1653     } else if (SIInstrInfo::isSALU(I)) {
1654       if (State.ExecPos == std::numeric_limits<int>::max()) {
1655         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1656           State.ExecPos = State.VALUs;
1657           Changed = true;
1658         }
1659       }
1660     }
1661 
1662     // Early expiration: too many VALUs in intv3
1663     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1664       return HazardExpired;
1665 
1666     // Only evaluate state if something changed
1667     if (!Changed)
1668       return NoHazardFound;
1669 
1670     // Determine positions of VALUs pre/post exec change
1671     if (State.ExecPos == std::numeric_limits<int>::max())
1672       return NoHazardFound;
1673 
1674     int PreExecPos = std::numeric_limits<int>::max();
1675     int PostExecPos = std::numeric_limits<int>::max();
1676 
1677     for (auto Entry : State.DefPos) {
1678       int DefVALUs = Entry.second;
1679       if (DefVALUs != std::numeric_limits<int>::max()) {
1680         if (DefVALUs >= State.ExecPos)
1681           PreExecPos = std::min(PreExecPos, DefVALUs);
1682         else
1683           PostExecPos = std::min(PostExecPos, DefVALUs);
1684       }
1685     }
1686 
1687     // Need a VALUs post exec change
1688     if (PostExecPos == std::numeric_limits<int>::max())
1689       return NoHazardFound;
1690 
1691     // Too many VALUs in intv3?
1692     int Intv3VALUs = PostExecPos;
1693     if (Intv3VALUs > Intv3MaxVALUs)
1694       return HazardExpired;
1695 
1696     // Too many VALUs in intv2?
1697     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1698     if (Intv2VALUs > Intv1plus2MaxVALUs)
1699       return HazardExpired;
1700 
1701     // Need a VALUs pre exec change
1702     if (PreExecPos == std::numeric_limits<int>::max())
1703       return NoHazardFound;
1704 
1705     // Too many VALUs in intv1?
1706     int Intv1VALUs = PreExecPos - State.ExecPos;
1707     if (Intv1VALUs > Intv1plus2MaxVALUs)
1708       return HazardExpired;
1709 
1710     // Too many VALUs in intv1 + intv2
1711     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1712       return HazardExpired;
1713 
1714     return HazardFound;
1715   };
1716   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1717     if (SIInstrInfo::isVALU(MI))
1718       State.VALUs += 1;
1719   };
1720 
1721   DenseSet<const MachineBasicBlock *> Visited;
1722   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1723                             std::next(MI->getReverseIterator()), Visited))
1724     return false;
1725 
1726   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1727           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1728       .addImm(0x0fff);
1729 
1730   return true;
1731 }
1732 
1733 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1734   if (!ST.hasVALUTransUseHazard())
1735     return false;
1736   assert(!ST.hasExtendedWaitCounts());
1737 
1738   if (!SIInstrInfo::isVALU(*MI))
1739     return false;
1740 
1741   SmallSet<Register, 4> SrcVGPRs;
1742 
1743   for (const MachineOperand &Use : MI->explicit_uses()) {
1744     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1745       SrcVGPRs.insert(Use.getReg());
1746   }
1747 
1748   // Look for the following pattern:
1749   //   Va <- TRANS VALU
1750   //   intv
1751   //   MI Va (WaitState = 0)
1752   //
1753   // Where:
1754   // intv <= 5 VALUs / 1 TRANS
1755   //
1756   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1757 
1758   const int IntvMaxVALUs = 5;
1759   const int IntvMaxTRANS = 1;
1760 
1761   struct StateType {
1762     int VALUs = 0;
1763     int TRANS = 0;
1764   };
1765 
1766   StateType State;
1767 
1768   // This overloads expiry testing with all the hazard detection
1769   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1770     // Too many VALU states have passed
1771     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1772       return HazardExpired;
1773 
1774     // Instructions which cause va_vdst==0 expire hazard
1775     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1776         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1777         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1778          I.getOperand(0).getImm() == 0x0fff))
1779       return HazardExpired;
1780 
1781     // Track registers writes
1782     if (SIInstrInfo::isTRANS(I)) {
1783       for (Register Src : SrcVGPRs) {
1784         if (I.modifiesRegister(Src, &TRI)) {
1785           return HazardFound;
1786         }
1787       }
1788     }
1789 
1790     return NoHazardFound;
1791   };
1792   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1793     if (SIInstrInfo::isVALU(MI))
1794       State.VALUs += 1;
1795     if (SIInstrInfo::isTRANS(MI))
1796       State.TRANS += 1;
1797   };
1798 
1799   DenseSet<const MachineBasicBlock *> Visited;
1800   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1801                             std::next(MI->getReverseIterator()), Visited))
1802     return false;
1803 
1804   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1805   // avoided.
1806   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1807           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1808       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1809 
1810   return true;
1811 }
1812 
1813 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1814   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1815     return false;
1816 
1817   const SIInstrInfo *TII = ST.getInstrInfo();
1818   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1819 
1820   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1821     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1822       return false;
1823 
1824     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1825     // with the dest(matrix D) of the previous wmma.
1826     const Register CurSrc0Reg =
1827         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1828     const Register CurSrc1Reg =
1829         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1830 
1831     const Register PrevDstReg =
1832         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1833 
1834     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1835         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1836       return true;
1837     }
1838 
1839     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1840     // but Index can't overlap with PrevDstReg.
1841     if (AMDGPU::isGFX12Plus(ST)) {
1842       if (SIInstrInfo::isSWMMAC(*MI)) {
1843         const Register CurIndex =
1844             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1845         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1846           return true;
1847       }
1848       return false;
1849     }
1850 
1851     return false;
1852   };
1853 
1854   auto IsExpiredFn = [](const MachineInstr &I, int) {
1855     return SIInstrInfo::isVALU(I);
1856   };
1857 
1858   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1859       std::numeric_limits<int>::max())
1860     return false;
1861 
1862   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1863 
1864   return true;
1865 }
1866 
1867 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1868   if (!ST.hasShift64HighRegBug())
1869     return false;
1870   assert(!ST.hasExtendedWaitCounts());
1871 
1872   switch (MI->getOpcode()) {
1873   default:
1874     return false;
1875   case AMDGPU::V_LSHLREV_B64_e64:
1876   case AMDGPU::V_LSHRREV_B64_e64:
1877   case AMDGPU::V_ASHRREV_I64_e64:
1878     break;
1879   }
1880 
1881   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1882   if (!Amt->isReg())
1883     return false;
1884 
1885   Register AmtReg = Amt->getReg();
1886   const MachineRegisterInfo &MRI = MF.getRegInfo();
1887   // Check if this is a last VGPR in the allocation block.
1888   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1889     return false;
1890 
1891   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1892     return false;
1893 
1894   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1895   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1896   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1897   bool Overlapped = OverlappedSrc || OverlappedDst;
1898 
1899   assert(!OverlappedDst || !OverlappedSrc ||
1900          Src1->getReg() == MI->getOperand(0).getReg());
1901   assert(ST.needsAlignedVGPRs());
1902   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1903 
1904   Register NewReg;
1905   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1906                                    : AMDGPU::VGPR_32RegClass) {
1907     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1908       NewReg = Reg;
1909       break;
1910     }
1911   }
1912 
1913   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1914                                : NewReg;
1915   Register NewAmtLo;
1916 
1917   if (Overlapped)
1918     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1919 
1920   DebugLoc DL = MI->getDebugLoc();
1921   MachineBasicBlock *MBB = MI->getParent();
1922   // Insert a full wait count because found register might be pending a wait.
1923   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1924       .addImm(0);
1925 
1926   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1927   if (Overlapped)
1928     runOnInstruction(
1929         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1930             .addDef(AmtReg - 1)
1931             .addReg(AmtReg - 1, RegState::Undef)
1932             .addReg(NewAmtLo, RegState::Undef));
1933   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1934                        .addDef(AmtReg)
1935                        .addReg(AmtReg, RegState::Undef)
1936                        .addReg(NewAmt, RegState::Undef));
1937 
1938   // Instructions emitted after the current instruction will be processed by the
1939   // parent loop of the hazard recognizer in a natural way.
1940   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1941           AmtReg)
1942       .addDef(NewAmt)
1943       .addReg(NewAmt)
1944       .addReg(AmtReg);
1945   if (Overlapped)
1946     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1947             AmtReg - 1)
1948         .addDef(NewAmtLo)
1949         .addReg(NewAmtLo)
1950         .addReg(AmtReg - 1);
1951 
1952   // Re-running hazard recognizer on the modified instruction is not necessary,
1953   // inserted V_SWAP_B32 has already both read and write new registers so
1954   // hazards related to these register has already been handled.
1955   Amt->setReg(NewAmt);
1956   Amt->setIsKill(false);
1957   // We do not update liveness, so verifier may see it as undef.
1958   Amt->setIsUndef();
1959   if (OverlappedDst)
1960     MI->getOperand(0).setReg(NewReg);
1961   if (OverlappedSrc) {
1962     Src1->setReg(NewReg);
1963     Src1->setIsKill(false);
1964     Src1->setIsUndef();
1965   }
1966 
1967   return true;
1968 }
1969 
1970 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1971   int NSAtoVMEMWaitStates = 1;
1972 
1973   if (!ST.hasNSAtoVMEMBug())
1974     return 0;
1975 
1976   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1977     return 0;
1978 
1979   const SIInstrInfo *TII = ST.getInstrInfo();
1980   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1981   if (!Offset || (Offset->getImm() & 6) == 0)
1982     return 0;
1983 
1984   auto IsHazardFn = [TII](const MachineInstr &I) {
1985     if (!SIInstrInfo::isMIMG(I))
1986       return false;
1987     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1988     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1989            TII->getInstSizeInBytes(I) >= 16;
1990   };
1991 
1992   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1993 }
1994 
1995 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1996   int FPAtomicToDenormModeWaitStates = 3;
1997 
1998   if (!ST.hasFPAtomicToDenormModeHazard())
1999     return 0;
2000   assert(!ST.hasExtendedWaitCounts());
2001 
2002   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2003     return 0;
2004 
2005   auto IsHazardFn = [](const MachineInstr &I) {
2006     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2007       return false;
2008     return SIInstrInfo::isFPAtomic(I);
2009   };
2010 
2011   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2012     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2013       return true;
2014 
2015     switch (MI.getOpcode()) {
2016     case AMDGPU::S_WAITCNT:
2017     case AMDGPU::S_WAITCNT_VSCNT:
2018     case AMDGPU::S_WAITCNT_VMCNT:
2019     case AMDGPU::S_WAITCNT_EXPCNT:
2020     case AMDGPU::S_WAITCNT_LGKMCNT:
2021     case AMDGPU::S_WAIT_IDLE:
2022       return true;
2023     default:
2024       break;
2025     }
2026 
2027     return false;
2028   };
2029 
2030   return FPAtomicToDenormModeWaitStates -
2031          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2032 }
2033 
2034 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2035   assert(SIInstrInfo::isMAI(*MI));
2036 
2037   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2038 }
2039 
2040 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2041   // Early exit if no padding is requested.
2042   if (MFMAPaddingRatio == 0)
2043     return 0;
2044 
2045   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2046   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2047     return 0;
2048 
2049   int NeighborMFMALatency = 0;
2050   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2051                             this](const MachineInstr &MI) {
2052     if (!SIInstrInfo::isMFMA(MI))
2053       return false;
2054 
2055     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2056     return true;
2057   };
2058 
2059   const int MaxMFMAPipelineWaitStates = 16;
2060   int WaitStatesSinceNeighborMFMA =
2061       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2062 
2063   int NeighborMFMAPaddingNeeded =
2064       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2065       WaitStatesSinceNeighborMFMA;
2066 
2067   return std::max(0, NeighborMFMAPaddingNeeded);
2068 }
2069 
2070 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2071   int WaitStatesNeeded = 0;
2072   unsigned Opc = MI->getOpcode();
2073 
2074   auto IsVALUFn = [](const MachineInstr &MI) {
2075     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2076   };
2077 
2078   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2079     const int LegacyVALUWritesVGPRWaitStates = 2;
2080     const int VALUWritesExecWaitStates = 4;
2081     const int MaxWaitStates = 4;
2082 
2083     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2084       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2085     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2086 
2087     if (WaitStatesNeeded < MaxWaitStates) {
2088       for (const MachineOperand &Use : MI->explicit_uses()) {
2089         const int MaxWaitStates = 2;
2090 
2091         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2092           continue;
2093 
2094         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2095           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2096         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2097 
2098         if (WaitStatesNeeded == MaxWaitStates)
2099           break;
2100       }
2101     }
2102   }
2103 
2104   for (const MachineOperand &Op : MI->explicit_operands()) {
2105     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2106       continue;
2107 
2108     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2109       continue;
2110 
2111     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2112     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2113     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2114     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2115     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2116     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2117     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2118     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2119     const int MaxWaitStates = 18;
2120     Register Reg = Op.getReg();
2121     unsigned HazardDefLatency = 0;
2122 
2123     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2124                                this](const MachineInstr &MI) {
2125       if (!SIInstrInfo::isMFMA(MI))
2126         return false;
2127       Register DstReg = MI.getOperand(0).getReg();
2128       if (DstReg == Reg)
2129         return false;
2130       HazardDefLatency =
2131           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2132       return TRI.regsOverlap(DstReg, Reg);
2133     };
2134 
2135     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2136                                                    MaxWaitStates);
2137     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2138     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2139     int OpNo = Op.getOperandNo();
2140     if (OpNo == SrcCIdx) {
2141       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2142     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2143       switch (HazardDefLatency) {
2144       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2145                break;
2146       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2147                break;
2148       case 16: [[fallthrough]];
2149       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2150                break;
2151       }
2152     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2153       switch (HazardDefLatency) {
2154       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2155                break;
2156       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2157                break;
2158       case 16: [[fallthrough]];
2159       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2160                break;
2161       }
2162     }
2163 
2164     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2165     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2166 
2167     if (WaitStatesNeeded == MaxWaitStates)
2168       return WaitStatesNeeded; // Early exit.
2169 
2170     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2171       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2172         return false;
2173       Register DstReg = MI.getOperand(0).getReg();
2174       return TRI.regsOverlap(Reg, DstReg);
2175     };
2176 
2177     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2178     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2179     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2180     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2181     if (OpNo == SrcCIdx)
2182       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2183     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2184       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2185 
2186     WaitStatesNeededForUse = NeedWaitStates -
2187       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2188     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2189 
2190     if (WaitStatesNeeded == MaxWaitStates)
2191       return WaitStatesNeeded; // Early exit.
2192   }
2193 
2194   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2195     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2196     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2197     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2198     const int MaxWaitStates = 13;
2199     Register DstReg = MI->getOperand(0).getReg();
2200     unsigned HazardDefLatency = 0;
2201 
2202     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2203                          this](const MachineInstr &MI) {
2204       if (!SIInstrInfo::isMFMA(MI))
2205         return false;
2206       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2207       HazardDefLatency =
2208           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2209       return TRI.regsOverlap(Reg, DstReg);
2210     };
2211 
2212     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2213     int NeedWaitStates;
2214     switch (HazardDefLatency) {
2215     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2216              break;
2217     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2218              break;
2219     case 16: [[fallthrough]];
2220     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2221              break;
2222     }
2223 
2224     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2225     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2226   }
2227 
2228   // Pad neighboring MFMA with noops for better inter-wave performance.
2229   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2230 
2231   return WaitStatesNeeded;
2232 }
2233 
2234 static int
2235 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2236   // 2 pass -> 3
2237   // 4 pass -> 5
2238   // 8 pass -> 9
2239   // 16 pass -> 17
2240   return NumPasses + 1;
2241 }
2242 
2243 static int
2244 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2245   // 2 pass -> 2
2246   // 4 pass -> 4
2247   // 8 pass -> 8
2248   // 16 pass -> 16
2249   return NumPasses;
2250 }
2251 
2252 static int
2253 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2254   // 2 pass -> 4
2255   // 4 pass -> 6
2256   // 8 pass -> 10
2257   // 16 pass -> 18
2258   return NumPasses + 2;
2259 }
2260 
2261 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2262   // 2 pass -> 5
2263   // 4 pass -> 7
2264   // 8 pass -> 11
2265   // 16 pass -> 19
2266   return NumPasses + 3;
2267 }
2268 
2269 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2270   int WaitStatesNeeded = 0;
2271   unsigned Opc = MI->getOpcode();
2272 
2273   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2274     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2275   };
2276 
2277   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2278     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2279            !SIInstrInfo::isDOT(MI);
2280   };
2281 
2282   if (!SIInstrInfo::isMFMA(*MI))
2283     return WaitStatesNeeded;
2284 
2285   const int VALUWritesExecWaitStates = 4;
2286   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2287     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2288                           VALUWritesExecWaitStates);
2289   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2290 
2291   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2292 
2293   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2294   for (const MachineOperand &Use : MI->explicit_uses()) {
2295     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2296     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2297     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2298     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2299     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2300     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2301     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2302     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2303     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2304     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2305     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2306     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2307     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2308     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2309     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2310     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2311     const int MaxWaitStates = 19;
2312 
2313     if (!Use.isReg())
2314       continue;
2315     Register Reg = Use.getReg();
2316     bool FullReg;
2317     const MachineInstr *MI1;
2318 
2319     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2320                                this](const MachineInstr &MI) {
2321       if (!SIInstrInfo::isMFMA(MI))
2322         return false;
2323       Register DstReg = MI.getOperand(0).getReg();
2324       FullReg = (DstReg == Reg);
2325       MI1 = &MI;
2326       return TRI.regsOverlap(DstReg, Reg);
2327     };
2328 
2329     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2330       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2331     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2332 
2333     int NumWaitStates =
2334         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2335     if (NumWaitStates == std::numeric_limits<int>::max())
2336       continue;
2337 
2338     int OpNo = Use.getOperandNo();
2339     unsigned Opc1 = MI1->getOpcode();
2340     int NeedWaitStates = 0;
2341     if (OpNo == SrcCIdx) {
2342       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2343         NeedWaitStates = 0;
2344       } else if (FullReg) {
2345         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2346              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2347             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2348              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2349           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2350         else if (ST.hasGFX940Insts() &&
2351                  TSchedModel.computeInstrLatency(MI1) == 2)
2352           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2353       } else {
2354         switch (Opc1) {
2355         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2356         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2357         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2358         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2359           if (!isXDL(ST, *MI))
2360             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2361           break;
2362         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2363         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2364           if (!isXDL(ST, *MI))
2365             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2366           break;
2367         default:
2368           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2369           if (ST.hasGFX940Insts()) {
2370             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2371               break;
2372 
2373             NeedWaitStates =
2374                 isXDL(ST, *MI1)
2375                     ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2376                           NumPasses)
2377                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2378                           NumPasses);
2379             break;
2380           }
2381 
2382           switch (NumPasses) {
2383           case 2:
2384             NeedWaitStates =
2385                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2386                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2387             break;
2388           case 8:
2389             NeedWaitStates =
2390                 isDGEMM(Opc)
2391                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2392                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2393             break;
2394           case 16:
2395             NeedWaitStates =
2396                 isDGEMM(Opc)
2397                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2398                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2399             break;
2400           default:
2401             llvm_unreachable("unexpected number of passes");
2402           }
2403         }
2404       }
2405     } else {
2406       switch (Opc1) {
2407       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2408       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2409       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2410       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2411         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2412         break;
2413       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2414       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2415         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2416         break;
2417       default:
2418         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2419 
2420         if (ST.hasGFX940Insts()) {
2421           NeedWaitStates =
2422               isXDL(ST, *MI1)
2423                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2424                         NumPasses)
2425                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2426                         NumPasses);
2427           break;
2428         }
2429 
2430         switch (NumPasses) {
2431         case 2:
2432           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2433           break;
2434         case 4:
2435           llvm_unreachable("unexpected number of passes for mfma");
2436         case 8:
2437           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2438           break;
2439         case 16:
2440         default:
2441           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2442         }
2443       }
2444     }
2445     if (WaitStatesNeeded >= NeedWaitStates)
2446       continue;
2447 
2448     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2449     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2450 
2451     if (WaitStatesNeeded == MaxWaitStates)
2452       break;
2453   }
2454 
2455   // Pad neighboring MFMA with noops for better inter-wave performance.
2456   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2457 
2458   return WaitStatesNeeded;
2459 }
2460 
2461 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2462   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2463   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2464     return 0;
2465 
2466   int WaitStatesNeeded = 0;
2467 
2468   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2469     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2470   };
2471 
2472   for (const MachineOperand &Op : MI->explicit_uses()) {
2473     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2474       continue;
2475 
2476     Register Reg = Op.getReg();
2477 
2478     const int AccVgprReadLdStWaitStates = 2;
2479     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2480     const int MaxWaitStates = 2;
2481 
2482     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2483       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2484     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2485 
2486     if (WaitStatesNeeded == MaxWaitStates)
2487       return WaitStatesNeeded; // Early exit.
2488 
2489     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2490       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2491           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2492         return false;
2493       auto IsVALUFn = [](const MachineInstr &MI) {
2494         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2495       };
2496       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2497              std::numeric_limits<int>::max();
2498     };
2499 
2500     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2501       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2502     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2503   }
2504 
2505   return WaitStatesNeeded;
2506 }
2507 
2508 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2509   // 2 pass -> 4
2510   // 4 pass -> 6
2511   // 8 pass -> 10
2512   // 16 pass -> 18
2513   return NumPasses + 2;
2514 }
2515 
2516 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2517   // 2 pass -> 5
2518   // 4 pass -> 7
2519   // 8 pass -> 11
2520   // 16 pass -> 19
2521   return NumPasses + 3;
2522 }
2523 
2524 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2525   // 2 pass -> 5
2526   // 4 pass -> 7
2527   // 8 pass -> 11
2528   // 16 pass -> 19
2529   return NumPasses + 3;
2530 }
2531 
2532 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2533   // 2 pass -> 4
2534   // 4 pass -> 6
2535   // 8 pass -> 10
2536   // 16 pass -> 18
2537   return NumPasses + 2;
2538 }
2539 
2540 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2541   if (!ST.hasGFX90AInsts())
2542     return 0;
2543 
2544   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2545     return isDGEMM(MI.getOpcode());
2546   };
2547 
2548   // This is checked in checkMAIHazards90A()
2549   if (SIInstrInfo::isMFMA(*MI))
2550     return 0;
2551 
2552   const MachineRegisterInfo &MRI = MF.getRegInfo();
2553 
2554   int WaitStatesNeeded = 0;
2555 
2556   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2557                SIInstrInfo::isFLAT(*MI) ||
2558                SIInstrInfo::isDS(*MI);
2559   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2560   bool IsVALU = SIInstrInfo::isVALU(*MI);
2561 
2562   const MachineInstr *MFMA = nullptr;
2563   unsigned Reg;
2564   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2565     if (!SIInstrInfo::isMFMA(MI) ||
2566         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2567       return false;
2568     MFMA = &MI;
2569     return true;
2570   };
2571 
2572   const MachineInstr *DOT = nullptr;
2573   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2574     if (!SIInstrInfo::isDOT(MI) ||
2575         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2576       return false;
2577     DOT = &MI;
2578     return true;
2579   };
2580 
2581   bool DGEMMAfterVALUWrite = false;
2582   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2583     // Found DGEMM on reverse traversal to def.
2584     if (isDGEMM(MI.getOpcode()))
2585       DGEMMAfterVALUWrite = true;
2586 
2587     // Only hazard if register is defined by a VALU and a DGEMM is found after
2588     // after the def.
2589     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2590       return false;
2591 
2592     return true;
2593   };
2594 
2595   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2596                                            AMDGPU::OpName::src2);
2597 
2598   if (IsMemOrExport || IsVALU) {
2599     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2600     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2601     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2602     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2603     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2604     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2605     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2606     const int DotWriteSameDotReadSrcAB = 3;
2607     const int DotWriteDifferentVALURead = 3;
2608     const int DMFMABetweenVALUWriteVMEMRead = 2;
2609     const int MaxWaitStates = 19;
2610 
2611     for (const MachineOperand &Use : MI->explicit_uses()) {
2612       if (!Use.isReg())
2613         continue;
2614       Reg = Use.getReg();
2615 
2616       DOT = nullptr;
2617       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2618                                                      MaxWaitStates);
2619       if (DOT) {
2620         int NeedWaitStates = 0;
2621         if (DOT->getOpcode() == MI->getOpcode()) {
2622           if (&Use - &MI->getOperand(0) != SrcCIdx)
2623             NeedWaitStates = DotWriteSameDotReadSrcAB;
2624         } else {
2625           NeedWaitStates = DotWriteDifferentVALURead;
2626         }
2627 
2628         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2629         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2630       }
2631 
2632       // Workaround for HW data hazard bug observed only in GFX90A. When there
2633       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2634       // causes the SQ to incorrectly not insert two wait states between the two
2635       // instructions needed to avoid data hazard.
2636       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2637         DGEMMAfterVALUWrite = false;
2638         if (TRI.isVectorRegister(MRI, Reg)) {
2639           int WaitStatesNeededForUse =
2640                 DMFMABetweenVALUWriteVMEMRead -
2641                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2642                                       DMFMABetweenVALUWriteVMEMRead);
2643 
2644           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2645         }
2646       }
2647 
2648       MFMA = nullptr;
2649       WaitStatesSinceDef =
2650           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2651       if (!MFMA)
2652         continue;
2653 
2654       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2655       int NumPasses = HazardDefLatency;
2656       int NeedWaitStates = MaxWaitStates;
2657 
2658       if (isDGEMM(MFMA->getOpcode())) {
2659         switch (HazardDefLatency) {
2660         case 4:
2661           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2662                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2663           break;
2664         case 8:
2665         case 16:
2666           NeedWaitStates = IsMemOrExport
2667                                ? DMFMA16x16WriteVgprMemExpReadWaitStates
2668                                : DMFMA16x16WriteVgprVALUReadWaitStates;
2669           break;
2670         default:
2671           llvm_unreachable("unexpected dgemm");
2672         }
2673       } else if (ST.hasGFX940Insts()) {
2674         NeedWaitStates =
2675             isXDL(ST, *MFMA)
2676                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2677                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2678                       NumPasses);
2679       } else {
2680         switch (HazardDefLatency) {
2681         case 2:
2682           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2683           break;
2684         case 8:
2685           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2686           break;
2687         case 16:
2688           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2689           break;
2690         default:
2691           llvm_unreachable("unexpected number of passes for mfma");
2692         }
2693       }
2694 
2695       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2696       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2697 
2698       if (WaitStatesNeeded == MaxWaitStates)
2699         break;
2700     }
2701   }
2702 
2703   unsigned Opc = MI->getOpcode();
2704   const int DMFMAToFMA64WaitStates = 2;
2705   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2706        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2707        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2708       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2709     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2710       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2711     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2712   }
2713 
2714   if (!IsVALU && !IsMemOrExport)
2715     return WaitStatesNeeded;
2716 
2717   for (const MachineOperand &Def : MI->defs()) {
2718     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2719     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2720     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2721     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2722     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2723     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2724     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2725     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2726     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2727     const int DotWriteDifferentVALUWrite = 3;
2728     const int MaxWaitStates = 19;
2729     const int MaxWarWaitStates = 15;
2730 
2731     Reg = Def.getReg();
2732 
2733     DOT = nullptr;
2734     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2735                                                    MaxWaitStates);
2736     if (DOT && DOT->getOpcode() != MI->getOpcode())
2737       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2738                                                     WaitStatesSinceDef);
2739 
2740     MFMA = nullptr;
2741     WaitStatesSinceDef =
2742         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2743     if (MFMA) {
2744       int NeedWaitStates = MaxWaitStates;
2745       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2746 
2747       if (isDGEMM(MFMA->getOpcode())) {
2748         switch (NumPasses) {
2749         case 4:
2750           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2751           break;
2752         case 8:
2753         case 16:
2754           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2755           break;
2756         default:
2757           llvm_unreachable("unexpected number of cycles for dgemm");
2758         }
2759       } else if (ST.hasGFX940Insts()) {
2760         NeedWaitStates =
2761             isXDL(ST, *MFMA)
2762                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2763                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2764       } else {
2765         switch (NumPasses) {
2766         case 2:
2767           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2768           break;
2769         case 8:
2770           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2771           break;
2772         case 16:
2773           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2774           break;
2775         default:
2776           llvm_unreachable("Unexpected number of passes for mfma");
2777         }
2778       }
2779 
2780       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2781       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2782 
2783       if (WaitStatesNeeded == MaxWaitStates)
2784         break;
2785     }
2786 
2787     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2788       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2789           !MI.readsRegister(Reg, &TRI))
2790         return false;
2791 
2792       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2793         return false;
2794 
2795       const MachineOperand *SrcC =
2796           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2797       assert(SrcC);
2798       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2799         return false;
2800 
2801       MFMA = &MI;
2802       return true;
2803     };
2804 
2805     MFMA = nullptr;
2806     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2807                                                 MaxWarWaitStates);
2808     if (!MFMA)
2809       continue;
2810 
2811     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2812     int NeedWaitStates = MaxWaitStates;
2813     switch (HazardDefLatency) {
2814     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2815              break;
2816     case 4:  assert(ST.hasGFX940Insts());
2817              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2818              break;
2819     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2820              break;
2821     case 16: [[fallthrough]];
2822     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2823              break;
2824     }
2825 
2826     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2827     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2828   }
2829 
2830   return WaitStatesNeeded;
2831 }
2832 
2833 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2834   if (!SU->isInstr())
2835     return false;
2836 
2837   const MachineInstr *MAI = nullptr;
2838 
2839   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2840     MAI = nullptr;
2841     if (SIInstrInfo::isMFMA(MI))
2842       MAI = &MI;
2843     return MAI != nullptr;
2844   };
2845 
2846   MachineInstr *MI = SU->getInstr();
2847   if (IsMFMAFn(*MI)) {
2848     int W = getWaitStatesSince(IsMFMAFn, 16);
2849     if (MAI)
2850       return W < (int)TSchedModel.computeInstrLatency(MAI);
2851   }
2852 
2853   return false;
2854 }
2855 
2856 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2857 // insertion of a new instruction.
2858 static void updateGetPCBundle(MachineInstr *NewMI) {
2859   if (!NewMI->isBundled())
2860     return;
2861 
2862   // Find start of bundle.
2863   auto I = NewMI->getIterator();
2864   while (I->isBundledWithPred())
2865     I--;
2866   if (I->isBundle())
2867     I++;
2868 
2869   // Bail if this is not an S_GETPC bundle.
2870   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2871     return;
2872 
2873   // Update offsets of any references in the bundle.
2874   const unsigned NewBytes = 4;
2875   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2876          "Unexpected instruction insertion in bundle");
2877   auto NextMI = std::next(NewMI->getIterator());
2878   auto End = NewMI->getParent()->end();
2879   while (NextMI != End && NextMI->isBundledWithPred()) {
2880     for (auto &Operand : NextMI->operands()) {
2881       if (Operand.isGlobal())
2882         Operand.setOffset(Operand.getOffset() + NewBytes);
2883     }
2884     NextMI++;
2885   }
2886 }
2887 
2888 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2889   if (!ST.hasVALUMaskWriteHazard())
2890     return false;
2891   assert(!ST.hasExtendedWaitCounts());
2892 
2893   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2894     return false;
2895 
2896   // The hazard sequence is three instructions:
2897   //   1. VALU reads SGPR as mask
2898   //   2. SALU writes SGPR
2899   //   3. SALU reads SGPR
2900   // The hazard can expire if the distance between 2 and 3 is sufficient.
2901   // In practice this happens <10% of the time, hence this always assumes
2902   // the hazard exists if 1 and 2 are present to avoid searching.
2903 
2904   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2905   if (!SDSTOp || !SDSTOp->isReg())
2906     return false;
2907 
2908   const Register HazardReg = SDSTOp->getReg();
2909   if (HazardReg == AMDGPU::EXEC ||
2910       HazardReg == AMDGPU::EXEC_LO ||
2911       HazardReg == AMDGPU::EXEC_HI ||
2912       HazardReg == AMDGPU::M0)
2913     return false;
2914 
2915   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2916     switch (I.getOpcode()) {
2917     case AMDGPU::V_ADDC_U32_e32:
2918     case AMDGPU::V_ADDC_U32_dpp:
2919     case AMDGPU::V_CNDMASK_B16_e32:
2920     case AMDGPU::V_CNDMASK_B16_dpp:
2921     case AMDGPU::V_CNDMASK_B32_e32:
2922     case AMDGPU::V_CNDMASK_B32_dpp:
2923     case AMDGPU::V_DIV_FMAS_F32_e64:
2924     case AMDGPU::V_DIV_FMAS_F64_e64:
2925     case AMDGPU::V_SUBB_U32_e32:
2926     case AMDGPU::V_SUBB_U32_dpp:
2927     case AMDGPU::V_SUBBREV_U32_e32:
2928     case AMDGPU::V_SUBBREV_U32_dpp:
2929       // These implicitly read VCC as mask source.
2930       return HazardReg == AMDGPU::VCC ||
2931              HazardReg == AMDGPU::VCC_LO ||
2932              HazardReg == AMDGPU::VCC_HI;
2933     case AMDGPU::V_ADDC_U32_e64:
2934     case AMDGPU::V_ADDC_U32_e64_dpp:
2935     case AMDGPU::V_CNDMASK_B16_e64:
2936     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2937     case AMDGPU::V_CNDMASK_B32_e64:
2938     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2939     case AMDGPU::V_SUBB_U32_e64:
2940     case AMDGPU::V_SUBB_U32_e64_dpp:
2941     case AMDGPU::V_SUBBREV_U32_e64:
2942     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2943       // Only check mask register overlaps.
2944       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2945       assert(SSRCOp);
2946       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2947     }
2948     default:
2949       return false;
2950     }
2951   };
2952 
2953   const MachineRegisterInfo &MRI = MF.getRegInfo();
2954   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2955     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2956     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2957         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2958       return true;
2959 
2960     // VALU access to any SGPR or literal constant other than HazardReg
2961     // mitigates hazard. No need to check HazardReg here as this will
2962     // only be called when !IsHazardFn.
2963     if (!SIInstrInfo::isVALU(I))
2964       return false;
2965     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2966       const MachineOperand &Op = I.getOperand(OpNo);
2967       if (Op.isReg()) {
2968         Register OpReg = Op.getReg();
2969         // Only consider uses
2970         if (!Op.isUse())
2971           continue;
2972         // Ignore EXEC
2973         if (OpReg == AMDGPU::EXEC ||
2974             OpReg == AMDGPU::EXEC_LO ||
2975             OpReg == AMDGPU::EXEC_HI)
2976           continue;
2977         // Ignore all implicit uses except VCC
2978         if (Op.isImplicit()) {
2979           if (OpReg == AMDGPU::VCC ||
2980               OpReg == AMDGPU::VCC_LO ||
2981               OpReg == AMDGPU::VCC_HI)
2982             return true;
2983           continue;
2984         }
2985         if (TRI.isSGPRReg(MRI, OpReg))
2986           return true;
2987       } else {
2988         const MCInstrDesc &InstDesc = I.getDesc();
2989         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2990         if (!TII.isInlineConstant(Op, OpInfo))
2991           return true;
2992       }
2993     }
2994     return false;
2995   };
2996 
2997   // Check for hazard
2998   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2999       std::numeric_limits<int>::max())
3000     return false;
3001 
3002   auto NextMI = std::next(MI->getIterator());
3003 
3004   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3005   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3006                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3007                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3008 
3009   // SALU write may be s_getpc in a bundle.
3010   updateGetPCBundle(NewMI);
3011 
3012   return true;
3013 }
3014 
3015 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3016 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3017 static std::optional<unsigned> sgprPairNumber(Register Reg,
3018                                               const SIRegisterInfo &TRI) {
3019   switch (Reg) {
3020   case AMDGPU::M0:
3021   case AMDGPU::EXEC:
3022   case AMDGPU::EXEC_LO:
3023   case AMDGPU::EXEC_HI:
3024   case AMDGPU::SGPR_NULL:
3025   case AMDGPU::SGPR_NULL64:
3026     return {};
3027   default:
3028     break;
3029   }
3030   unsigned RegN = TRI.getEncodingValue(Reg);
3031   if (RegN > 127)
3032     return {};
3033   return (RegN >> 1) & 0x3f;
3034 }
3035 
3036 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3037 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3038   assert(MMF == &MF);
3039 
3040   // Assume non-empty vector means it has already been computed.
3041   if (!VALUReadHazardSGPRs.empty())
3042     return;
3043 
3044   auto CallingConv = MF.getFunction().getCallingConv();
3045   bool IsCallFree =
3046       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3047 
3048   // Exhaustive search is only viable in non-caller/callee functions where
3049   // VALUs will be exposed to the hazard recognizer.
3050   UseVALUReadHazardExhaustiveSearch =
3051       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3052       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3053 
3054   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3055   bool UseVALUUseCache =
3056       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3057   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3058   if (!UseVALUUseCache)
3059     return;
3060 
3061   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3062   // before a SALU write to the same SGPR.  This provides a reduction in
3063   // hazard insertion when all VALU access to an SGPR occurs after its last
3064   // SALU write, when compared to a linear scan.
3065   const MachineRegisterInfo &MRI = MF.getRegInfo();
3066   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3067   MachineCycleInfo CI;
3068   CI.compute(*MMF);
3069 
3070   for (auto *MBB : post_order(&MF)) {
3071     bool InCycle = CI.getCycle(MBB) != nullptr;
3072     for (auto &MI : reverse(MBB->instrs())) {
3073       bool IsVALU = SIInstrInfo::isVALU(MI);
3074       bool IsSALU = SIInstrInfo::isSALU(MI);
3075       if (!IsVALU && !IsSALU)
3076         continue;
3077 
3078       for (const MachineOperand &Op : MI.operands()) {
3079         if (!Op.isReg())
3080           continue;
3081         Register Reg = Op.getReg();
3082         assert(!Op.getSubReg());
3083         // Only consider implicit operands of VCC.
3084         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3085                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3086           continue;
3087         if (!TRI.isSGPRReg(MRI, Reg))
3088           continue;
3089         auto RegN = sgprPairNumber(Reg, TRI);
3090         if (!RegN)
3091           continue;
3092         if (IsVALU && Op.isUse()) {
3093           // Note: any access within a cycle must be considered a hazard.
3094           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3095             VALUReadHazardSGPRs.set(*RegN);
3096           ReadSGPRs.set(*RegN);
3097         } else if (IsSALU) {
3098           if (Op.isDef())
3099             SALUWriteSGPRs.set(*RegN);
3100           else
3101             ReadSGPRs.set(*RegN);
3102         }
3103       }
3104     }
3105   }
3106 }
3107 
3108 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3109   if (!ST.hasVALUReadSGPRHazard())
3110     return false;
3111 
3112   // The hazard sequence is fundamentally three instructions:
3113   //   1. VALU reads SGPR
3114   //   2. SALU writes SGPR
3115   //   3. VALU/SALU reads SGPR
3116   // Try to avoid searching for (1) because the expiry point of the hazard is
3117   // indeterminate; however, the hazard between (2) and (3) can expire if the
3118   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3119   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3120   // even if individual SGPRs are accessed.
3121 
3122   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3123   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3124   if (!(MIIsSALU || MIIsVALU))
3125     return false;
3126 
3127   // Avoid expensive search when compile time is priority by
3128   // mitigating every SALU which writes an SGPR.
3129   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3130     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3131       return false;
3132 
3133     const MachineOperand *SDSTOp =
3134         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3135     if (!SDSTOp || !SDSTOp->isReg())
3136       return false;
3137 
3138     const Register HazardReg = SDSTOp->getReg();
3139     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3140         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3141       return false;
3142 
3143     // Add s_wait_alu sa_sdst(0) after SALU write.
3144     auto NextMI = std::next(MI->getIterator());
3145     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3146                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3147                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3148 
3149     // SALU write may be s_getpc in a bundle.
3150     updateGetPCBundle(NewMI);
3151 
3152     return true;
3153   }
3154 
3155   // Pre-compute set of SGPR pairs read by VALUs.
3156   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3157   computeVALUHazardSGPRs(MI->getMF());
3158 
3159   // If no VALUs hazard SGPRs exist then nothing to do.
3160   if (VALUReadHazardSGPRs.none())
3161     return false;
3162 
3163   // All SGPR writes before a call/return must be flushed as the callee/caller
3164   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3165   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3166                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3167                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3168 
3169   // Collect all SGPR sources for MI which are read by a VALU.
3170   const MachineRegisterInfo &MRI = MF.getRegInfo();
3171   SmallSet<Register, 4> SGPRsUsed;
3172 
3173   if (!IsSetPC) {
3174     for (const MachineOperand &Op : MI->all_uses()) {
3175       Register OpReg = Op.getReg();
3176 
3177       // Only consider VCC implicit uses on VALUs.
3178       // The only expected SALU implicit access is SCC which is no hazard.
3179       if (MIIsSALU && Op.isImplicit())
3180         continue;
3181 
3182       if (!TRI.isSGPRReg(MRI, OpReg))
3183         continue;
3184 
3185       auto RegN = sgprPairNumber(OpReg, TRI);
3186       if (!RegN)
3187         continue;
3188 
3189       if (!VALUReadHazardSGPRs[*RegN])
3190         continue;
3191 
3192       SGPRsUsed.insert(OpReg);
3193     }
3194 
3195     // No SGPRs -> nothing to do.
3196     if (SGPRsUsed.empty())
3197       return false;
3198   }
3199 
3200   // A hazard is any SALU which writes one of the SGPRs read by MI.
3201   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3202     if (!SIInstrInfo::isSALU(I))
3203       return false;
3204     // Ensure SGPR flush before call/return by conservatively assuming every
3205     // SALU writes an SGPR.
3206     if (IsSetPC && I.getNumDefs() > 0)
3207       return true;
3208     // Check for any register writes.
3209     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3210       return I.modifiesRegister(Reg, &TRI);
3211     });
3212   };
3213 
3214   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3215   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3216     if (Count >= SALUExpiryCount)
3217       return true;
3218     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3219     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3220         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3221       return true;
3222     return false;
3223   };
3224 
3225   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3226     // Only count true SALUs as wait states.
3227     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3228       return 0;
3229     // SALU must be unrelated to any hazard registers.
3230     if (any_of(SGPRsUsed,
3231                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3232       return 0;
3233     return 1;
3234   };
3235 
3236   // Check for the hazard.
3237   DenseSet<const MachineBasicBlock *> Visited;
3238   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3239                                         std::next(MI->getReverseIterator()), 0,
3240                                         IsExpiredFn, Visited, WaitStatesFn);
3241 
3242   if (WaitStates >= SALUExpiryCount)
3243     return false;
3244 
3245   // Validate hazard through an exhaustive search.
3246   if (UseVALUReadHazardExhaustiveSearch) {
3247     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3248     // This is searching for (1) in the hazard description.
3249     auto hazardPair = [this](Register Reg) {
3250       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3251         return Register(AMDGPU::VCC);
3252       auto RegN = sgprPairNumber(Reg, TRI);
3253       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3254     };
3255     auto SearchHazardFn = [this, hazardPair,
3256                            &SGPRsUsed](const MachineInstr &I) {
3257       if (!SIInstrInfo::isVALU(I))
3258         return false;
3259       // Check for any register reads.
3260       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3261         return I.readsRegister(hazardPair(Reg), &TRI);
3262       });
3263     };
3264     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3265       return false;
3266     };
3267     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3268         std::numeric_limits<int>::max())
3269       return false;
3270   }
3271 
3272   // Add s_wait_alu sa_sdst(0) before SALU read.
3273   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3274                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3275                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3276 
3277   // SALU read may be after s_getpc in a bundle.
3278   updateGetPCBundle(NewMI);
3279 
3280   return true;
3281 }
3282 
3283 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3284                                const SIInstrInfo &TII) {
3285   MachineBasicBlock &EntryMBB = MF->front();
3286   if (EntryMBB.begin() != EntryMBB.end()) {
3287     auto &EntryMI = *EntryMBB.begin();
3288     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3289         EntryMI.getOperand(0).getImm() >= Priority)
3290       return false;
3291   }
3292 
3293   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3294       .addImm(Priority);
3295   return true;
3296 }
3297 
3298 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3299   if (!ST.hasRequiredExportPriority())
3300     return false;
3301 
3302   // Assume the following shader types will never have exports,
3303   // and avoid adding or adjusting S_SETPRIO.
3304   MachineBasicBlock *MBB = MI->getParent();
3305   MachineFunction *MF = MBB->getParent();
3306   auto CC = MF->getFunction().getCallingConv();
3307   switch (CC) {
3308   case CallingConv::AMDGPU_CS:
3309   case CallingConv::AMDGPU_CS_Chain:
3310   case CallingConv::AMDGPU_CS_ChainPreserve:
3311   case CallingConv::AMDGPU_KERNEL:
3312     return false;
3313   default:
3314     break;
3315   }
3316 
3317   const int MaxPriority = 3;
3318   const int NormalPriority = 2;
3319   const int PostExportPriority = 0;
3320 
3321   auto It = MI->getIterator();
3322   switch (MI->getOpcode()) {
3323   case AMDGPU::S_ENDPGM:
3324   case AMDGPU::S_ENDPGM_SAVED:
3325   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3326   case AMDGPU::SI_RETURN_TO_EPILOG:
3327     // Ensure shader with calls raises priority at entry.
3328     // This ensures correct priority if exports exist in callee.
3329     if (MF->getFrameInfo().hasCalls())
3330       return ensureEntrySetPrio(MF, NormalPriority, TII);
3331     return false;
3332   case AMDGPU::S_SETPRIO: {
3333     // Raise minimum priority unless in workaround.
3334     auto &PrioOp = MI->getOperand(0);
3335     int Prio = PrioOp.getImm();
3336     bool InWA = (Prio == PostExportPriority) &&
3337                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3338     if (InWA || Prio >= NormalPriority)
3339       return false;
3340     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3341     return true;
3342   }
3343   default:
3344     if (!TII.isEXP(*MI))
3345       return false;
3346     break;
3347   }
3348 
3349   // Check entry priority at each export (as there will only be a few).
3350   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3351   bool Changed = false;
3352   if (CC != CallingConv::AMDGPU_Gfx)
3353     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3354 
3355   auto NextMI = std::next(It);
3356   bool EndOfShader = false;
3357   if (NextMI != MBB->end()) {
3358     // Only need WA at end of sequence of exports.
3359     if (TII.isEXP(*NextMI))
3360       return Changed;
3361     // Assume appropriate S_SETPRIO after export means WA already applied.
3362     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3363         NextMI->getOperand(0).getImm() == PostExportPriority)
3364       return Changed;
3365     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3366   }
3367 
3368   const DebugLoc &DL = MI->getDebugLoc();
3369 
3370   // Lower priority.
3371   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3372       .addImm(PostExportPriority);
3373 
3374   if (!EndOfShader) {
3375     // Wait for exports to complete.
3376     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3377         .addReg(AMDGPU::SGPR_NULL)
3378         .addImm(0);
3379   }
3380 
3381   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3382   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3383 
3384   if (!EndOfShader) {
3385     // Return to normal (higher) priority.
3386     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3387         .addImm(NormalPriority);
3388   }
3389 
3390   return true;
3391 }
3392