xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 5e007afa9d4f175decc328ee89533a5fe89be99b)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       UseVALUReadHazardExhaustiveSearch(false),
64       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
176 }
177 
178 static bool isLdsDma(const MachineInstr &MI) {
179   return SIInstrInfo::isVALU(MI) &&
180          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
181 }
182 
183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185                                                      AMDGPU::OpName::simm16);
186   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187 }
188 
189 ScheduleHazardRecognizer::HazardType
190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
191   MachineInstr *MI = SU->getInstr();
192   // If we are not in "HazardRecognizerMode" and therefore not being run from
193   // the scheduler, track possible stalls from hazards but don't insert noops.
194   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195 
196   if (MI->isBundle())
197    return NoHazard;
198 
199   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200     return HazardType;
201 
202   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203     return HazardType;
204 
205   if (checkFPAtomicToDenormModeHazard(MI) > 0)
206     return HazardType;
207 
208   if (ST.hasNoDataDepHazard())
209     return NoHazard;
210 
211   // FIXME: Should flat be considered vmem?
212   if ((SIInstrInfo::isVMEM(*MI) ||
213        SIInstrInfo::isFLAT(*MI))
214       && checkVMEMHazards(MI) > 0)
215     return HazardType;
216 
217   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
218     return HazardType;
219 
220   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
221     return HazardType;
222 
223   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
224     return HazardType;
225 
226   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
227     return HazardType;
228 
229   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
230        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
231        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
232     return HazardType;
233 
234   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
235     return HazardType;
236 
237   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
238     return HazardType;
239 
240   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
241     return HazardType;
242 
243   if (((ST.hasReadM0MovRelInterpHazard() &&
244         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
245          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
247        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
248        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
249        (ST.hasReadM0LdsDirectHazard() &&
250         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
251       checkReadM0Hazards(MI) > 0)
252     return HazardType;
253 
254   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
255     return HazardType;
256 
257   if ((SIInstrInfo::isVMEM(*MI) ||
258        SIInstrInfo::isFLAT(*MI) ||
259        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
260     return HazardType;
261 
262   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
263     return HazardType;
264 
265   return NoHazard;
266 }
267 
268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
269                                 unsigned Quantity) {
270   while (Quantity > 0) {
271     unsigned Arg = std::min(Quantity, 8u);
272     Quantity -= Arg;
273     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
274         .addImm(Arg - 1);
275   }
276 }
277 
278 unsigned
279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
280   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
281   assert(TSchedModel.getWriteProcResBegin(SC) !=
282          TSchedModel.getWriteProcResEnd(SC));
283   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
284 }
285 
286 void GCNHazardRecognizer::processBundle() {
287   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
288   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
289   // Check bundled MachineInstr's for hazards.
290   for (; MI != E && MI->isInsideBundle(); ++MI) {
291     CurrCycleInstr = &*MI;
292     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
293 
294     if (IsHazardRecognizerMode) {
295       fixHazards(CurrCycleInstr);
296 
297       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
298     }
299 
300     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
301     // include the bundled MI directly after, only add a maximum of
302     // (MaxLookAhead - 1) noops to EmittedInstrs.
303     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
304       EmittedInstrs.push_front(nullptr);
305 
306     EmittedInstrs.push_front(CurrCycleInstr);
307     EmittedInstrs.resize(MaxLookAhead);
308   }
309   CurrCycleInstr = nullptr;
310 }
311 
312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
313   assert(IsHazardRecognizerMode);
314 
315   unsigned NumPreNoops = PreEmitNoops(MI);
316   EmitNoops(NumPreNoops);
317   if (MI->isInsideBundle())
318     insertNoopsInBundle(MI, TII, NumPreNoops);
319   else
320     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
321                     NumPreNoops);
322   EmitInstruction(MI);
323   AdvanceCycle();
324 }
325 
326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
327   IsHazardRecognizerMode = true;
328   CurrCycleInstr = MI;
329   unsigned W = PreEmitNoopsCommon(MI);
330   fixHazards(MI);
331   CurrCycleInstr = nullptr;
332   return W;
333 }
334 
335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
336   if (MI->isBundle())
337     return 0;
338 
339   int WaitStates = 0;
340 
341   if (SIInstrInfo::isSMRD(*MI))
342     return std::max(WaitStates, checkSMRDHazards(MI));
343 
344   if (ST.hasNSAtoVMEMBug())
345     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
346 
347   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
348 
349   if (ST.hasNoDataDepHazard())
350     return WaitStates;
351 
352   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
353     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
354 
355   if (SIInstrInfo::isVALU(*MI))
356     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
357 
358   if (SIInstrInfo::isDPP(*MI))
359     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
360 
361   if (isDivFMas(MI->getOpcode()))
362     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
363 
364   if (isRWLane(MI->getOpcode()))
365     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
366 
367   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
368        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
369        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
370     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
371 
372   if (MI->isInlineAsm())
373     return std::max(WaitStates, checkInlineAsmHazards(MI));
374 
375   if (isSGetReg(MI->getOpcode()))
376     return std::max(WaitStates, checkGetRegHazards(MI));
377 
378   if (isSSetReg(MI->getOpcode()))
379     return std::max(WaitStates, checkSetRegHazards(MI));
380 
381   if (isRFE(MI->getOpcode()))
382     return std::max(WaitStates, checkRFEHazards(MI));
383 
384   if ((ST.hasReadM0MovRelInterpHazard() &&
385        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
386         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
388       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
389       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
390       (ST.hasReadM0LdsDirectHazard() &&
391        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
392     return std::max(WaitStates, checkReadM0Hazards(MI));
393 
394   if (SIInstrInfo::isMAI(*MI))
395     return std::max(WaitStates, checkMAIHazards(MI));
396 
397   if (SIInstrInfo::isVMEM(*MI) ||
398       SIInstrInfo::isFLAT(*MI) ||
399       SIInstrInfo::isDS(*MI))
400     return std::max(WaitStates, checkMAILdStHazards(MI));
401 
402   if (ST.hasGFX950Insts() && isPermlane(*MI))
403     return std::max(WaitStates, checkPermlaneHazards(MI));
404 
405   return WaitStates;
406 }
407 
408 void GCNHazardRecognizer::EmitNoop() {
409   EmittedInstrs.push_front(nullptr);
410 }
411 
412 void GCNHazardRecognizer::AdvanceCycle() {
413   // When the scheduler detects a stall, it will call AdvanceCycle() without
414   // emitting any instructions.
415   if (!CurrCycleInstr) {
416     EmittedInstrs.push_front(nullptr);
417     return;
418   }
419 
420   if (CurrCycleInstr->isBundle()) {
421     processBundle();
422     return;
423   }
424 
425   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426   if (!NumWaitStates) {
427     CurrCycleInstr = nullptr;
428     return;
429   }
430 
431   // Keep track of emitted instructions
432   EmittedInstrs.push_front(CurrCycleInstr);
433 
434   // Add a nullptr for each additional wait state after the first.  Make sure
435   // not to add more than getMaxLookAhead() items to the list, since we
436   // truncate the list to that size right after this loop.
437   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438        i < e; ++i) {
439     EmittedInstrs.push_front(nullptr);
440   }
441 
442   // getMaxLookahead() is the largest number of wait states we will ever need
443   // to insert, so there is no point in keeping track of more than that many
444   // wait states.
445   EmittedInstrs.resize(getMaxLookAhead());
446 
447   CurrCycleInstr = nullptr;
448 }
449 
450 void GCNHazardRecognizer::RecedeCycle() {
451   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
452 }
453 
454 //===----------------------------------------------------------------------===//
455 // Helper Functions
456 //===----------------------------------------------------------------------===//
457 
458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
459 
460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
462 
463 // Search for a hazard in a block and its predecessors.
464 template <typename StateT>
465 static bool
466 hasHazard(StateT State,
467           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
468           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
469           const MachineBasicBlock *MBB,
470           MachineBasicBlock::const_reverse_instr_iterator I,
471           DenseSet<const MachineBasicBlock *> &Visited) {
472   for (auto E = MBB->instr_rend(); I != E; ++I) {
473     // No need to look at parent BUNDLE instructions.
474     if (I->isBundle())
475       continue;
476 
477     switch (IsHazard(State, *I)) {
478     case HazardFound:
479       return true;
480     case HazardExpired:
481       return false;
482     default:
483       // Continue search
484       break;
485     }
486 
487     if (I->isInlineAsm() || I->isMetaInstruction())
488       continue;
489 
490     UpdateState(State, *I);
491   }
492 
493   for (MachineBasicBlock *Pred : MBB->predecessors()) {
494     if (!Visited.insert(Pred).second)
495       continue;
496 
497     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
498                   Visited))
499       return true;
500   }
501 
502   return false;
503 }
504 
505 // Returns a minimum wait states since \p I walking all predecessors.
506 // Only scans until \p IsExpired does not return true.
507 // Can only be run in a hazard recognizer mode.
508 static int getWaitStatesSince(
509     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
510     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
511     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
512     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
513   for (auto E = MBB->instr_rend(); I != E; ++I) {
514     // Don't add WaitStates for parent BUNDLE instructions.
515     if (I->isBundle())
516       continue;
517 
518     if (IsHazard(*I))
519       return WaitStates;
520 
521     if (I->isInlineAsm())
522       continue;
523 
524     WaitStates += GetNumWaitStates(*I);
525 
526     if (IsExpired(*I, WaitStates))
527       return std::numeric_limits<int>::max();
528   }
529 
530   int MinWaitStates = std::numeric_limits<int>::max();
531   for (MachineBasicBlock *Pred : MBB->predecessors()) {
532     if (!Visited.insert(Pred).second)
533       continue;
534 
535     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
536                                IsExpired, Visited, GetNumWaitStates);
537 
538     MinWaitStates = std::min(MinWaitStates, W);
539   }
540 
541   return MinWaitStates;
542 }
543 
544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
545                               const MachineInstr *MI, IsExpiredFn IsExpired) {
546   DenseSet<const MachineBasicBlock *> Visited;
547   return getWaitStatesSince(IsHazard, MI->getParent(),
548                             std::next(MI->getReverseIterator()),
549                             0, IsExpired, Visited);
550 }
551 
552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
553   if (IsHazardRecognizerMode) {
554     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
555       return WaitStates >= Limit;
556     };
557     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
558   }
559 
560   int WaitStates = 0;
561   for (MachineInstr *MI : EmittedInstrs) {
562     if (MI) {
563       if (IsHazard(*MI))
564         return WaitStates;
565 
566       if (MI->isInlineAsm())
567         continue;
568     }
569     ++WaitStates;
570 
571     if (WaitStates >= Limit)
572       break;
573   }
574   return std::numeric_limits<int>::max();
575 }
576 
577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
578                                                IsHazardFn IsHazardDef,
579                                                int Limit) {
580   const SIRegisterInfo *TRI = ST.getRegisterInfo();
581 
582   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
583     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
584   };
585 
586   return getWaitStatesSince(IsHazardFn, Limit);
587 }
588 
589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
590                                                   int Limit) {
591   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
592     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
593   };
594 
595   return getWaitStatesSince(IsHazardFn, Limit);
596 }
597 
598 //===----------------------------------------------------------------------===//
599 // No-op Hazard Detection
600 //===----------------------------------------------------------------------===//
601 
602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
603                         MCRegister Reg) {
604   for (MCRegUnit Unit : TRI.regunits(Reg))
605     BV.set(Unit);
606 }
607 
608 static void addRegsToSet(const SIRegisterInfo &TRI,
609                          iterator_range<MachineInstr::const_mop_iterator> Ops,
610                          BitVector &DefSet, BitVector &UseSet) {
611   for (const MachineOperand &Op : Ops) {
612     if (Op.isReg())
613       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
614   }
615 }
616 
617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
618   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
619 }
620 
621 static bool breaksSMEMSoftClause(MachineInstr *MI) {
622   return !SIInstrInfo::isSMRD(*MI);
623 }
624 
625 static bool breaksVMEMSoftClause(MachineInstr *MI) {
626   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
627 }
628 
629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
630   // SMEM soft clause are only present on VI+, and only matter if xnack is
631   // enabled.
632   if (!ST.isXNACKEnabled())
633     return 0;
634 
635   bool IsSMRD = TII.isSMRD(*MEM);
636 
637   resetClause();
638 
639   // A soft-clause is any group of consecutive SMEM instructions.  The
640   // instructions in this group may return out of order and/or may be
641   // replayed (i.e. the same instruction issued more than once).
642   //
643   // In order to handle these situations correctly we need to make sure that
644   // when a clause has more than one instruction, no instruction in the clause
645   // writes to a register that is read by another instruction in the clause
646   // (including itself). If we encounter this situation, we need to break the
647   // clause by inserting a non SMEM instruction.
648 
649   for (MachineInstr *MI : EmittedInstrs) {
650     // When we hit a non-SMEM instruction then we have passed the start of the
651     // clause and we can stop.
652     if (!MI)
653       break;
654 
655     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
656       break;
657 
658     addClauseInst(*MI);
659   }
660 
661   if (ClauseDefs.none())
662     return 0;
663 
664   // We need to make sure not to put loads and stores in the same clause if they
665   // use the same address. For now, just start a new clause whenever we see a
666   // store.
667   if (MEM->mayStore())
668     return 1;
669 
670   addClauseInst(*MEM);
671 
672   // If the set of defs and uses intersect then we cannot add this instruction
673   // to the clause, so we have a hazard.
674   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
675 }
676 
677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
678   int WaitStatesNeeded = 0;
679 
680   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
681 
682   // This SMRD hazard only affects SI.
683   if (!ST.hasSMRDReadVALUDefHazard())
684     return WaitStatesNeeded;
685 
686   // A read of an SGPR by SMRD instruction requires 4 wait states when the
687   // SGPR was written by a VALU instruction.
688   int SmrdSgprWaitStates = 4;
689   auto IsHazardDefFn = [this](const MachineInstr &MI) {
690     return TII.isVALU(MI);
691   };
692   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
693     return TII.isSALU(MI);
694   };
695 
696   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
697 
698   for (const MachineOperand &Use : SMRD->uses()) {
699     if (!Use.isReg())
700       continue;
701     int WaitStatesNeededForUse =
702         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
703                                                    SmrdSgprWaitStates);
704     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705 
706     // This fixes what appears to be undocumented hardware behavior in SI where
707     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
708     // needs some number of nops in between. We don't know how many we need, but
709     // let's use 4. This wasn't discovered before probably because the only
710     // case when this happens is when we expand a 64-bit pointer into a full
711     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
712     // probably never encountered in the closed-source land.
713     if (IsBufferSMRD) {
714       int WaitStatesNeededForUse =
715         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
716                                                    IsBufferHazardDefFn,
717                                                    SmrdSgprWaitStates);
718       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
719     }
720   }
721 
722   return WaitStatesNeeded;
723 }
724 
725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
726   if (!ST.hasVMEMReadSGPRVALUDefHazard())
727     return 0;
728 
729   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
730 
731   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
732   // SGPR was written by a VALU Instruction.
733   const int VmemSgprWaitStates = 5;
734   auto IsHazardDefFn = [this](const MachineInstr &MI) {
735     return TII.isVALU(MI);
736   };
737   for (const MachineOperand &Use : VMEM->uses()) {
738     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
739       continue;
740 
741     int WaitStatesNeededForUse =
742         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
743                                                    VmemSgprWaitStates);
744     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745   }
746   return WaitStatesNeeded;
747 }
748 
749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
750   const SIRegisterInfo *TRI = ST.getRegisterInfo();
751   const SIInstrInfo *TII = ST.getInstrInfo();
752 
753   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
754   int DppVgprWaitStates = 2;
755   int DppExecWaitStates = 5;
756   int WaitStatesNeeded = 0;
757   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758     return TII->isVALU(MI);
759   };
760 
761   for (const MachineOperand &Use : DPP->uses()) {
762     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
763       continue;
764     int WaitStatesNeededForUse =
765         DppVgprWaitStates - getWaitStatesSinceDef(
766                                 Use.getReg(),
767                                 [](const MachineInstr &) { return true; },
768                                 DppVgprWaitStates);
769     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
770   }
771 
772   WaitStatesNeeded = std::max(
773       WaitStatesNeeded,
774       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
775                                                 DppExecWaitStates));
776 
777   return WaitStatesNeeded;
778 }
779 
780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
781   const SIInstrInfo *TII = ST.getInstrInfo();
782 
783   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
784   // instruction.
785   const int DivFMasWaitStates = 4;
786   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
787     return TII->isVALU(MI);
788   };
789   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
790                                                DivFMasWaitStates);
791 
792   return DivFMasWaitStates - WaitStatesNeeded;
793 }
794 
795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
796   const SIInstrInfo *TII = ST.getInstrInfo();
797   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
798 
799   const int GetRegWaitStates = 2;
800   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
801     return GetRegHWReg == getHWReg(TII, MI);
802   };
803   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
804 
805   return GetRegWaitStates - WaitStatesNeeded;
806 }
807 
808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
809   const SIInstrInfo *TII = ST.getInstrInfo();
810   unsigned HWReg = getHWReg(TII, *SetRegInstr);
811 
812   const int SetRegWaitStates = ST.getSetRegWaitStates();
813   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
814     return HWReg == getHWReg(TII, MI);
815   };
816   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
817   return SetRegWaitStates - WaitStatesNeeded;
818 }
819 
820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
821   if (!MI.mayStore())
822     return -1;
823 
824   const SIInstrInfo *TII = ST.getInstrInfo();
825   unsigned Opcode = MI.getOpcode();
826   const MCInstrDesc &Desc = MI.getDesc();
827 
828   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
829   int VDataRCID = -1;
830   if (VDataIdx != -1)
831     VDataRCID = Desc.operands()[VDataIdx].RegClass;
832 
833   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
834     // There is no hazard if the instruction does not use vector regs
835     // (like wbinvl1)
836     if (VDataIdx == -1)
837       return -1;
838     // For MUBUF/MTBUF instructions this hazard only exists if the
839     // instruction is not using a register in the soffset field.
840     const MachineOperand *SOffset =
841         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
842     // If we have no soffset operand, then assume this field has been
843     // hardcoded to zero.
844     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
845         (!SOffset || !SOffset->isReg()))
846       return VDataIdx;
847   }
848 
849   // MIMG instructions create a hazard if they don't use a 256-bit T# and
850   // the store size is greater than 8 bytes and they have more than two bits
851   // of their dmask set.
852   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
853   if (TII->isMIMG(MI)) {
854     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
855     assert(SRsrcIdx != -1 &&
856            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
857     (void)SRsrcIdx;
858   }
859 
860   if (TII->isFLAT(MI)) {
861     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
862     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
863       return DataIdx;
864   }
865 
866   return -1;
867 }
868 
869 int
870 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
871                                             const MachineRegisterInfo &MRI) {
872   // Helper to check for the hazard where VMEM instructions that store more than
873   // 8 bytes can have there store data over written by the next instruction.
874   const SIRegisterInfo *TRI = ST.getRegisterInfo();
875 
876   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
877   int WaitStatesNeeded = 0;
878 
879   if (!TRI->isVectorRegister(MRI, Def.getReg()))
880     return WaitStatesNeeded;
881   Register Reg = Def.getReg();
882   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
883     int DataIdx = createsVALUHazard(MI);
884     return DataIdx >= 0 &&
885            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
886   };
887 
888   int WaitStatesNeededForDef =
889     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
890   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
891 
892   return WaitStatesNeeded;
893 }
894 
895 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
896 /// pack the computed value into correct bit position of the dest register. This
897 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
898 /// dst_sel that is not aligned to the register. This function analayzes the \p
899 /// MI and \returns an operand with dst forwarding issue, or nullptr if
900 /// none exists.
901 static const MachineOperand *
902 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
903   if (!SIInstrInfo::isVALU(MI))
904     return nullptr;
905 
906   const SIInstrInfo *TII = ST.getInstrInfo();
907 
908   unsigned Opcode = MI.getOpcode();
909 
910   // There are three different types of instructions
911   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
912   // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
913   // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
914   // op_sel[3:2]
915   // != 0
916   if (SIInstrInfo::isSDWA(MI)) {
917     // Type 1: SDWA with dst_sel != DWORD
918     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919       if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
920         return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
921   }
922 
923   AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
924   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
925     // Type 2: VOP3 which write the hi bits
926     if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
927         SISrcMods::DST_OP_SEL)
928       return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
929 
930     // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
931     if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
932         (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
933          SISrcMods::OP_SEL_0))
934       return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
935   }
936 
937   // Special case: nop is required for all the opsel values for fp4 sr variant
938   // cvt scale instructions
939   if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
940     return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
941 
942   return nullptr;
943 }
944 
945 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
946 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
947 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
948 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
949                                             const MachineOperand *Dst,
950                                             const SIRegisterInfo *TRI) {
951   // We must consider implicit reads of the VALU. SDWA with dst_sel and
952   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
953   // and we must account for that hazard.
954   // We also must account for WAW hazards. In particular, WAW with dest
955   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
956   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
957   // check for ECC. Without accounting for this hazard, the ECC will be
958   // wrong.
959   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
960   // complete zeroesHigh16BitsOfDest)
961   for (auto &Operand : VALU->operands()) {
962     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
963       return true;
964     }
965   }
966   return false;
967 }
968 
969 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
970   int WaitStatesNeeded = 0;
971 
972   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
973     const int TransDefWaitstates = 1;
974 
975     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
976       if (!SIInstrInfo::isTRANS(MI))
977         return false;
978       const SIRegisterInfo *TRI = ST.getRegisterInfo();
979       const SIInstrInfo *TII = ST.getInstrInfo();
980       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
981 
982       for (const MachineOperand &Use : VALU->explicit_uses()) {
983         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
984           return true;
985       }
986 
987       return false;
988     };
989 
990     int WaitStatesNeededForDef =
991         TransDefWaitstates -
992         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
993     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
994   }
995 
996   if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
997     const int Shift16DefWaitstates = 1;
998 
999     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1000       const SIRegisterInfo *TRI = ST.getRegisterInfo();
1001       const MachineOperand *ForwardedDst =
1002           getDstSelForwardingOperand(ProducerMI, ST);
1003       if (ForwardedDst) {
1004         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1005       }
1006 
1007       if (ProducerMI.isInlineAsm()) {
1008         // Assume inline asm has dst forwarding hazard
1009         for (auto &Def : ProducerMI.all_defs()) {
1010           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1011             return true;
1012         }
1013       }
1014 
1015       return false;
1016     };
1017 
1018     int WaitStatesNeededForDef =
1019         Shift16DefWaitstates -
1020         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1021     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1022   }
1023 
1024   if (ST.hasVDecCoExecHazard()) {
1025     const int VALUWriteSGPRVALUReadWaitstates = 2;
1026     const int VALUWriteEXECRWLane = 4;
1027     const int VALUWriteVGPRReadlaneRead = 1;
1028 
1029     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1030     const MachineRegisterInfo &MRI = MF.getRegInfo();
1031     Register UseReg;
1032     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1033       if (!SIInstrInfo::isVALU(MI))
1034         return false;
1035       return MI.modifiesRegister(UseReg, TRI);
1036     };
1037 
1038     for (const MachineOperand &Use : VALU->explicit_uses()) {
1039       if (!Use.isReg())
1040         continue;
1041 
1042       UseReg = Use.getReg();
1043       if (TRI->isSGPRReg(MRI, UseReg)) {
1044         int WaitStatesNeededForDef =
1045             VALUWriteSGPRVALUReadWaitstates -
1046             getWaitStatesSince(IsVALUDefSGPRFn,
1047                                VALUWriteSGPRVALUReadWaitstates);
1048         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1049       }
1050     }
1051 
1052     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1053       UseReg = AMDGPU::VCC;
1054       int WaitStatesNeededForDef =
1055           VALUWriteSGPRVALUReadWaitstates -
1056           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1057       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1058     }
1059 
1060     switch (VALU->getOpcode()) {
1061     case AMDGPU::V_READLANE_B32:
1062     case AMDGPU::V_READFIRSTLANE_B32: {
1063       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1064       UseReg = Src->getReg();
1065       int WaitStatesNeededForDef =
1066           VALUWriteVGPRReadlaneRead -
1067           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1068       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1069     }
1070       [[fallthrough]];
1071     case AMDGPU::V_WRITELANE_B32: {
1072       UseReg = AMDGPU::EXEC;
1073       int WaitStatesNeededForDef =
1074           VALUWriteEXECRWLane -
1075           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1076       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1077       break;
1078     }
1079     default:
1080       break;
1081     }
1082   }
1083 
1084   // This checks for the hazard where VMEM instructions that store more than
1085   // 8 bytes can have there store data over written by the next instruction.
1086   if (!ST.has12DWordStoreHazard())
1087     return WaitStatesNeeded;
1088 
1089   const MachineRegisterInfo &MRI = MF.getRegInfo();
1090 
1091   for (const MachineOperand &Def : VALU->defs()) {
1092     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1093   }
1094 
1095   return WaitStatesNeeded;
1096 }
1097 
1098 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1099   // This checks for hazards associated with inline asm statements.
1100   // Since inline asms can contain just about anything, we use this
1101   // to call/leverage other check*Hazard routines. Note that
1102   // this function doesn't attempt to address all possible inline asm
1103   // hazards (good luck), but is a collection of what has been
1104   // problematic thus far.
1105 
1106   // see checkVALUHazards()
1107   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1108       !ST.hasCvtScaleForwardingHazard())
1109     return 0;
1110 
1111   const MachineRegisterInfo &MRI = MF.getRegInfo();
1112   int WaitStatesNeeded = 0;
1113 
1114   for (const MachineOperand &Op :
1115        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1116     if (Op.isReg() && Op.isDef()) {
1117       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1118         continue;
1119 
1120       if (ST.has12DWordStoreHazard()) {
1121         WaitStatesNeeded =
1122             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1123       }
1124     }
1125   }
1126 
1127   if (ST.hasDstSelForwardingHazard()) {
1128     const int Shift16DefWaitstates = 1;
1129 
1130     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1131       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1132       // Assume inline asm reads the dst
1133       if (Dst)
1134         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1135                IA->readsRegister(Dst->getReg(), &TRI);
1136 
1137       if (ProducerMI.isInlineAsm()) {
1138         // If MI is inline asm, assume it has dst forwarding hazard
1139         for (auto &Def : ProducerMI.all_defs()) {
1140           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1141               IA->readsRegister(Def.getReg(), &TRI)) {
1142             return true;
1143           }
1144         }
1145       }
1146 
1147       return false;
1148     };
1149 
1150     int WaitStatesNeededForDef =
1151         Shift16DefWaitstates -
1152         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1153     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1154   }
1155 
1156   return WaitStatesNeeded;
1157 }
1158 
1159 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1160   const SIInstrInfo *TII = ST.getInstrInfo();
1161   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1162   const MachineRegisterInfo &MRI = MF.getRegInfo();
1163 
1164   const MachineOperand *LaneSelectOp =
1165       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1166 
1167   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1168     return 0;
1169 
1170   Register LaneSelectReg = LaneSelectOp->getReg();
1171   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1172 
1173   const int RWLaneWaitStates = 4;
1174   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1175                                               RWLaneWaitStates);
1176   return RWLaneWaitStates - WaitStatesSince;
1177 }
1178 
1179 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1180   if (!ST.hasRFEHazards())
1181     return 0;
1182 
1183   const SIInstrInfo *TII = ST.getInstrInfo();
1184 
1185   const int RFEWaitStates = 1;
1186 
1187   auto IsHazardFn = [TII](const MachineInstr &MI) {
1188     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1189   };
1190   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1191   return RFEWaitStates - WaitStatesNeeded;
1192 }
1193 
1194 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1195   const SIInstrInfo *TII = ST.getInstrInfo();
1196   const int ReadM0WaitStates = 1;
1197   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1198   return ReadM0WaitStates -
1199          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1200 }
1201 
1202 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1203   fixVMEMtoScalarWriteHazards(MI);
1204   fixVcmpxPermlaneHazards(MI);
1205   fixSMEMtoVectorWriteHazards(MI);
1206   fixVcmpxExecWARHazard(MI);
1207   fixLdsBranchVmemWARHazard(MI);
1208   if (ST.hasLdsDirect()) {
1209     fixLdsDirectVALUHazard(MI);
1210     fixLdsDirectVMEMHazard(MI);
1211   }
1212   fixVALUPartialForwardingHazard(MI);
1213   fixVALUTransUseHazard(MI);
1214   fixWMMAHazards(MI);
1215   fixShift64HighRegBug(MI);
1216   fixVALUMaskWriteHazard(MI);
1217   fixVALUReadSGPRHazard(MI);
1218   fixRequiredExportPriority(MI);
1219 }
1220 
1221 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1222                               const MachineInstr &MI) {
1223   return (TII.isVOPC(MI) ||
1224           (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1225          MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1226 }
1227 
1228 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1229   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1230     return false;
1231 
1232   const SIInstrInfo *TII = ST.getInstrInfo();
1233   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1234   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1235     return isVCmpXWritesExec(*TII, *TRI, MI);
1236   };
1237 
1238   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1239     unsigned Opc = MI.getOpcode();
1240     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1241            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1242   };
1243 
1244   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1245       std::numeric_limits<int>::max())
1246     return false;
1247 
1248   // V_NOP will be discarded by SQ.
1249   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1250   // which is always a VGPR and available.
1251   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1252   Register Reg = Src0->getReg();
1253   bool IsUndef = Src0->isUndef();
1254   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1255           TII->get(AMDGPU::V_MOV_B32_e32))
1256     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1257     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1258 
1259   return true;
1260 }
1261 
1262 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1263   if (!ST.hasVMEMtoScalarWriteHazard())
1264     return false;
1265   assert(!ST.hasExtendedWaitCounts());
1266 
1267   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1268     return false;
1269 
1270   if (MI->getNumDefs() == 0)
1271     return false;
1272 
1273   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1274 
1275   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1276     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1277         !SIInstrInfo::isFLAT(I))
1278       return false;
1279 
1280     for (const MachineOperand &Def : MI->defs()) {
1281       const MachineOperand *Op =
1282           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1283       if (!Op)
1284         continue;
1285       return true;
1286     }
1287     return false;
1288   };
1289 
1290   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1291     return SIInstrInfo::isVALU(MI) ||
1292            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1293             !MI.getOperand(0).getImm()) ||
1294            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1295             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1296   };
1297 
1298   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1299       std::numeric_limits<int>::max())
1300     return false;
1301 
1302   const SIInstrInfo *TII = ST.getInstrInfo();
1303   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1304           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1305       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1306   return true;
1307 }
1308 
1309 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1310   if (!ST.hasSMEMtoVectorWriteHazard())
1311     return false;
1312   assert(!ST.hasExtendedWaitCounts());
1313 
1314   if (!SIInstrInfo::isVALU(*MI))
1315     return false;
1316 
1317   unsigned SDSTName;
1318   switch (MI->getOpcode()) {
1319   case AMDGPU::V_READLANE_B32:
1320   case AMDGPU::V_READFIRSTLANE_B32:
1321     SDSTName = AMDGPU::OpName::vdst;
1322     break;
1323   default:
1324     SDSTName = AMDGPU::OpName::sdst;
1325     break;
1326   }
1327 
1328   const SIInstrInfo *TII = ST.getInstrInfo();
1329   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1330   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1331   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1332   if (!SDST) {
1333     for (const auto &MO : MI->implicit_operands()) {
1334       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1335         SDST = &MO;
1336         break;
1337       }
1338     }
1339   }
1340 
1341   if (!SDST)
1342     return false;
1343 
1344   const Register SDSTReg = SDST->getReg();
1345   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1346     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1347   };
1348 
1349   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1350     if (TII->isSALU(MI)) {
1351       switch (MI.getOpcode()) {
1352       case AMDGPU::S_SETVSKIP:
1353       case AMDGPU::S_VERSION:
1354       case AMDGPU::S_WAITCNT_VSCNT:
1355       case AMDGPU::S_WAITCNT_VMCNT:
1356       case AMDGPU::S_WAITCNT_EXPCNT:
1357         // These instructions cannot not mitigate the hazard.
1358         return false;
1359       case AMDGPU::S_WAITCNT_LGKMCNT:
1360         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1361         return (MI.getOperand(1).getImm() == 0) &&
1362                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1363       case AMDGPU::S_WAITCNT: {
1364         const int64_t Imm = MI.getOperand(0).getImm();
1365         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1366         // DsCnt corresponds to LGKMCnt here.
1367         return (Decoded.DsCnt == 0);
1368       }
1369       default:
1370         // SOPP instructions cannot mitigate the hazard.
1371         if (TII->isSOPP(MI))
1372           return false;
1373         // At this point the SALU can be assumed to mitigate the hazard
1374         // because either:
1375         // (a) it is independent of the at risk SMEM (breaking chain),
1376         // or
1377         // (b) it is dependent on the SMEM, in which case an appropriate
1378         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1379         //     SMEM instruction.
1380         return true;
1381       }
1382     }
1383     return false;
1384   };
1385 
1386   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1387       std::numeric_limits<int>::max())
1388     return false;
1389 
1390   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1391           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1392       .addImm(0);
1393   return true;
1394 }
1395 
1396 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1397   if (!ST.hasVcmpxExecWARHazard())
1398     return false;
1399   assert(!ST.hasExtendedWaitCounts());
1400 
1401   if (!SIInstrInfo::isVALU(*MI))
1402     return false;
1403 
1404   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1405   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1406     return false;
1407 
1408   auto IsHazardFn = [TRI](const MachineInstr &I) {
1409     if (SIInstrInfo::isVALU(I))
1410       return false;
1411     return I.readsRegister(AMDGPU::EXEC, TRI);
1412   };
1413 
1414   const SIInstrInfo *TII = ST.getInstrInfo();
1415   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1416     if (SIInstrInfo::isVALU(MI)) {
1417       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1418         return true;
1419       for (auto MO : MI.implicit_operands())
1420         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1421           return true;
1422     }
1423     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1424         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1425       return true;
1426     return false;
1427   };
1428 
1429   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1430       std::numeric_limits<int>::max())
1431     return false;
1432 
1433   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1434           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1435       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1436   return true;
1437 }
1438 
1439 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1440                                                  const GCNSubtarget &ST) {
1441   if (!ST.hasLdsBranchVmemWARHazard())
1442     return false;
1443 
1444   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1445   // instructions need to appear in the same function.
1446   bool HasLds = false;
1447   bool HasVmem = false;
1448   for (auto &MBB : MF) {
1449     for (auto &MI : MBB) {
1450       HasLds |= SIInstrInfo::isDS(MI);
1451       HasVmem |=
1452           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1453       if (HasLds && HasVmem)
1454         return true;
1455     }
1456   }
1457   return false;
1458 }
1459 
1460 static bool isStoreCountWaitZero(const MachineInstr &I) {
1461   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1462          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1463          !I.getOperand(1).getImm();
1464 }
1465 
1466 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1467   if (!RunLdsBranchVmemWARHazardFixup)
1468     return false;
1469 
1470   assert(ST.hasLdsBranchVmemWARHazard());
1471   assert(!ST.hasExtendedWaitCounts());
1472 
1473   auto IsHazardInst = [](const MachineInstr &MI) {
1474     if (SIInstrInfo::isDS(MI))
1475       return 1;
1476     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1477       return 2;
1478     return 0;
1479   };
1480 
1481   auto InstType = IsHazardInst(*MI);
1482   if (!InstType)
1483     return false;
1484 
1485   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1486     return IsHazardInst(I) || isStoreCountWaitZero(I);
1487   };
1488 
1489   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1490     if (!I.isBranch())
1491       return false;
1492 
1493     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1494       auto InstType2 = IsHazardInst(I);
1495       return InstType2 && InstType != InstType2;
1496     };
1497 
1498     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1499       auto InstType2 = IsHazardInst(I);
1500       if (InstType == InstType2)
1501         return true;
1502 
1503       return isStoreCountWaitZero(I);
1504     };
1505 
1506     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1507            std::numeric_limits<int>::max();
1508   };
1509 
1510   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1511       std::numeric_limits<int>::max())
1512     return false;
1513 
1514   const SIInstrInfo *TII = ST.getInstrInfo();
1515   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1516           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1517     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1518     .addImm(0);
1519 
1520   return true;
1521 }
1522 
1523 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1524   if (!SIInstrInfo::isLDSDIR(*MI))
1525     return false;
1526 
1527   const int NoHazardWaitStates = 15;
1528   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1529   const Register VDSTReg = VDST->getReg();
1530 
1531   bool VisitedTrans = false;
1532   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1533     if (!SIInstrInfo::isVALU(I))
1534       return false;
1535     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1536     // Cover both WAR and WAW
1537     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1538   };
1539   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1540     if (WaitStates >= NoHazardWaitStates)
1541       return true;
1542     // Instructions which cause va_vdst==0 expire hazard
1543     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1544            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1545   };
1546   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1547     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1548   };
1549 
1550   DenseSet<const MachineBasicBlock *> Visited;
1551   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1552                                     std::next(MI->getReverseIterator()), 0,
1553                                     IsExpiredFn, Visited, GetWaitStatesFn);
1554 
1555   // Transcendentals can execute in parallel to other VALUs.
1556   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1557   if (VisitedTrans)
1558     Count = 0;
1559 
1560   MachineOperand *WaitVdstOp =
1561       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1562   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1563 
1564   return true;
1565 }
1566 
1567 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1568   if (!SIInstrInfo::isLDSDIR(*MI))
1569     return false;
1570 
1571   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1572   const Register VDSTReg = VDST->getReg();
1573 
1574   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1575     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1576         !SIInstrInfo::isDS(I))
1577       return false;
1578     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1579   };
1580   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1581   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1582   // according to the type of VMEM instruction.
1583   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1584     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1585            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1586            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1587             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1588            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1589             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1590   };
1591 
1592   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1593       std::numeric_limits<int>::max())
1594     return false;
1595 
1596   if (LdsdirCanWait) {
1597     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1598   } else {
1599     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1600             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1601         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1602   }
1603 
1604   return true;
1605 }
1606 
1607 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1608   if (!ST.hasVALUPartialForwardingHazard())
1609     return false;
1610   assert(!ST.hasExtendedWaitCounts());
1611 
1612   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1613     return false;
1614 
1615   SmallSetVector<Register, 4> SrcVGPRs;
1616 
1617   for (const MachineOperand &Use : MI->explicit_uses()) {
1618     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1619       SrcVGPRs.insert(Use.getReg());
1620   }
1621 
1622   // Only applies with >= 2 unique VGPR sources
1623   if (SrcVGPRs.size() <= 1)
1624     return false;
1625 
1626   // Look for the following pattern:
1627   //   Va <- VALU [PreExecPos]
1628   //   intv1
1629   //   Exec <- SALU [ExecPos]
1630   //   intv2
1631   //   Vb <- VALU [PostExecPos]
1632   //   intv3
1633   //   MI Va, Vb (WaitState = 0)
1634   //
1635   // Where:
1636   // intv1 + intv2 <= 2 VALUs
1637   // intv3 <= 4 VALUs
1638   //
1639   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1640 
1641   const int Intv1plus2MaxVALUs = 2;
1642   const int Intv3MaxVALUs = 4;
1643   const int IntvMaxVALUs = 6;
1644   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1645 
1646   struct StateType {
1647     SmallDenseMap<Register, int, 4> DefPos;
1648     int ExecPos = std::numeric_limits<int>::max();
1649     int VALUs = 0;
1650   };
1651 
1652   StateType State;
1653 
1654   // This overloads expiry testing with all the hazard detection
1655   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1656     // Too many VALU states have passed
1657     if (State.VALUs > NoHazardVALUWaitStates)
1658       return HazardExpired;
1659 
1660     // Instructions which cause va_vdst==0 expire hazard
1661     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1662         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1663         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1664          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1665       return HazardExpired;
1666 
1667     // Track registers writes
1668     bool Changed = false;
1669     if (SIInstrInfo::isVALU(I)) {
1670       for (Register Src : SrcVGPRs) {
1671         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1672           State.DefPos[Src] = State.VALUs;
1673           Changed = true;
1674         }
1675       }
1676     } else if (SIInstrInfo::isSALU(I)) {
1677       if (State.ExecPos == std::numeric_limits<int>::max()) {
1678         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1679           State.ExecPos = State.VALUs;
1680           Changed = true;
1681         }
1682       }
1683     }
1684 
1685     // Early expiration: too many VALUs in intv3
1686     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1687       return HazardExpired;
1688 
1689     // Only evaluate state if something changed
1690     if (!Changed)
1691       return NoHazardFound;
1692 
1693     // Determine positions of VALUs pre/post exec change
1694     if (State.ExecPos == std::numeric_limits<int>::max())
1695       return NoHazardFound;
1696 
1697     int PreExecPos = std::numeric_limits<int>::max();
1698     int PostExecPos = std::numeric_limits<int>::max();
1699 
1700     for (auto Entry : State.DefPos) {
1701       int DefVALUs = Entry.second;
1702       if (DefVALUs != std::numeric_limits<int>::max()) {
1703         if (DefVALUs >= State.ExecPos)
1704           PreExecPos = std::min(PreExecPos, DefVALUs);
1705         else
1706           PostExecPos = std::min(PostExecPos, DefVALUs);
1707       }
1708     }
1709 
1710     // Need a VALUs post exec change
1711     if (PostExecPos == std::numeric_limits<int>::max())
1712       return NoHazardFound;
1713 
1714     // Too many VALUs in intv3?
1715     int Intv3VALUs = PostExecPos;
1716     if (Intv3VALUs > Intv3MaxVALUs)
1717       return HazardExpired;
1718 
1719     // Too many VALUs in intv2?
1720     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1721     if (Intv2VALUs > Intv1plus2MaxVALUs)
1722       return HazardExpired;
1723 
1724     // Need a VALUs pre exec change
1725     if (PreExecPos == std::numeric_limits<int>::max())
1726       return NoHazardFound;
1727 
1728     // Too many VALUs in intv1?
1729     int Intv1VALUs = PreExecPos - State.ExecPos;
1730     if (Intv1VALUs > Intv1plus2MaxVALUs)
1731       return HazardExpired;
1732 
1733     // Too many VALUs in intv1 + intv2
1734     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1735       return HazardExpired;
1736 
1737     return HazardFound;
1738   };
1739   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1740     if (SIInstrInfo::isVALU(MI))
1741       State.VALUs += 1;
1742   };
1743 
1744   DenseSet<const MachineBasicBlock *> Visited;
1745   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1746                             std::next(MI->getReverseIterator()), Visited))
1747     return false;
1748 
1749   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1750           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1751       .addImm(0x0fff);
1752 
1753   return true;
1754 }
1755 
1756 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1757   if (!ST.hasVALUTransUseHazard())
1758     return false;
1759   assert(!ST.hasExtendedWaitCounts());
1760 
1761   if (!SIInstrInfo::isVALU(*MI))
1762     return false;
1763 
1764   SmallSet<Register, 4> SrcVGPRs;
1765 
1766   for (const MachineOperand &Use : MI->explicit_uses()) {
1767     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1768       SrcVGPRs.insert(Use.getReg());
1769   }
1770 
1771   // Look for the following pattern:
1772   //   Va <- TRANS VALU
1773   //   intv
1774   //   MI Va (WaitState = 0)
1775   //
1776   // Where:
1777   // intv <= 5 VALUs / 1 TRANS
1778   //
1779   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1780 
1781   const int IntvMaxVALUs = 5;
1782   const int IntvMaxTRANS = 1;
1783 
1784   struct StateType {
1785     int VALUs = 0;
1786     int TRANS = 0;
1787   };
1788 
1789   StateType State;
1790 
1791   // This overloads expiry testing with all the hazard detection
1792   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1793     // Too many VALU states have passed
1794     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1795       return HazardExpired;
1796 
1797     // Instructions which cause va_vdst==0 expire hazard
1798     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1799         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1800         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1801          I.getOperand(0).getImm() == 0x0fff))
1802       return HazardExpired;
1803 
1804     // Track registers writes
1805     if (SIInstrInfo::isTRANS(I)) {
1806       for (Register Src : SrcVGPRs) {
1807         if (I.modifiesRegister(Src, &TRI)) {
1808           return HazardFound;
1809         }
1810       }
1811     }
1812 
1813     return NoHazardFound;
1814   };
1815   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1816     if (SIInstrInfo::isVALU(MI))
1817       State.VALUs += 1;
1818     if (SIInstrInfo::isTRANS(MI))
1819       State.TRANS += 1;
1820   };
1821 
1822   DenseSet<const MachineBasicBlock *> Visited;
1823   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1824                             std::next(MI->getReverseIterator()), Visited))
1825     return false;
1826 
1827   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1828   // avoided.
1829   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1830           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1831       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1832 
1833   return true;
1834 }
1835 
1836 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1837   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1838     return false;
1839 
1840   const SIInstrInfo *TII = ST.getInstrInfo();
1841   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1842 
1843   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1844     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1845       return false;
1846 
1847     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1848     // with the dest(matrix D) of the previous wmma.
1849     const Register CurSrc0Reg =
1850         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1851     const Register CurSrc1Reg =
1852         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1853 
1854     const Register PrevDstReg =
1855         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1856 
1857     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1858         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1859       return true;
1860     }
1861 
1862     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1863     // but Index can't overlap with PrevDstReg.
1864     if (AMDGPU::isGFX12Plus(ST)) {
1865       if (SIInstrInfo::isSWMMAC(*MI)) {
1866         const Register CurIndex =
1867             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1868         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1869           return true;
1870       }
1871       return false;
1872     }
1873 
1874     return false;
1875   };
1876 
1877   auto IsExpiredFn = [](const MachineInstr &I, int) {
1878     return SIInstrInfo::isVALU(I);
1879   };
1880 
1881   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1882       std::numeric_limits<int>::max())
1883     return false;
1884 
1885   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1886 
1887   return true;
1888 }
1889 
1890 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1891   if (!ST.hasShift64HighRegBug())
1892     return false;
1893   assert(!ST.hasExtendedWaitCounts());
1894 
1895   switch (MI->getOpcode()) {
1896   default:
1897     return false;
1898   case AMDGPU::V_LSHLREV_B64_e64:
1899   case AMDGPU::V_LSHRREV_B64_e64:
1900   case AMDGPU::V_ASHRREV_I64_e64:
1901     break;
1902   }
1903 
1904   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1905   if (!Amt->isReg())
1906     return false;
1907 
1908   Register AmtReg = Amt->getReg();
1909   const MachineRegisterInfo &MRI = MF.getRegInfo();
1910   // Check if this is a last VGPR in the allocation block.
1911   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1912     return false;
1913 
1914   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1915     return false;
1916 
1917   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1918   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1919   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1920   bool Overlapped = OverlappedSrc || OverlappedDst;
1921 
1922   assert(!OverlappedDst || !OverlappedSrc ||
1923          Src1->getReg() == MI->getOperand(0).getReg());
1924   assert(ST.needsAlignedVGPRs());
1925   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1926 
1927   Register NewReg;
1928   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1929                                    : AMDGPU::VGPR_32RegClass) {
1930     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1931       NewReg = Reg;
1932       break;
1933     }
1934   }
1935 
1936   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1937                                : NewReg;
1938   Register NewAmtLo;
1939 
1940   if (Overlapped)
1941     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1942 
1943   DebugLoc DL = MI->getDebugLoc();
1944   MachineBasicBlock *MBB = MI->getParent();
1945   // Insert a full wait count because found register might be pending a wait.
1946   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1947       .addImm(0);
1948 
1949   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1950   if (Overlapped)
1951     runOnInstruction(
1952         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1953             .addDef(AmtReg - 1)
1954             .addReg(AmtReg - 1, RegState::Undef)
1955             .addReg(NewAmtLo, RegState::Undef));
1956   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1957                        .addDef(AmtReg)
1958                        .addReg(AmtReg, RegState::Undef)
1959                        .addReg(NewAmt, RegState::Undef));
1960 
1961   // Instructions emitted after the current instruction will be processed by the
1962   // parent loop of the hazard recognizer in a natural way.
1963   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1964           AmtReg)
1965       .addDef(NewAmt)
1966       .addReg(NewAmt)
1967       .addReg(AmtReg);
1968   if (Overlapped)
1969     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1970             AmtReg - 1)
1971         .addDef(NewAmtLo)
1972         .addReg(NewAmtLo)
1973         .addReg(AmtReg - 1);
1974 
1975   // Re-running hazard recognizer on the modified instruction is not necessary,
1976   // inserted V_SWAP_B32 has already both read and write new registers so
1977   // hazards related to these register has already been handled.
1978   Amt->setReg(NewAmt);
1979   Amt->setIsKill(false);
1980   // We do not update liveness, so verifier may see it as undef.
1981   Amt->setIsUndef();
1982   if (OverlappedDst)
1983     MI->getOperand(0).setReg(NewReg);
1984   if (OverlappedSrc) {
1985     Src1->setReg(NewReg);
1986     Src1->setIsKill(false);
1987     Src1->setIsUndef();
1988   }
1989 
1990   return true;
1991 }
1992 
1993 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1994   int NSAtoVMEMWaitStates = 1;
1995 
1996   if (!ST.hasNSAtoVMEMBug())
1997     return 0;
1998 
1999   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
2000     return 0;
2001 
2002   const SIInstrInfo *TII = ST.getInstrInfo();
2003   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2004   if (!Offset || (Offset->getImm() & 6) == 0)
2005     return 0;
2006 
2007   auto IsHazardFn = [TII](const MachineInstr &I) {
2008     if (!SIInstrInfo::isMIMG(I))
2009       return false;
2010     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2011     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2012            TII->getInstSizeInBytes(I) >= 16;
2013   };
2014 
2015   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2016 }
2017 
2018 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2019   int FPAtomicToDenormModeWaitStates = 3;
2020 
2021   if (!ST.hasFPAtomicToDenormModeHazard())
2022     return 0;
2023   assert(!ST.hasExtendedWaitCounts());
2024 
2025   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2026     return 0;
2027 
2028   auto IsHazardFn = [](const MachineInstr &I) {
2029     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2030       return false;
2031     return SIInstrInfo::isFPAtomic(I);
2032   };
2033 
2034   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2035     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2036       return true;
2037 
2038     switch (MI.getOpcode()) {
2039     case AMDGPU::S_WAITCNT:
2040     case AMDGPU::S_WAITCNT_VSCNT:
2041     case AMDGPU::S_WAITCNT_VMCNT:
2042     case AMDGPU::S_WAITCNT_EXPCNT:
2043     case AMDGPU::S_WAITCNT_LGKMCNT:
2044     case AMDGPU::S_WAIT_IDLE:
2045       return true;
2046     default:
2047       break;
2048     }
2049 
2050     return false;
2051   };
2052 
2053   return FPAtomicToDenormModeWaitStates -
2054          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2055 }
2056 
2057 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2058   assert(SIInstrInfo::isMAI(*MI));
2059 
2060   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2061 }
2062 
2063 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2064   // Early exit if no padding is requested.
2065   if (MFMAPaddingRatio == 0)
2066     return 0;
2067 
2068   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2069   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2070     return 0;
2071 
2072   int NeighborMFMALatency = 0;
2073   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2074                             this](const MachineInstr &MI) {
2075     if (!SIInstrInfo::isMFMA(MI))
2076       return false;
2077 
2078     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2079     return true;
2080   };
2081 
2082   const int MaxMFMAPipelineWaitStates = 16;
2083   int WaitStatesSinceNeighborMFMA =
2084       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2085 
2086   int NeighborMFMAPaddingNeeded =
2087       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2088       WaitStatesSinceNeighborMFMA;
2089 
2090   return std::max(0, NeighborMFMAPaddingNeeded);
2091 }
2092 
2093 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2094   int WaitStatesNeeded = 0;
2095   unsigned Opc = MI->getOpcode();
2096 
2097   auto IsVALUFn = [](const MachineInstr &MI) {
2098     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2099   };
2100 
2101   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2102     const int LegacyVALUWritesVGPRWaitStates = 2;
2103     const int VALUWritesExecWaitStates = 4;
2104     const int MaxWaitStates = 4;
2105 
2106     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2107       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2108     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2109 
2110     if (WaitStatesNeeded < MaxWaitStates) {
2111       for (const MachineOperand &Use : MI->explicit_uses()) {
2112         const int MaxWaitStates = 2;
2113 
2114         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2115           continue;
2116 
2117         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2118           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2119         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2120 
2121         if (WaitStatesNeeded == MaxWaitStates)
2122           break;
2123       }
2124     }
2125   }
2126 
2127   for (const MachineOperand &Op : MI->explicit_operands()) {
2128     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2129       continue;
2130 
2131     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2132       continue;
2133 
2134     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2135     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2136     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2137     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2138     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2139     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2140     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2141     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2142     const int MaxWaitStates = 18;
2143     Register Reg = Op.getReg();
2144     unsigned HazardDefLatency = 0;
2145 
2146     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2147                                this](const MachineInstr &MI) {
2148       if (!SIInstrInfo::isMFMA(MI))
2149         return false;
2150       Register DstReg = MI.getOperand(0).getReg();
2151       if (DstReg == Reg)
2152         return false;
2153       HazardDefLatency =
2154           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2155       return TRI.regsOverlap(DstReg, Reg);
2156     };
2157 
2158     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2159                                                    MaxWaitStates);
2160     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2161     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2162     int OpNo = Op.getOperandNo();
2163     if (OpNo == SrcCIdx) {
2164       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2165     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2166       switch (HazardDefLatency) {
2167       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2168                break;
2169       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2170                break;
2171       case 16: [[fallthrough]];
2172       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2173                break;
2174       }
2175     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2176       switch (HazardDefLatency) {
2177       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2178                break;
2179       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2180                break;
2181       case 16: [[fallthrough]];
2182       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2183                break;
2184       }
2185     }
2186 
2187     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2188     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2189 
2190     if (WaitStatesNeeded == MaxWaitStates)
2191       return WaitStatesNeeded; // Early exit.
2192 
2193     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2194       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2195         return false;
2196       Register DstReg = MI.getOperand(0).getReg();
2197       return TRI.regsOverlap(Reg, DstReg);
2198     };
2199 
2200     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2201     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2202     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2203     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2204     if (OpNo == SrcCIdx)
2205       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2206     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2207       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2208 
2209     WaitStatesNeededForUse = NeedWaitStates -
2210       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2211     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2212 
2213     if (WaitStatesNeeded == MaxWaitStates)
2214       return WaitStatesNeeded; // Early exit.
2215   }
2216 
2217   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2218     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2219     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2220     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2221     const int MaxWaitStates = 13;
2222     Register DstReg = MI->getOperand(0).getReg();
2223     unsigned HazardDefLatency = 0;
2224 
2225     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2226                          this](const MachineInstr &MI) {
2227       if (!SIInstrInfo::isMFMA(MI))
2228         return false;
2229       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2230       HazardDefLatency =
2231           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2232       return TRI.regsOverlap(Reg, DstReg);
2233     };
2234 
2235     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2236     int NeedWaitStates;
2237     switch (HazardDefLatency) {
2238     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2239              break;
2240     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2241              break;
2242     case 16: [[fallthrough]];
2243     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2244              break;
2245     }
2246 
2247     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2248     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2249   }
2250 
2251   // Pad neighboring MFMA with noops for better inter-wave performance.
2252   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2253 
2254   return WaitStatesNeeded;
2255 }
2256 
2257 static int
2258 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2259                                                               bool IsGFX950) {
2260   // xdl def cycles | gfx940 | gfx950
2261   // 2 pass         |  3        4
2262   // 4 pass         |  5        6
2263   // 8 pass         |  9        10
2264   // 16 pass        |  17       18
2265   return NumPasses + 1 + IsGFX950;
2266 }
2267 
2268 static int
2269 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2270                                                               bool IsGFX950) {
2271   // xdl def cycles | gfx940 | gfx950
2272   // 2 pass         |  3        3
2273   // 4 pass         |  5        6
2274   // 8 pass         |  9        10
2275   // 16 pass        |  17       18
2276   return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2277 }
2278 
2279 static int
2280 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2281   // 2 pass -> 2
2282   // 4 pass -> 4
2283   // 8 pass -> 8
2284   // 16 pass -> 16
2285   return NumPasses;
2286 }
2287 
2288 static int
2289 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2290   // 2 pass -> 4
2291   // 4 pass -> 6
2292   // 8 pass -> 10
2293   // 16 pass -> 18
2294   return NumPasses + 2;
2295 }
2296 
2297 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2298   // 2 pass -> 5
2299   // 4 pass -> 7
2300   // 8 pass -> 11
2301   // 16 pass -> 19
2302   return NumPasses + 3;
2303 }
2304 
2305 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2306   int WaitStatesNeeded = 0;
2307   unsigned Opc = MI->getOpcode();
2308 
2309   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2310     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2311   };
2312 
2313   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2314     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2315            !SIInstrInfo::isDOT(MI);
2316   };
2317 
2318   if (!SIInstrInfo::isMFMA(*MI))
2319     return WaitStatesNeeded;
2320 
2321   const int VALUWritesExecWaitStates = 4;
2322   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2323     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2324                           VALUWritesExecWaitStates);
2325   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2326 
2327   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2328 
2329   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2330   for (const MachineOperand &Use : MI->explicit_uses()) {
2331     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2332     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2333     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2334     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2335     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2336     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2337     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2338     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2339     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2340     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2341     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2342     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2343     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2344     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2345     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2346     const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2347     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2348     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2349     const int MaxWaitStates = 19;
2350 
2351     if (!Use.isReg())
2352       continue;
2353     Register Reg = Use.getReg();
2354     bool FullReg;
2355     const MachineInstr *MI1;
2356 
2357     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2358                                this](const MachineInstr &MI) {
2359       if (!SIInstrInfo::isMFMA(MI))
2360         return false;
2361       Register DstReg = MI.getOperand(0).getReg();
2362       FullReg = (DstReg == Reg);
2363       MI1 = &MI;
2364       return TRI.regsOverlap(DstReg, Reg);
2365     };
2366 
2367     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2368       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2369     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2370 
2371     int NumWaitStates =
2372         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2373     if (NumWaitStates == std::numeric_limits<int>::max())
2374       continue;
2375 
2376     int OpNo = Use.getOperandNo();
2377     unsigned Opc1 = MI1->getOpcode();
2378     int NeedWaitStates = 0;
2379     if (OpNo == SrcCIdx) {
2380       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2381         NeedWaitStates = 0;
2382       } else if (FullReg) {
2383         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2384              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2385             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2386              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2387           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2388         else if (ST.hasGFX940Insts() &&
2389                  TSchedModel.computeInstrLatency(MI1) == 2)
2390           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2391       } else {
2392         switch (Opc1) {
2393         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2394         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2395         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2396         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2397           if (!isXDL(ST, *MI))
2398             NeedWaitStates =
2399                 ST.hasGFX950Insts()
2400                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2401                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2402           break;
2403         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2404         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2405           if (!isXDL(ST, *MI))
2406             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2407           break;
2408         default:
2409           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2410           if (ST.hasGFX940Insts()) {
2411             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2412               break;
2413 
2414             NeedWaitStates =
2415                 isXDL(ST, *MI1)
2416                     ? (isXDL(ST, *MI)
2417                            ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2418                                  NumPasses, ST.hasGFX950Insts())
2419                            : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2420                                  NumPasses, ST.hasGFX950Insts()))
2421                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2422                           NumPasses);
2423             break;
2424           }
2425 
2426           switch (NumPasses) {
2427           case 2:
2428             NeedWaitStates =
2429                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2430                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2431             break;
2432           case 8:
2433             NeedWaitStates =
2434                 isDGEMM(Opc)
2435                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2436                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2437             break;
2438           case 16:
2439             NeedWaitStates =
2440                 isDGEMM(Opc)
2441                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2442                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2443             break;
2444           default:
2445             llvm_unreachable("unexpected number of passes");
2446           }
2447         }
2448       }
2449     } else {
2450       switch (Opc1) {
2451       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2452       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2453       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2454       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2455         NeedWaitStates =
2456             ST.hasGFX950Insts()
2457                 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2458                 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2459         break;
2460       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2461       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2462         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2463         break;
2464       default:
2465         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2466 
2467         if (ST.hasGFX940Insts()) {
2468           NeedWaitStates =
2469               isXDL(ST, *MI1)
2470                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2471                         NumPasses)
2472                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2473                         NumPasses);
2474           break;
2475         }
2476 
2477         switch (NumPasses) {
2478         case 2:
2479           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2480           break;
2481         case 4:
2482           llvm_unreachable("unexpected number of passes for mfma");
2483         case 8:
2484           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2485           break;
2486         case 16:
2487         default:
2488           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2489         }
2490       }
2491     }
2492     if (WaitStatesNeeded >= NeedWaitStates)
2493       continue;
2494 
2495     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2496     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2497 
2498     if (WaitStatesNeeded == MaxWaitStates)
2499       break;
2500   }
2501 
2502   // Pad neighboring MFMA with noops for better inter-wave performance.
2503   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2504 
2505   return WaitStatesNeeded;
2506 }
2507 
2508 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2509   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2510   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2511     return 0;
2512 
2513   int WaitStatesNeeded = 0;
2514 
2515   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2516     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2517   };
2518 
2519   for (const MachineOperand &Op : MI->explicit_uses()) {
2520     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2521       continue;
2522 
2523     Register Reg = Op.getReg();
2524 
2525     const int AccVgprReadLdStWaitStates = 2;
2526     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2527     const int MaxWaitStates = 2;
2528 
2529     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2530       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2531     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2532 
2533     if (WaitStatesNeeded == MaxWaitStates)
2534       return WaitStatesNeeded; // Early exit.
2535 
2536     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2537       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2538           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2539         return false;
2540       auto IsVALUFn = [](const MachineInstr &MI) {
2541         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2542       };
2543       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2544              std::numeric_limits<int>::max();
2545     };
2546 
2547     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2548       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2549     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2550   }
2551 
2552   return WaitStatesNeeded;
2553 }
2554 
2555 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2556   assert(!ST.hasVcmpxPermlaneHazard() &&
2557          "this is a different vcmpx+permlane hazard");
2558   const SIRegisterInfo *TRI = ST.getRegisterInfo();
2559   const SIInstrInfo *TII = ST.getInstrInfo();
2560 
2561   auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2562     return isVCmpXWritesExec(*TII, *TRI, MI);
2563   };
2564 
2565   auto IsVALUFn = [](const MachineInstr &MI) {
2566     return SIInstrInfo::isVALU(MI);
2567   };
2568 
2569   const int VCmpXWritesExecWaitStates = 4;
2570   const int VALUWritesVDstWaitStates = 2;
2571   int WaitStatesNeeded = 0;
2572 
2573   for (const MachineOperand &Op : MI->explicit_uses()) {
2574     if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2575       continue;
2576     Register Reg = Op.getReg();
2577 
2578     int WaitStatesSinceDef =
2579         VALUWritesVDstWaitStates -
2580         getWaitStatesSinceDef(Reg, IsVALUFn,
2581                               /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2582     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2583     if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2584       break;
2585   }
2586 
2587   int VCmpXHazardWaits =
2588       VCmpXWritesExecWaitStates -
2589       getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2590 
2591   WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2592   return WaitStatesNeeded;
2593 }
2594 
2595 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2596   // 2 pass -> 4
2597   // 4 pass -> 6
2598   // 8 pass -> 10
2599   // 16 pass -> 18
2600   return NumPasses + 2;
2601 }
2602 
2603 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2604   // 2 pass -> 5
2605   // 4 pass -> 7
2606   // 8 pass -> 11
2607   // 16 pass -> 19
2608   return NumPasses + 3;
2609 }
2610 
2611 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2612   // 2 pass -> 5
2613   // 4 pass -> 7
2614   // 8 pass -> 11
2615   // 16 pass -> 19
2616   return NumPasses + 3;
2617 }
2618 
2619 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2620   // 2 pass -> 4
2621   // 4 pass -> 6
2622   // 8 pass -> 10
2623   // 16 pass -> 18
2624   return NumPasses + 2;
2625 }
2626 
2627 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2628   if (!ST.hasGFX90AInsts())
2629     return 0;
2630 
2631   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2632     return isDGEMM(MI.getOpcode());
2633   };
2634 
2635   // This is checked in checkMAIHazards90A()
2636   if (SIInstrInfo::isMFMA(*MI))
2637     return 0;
2638 
2639   const MachineRegisterInfo &MRI = MF.getRegInfo();
2640 
2641   int WaitStatesNeeded = 0;
2642 
2643   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2644                SIInstrInfo::isFLAT(*MI) ||
2645                SIInstrInfo::isDS(*MI);
2646   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2647   bool IsVALU = SIInstrInfo::isVALU(*MI);
2648 
2649   const MachineInstr *MFMA = nullptr;
2650   unsigned Reg;
2651   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2652     if (!SIInstrInfo::isMFMA(MI) ||
2653         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2654       return false;
2655     MFMA = &MI;
2656     return true;
2657   };
2658 
2659   const MachineInstr *DOT = nullptr;
2660   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2661     if (!SIInstrInfo::isDOT(MI) ||
2662         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2663       return false;
2664     DOT = &MI;
2665     return true;
2666   };
2667 
2668   bool DGEMMAfterVALUWrite = false;
2669   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2670     // Found DGEMM on reverse traversal to def.
2671     if (isDGEMM(MI.getOpcode()))
2672       DGEMMAfterVALUWrite = true;
2673 
2674     // Only hazard if register is defined by a VALU and a DGEMM is found after
2675     // after the def.
2676     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2677       return false;
2678 
2679     return true;
2680   };
2681 
2682   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2683                                            AMDGPU::OpName::src2);
2684 
2685   if (IsMemOrExport || IsVALU) {
2686     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2687     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2688     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2689     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2690     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2691     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2692     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2693     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2694     const int DotWriteSameDotReadSrcAB = 3;
2695     const int DotWriteDifferentVALURead = 3;
2696     const int DMFMABetweenVALUWriteVMEMRead = 2;
2697     const int MaxWaitStates = 19;
2698 
2699     for (const MachineOperand &Use : MI->explicit_uses()) {
2700       if (!Use.isReg())
2701         continue;
2702       Reg = Use.getReg();
2703 
2704       DOT = nullptr;
2705       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2706                                                      MaxWaitStates);
2707       if (DOT) {
2708         int NeedWaitStates = 0;
2709         if (DOT->getOpcode() == MI->getOpcode()) {
2710           if (&Use - &MI->getOperand(0) != SrcCIdx)
2711             NeedWaitStates = DotWriteSameDotReadSrcAB;
2712         } else {
2713           NeedWaitStates = DotWriteDifferentVALURead;
2714         }
2715 
2716         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2717         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2718       }
2719 
2720       // Workaround for HW data hazard bug observed only in GFX90A. When there
2721       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2722       // causes the SQ to incorrectly not insert two wait states between the two
2723       // instructions needed to avoid data hazard.
2724       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2725         DGEMMAfterVALUWrite = false;
2726         if (TRI.isVectorRegister(MRI, Reg)) {
2727           int WaitStatesNeededForUse =
2728                 DMFMABetweenVALUWriteVMEMRead -
2729                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2730                                       DMFMABetweenVALUWriteVMEMRead);
2731 
2732           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2733         }
2734       }
2735 
2736       MFMA = nullptr;
2737       WaitStatesSinceDef =
2738           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2739       if (!MFMA)
2740         continue;
2741 
2742       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2743       int NumPasses = HazardDefLatency;
2744       int NeedWaitStates = MaxWaitStates;
2745 
2746       if (isDGEMM(MFMA->getOpcode())) {
2747         switch (HazardDefLatency) {
2748         case 4:
2749           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2750                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2751           break;
2752         case 8:
2753         case 16:
2754           NeedWaitStates =
2755               IsMemOrExport
2756                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2757                   : (ST.hasGFX950Insts()
2758                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2759                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2760           break;
2761         default:
2762           llvm_unreachable("unexpected dgemm");
2763         }
2764       } else if (ST.hasGFX940Insts()) {
2765         NeedWaitStates =
2766             isXDL(ST, *MFMA)
2767                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2768                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2769                       NumPasses);
2770       } else {
2771         switch (HazardDefLatency) {
2772         case 2:
2773           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2774           break;
2775         case 8:
2776           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2777           break;
2778         case 16:
2779           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2780           break;
2781         default:
2782           llvm_unreachable("unexpected number of passes for mfma");
2783         }
2784       }
2785 
2786       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2787       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2788 
2789       if (WaitStatesNeeded == MaxWaitStates)
2790         break;
2791     }
2792   }
2793 
2794   unsigned Opc = MI->getOpcode();
2795   const int DMFMAToFMA64WaitStates = 2;
2796   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2797        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2798        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2799       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2800     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2801       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2802     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2803   }
2804 
2805   if (!IsVALU && !IsMemOrExport)
2806     return WaitStatesNeeded;
2807 
2808   for (const MachineOperand &Def : MI->defs()) {
2809     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2810     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2811     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2812     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2813     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2814     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2815     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2816     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2817     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2818     const int DotWriteDifferentVALUWrite = 3;
2819     const int MaxWaitStates = 19;
2820     const int MaxWarWaitStates = 15;
2821 
2822     Reg = Def.getReg();
2823 
2824     DOT = nullptr;
2825     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2826                                                    MaxWaitStates);
2827     if (DOT && DOT->getOpcode() != MI->getOpcode())
2828       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2829                                                     WaitStatesSinceDef);
2830 
2831     MFMA = nullptr;
2832     WaitStatesSinceDef =
2833         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2834     if (MFMA) {
2835       int NeedWaitStates = MaxWaitStates;
2836       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2837 
2838       if (isDGEMM(MFMA->getOpcode())) {
2839         switch (NumPasses) {
2840         case 4:
2841           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2842           break;
2843         case 8:
2844         case 16:
2845           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2846           break;
2847         default:
2848           llvm_unreachable("unexpected number of cycles for dgemm");
2849         }
2850       } else if (ST.hasGFX940Insts()) {
2851         NeedWaitStates =
2852             isXDL(ST, *MFMA)
2853                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2854                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2855       } else {
2856         switch (NumPasses) {
2857         case 2:
2858           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2859           break;
2860         case 8:
2861           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2862           break;
2863         case 16:
2864           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2865           break;
2866         default:
2867           llvm_unreachable("Unexpected number of passes for mfma");
2868         }
2869       }
2870 
2871       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2872       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2873 
2874       if (WaitStatesNeeded == MaxWaitStates)
2875         break;
2876     }
2877 
2878     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2879       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2880           !MI.readsRegister(Reg, &TRI))
2881         return false;
2882 
2883       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2884         return false;
2885 
2886       const MachineOperand *SrcC =
2887           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2888       assert(SrcC);
2889       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2890         return false;
2891 
2892       MFMA = &MI;
2893       return true;
2894     };
2895 
2896     MFMA = nullptr;
2897     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2898                                                 MaxWarWaitStates);
2899     if (!MFMA)
2900       continue;
2901 
2902     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2903     int NeedWaitStates = MaxWaitStates;
2904     switch (HazardDefLatency) {
2905     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2906              break;
2907     case 4:  assert(ST.hasGFX940Insts());
2908              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2909              break;
2910     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2911              break;
2912     case 16: [[fallthrough]];
2913     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2914              break;
2915     }
2916 
2917     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2918     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2919   }
2920 
2921   return WaitStatesNeeded;
2922 }
2923 
2924 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2925   if (!SU->isInstr())
2926     return false;
2927 
2928   const MachineInstr *MAI = nullptr;
2929 
2930   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2931     MAI = nullptr;
2932     if (SIInstrInfo::isMFMA(MI))
2933       MAI = &MI;
2934     return MAI != nullptr;
2935   };
2936 
2937   MachineInstr *MI = SU->getInstr();
2938   if (IsMFMAFn(*MI)) {
2939     int W = getWaitStatesSince(IsMFMAFn, 16);
2940     if (MAI)
2941       return W < (int)TSchedModel.computeInstrLatency(MAI);
2942   }
2943 
2944   return false;
2945 }
2946 
2947 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2948 // insertion of a new instruction.
2949 static void updateGetPCBundle(MachineInstr *NewMI) {
2950   if (!NewMI->isBundled())
2951     return;
2952 
2953   // Find start of bundle.
2954   auto I = NewMI->getIterator();
2955   while (I->isBundledWithPred())
2956     I--;
2957   if (I->isBundle())
2958     I++;
2959 
2960   // Bail if this is not an S_GETPC bundle.
2961   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2962     return;
2963 
2964   // Update offsets of any references in the bundle.
2965   const unsigned NewBytes = 4;
2966   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2967          "Unexpected instruction insertion in bundle");
2968   auto NextMI = std::next(NewMI->getIterator());
2969   auto End = NewMI->getParent()->end();
2970   while (NextMI != End && NextMI->isBundledWithPred()) {
2971     for (auto &Operand : NextMI->operands()) {
2972       if (Operand.isGlobal())
2973         Operand.setOffset(Operand.getOffset() + NewBytes);
2974     }
2975     NextMI++;
2976   }
2977 }
2978 
2979 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2980   if (!ST.hasVALUMaskWriteHazard())
2981     return false;
2982   assert(!ST.hasExtendedWaitCounts());
2983 
2984   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2985     return false;
2986 
2987   // The hazard sequence is three instructions:
2988   //   1. VALU reads SGPR as mask
2989   //   2. SALU writes SGPR
2990   //   3. SALU reads SGPR
2991   // The hazard can expire if the distance between 2 and 3 is sufficient.
2992   // In practice this happens <10% of the time, hence this always assumes
2993   // the hazard exists if 1 and 2 are present to avoid searching.
2994 
2995   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2996   if (!SDSTOp || !SDSTOp->isReg())
2997     return false;
2998 
2999   const Register HazardReg = SDSTOp->getReg();
3000   if (HazardReg == AMDGPU::EXEC ||
3001       HazardReg == AMDGPU::EXEC_LO ||
3002       HazardReg == AMDGPU::EXEC_HI ||
3003       HazardReg == AMDGPU::M0)
3004     return false;
3005 
3006   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3007     switch (I.getOpcode()) {
3008     case AMDGPU::V_ADDC_U32_e32:
3009     case AMDGPU::V_ADDC_U32_dpp:
3010     case AMDGPU::V_CNDMASK_B16_e32:
3011     case AMDGPU::V_CNDMASK_B16_dpp:
3012     case AMDGPU::V_CNDMASK_B32_e32:
3013     case AMDGPU::V_CNDMASK_B32_dpp:
3014     case AMDGPU::V_DIV_FMAS_F32_e64:
3015     case AMDGPU::V_DIV_FMAS_F64_e64:
3016     case AMDGPU::V_SUBB_U32_e32:
3017     case AMDGPU::V_SUBB_U32_dpp:
3018     case AMDGPU::V_SUBBREV_U32_e32:
3019     case AMDGPU::V_SUBBREV_U32_dpp:
3020       // These implicitly read VCC as mask source.
3021       return HazardReg == AMDGPU::VCC ||
3022              HazardReg == AMDGPU::VCC_LO ||
3023              HazardReg == AMDGPU::VCC_HI;
3024     case AMDGPU::V_ADDC_U32_e64:
3025     case AMDGPU::V_ADDC_U32_e64_dpp:
3026     case AMDGPU::V_CNDMASK_B16_e64:
3027     case AMDGPU::V_CNDMASK_B16_e64_dpp:
3028     case AMDGPU::V_CNDMASK_B32_e64:
3029     case AMDGPU::V_CNDMASK_B32_e64_dpp:
3030     case AMDGPU::V_SUBB_U32_e64:
3031     case AMDGPU::V_SUBB_U32_e64_dpp:
3032     case AMDGPU::V_SUBBREV_U32_e64:
3033     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3034       // Only check mask register overlaps.
3035       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3036       assert(SSRCOp);
3037       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3038     }
3039     default:
3040       return false;
3041     }
3042   };
3043 
3044   const MachineRegisterInfo &MRI = MF.getRegInfo();
3045   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3046     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3047     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3048         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3049       return true;
3050 
3051     // VALU access to any SGPR or literal constant other than HazardReg
3052     // mitigates hazard. No need to check HazardReg here as this will
3053     // only be called when !IsHazardFn.
3054     if (!SIInstrInfo::isVALU(I))
3055       return false;
3056     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3057       const MachineOperand &Op = I.getOperand(OpNo);
3058       if (Op.isReg()) {
3059         Register OpReg = Op.getReg();
3060         // Only consider uses
3061         if (!Op.isUse())
3062           continue;
3063         // Ignore EXEC
3064         if (OpReg == AMDGPU::EXEC ||
3065             OpReg == AMDGPU::EXEC_LO ||
3066             OpReg == AMDGPU::EXEC_HI)
3067           continue;
3068         // Ignore all implicit uses except VCC
3069         if (Op.isImplicit()) {
3070           if (OpReg == AMDGPU::VCC ||
3071               OpReg == AMDGPU::VCC_LO ||
3072               OpReg == AMDGPU::VCC_HI)
3073             return true;
3074           continue;
3075         }
3076         if (TRI.isSGPRReg(MRI, OpReg))
3077           return true;
3078       } else {
3079         const MCInstrDesc &InstDesc = I.getDesc();
3080         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3081         if (!TII.isInlineConstant(Op, OpInfo))
3082           return true;
3083       }
3084     }
3085     return false;
3086   };
3087 
3088   // Check for hazard
3089   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3090       std::numeric_limits<int>::max())
3091     return false;
3092 
3093   auto NextMI = std::next(MI->getIterator());
3094 
3095   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3096   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3097                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3098                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3099 
3100   // SALU write may be s_getpc in a bundle.
3101   updateGetPCBundle(NewMI);
3102 
3103   return true;
3104 }
3105 
3106 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3107 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3108 static std::optional<unsigned> sgprPairNumber(Register Reg,
3109                                               const SIRegisterInfo &TRI) {
3110   switch (Reg) {
3111   case AMDGPU::M0:
3112   case AMDGPU::EXEC:
3113   case AMDGPU::EXEC_LO:
3114   case AMDGPU::EXEC_HI:
3115   case AMDGPU::SGPR_NULL:
3116   case AMDGPU::SGPR_NULL64:
3117     return {};
3118   default:
3119     break;
3120   }
3121   unsigned RegN = TRI.getEncodingValue(Reg);
3122   if (RegN > 127)
3123     return {};
3124   return (RegN >> 1) & 0x3f;
3125 }
3126 
3127 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3128 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3129   assert(MMF == &MF);
3130 
3131   // Assume non-empty vector means it has already been computed.
3132   if (!VALUReadHazardSGPRs.empty())
3133     return;
3134 
3135   auto CallingConv = MF.getFunction().getCallingConv();
3136   bool IsCallFree =
3137       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3138 
3139   // Exhaustive search is only viable in non-caller/callee functions where
3140   // VALUs will be exposed to the hazard recognizer.
3141   UseVALUReadHazardExhaustiveSearch =
3142       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3143       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3144 
3145   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3146   bool UseVALUUseCache =
3147       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3148   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3149   if (!UseVALUUseCache)
3150     return;
3151 
3152   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3153   // before a SALU write to the same SGPR.  This provides a reduction in
3154   // hazard insertion when all VALU access to an SGPR occurs after its last
3155   // SALU write, when compared to a linear scan.
3156   const MachineRegisterInfo &MRI = MF.getRegInfo();
3157   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3158   MachineCycleInfo CI;
3159   CI.compute(*MMF);
3160 
3161   for (auto *MBB : post_order(&MF)) {
3162     bool InCycle = CI.getCycle(MBB) != nullptr;
3163     for (auto &MI : reverse(MBB->instrs())) {
3164       bool IsVALU = SIInstrInfo::isVALU(MI);
3165       bool IsSALU = SIInstrInfo::isSALU(MI);
3166       if (!IsVALU && !IsSALU)
3167         continue;
3168 
3169       for (const MachineOperand &Op : MI.operands()) {
3170         if (!Op.isReg())
3171           continue;
3172         Register Reg = Op.getReg();
3173         assert(!Op.getSubReg());
3174         // Only consider implicit operands of VCC.
3175         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3176                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3177           continue;
3178         if (!TRI.isSGPRReg(MRI, Reg))
3179           continue;
3180         auto RegN = sgprPairNumber(Reg, TRI);
3181         if (!RegN)
3182           continue;
3183         if (IsVALU && Op.isUse()) {
3184           // Note: any access within a cycle must be considered a hazard.
3185           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3186             VALUReadHazardSGPRs.set(*RegN);
3187           ReadSGPRs.set(*RegN);
3188         } else if (IsSALU) {
3189           if (Op.isDef())
3190             SALUWriteSGPRs.set(*RegN);
3191           else
3192             ReadSGPRs.set(*RegN);
3193         }
3194       }
3195     }
3196   }
3197 }
3198 
3199 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3200   if (!ST.hasVALUReadSGPRHazard())
3201     return false;
3202 
3203   // The hazard sequence is fundamentally three instructions:
3204   //   1. VALU reads SGPR
3205   //   2. SALU writes SGPR
3206   //   3. VALU/SALU reads SGPR
3207   // Try to avoid searching for (1) because the expiry point of the hazard is
3208   // indeterminate; however, the hazard between (2) and (3) can expire if the
3209   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3210   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3211   // even if individual SGPRs are accessed.
3212 
3213   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3214   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3215   if (!(MIIsSALU || MIIsVALU))
3216     return false;
3217 
3218   // Avoid expensive search when compile time is priority by
3219   // mitigating every SALU which writes an SGPR.
3220   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3221     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3222       return false;
3223 
3224     const MachineOperand *SDSTOp =
3225         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3226     if (!SDSTOp || !SDSTOp->isReg())
3227       return false;
3228 
3229     const Register HazardReg = SDSTOp->getReg();
3230     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3231         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3232       return false;
3233 
3234     // Add s_wait_alu sa_sdst(0) after SALU write.
3235     auto NextMI = std::next(MI->getIterator());
3236     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3237                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3238                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3239 
3240     // SALU write may be s_getpc in a bundle.
3241     updateGetPCBundle(NewMI);
3242 
3243     return true;
3244   }
3245 
3246   // Pre-compute set of SGPR pairs read by VALUs.
3247   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3248   computeVALUHazardSGPRs(MI->getMF());
3249 
3250   // If no VALUs hazard SGPRs exist then nothing to do.
3251   if (VALUReadHazardSGPRs.none())
3252     return false;
3253 
3254   // All SGPR writes before a call/return must be flushed as the callee/caller
3255   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3256   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3257                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3258                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3259 
3260   // Collect all SGPR sources for MI which are read by a VALU.
3261   const MachineRegisterInfo &MRI = MF.getRegInfo();
3262   SmallSet<Register, 4> SGPRsUsed;
3263 
3264   if (!IsSetPC) {
3265     for (const MachineOperand &Op : MI->all_uses()) {
3266       Register OpReg = Op.getReg();
3267 
3268       // Only consider VCC implicit uses on VALUs.
3269       // The only expected SALU implicit access is SCC which is no hazard.
3270       if (MIIsSALU && Op.isImplicit())
3271         continue;
3272 
3273       if (!TRI.isSGPRReg(MRI, OpReg))
3274         continue;
3275 
3276       auto RegN = sgprPairNumber(OpReg, TRI);
3277       if (!RegN)
3278         continue;
3279 
3280       if (!VALUReadHazardSGPRs[*RegN])
3281         continue;
3282 
3283       SGPRsUsed.insert(OpReg);
3284     }
3285 
3286     // No SGPRs -> nothing to do.
3287     if (SGPRsUsed.empty())
3288       return false;
3289   }
3290 
3291   // A hazard is any SALU which writes one of the SGPRs read by MI.
3292   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3293     if (!SIInstrInfo::isSALU(I))
3294       return false;
3295     // Ensure SGPR flush before call/return by conservatively assuming every
3296     // SALU writes an SGPR.
3297     if (IsSetPC && I.getNumDefs() > 0)
3298       return true;
3299     // Check for any register writes.
3300     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3301       return I.modifiesRegister(Reg, &TRI);
3302     });
3303   };
3304 
3305   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3306   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3307     if (Count >= SALUExpiryCount)
3308       return true;
3309     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3310     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3311         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3312       return true;
3313     return false;
3314   };
3315 
3316   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3317     // Only count true SALUs as wait states.
3318     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3319       return 0;
3320     // SALU must be unrelated to any hazard registers.
3321     if (any_of(SGPRsUsed,
3322                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3323       return 0;
3324     return 1;
3325   };
3326 
3327   // Check for the hazard.
3328   DenseSet<const MachineBasicBlock *> Visited;
3329   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3330                                         std::next(MI->getReverseIterator()), 0,
3331                                         IsExpiredFn, Visited, WaitStatesFn);
3332 
3333   if (WaitStates >= SALUExpiryCount)
3334     return false;
3335 
3336   // Validate hazard through an exhaustive search.
3337   if (UseVALUReadHazardExhaustiveSearch) {
3338     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3339     // This is searching for (1) in the hazard description.
3340     auto hazardPair = [this](Register Reg) {
3341       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3342         return Register(AMDGPU::VCC);
3343       auto RegN = sgprPairNumber(Reg, TRI);
3344       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3345     };
3346     auto SearchHazardFn = [this, hazardPair,
3347                            &SGPRsUsed](const MachineInstr &I) {
3348       if (!SIInstrInfo::isVALU(I))
3349         return false;
3350       // Check for any register reads.
3351       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3352         return I.readsRegister(hazardPair(Reg), &TRI);
3353       });
3354     };
3355     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3356       return false;
3357     };
3358     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3359         std::numeric_limits<int>::max())
3360       return false;
3361   }
3362 
3363   // Add s_wait_alu sa_sdst(0) before SALU read.
3364   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3365                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3366                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3367 
3368   // SALU read may be after s_getpc in a bundle.
3369   updateGetPCBundle(NewMI);
3370 
3371   return true;
3372 }
3373 
3374 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3375                                const SIInstrInfo &TII) {
3376   MachineBasicBlock &EntryMBB = MF->front();
3377   if (EntryMBB.begin() != EntryMBB.end()) {
3378     auto &EntryMI = *EntryMBB.begin();
3379     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3380         EntryMI.getOperand(0).getImm() >= Priority)
3381       return false;
3382   }
3383 
3384   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3385       .addImm(Priority);
3386   return true;
3387 }
3388 
3389 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3390   if (!ST.hasRequiredExportPriority())
3391     return false;
3392 
3393   // Assume the following shader types will never have exports,
3394   // and avoid adding or adjusting S_SETPRIO.
3395   MachineBasicBlock *MBB = MI->getParent();
3396   MachineFunction *MF = MBB->getParent();
3397   auto CC = MF->getFunction().getCallingConv();
3398   switch (CC) {
3399   case CallingConv::AMDGPU_CS:
3400   case CallingConv::AMDGPU_CS_Chain:
3401   case CallingConv::AMDGPU_CS_ChainPreserve:
3402   case CallingConv::AMDGPU_KERNEL:
3403     return false;
3404   default:
3405     break;
3406   }
3407 
3408   const int MaxPriority = 3;
3409   const int NormalPriority = 2;
3410   const int PostExportPriority = 0;
3411 
3412   auto It = MI->getIterator();
3413   switch (MI->getOpcode()) {
3414   case AMDGPU::S_ENDPGM:
3415   case AMDGPU::S_ENDPGM_SAVED:
3416   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3417   case AMDGPU::SI_RETURN_TO_EPILOG:
3418     // Ensure shader with calls raises priority at entry.
3419     // This ensures correct priority if exports exist in callee.
3420     if (MF->getFrameInfo().hasCalls())
3421       return ensureEntrySetPrio(MF, NormalPriority, TII);
3422     return false;
3423   case AMDGPU::S_SETPRIO: {
3424     // Raise minimum priority unless in workaround.
3425     auto &PrioOp = MI->getOperand(0);
3426     int Prio = PrioOp.getImm();
3427     bool InWA = (Prio == PostExportPriority) &&
3428                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3429     if (InWA || Prio >= NormalPriority)
3430       return false;
3431     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3432     return true;
3433   }
3434   default:
3435     if (!TII.isEXP(*MI))
3436       return false;
3437     break;
3438   }
3439 
3440   // Check entry priority at each export (as there will only be a few).
3441   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3442   bool Changed = false;
3443   if (CC != CallingConv::AMDGPU_Gfx)
3444     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3445 
3446   auto NextMI = std::next(It);
3447   bool EndOfShader = false;
3448   if (NextMI != MBB->end()) {
3449     // Only need WA at end of sequence of exports.
3450     if (TII.isEXP(*NextMI))
3451       return Changed;
3452     // Assume appropriate S_SETPRIO after export means WA already applied.
3453     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3454         NextMI->getOperand(0).getImm() == PostExportPriority)
3455       return Changed;
3456     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3457   }
3458 
3459   const DebugLoc &DL = MI->getDebugLoc();
3460 
3461   // Lower priority.
3462   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3463       .addImm(PostExportPriority);
3464 
3465   if (!EndOfShader) {
3466     // Wait for exports to complete.
3467     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3468         .addReg(AMDGPU::SGPR_NULL)
3469         .addImm(0);
3470   }
3471 
3472   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3473   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3474 
3475   if (!EndOfShader) {
3476     // Return to normal (higher) priority.
3477     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3478         .addImm(NormalPriority);
3479   }
3480 
3481   return true;
3482 }
3483