xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 39337ff2dc366fde83b07193b72c294a846c5959)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       UseVALUReadHazardExhaustiveSearch(false),
64       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
176 }
177 
178 static bool isLdsDma(const MachineInstr &MI) {
179   return SIInstrInfo::isVALU(MI) &&
180          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
181 }
182 
183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185                                                      AMDGPU::OpName::simm16);
186   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187 }
188 
189 ScheduleHazardRecognizer::HazardType
190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
191   MachineInstr *MI = SU->getInstr();
192   // If we are not in "HazardRecognizerMode" and therefore not being run from
193   // the scheduler, track possible stalls from hazards but don't insert noops.
194   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195 
196   if (MI->isBundle())
197    return NoHazard;
198 
199   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200     return HazardType;
201 
202   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203     return HazardType;
204 
205   if (checkFPAtomicToDenormModeHazard(MI) > 0)
206     return HazardType;
207 
208   if (ST.hasNoDataDepHazard())
209     return NoHazard;
210 
211   // FIXME: Should flat be considered vmem?
212   if ((SIInstrInfo::isVMEM(*MI) ||
213        SIInstrInfo::isFLAT(*MI))
214       && checkVMEMHazards(MI) > 0)
215     return HazardType;
216 
217   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
218     return HazardType;
219 
220   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
221     return HazardType;
222 
223   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
224     return HazardType;
225 
226   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
227     return HazardType;
228 
229   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
230        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
231        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
232     return HazardType;
233 
234   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
235     return HazardType;
236 
237   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
238     return HazardType;
239 
240   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
241     return HazardType;
242 
243   if (((ST.hasReadM0MovRelInterpHazard() &&
244         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
245          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
247        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
248        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
249        (ST.hasReadM0LdsDirectHazard() &&
250         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
251       checkReadM0Hazards(MI) > 0)
252     return HazardType;
253 
254   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
255     return HazardType;
256 
257   if ((SIInstrInfo::isVMEM(*MI) ||
258        SIInstrInfo::isFLAT(*MI) ||
259        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
260     return HazardType;
261 
262   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
263     return HazardType;
264 
265   return NoHazard;
266 }
267 
268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
269                                 unsigned Quantity) {
270   while (Quantity > 0) {
271     unsigned Arg = std::min(Quantity, 8u);
272     Quantity -= Arg;
273     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
274         .addImm(Arg - 1);
275   }
276 }
277 
278 unsigned
279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
280   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
281   assert(TSchedModel.getWriteProcResBegin(SC) !=
282          TSchedModel.getWriteProcResEnd(SC));
283   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
284 }
285 
286 void GCNHazardRecognizer::processBundle() {
287   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
288   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
289   // Check bundled MachineInstr's for hazards.
290   for (; MI != E && MI->isInsideBundle(); ++MI) {
291     CurrCycleInstr = &*MI;
292     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
293 
294     if (IsHazardRecognizerMode) {
295       fixHazards(CurrCycleInstr);
296 
297       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
298     }
299 
300     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
301     // include the bundled MI directly after, only add a maximum of
302     // (MaxLookAhead - 1) noops to EmittedInstrs.
303     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
304       EmittedInstrs.push_front(nullptr);
305 
306     EmittedInstrs.push_front(CurrCycleInstr);
307     EmittedInstrs.resize(MaxLookAhead);
308   }
309   CurrCycleInstr = nullptr;
310 }
311 
312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
313   assert(IsHazardRecognizerMode);
314 
315   unsigned NumPreNoops = PreEmitNoops(MI);
316   EmitNoops(NumPreNoops);
317   if (MI->isInsideBundle())
318     insertNoopsInBundle(MI, TII, NumPreNoops);
319   else
320     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
321                     NumPreNoops);
322   EmitInstruction(MI);
323   AdvanceCycle();
324 }
325 
326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
327   IsHazardRecognizerMode = true;
328   CurrCycleInstr = MI;
329   unsigned W = PreEmitNoopsCommon(MI);
330   fixHazards(MI);
331   CurrCycleInstr = nullptr;
332   return W;
333 }
334 
335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
336   if (MI->isBundle())
337     return 0;
338 
339   int WaitStates = 0;
340 
341   if (SIInstrInfo::isSMRD(*MI))
342     return std::max(WaitStates, checkSMRDHazards(MI));
343 
344   if (ST.hasNSAtoVMEMBug())
345     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
346 
347   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
348 
349   if (ST.hasNoDataDepHazard())
350     return WaitStates;
351 
352   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
353     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
354 
355   if (SIInstrInfo::isVALU(*MI))
356     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
357 
358   if (SIInstrInfo::isDPP(*MI))
359     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
360 
361   if (isDivFMas(MI->getOpcode()))
362     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
363 
364   if (isRWLane(MI->getOpcode()))
365     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
366 
367   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
368        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
369        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
370     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
371 
372   if (MI->isInlineAsm())
373     return std::max(WaitStates, checkInlineAsmHazards(MI));
374 
375   if (isSGetReg(MI->getOpcode()))
376     return std::max(WaitStates, checkGetRegHazards(MI));
377 
378   if (isSSetReg(MI->getOpcode()))
379     return std::max(WaitStates, checkSetRegHazards(MI));
380 
381   if (isRFE(MI->getOpcode()))
382     return std::max(WaitStates, checkRFEHazards(MI));
383 
384   if ((ST.hasReadM0MovRelInterpHazard() &&
385        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
386         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
388       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
389       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
390       (ST.hasReadM0LdsDirectHazard() &&
391        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
392     return std::max(WaitStates, checkReadM0Hazards(MI));
393 
394   if (SIInstrInfo::isMAI(*MI))
395     return std::max(WaitStates, checkMAIHazards(MI));
396 
397   if (SIInstrInfo::isVMEM(*MI) ||
398       SIInstrInfo::isFLAT(*MI) ||
399       SIInstrInfo::isDS(*MI))
400     return std::max(WaitStates, checkMAILdStHazards(MI));
401 
402   if (ST.hasGFX950Insts() && isPermlane(*MI))
403     return std::max(WaitStates, checkPermlaneHazards(MI));
404 
405   return WaitStates;
406 }
407 
408 void GCNHazardRecognizer::EmitNoop() {
409   EmittedInstrs.push_front(nullptr);
410 }
411 
412 void GCNHazardRecognizer::AdvanceCycle() {
413   // When the scheduler detects a stall, it will call AdvanceCycle() without
414   // emitting any instructions.
415   if (!CurrCycleInstr) {
416     EmittedInstrs.push_front(nullptr);
417     return;
418   }
419 
420   if (CurrCycleInstr->isBundle()) {
421     processBundle();
422     return;
423   }
424 
425   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426   if (!NumWaitStates) {
427     CurrCycleInstr = nullptr;
428     return;
429   }
430 
431   // Keep track of emitted instructions
432   EmittedInstrs.push_front(CurrCycleInstr);
433 
434   // Add a nullptr for each additional wait state after the first.  Make sure
435   // not to add more than getMaxLookAhead() items to the list, since we
436   // truncate the list to that size right after this loop.
437   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438        i < e; ++i) {
439     EmittedInstrs.push_front(nullptr);
440   }
441 
442   // getMaxLookahead() is the largest number of wait states we will ever need
443   // to insert, so there is no point in keeping track of more than that many
444   // wait states.
445   EmittedInstrs.resize(getMaxLookAhead());
446 
447   CurrCycleInstr = nullptr;
448 }
449 
450 void GCNHazardRecognizer::RecedeCycle() {
451   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
452 }
453 
454 //===----------------------------------------------------------------------===//
455 // Helper Functions
456 //===----------------------------------------------------------------------===//
457 
458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
459 
460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
462 
463 // Search for a hazard in a block and its predecessors.
464 template <typename StateT>
465 static bool
466 hasHazard(StateT State,
467           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
468           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
469           const MachineBasicBlock *MBB,
470           MachineBasicBlock::const_reverse_instr_iterator I,
471           DenseSet<const MachineBasicBlock *> &Visited) {
472   for (auto E = MBB->instr_rend(); I != E; ++I) {
473     // No need to look at parent BUNDLE instructions.
474     if (I->isBundle())
475       continue;
476 
477     switch (IsHazard(State, *I)) {
478     case HazardFound:
479       return true;
480     case HazardExpired:
481       return false;
482     default:
483       // Continue search
484       break;
485     }
486 
487     if (I->isInlineAsm() || I->isMetaInstruction())
488       continue;
489 
490     UpdateState(State, *I);
491   }
492 
493   for (MachineBasicBlock *Pred : MBB->predecessors()) {
494     if (!Visited.insert(Pred).second)
495       continue;
496 
497     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
498                   Visited))
499       return true;
500   }
501 
502   return false;
503 }
504 
505 // Returns a minimum wait states since \p I walking all predecessors.
506 // Only scans until \p IsExpired does not return true.
507 // Can only be run in a hazard recognizer mode.
508 static int getWaitStatesSince(
509     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
510     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
511     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
512     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
513   for (auto E = MBB->instr_rend(); I != E; ++I) {
514     // Don't add WaitStates for parent BUNDLE instructions.
515     if (I->isBundle())
516       continue;
517 
518     if (IsHazard(*I))
519       return WaitStates;
520 
521     if (I->isInlineAsm())
522       continue;
523 
524     WaitStates += GetNumWaitStates(*I);
525 
526     if (IsExpired(*I, WaitStates))
527       return std::numeric_limits<int>::max();
528   }
529 
530   int MinWaitStates = std::numeric_limits<int>::max();
531   for (MachineBasicBlock *Pred : MBB->predecessors()) {
532     if (!Visited.insert(Pred).second)
533       continue;
534 
535     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
536                                IsExpired, Visited, GetNumWaitStates);
537 
538     MinWaitStates = std::min(MinWaitStates, W);
539   }
540 
541   return MinWaitStates;
542 }
543 
544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
545                               const MachineInstr *MI, IsExpiredFn IsExpired) {
546   DenseSet<const MachineBasicBlock *> Visited;
547   return getWaitStatesSince(IsHazard, MI->getParent(),
548                             std::next(MI->getReverseIterator()),
549                             0, IsExpired, Visited);
550 }
551 
552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
553   if (IsHazardRecognizerMode) {
554     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
555       return WaitStates >= Limit;
556     };
557     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
558   }
559 
560   int WaitStates = 0;
561   for (MachineInstr *MI : EmittedInstrs) {
562     if (MI) {
563       if (IsHazard(*MI))
564         return WaitStates;
565 
566       if (MI->isInlineAsm())
567         continue;
568     }
569     ++WaitStates;
570 
571     if (WaitStates >= Limit)
572       break;
573   }
574   return std::numeric_limits<int>::max();
575 }
576 
577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
578                                                IsHazardFn IsHazardDef,
579                                                int Limit) {
580   const SIRegisterInfo *TRI = ST.getRegisterInfo();
581 
582   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
583     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
584   };
585 
586   return getWaitStatesSince(IsHazardFn, Limit);
587 }
588 
589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
590                                                   int Limit) {
591   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
592     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
593   };
594 
595   return getWaitStatesSince(IsHazardFn, Limit);
596 }
597 
598 //===----------------------------------------------------------------------===//
599 // No-op Hazard Detection
600 //===----------------------------------------------------------------------===//
601 
602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
603                         MCRegister Reg) {
604   for (MCRegUnit Unit : TRI.regunits(Reg))
605     BV.set(Unit);
606 }
607 
608 static void addRegsToSet(const SIRegisterInfo &TRI,
609                          iterator_range<MachineInstr::const_mop_iterator> Ops,
610                          BitVector &DefSet, BitVector &UseSet) {
611   for (const MachineOperand &Op : Ops) {
612     if (Op.isReg())
613       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
614   }
615 }
616 
617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
618   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
619 }
620 
621 static bool breaksSMEMSoftClause(MachineInstr *MI) {
622   return !SIInstrInfo::isSMRD(*MI);
623 }
624 
625 static bool breaksVMEMSoftClause(MachineInstr *MI) {
626   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
627 }
628 
629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
630   // SMEM soft clause are only present on VI+, and only matter if xnack is
631   // enabled.
632   if (!ST.isXNACKEnabled())
633     return 0;
634 
635   bool IsSMRD = TII.isSMRD(*MEM);
636 
637   resetClause();
638 
639   // A soft-clause is any group of consecutive SMEM instructions.  The
640   // instructions in this group may return out of order and/or may be
641   // replayed (i.e. the same instruction issued more than once).
642   //
643   // In order to handle these situations correctly we need to make sure that
644   // when a clause has more than one instruction, no instruction in the clause
645   // writes to a register that is read by another instruction in the clause
646   // (including itself). If we encounter this situation, we need to break the
647   // clause by inserting a non SMEM instruction.
648 
649   for (MachineInstr *MI : EmittedInstrs) {
650     // When we hit a non-SMEM instruction then we have passed the start of the
651     // clause and we can stop.
652     if (!MI)
653       break;
654 
655     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
656       break;
657 
658     addClauseInst(*MI);
659   }
660 
661   if (ClauseDefs.none())
662     return 0;
663 
664   // We need to make sure not to put loads and stores in the same clause if they
665   // use the same address. For now, just start a new clause whenever we see a
666   // store.
667   if (MEM->mayStore())
668     return 1;
669 
670   addClauseInst(*MEM);
671 
672   // If the set of defs and uses intersect then we cannot add this instruction
673   // to the clause, so we have a hazard.
674   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
675 }
676 
677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
678   int WaitStatesNeeded = 0;
679 
680   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
681 
682   // This SMRD hazard only affects SI.
683   if (!ST.hasSMRDReadVALUDefHazard())
684     return WaitStatesNeeded;
685 
686   // A read of an SGPR by SMRD instruction requires 4 wait states when the
687   // SGPR was written by a VALU instruction.
688   int SmrdSgprWaitStates = 4;
689   auto IsHazardDefFn = [this](const MachineInstr &MI) {
690     return TII.isVALU(MI);
691   };
692   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
693     return TII.isSALU(MI);
694   };
695 
696   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
697 
698   for (const MachineOperand &Use : SMRD->uses()) {
699     if (!Use.isReg())
700       continue;
701     int WaitStatesNeededForUse =
702         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
703                                                    SmrdSgprWaitStates);
704     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705 
706     // This fixes what appears to be undocumented hardware behavior in SI where
707     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
708     // needs some number of nops in between. We don't know how many we need, but
709     // let's use 4. This wasn't discovered before probably because the only
710     // case when this happens is when we expand a 64-bit pointer into a full
711     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
712     // probably never encountered in the closed-source land.
713     if (IsBufferSMRD) {
714       int WaitStatesNeededForUse =
715         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
716                                                    IsBufferHazardDefFn,
717                                                    SmrdSgprWaitStates);
718       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
719     }
720   }
721 
722   return WaitStatesNeeded;
723 }
724 
725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
726   if (!ST.hasVMEMReadSGPRVALUDefHazard())
727     return 0;
728 
729   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
730 
731   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
732   // SGPR was written by a VALU Instruction.
733   const int VmemSgprWaitStates = 5;
734   auto IsHazardDefFn = [this](const MachineInstr &MI) {
735     return TII.isVALU(MI);
736   };
737   for (const MachineOperand &Use : VMEM->uses()) {
738     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
739       continue;
740 
741     int WaitStatesNeededForUse =
742         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
743                                                    VmemSgprWaitStates);
744     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745   }
746   return WaitStatesNeeded;
747 }
748 
749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
750   const SIRegisterInfo *TRI = ST.getRegisterInfo();
751   const SIInstrInfo *TII = ST.getInstrInfo();
752 
753   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
754   int DppVgprWaitStates = 2;
755   int DppExecWaitStates = 5;
756   int WaitStatesNeeded = 0;
757   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758     return TII->isVALU(MI);
759   };
760 
761   for (const MachineOperand &Use : DPP->uses()) {
762     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
763       continue;
764     int WaitStatesNeededForUse =
765         DppVgprWaitStates - getWaitStatesSinceDef(
766                                 Use.getReg(),
767                                 [](const MachineInstr &) { return true; },
768                                 DppVgprWaitStates);
769     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
770   }
771 
772   WaitStatesNeeded = std::max(
773       WaitStatesNeeded,
774       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
775                                                 DppExecWaitStates));
776 
777   return WaitStatesNeeded;
778 }
779 
780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
781   const SIInstrInfo *TII = ST.getInstrInfo();
782 
783   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
784   // instruction.
785   const int DivFMasWaitStates = 4;
786   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
787     return TII->isVALU(MI);
788   };
789   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
790                                                DivFMasWaitStates);
791 
792   return DivFMasWaitStates - WaitStatesNeeded;
793 }
794 
795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
796   const SIInstrInfo *TII = ST.getInstrInfo();
797   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
798 
799   const int GetRegWaitStates = 2;
800   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
801     return GetRegHWReg == getHWReg(TII, MI);
802   };
803   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
804 
805   return GetRegWaitStates - WaitStatesNeeded;
806 }
807 
808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
809   const SIInstrInfo *TII = ST.getInstrInfo();
810   unsigned HWReg = getHWReg(TII, *SetRegInstr);
811 
812   const int SetRegWaitStates = ST.getSetRegWaitStates();
813   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
814     return HWReg == getHWReg(TII, MI);
815   };
816   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
817   return SetRegWaitStates - WaitStatesNeeded;
818 }
819 
820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
821   if (!MI.mayStore())
822     return -1;
823 
824   const SIInstrInfo *TII = ST.getInstrInfo();
825   unsigned Opcode = MI.getOpcode();
826   const MCInstrDesc &Desc = MI.getDesc();
827 
828   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
829   int VDataRCID = -1;
830   if (VDataIdx != -1)
831     VDataRCID = Desc.operands()[VDataIdx].RegClass;
832 
833   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
834     // There is no hazard if the instruction does not use vector regs
835     // (like wbinvl1)
836     if (VDataIdx == -1)
837       return -1;
838     // For MUBUF/MTBUF instructions this hazard only exists if the
839     // instruction is not using a register in the soffset field.
840     const MachineOperand *SOffset =
841         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
842     // If we have no soffset operand, then assume this field has been
843     // hardcoded to zero.
844     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
845         (!SOffset || !SOffset->isReg()))
846       return VDataIdx;
847   }
848 
849   // MIMG instructions create a hazard if they don't use a 256-bit T# and
850   // the store size is greater than 8 bytes and they have more than two bits
851   // of their dmask set.
852   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
853   if (TII->isMIMG(MI)) {
854     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
855     assert(SRsrcIdx != -1 &&
856            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
857     (void)SRsrcIdx;
858   }
859 
860   if (TII->isFLAT(MI)) {
861     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
862     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
863       return DataIdx;
864   }
865 
866   return -1;
867 }
868 
869 int
870 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
871                                             const MachineRegisterInfo &MRI) {
872   // Helper to check for the hazard where VMEM instructions that store more than
873   // 8 bytes can have there store data over written by the next instruction.
874   const SIRegisterInfo *TRI = ST.getRegisterInfo();
875 
876   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
877   int WaitStatesNeeded = 0;
878 
879   if (!TRI->isVectorRegister(MRI, Def.getReg()))
880     return WaitStatesNeeded;
881   Register Reg = Def.getReg();
882   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
883     int DataIdx = createsVALUHazard(MI);
884     return DataIdx >= 0 &&
885            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
886   };
887 
888   int WaitStatesNeededForDef =
889     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
890   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
891 
892   return WaitStatesNeeded;
893 }
894 
895 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
896 /// pack the computed value into correct bit position of the dest register. This
897 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
898 /// dst_sel that is not aligned to the register. This function analayzes the \p
899 /// MI and \returns an operand with dst forwarding issue, or nullptr if
900 /// none exists.
901 static const MachineOperand *
902 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
903   if (!SIInstrInfo::isVALU(MI))
904     return nullptr;
905 
906   const SIInstrInfo *TII = ST.getInstrInfo();
907 
908   unsigned Opcode = MI.getOpcode();
909 
910   // There are three different types of instructions
911   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
912   // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
913   // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
914   // op_sel[3:2]
915   // != 0
916   if (SIInstrInfo::isSDWA(MI)) {
917     // Type 1: SDWA with dst_sel != DWORD
918     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
920         return nullptr;
921   } else {
922     // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
923     // with op_sel[3:2] != 0)
924     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
925         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
926               SISrcMods::DST_OP_SEL ||
927           (AMDGPU::isFP8DstSelInst(Opcode) &&
928            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
929             SISrcMods::OP_SEL_0))))
930       return nullptr;
931   }
932 
933   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
934 }
935 
936 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
937 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
938 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
939 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
940                                             const MachineOperand *Dst,
941                                             const SIRegisterInfo *TRI) {
942   // We must consider implicit reads of the VALU. SDWA with dst_sel and
943   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
944   // and we must account for that hazard.
945   // We also must account for WAW hazards. In particular, WAW with dest
946   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
947   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
948   // check for ECC. Without accounting for this hazard, the ECC will be
949   // wrong.
950   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
951   // complete zeroesHigh16BitsOfDest)
952   for (auto &Operand : VALU->operands()) {
953     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
954       return true;
955     }
956   }
957   return false;
958 }
959 
960 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
961   int WaitStatesNeeded = 0;
962 
963   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
964     const int TransDefWaitstates = 1;
965 
966     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
967       if (!SIInstrInfo::isTRANS(MI))
968         return false;
969       const SIRegisterInfo *TRI = ST.getRegisterInfo();
970       const SIInstrInfo *TII = ST.getInstrInfo();
971       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
972 
973       for (const MachineOperand &Use : VALU->explicit_uses()) {
974         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
975           return true;
976       }
977 
978       return false;
979     };
980 
981     int WaitStatesNeededForDef =
982         TransDefWaitstates -
983         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
984     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
985   }
986 
987   if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
988     const int Shift16DefWaitstates = 1;
989 
990     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
991       const SIRegisterInfo *TRI = ST.getRegisterInfo();
992       const MachineOperand *ForwardedDst =
993           getDstSelForwardingOperand(ProducerMI, ST);
994       if (ForwardedDst) {
995         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
996       }
997 
998       if (ProducerMI.isInlineAsm()) {
999         // Assume inline asm has dst forwarding hazard
1000         for (auto &Def : ProducerMI.all_defs()) {
1001           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1002             return true;
1003         }
1004       }
1005 
1006       return false;
1007     };
1008 
1009     int WaitStatesNeededForDef =
1010         Shift16DefWaitstates -
1011         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1012     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1013   }
1014 
1015   if (ST.hasVDecCoExecHazard()) {
1016     const int VALUWriteSGPRVALUReadWaitstates = 2;
1017     const int VALUWriteEXECRWLane = 4;
1018     const int VALUWriteVGPRReadlaneRead = 1;
1019 
1020     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1021     const MachineRegisterInfo &MRI = MF.getRegInfo();
1022     Register UseReg;
1023     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1024       if (!SIInstrInfo::isVALU(MI))
1025         return false;
1026       return MI.modifiesRegister(UseReg, TRI);
1027     };
1028 
1029     for (const MachineOperand &Use : VALU->explicit_uses()) {
1030       if (!Use.isReg())
1031         continue;
1032 
1033       UseReg = Use.getReg();
1034       if (TRI->isSGPRReg(MRI, UseReg)) {
1035         int WaitStatesNeededForDef =
1036             VALUWriteSGPRVALUReadWaitstates -
1037             getWaitStatesSince(IsVALUDefSGPRFn,
1038                                VALUWriteSGPRVALUReadWaitstates);
1039         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1040       }
1041     }
1042 
1043     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1044       UseReg = AMDGPU::VCC;
1045       int WaitStatesNeededForDef =
1046           VALUWriteSGPRVALUReadWaitstates -
1047           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1048       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1049     }
1050 
1051     switch (VALU->getOpcode()) {
1052     case AMDGPU::V_READLANE_B32:
1053     case AMDGPU::V_READFIRSTLANE_B32: {
1054       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1055       UseReg = Src->getReg();
1056       int WaitStatesNeededForDef =
1057           VALUWriteVGPRReadlaneRead -
1058           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1059       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1060     }
1061       [[fallthrough]];
1062     case AMDGPU::V_WRITELANE_B32: {
1063       UseReg = AMDGPU::EXEC;
1064       int WaitStatesNeededForDef =
1065           VALUWriteEXECRWLane -
1066           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1067       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1068       break;
1069     }
1070     default:
1071       break;
1072     }
1073   }
1074 
1075   // This checks for the hazard where VMEM instructions that store more than
1076   // 8 bytes can have there store data over written by the next instruction.
1077   if (!ST.has12DWordStoreHazard())
1078     return WaitStatesNeeded;
1079 
1080   const MachineRegisterInfo &MRI = MF.getRegInfo();
1081 
1082   for (const MachineOperand &Def : VALU->defs()) {
1083     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1084   }
1085 
1086   return WaitStatesNeeded;
1087 }
1088 
1089 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1090   // This checks for hazards associated with inline asm statements.
1091   // Since inline asms can contain just about anything, we use this
1092   // to call/leverage other check*Hazard routines. Note that
1093   // this function doesn't attempt to address all possible inline asm
1094   // hazards (good luck), but is a collection of what has been
1095   // problematic thus far.
1096 
1097   // see checkVALUHazards()
1098   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1099       !ST.hasCvtScaleForwardingHazard())
1100     return 0;
1101 
1102   const MachineRegisterInfo &MRI = MF.getRegInfo();
1103   int WaitStatesNeeded = 0;
1104 
1105   for (const MachineOperand &Op :
1106        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1107     if (Op.isReg() && Op.isDef()) {
1108       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1109         continue;
1110 
1111       if (ST.has12DWordStoreHazard()) {
1112         WaitStatesNeeded =
1113             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1114       }
1115     }
1116   }
1117 
1118   if (ST.hasDstSelForwardingHazard()) {
1119     const int Shift16DefWaitstates = 1;
1120 
1121     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1122       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1123       // Assume inline asm reads the dst
1124       if (Dst)
1125         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1126                IA->readsRegister(Dst->getReg(), &TRI);
1127 
1128       if (ProducerMI.isInlineAsm()) {
1129         // If MI is inline asm, assume it has dst forwarding hazard
1130         for (auto &Def : ProducerMI.all_defs()) {
1131           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1132               IA->readsRegister(Def.getReg(), &TRI)) {
1133             return true;
1134           }
1135         }
1136       }
1137 
1138       return false;
1139     };
1140 
1141     int WaitStatesNeededForDef =
1142         Shift16DefWaitstates -
1143         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1144     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1145   }
1146 
1147   return WaitStatesNeeded;
1148 }
1149 
1150 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1151   const SIInstrInfo *TII = ST.getInstrInfo();
1152   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1153   const MachineRegisterInfo &MRI = MF.getRegInfo();
1154 
1155   const MachineOperand *LaneSelectOp =
1156       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1157 
1158   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1159     return 0;
1160 
1161   Register LaneSelectReg = LaneSelectOp->getReg();
1162   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1163 
1164   const int RWLaneWaitStates = 4;
1165   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1166                                               RWLaneWaitStates);
1167   return RWLaneWaitStates - WaitStatesSince;
1168 }
1169 
1170 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1171   if (!ST.hasRFEHazards())
1172     return 0;
1173 
1174   const SIInstrInfo *TII = ST.getInstrInfo();
1175 
1176   const int RFEWaitStates = 1;
1177 
1178   auto IsHazardFn = [TII](const MachineInstr &MI) {
1179     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1180   };
1181   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1182   return RFEWaitStates - WaitStatesNeeded;
1183 }
1184 
1185 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1186   const SIInstrInfo *TII = ST.getInstrInfo();
1187   const int ReadM0WaitStates = 1;
1188   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1189   return ReadM0WaitStates -
1190          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1191 }
1192 
1193 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1194   fixVMEMtoScalarWriteHazards(MI);
1195   fixVcmpxPermlaneHazards(MI);
1196   fixSMEMtoVectorWriteHazards(MI);
1197   fixVcmpxExecWARHazard(MI);
1198   fixLdsBranchVmemWARHazard(MI);
1199   if (ST.hasLdsDirect()) {
1200     fixLdsDirectVALUHazard(MI);
1201     fixLdsDirectVMEMHazard(MI);
1202   }
1203   fixVALUPartialForwardingHazard(MI);
1204   fixVALUTransUseHazard(MI);
1205   fixWMMAHazards(MI);
1206   fixShift64HighRegBug(MI);
1207   fixVALUMaskWriteHazard(MI);
1208   fixVALUReadSGPRHazard(MI);
1209   fixRequiredExportPriority(MI);
1210 }
1211 
1212 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1213                               const MachineInstr &MI) {
1214   return (TII.isVOPC(MI) ||
1215           (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1216          MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1217 }
1218 
1219 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1220   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1221     return false;
1222 
1223   const SIInstrInfo *TII = ST.getInstrInfo();
1224   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1225   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1226     return isVCmpXWritesExec(*TII, *TRI, MI);
1227   };
1228 
1229   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1230     unsigned Opc = MI.getOpcode();
1231     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1232            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1233   };
1234 
1235   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1236       std::numeric_limits<int>::max())
1237     return false;
1238 
1239   // V_NOP will be discarded by SQ.
1240   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1241   // which is always a VGPR and available.
1242   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1243   Register Reg = Src0->getReg();
1244   bool IsUndef = Src0->isUndef();
1245   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1246           TII->get(AMDGPU::V_MOV_B32_e32))
1247     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1248     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1249 
1250   return true;
1251 }
1252 
1253 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1254   if (!ST.hasVMEMtoScalarWriteHazard())
1255     return false;
1256   assert(!ST.hasExtendedWaitCounts());
1257 
1258   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1259     return false;
1260 
1261   if (MI->getNumDefs() == 0)
1262     return false;
1263 
1264   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1265 
1266   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1267     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1268         !SIInstrInfo::isFLAT(I))
1269       return false;
1270 
1271     for (const MachineOperand &Def : MI->defs()) {
1272       const MachineOperand *Op =
1273           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1274       if (!Op)
1275         continue;
1276       return true;
1277     }
1278     return false;
1279   };
1280 
1281   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1282     return SIInstrInfo::isVALU(MI) ||
1283            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1284             !MI.getOperand(0).getImm()) ||
1285            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1286             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1287   };
1288 
1289   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1290       std::numeric_limits<int>::max())
1291     return false;
1292 
1293   const SIInstrInfo *TII = ST.getInstrInfo();
1294   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1295           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1296       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1297   return true;
1298 }
1299 
1300 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1301   if (!ST.hasSMEMtoVectorWriteHazard())
1302     return false;
1303   assert(!ST.hasExtendedWaitCounts());
1304 
1305   if (!SIInstrInfo::isVALU(*MI))
1306     return false;
1307 
1308   unsigned SDSTName;
1309   switch (MI->getOpcode()) {
1310   case AMDGPU::V_READLANE_B32:
1311   case AMDGPU::V_READFIRSTLANE_B32:
1312     SDSTName = AMDGPU::OpName::vdst;
1313     break;
1314   default:
1315     SDSTName = AMDGPU::OpName::sdst;
1316     break;
1317   }
1318 
1319   const SIInstrInfo *TII = ST.getInstrInfo();
1320   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1321   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1322   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1323   if (!SDST) {
1324     for (const auto &MO : MI->implicit_operands()) {
1325       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1326         SDST = &MO;
1327         break;
1328       }
1329     }
1330   }
1331 
1332   if (!SDST)
1333     return false;
1334 
1335   const Register SDSTReg = SDST->getReg();
1336   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1337     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1338   };
1339 
1340   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1341     if (TII->isSALU(MI)) {
1342       switch (MI.getOpcode()) {
1343       case AMDGPU::S_SETVSKIP:
1344       case AMDGPU::S_VERSION:
1345       case AMDGPU::S_WAITCNT_VSCNT:
1346       case AMDGPU::S_WAITCNT_VMCNT:
1347       case AMDGPU::S_WAITCNT_EXPCNT:
1348         // These instructions cannot not mitigate the hazard.
1349         return false;
1350       case AMDGPU::S_WAITCNT_LGKMCNT:
1351         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1352         return (MI.getOperand(1).getImm() == 0) &&
1353                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1354       case AMDGPU::S_WAITCNT: {
1355         const int64_t Imm = MI.getOperand(0).getImm();
1356         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1357         // DsCnt corresponds to LGKMCnt here.
1358         return (Decoded.DsCnt == 0);
1359       }
1360       default:
1361         // SOPP instructions cannot mitigate the hazard.
1362         if (TII->isSOPP(MI))
1363           return false;
1364         // At this point the SALU can be assumed to mitigate the hazard
1365         // because either:
1366         // (a) it is independent of the at risk SMEM (breaking chain),
1367         // or
1368         // (b) it is dependent on the SMEM, in which case an appropriate
1369         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1370         //     SMEM instruction.
1371         return true;
1372       }
1373     }
1374     return false;
1375   };
1376 
1377   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1378       std::numeric_limits<int>::max())
1379     return false;
1380 
1381   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1382           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1383       .addImm(0);
1384   return true;
1385 }
1386 
1387 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1388   if (!ST.hasVcmpxExecWARHazard())
1389     return false;
1390   assert(!ST.hasExtendedWaitCounts());
1391 
1392   if (!SIInstrInfo::isVALU(*MI))
1393     return false;
1394 
1395   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1396   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1397     return false;
1398 
1399   auto IsHazardFn = [TRI](const MachineInstr &I) {
1400     if (SIInstrInfo::isVALU(I))
1401       return false;
1402     return I.readsRegister(AMDGPU::EXEC, TRI);
1403   };
1404 
1405   const SIInstrInfo *TII = ST.getInstrInfo();
1406   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1407     if (SIInstrInfo::isVALU(MI)) {
1408       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1409         return true;
1410       for (auto MO : MI.implicit_operands())
1411         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1412           return true;
1413     }
1414     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1415         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1416       return true;
1417     return false;
1418   };
1419 
1420   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1421       std::numeric_limits<int>::max())
1422     return false;
1423 
1424   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1425           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1426       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1427   return true;
1428 }
1429 
1430 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1431                                                  const GCNSubtarget &ST) {
1432   if (!ST.hasLdsBranchVmemWARHazard())
1433     return false;
1434 
1435   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1436   // instructions need to appear in the same function.
1437   bool HasLds = false;
1438   bool HasVmem = false;
1439   for (auto &MBB : MF) {
1440     for (auto &MI : MBB) {
1441       HasLds |= SIInstrInfo::isDS(MI);
1442       HasVmem |=
1443           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1444       if (HasLds && HasVmem)
1445         return true;
1446     }
1447   }
1448   return false;
1449 }
1450 
1451 static bool isStoreCountWaitZero(const MachineInstr &I) {
1452   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1453          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1454          !I.getOperand(1).getImm();
1455 }
1456 
1457 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1458   if (!RunLdsBranchVmemWARHazardFixup)
1459     return false;
1460 
1461   assert(ST.hasLdsBranchVmemWARHazard());
1462   assert(!ST.hasExtendedWaitCounts());
1463 
1464   auto IsHazardInst = [](const MachineInstr &MI) {
1465     if (SIInstrInfo::isDS(MI))
1466       return 1;
1467     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1468       return 2;
1469     return 0;
1470   };
1471 
1472   auto InstType = IsHazardInst(*MI);
1473   if (!InstType)
1474     return false;
1475 
1476   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1477     return IsHazardInst(I) || isStoreCountWaitZero(I);
1478   };
1479 
1480   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1481     if (!I.isBranch())
1482       return false;
1483 
1484     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1485       auto InstType2 = IsHazardInst(I);
1486       return InstType2 && InstType != InstType2;
1487     };
1488 
1489     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1490       auto InstType2 = IsHazardInst(I);
1491       if (InstType == InstType2)
1492         return true;
1493 
1494       return isStoreCountWaitZero(I);
1495     };
1496 
1497     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1498            std::numeric_limits<int>::max();
1499   };
1500 
1501   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1502       std::numeric_limits<int>::max())
1503     return false;
1504 
1505   const SIInstrInfo *TII = ST.getInstrInfo();
1506   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1507           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1508     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1509     .addImm(0);
1510 
1511   return true;
1512 }
1513 
1514 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1515   if (!SIInstrInfo::isLDSDIR(*MI))
1516     return false;
1517 
1518   const int NoHazardWaitStates = 15;
1519   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1520   const Register VDSTReg = VDST->getReg();
1521 
1522   bool VisitedTrans = false;
1523   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1524     if (!SIInstrInfo::isVALU(I))
1525       return false;
1526     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1527     // Cover both WAR and WAW
1528     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1529   };
1530   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1531     if (WaitStates >= NoHazardWaitStates)
1532       return true;
1533     // Instructions which cause va_vdst==0 expire hazard
1534     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1535            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1536   };
1537   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1538     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1539   };
1540 
1541   DenseSet<const MachineBasicBlock *> Visited;
1542   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1543                                     std::next(MI->getReverseIterator()), 0,
1544                                     IsExpiredFn, Visited, GetWaitStatesFn);
1545 
1546   // Transcendentals can execute in parallel to other VALUs.
1547   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1548   if (VisitedTrans)
1549     Count = 0;
1550 
1551   MachineOperand *WaitVdstOp =
1552       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1553   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1554 
1555   return true;
1556 }
1557 
1558 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1559   if (!SIInstrInfo::isLDSDIR(*MI))
1560     return false;
1561 
1562   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1563   const Register VDSTReg = VDST->getReg();
1564 
1565   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1566     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1567         !SIInstrInfo::isDS(I))
1568       return false;
1569     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1570   };
1571   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1572   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1573   // according to the type of VMEM instruction.
1574   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1575     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1576            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1577            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1578             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1579            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1580             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1581   };
1582 
1583   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1584       std::numeric_limits<int>::max())
1585     return false;
1586 
1587   if (LdsdirCanWait) {
1588     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1589   } else {
1590     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1591             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1592         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1593   }
1594 
1595   return true;
1596 }
1597 
1598 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1599   if (!ST.hasVALUPartialForwardingHazard())
1600     return false;
1601   assert(!ST.hasExtendedWaitCounts());
1602 
1603   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1604     return false;
1605 
1606   SmallSetVector<Register, 4> SrcVGPRs;
1607 
1608   for (const MachineOperand &Use : MI->explicit_uses()) {
1609     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1610       SrcVGPRs.insert(Use.getReg());
1611   }
1612 
1613   // Only applies with >= 2 unique VGPR sources
1614   if (SrcVGPRs.size() <= 1)
1615     return false;
1616 
1617   // Look for the following pattern:
1618   //   Va <- VALU [PreExecPos]
1619   //   intv1
1620   //   Exec <- SALU [ExecPos]
1621   //   intv2
1622   //   Vb <- VALU [PostExecPos]
1623   //   intv3
1624   //   MI Va, Vb (WaitState = 0)
1625   //
1626   // Where:
1627   // intv1 + intv2 <= 2 VALUs
1628   // intv3 <= 4 VALUs
1629   //
1630   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1631 
1632   const int Intv1plus2MaxVALUs = 2;
1633   const int Intv3MaxVALUs = 4;
1634   const int IntvMaxVALUs = 6;
1635   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1636 
1637   struct StateType {
1638     SmallDenseMap<Register, int, 4> DefPos;
1639     int ExecPos = std::numeric_limits<int>::max();
1640     int VALUs = 0;
1641   };
1642 
1643   StateType State;
1644 
1645   // This overloads expiry testing with all the hazard detection
1646   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1647     // Too many VALU states have passed
1648     if (State.VALUs > NoHazardVALUWaitStates)
1649       return HazardExpired;
1650 
1651     // Instructions which cause va_vdst==0 expire hazard
1652     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1653         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1654         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1655          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1656       return HazardExpired;
1657 
1658     // Track registers writes
1659     bool Changed = false;
1660     if (SIInstrInfo::isVALU(I)) {
1661       for (Register Src : SrcVGPRs) {
1662         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1663           State.DefPos[Src] = State.VALUs;
1664           Changed = true;
1665         }
1666       }
1667     } else if (SIInstrInfo::isSALU(I)) {
1668       if (State.ExecPos == std::numeric_limits<int>::max()) {
1669         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1670           State.ExecPos = State.VALUs;
1671           Changed = true;
1672         }
1673       }
1674     }
1675 
1676     // Early expiration: too many VALUs in intv3
1677     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1678       return HazardExpired;
1679 
1680     // Only evaluate state if something changed
1681     if (!Changed)
1682       return NoHazardFound;
1683 
1684     // Determine positions of VALUs pre/post exec change
1685     if (State.ExecPos == std::numeric_limits<int>::max())
1686       return NoHazardFound;
1687 
1688     int PreExecPos = std::numeric_limits<int>::max();
1689     int PostExecPos = std::numeric_limits<int>::max();
1690 
1691     for (auto Entry : State.DefPos) {
1692       int DefVALUs = Entry.second;
1693       if (DefVALUs != std::numeric_limits<int>::max()) {
1694         if (DefVALUs >= State.ExecPos)
1695           PreExecPos = std::min(PreExecPos, DefVALUs);
1696         else
1697           PostExecPos = std::min(PostExecPos, DefVALUs);
1698       }
1699     }
1700 
1701     // Need a VALUs post exec change
1702     if (PostExecPos == std::numeric_limits<int>::max())
1703       return NoHazardFound;
1704 
1705     // Too many VALUs in intv3?
1706     int Intv3VALUs = PostExecPos;
1707     if (Intv3VALUs > Intv3MaxVALUs)
1708       return HazardExpired;
1709 
1710     // Too many VALUs in intv2?
1711     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1712     if (Intv2VALUs > Intv1plus2MaxVALUs)
1713       return HazardExpired;
1714 
1715     // Need a VALUs pre exec change
1716     if (PreExecPos == std::numeric_limits<int>::max())
1717       return NoHazardFound;
1718 
1719     // Too many VALUs in intv1?
1720     int Intv1VALUs = PreExecPos - State.ExecPos;
1721     if (Intv1VALUs > Intv1plus2MaxVALUs)
1722       return HazardExpired;
1723 
1724     // Too many VALUs in intv1 + intv2
1725     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1726       return HazardExpired;
1727 
1728     return HazardFound;
1729   };
1730   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1731     if (SIInstrInfo::isVALU(MI))
1732       State.VALUs += 1;
1733   };
1734 
1735   DenseSet<const MachineBasicBlock *> Visited;
1736   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1737                             std::next(MI->getReverseIterator()), Visited))
1738     return false;
1739 
1740   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1741           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1742       .addImm(0x0fff);
1743 
1744   return true;
1745 }
1746 
1747 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1748   if (!ST.hasVALUTransUseHazard())
1749     return false;
1750   assert(!ST.hasExtendedWaitCounts());
1751 
1752   if (!SIInstrInfo::isVALU(*MI))
1753     return false;
1754 
1755   SmallSet<Register, 4> SrcVGPRs;
1756 
1757   for (const MachineOperand &Use : MI->explicit_uses()) {
1758     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1759       SrcVGPRs.insert(Use.getReg());
1760   }
1761 
1762   // Look for the following pattern:
1763   //   Va <- TRANS VALU
1764   //   intv
1765   //   MI Va (WaitState = 0)
1766   //
1767   // Where:
1768   // intv <= 5 VALUs / 1 TRANS
1769   //
1770   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1771 
1772   const int IntvMaxVALUs = 5;
1773   const int IntvMaxTRANS = 1;
1774 
1775   struct StateType {
1776     int VALUs = 0;
1777     int TRANS = 0;
1778   };
1779 
1780   StateType State;
1781 
1782   // This overloads expiry testing with all the hazard detection
1783   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1784     // Too many VALU states have passed
1785     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1786       return HazardExpired;
1787 
1788     // Instructions which cause va_vdst==0 expire hazard
1789     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1790         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1791         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1792          I.getOperand(0).getImm() == 0x0fff))
1793       return HazardExpired;
1794 
1795     // Track registers writes
1796     if (SIInstrInfo::isTRANS(I)) {
1797       for (Register Src : SrcVGPRs) {
1798         if (I.modifiesRegister(Src, &TRI)) {
1799           return HazardFound;
1800         }
1801       }
1802     }
1803 
1804     return NoHazardFound;
1805   };
1806   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1807     if (SIInstrInfo::isVALU(MI))
1808       State.VALUs += 1;
1809     if (SIInstrInfo::isTRANS(MI))
1810       State.TRANS += 1;
1811   };
1812 
1813   DenseSet<const MachineBasicBlock *> Visited;
1814   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1815                             std::next(MI->getReverseIterator()), Visited))
1816     return false;
1817 
1818   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1819   // avoided.
1820   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1821           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1822       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1823 
1824   return true;
1825 }
1826 
1827 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1828   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1829     return false;
1830 
1831   const SIInstrInfo *TII = ST.getInstrInfo();
1832   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1833 
1834   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1835     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1836       return false;
1837 
1838     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1839     // with the dest(matrix D) of the previous wmma.
1840     const Register CurSrc0Reg =
1841         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1842     const Register CurSrc1Reg =
1843         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1844 
1845     const Register PrevDstReg =
1846         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1847 
1848     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1849         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1850       return true;
1851     }
1852 
1853     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1854     // but Index can't overlap with PrevDstReg.
1855     if (AMDGPU::isGFX12Plus(ST)) {
1856       if (SIInstrInfo::isSWMMAC(*MI)) {
1857         const Register CurIndex =
1858             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1859         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1860           return true;
1861       }
1862       return false;
1863     }
1864 
1865     return false;
1866   };
1867 
1868   auto IsExpiredFn = [](const MachineInstr &I, int) {
1869     return SIInstrInfo::isVALU(I);
1870   };
1871 
1872   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1873       std::numeric_limits<int>::max())
1874     return false;
1875 
1876   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1877 
1878   return true;
1879 }
1880 
1881 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1882   if (!ST.hasShift64HighRegBug())
1883     return false;
1884   assert(!ST.hasExtendedWaitCounts());
1885 
1886   switch (MI->getOpcode()) {
1887   default:
1888     return false;
1889   case AMDGPU::V_LSHLREV_B64_e64:
1890   case AMDGPU::V_LSHRREV_B64_e64:
1891   case AMDGPU::V_ASHRREV_I64_e64:
1892     break;
1893   }
1894 
1895   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1896   if (!Amt->isReg())
1897     return false;
1898 
1899   Register AmtReg = Amt->getReg();
1900   const MachineRegisterInfo &MRI = MF.getRegInfo();
1901   // Check if this is a last VGPR in the allocation block.
1902   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1903     return false;
1904 
1905   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1906     return false;
1907 
1908   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1909   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1910   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1911   bool Overlapped = OverlappedSrc || OverlappedDst;
1912 
1913   assert(!OverlappedDst || !OverlappedSrc ||
1914          Src1->getReg() == MI->getOperand(0).getReg());
1915   assert(ST.needsAlignedVGPRs());
1916   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1917 
1918   Register NewReg;
1919   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1920                                    : AMDGPU::VGPR_32RegClass) {
1921     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1922       NewReg = Reg;
1923       break;
1924     }
1925   }
1926 
1927   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1928                                : NewReg;
1929   Register NewAmtLo;
1930 
1931   if (Overlapped)
1932     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1933 
1934   DebugLoc DL = MI->getDebugLoc();
1935   MachineBasicBlock *MBB = MI->getParent();
1936   // Insert a full wait count because found register might be pending a wait.
1937   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1938       .addImm(0);
1939 
1940   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1941   if (Overlapped)
1942     runOnInstruction(
1943         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1944             .addDef(AmtReg - 1)
1945             .addReg(AmtReg - 1, RegState::Undef)
1946             .addReg(NewAmtLo, RegState::Undef));
1947   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1948                        .addDef(AmtReg)
1949                        .addReg(AmtReg, RegState::Undef)
1950                        .addReg(NewAmt, RegState::Undef));
1951 
1952   // Instructions emitted after the current instruction will be processed by the
1953   // parent loop of the hazard recognizer in a natural way.
1954   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1955           AmtReg)
1956       .addDef(NewAmt)
1957       .addReg(NewAmt)
1958       .addReg(AmtReg);
1959   if (Overlapped)
1960     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1961             AmtReg - 1)
1962         .addDef(NewAmtLo)
1963         .addReg(NewAmtLo)
1964         .addReg(AmtReg - 1);
1965 
1966   // Re-running hazard recognizer on the modified instruction is not necessary,
1967   // inserted V_SWAP_B32 has already both read and write new registers so
1968   // hazards related to these register has already been handled.
1969   Amt->setReg(NewAmt);
1970   Amt->setIsKill(false);
1971   // We do not update liveness, so verifier may see it as undef.
1972   Amt->setIsUndef();
1973   if (OverlappedDst)
1974     MI->getOperand(0).setReg(NewReg);
1975   if (OverlappedSrc) {
1976     Src1->setReg(NewReg);
1977     Src1->setIsKill(false);
1978     Src1->setIsUndef();
1979   }
1980 
1981   return true;
1982 }
1983 
1984 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1985   int NSAtoVMEMWaitStates = 1;
1986 
1987   if (!ST.hasNSAtoVMEMBug())
1988     return 0;
1989 
1990   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1991     return 0;
1992 
1993   const SIInstrInfo *TII = ST.getInstrInfo();
1994   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1995   if (!Offset || (Offset->getImm() & 6) == 0)
1996     return 0;
1997 
1998   auto IsHazardFn = [TII](const MachineInstr &I) {
1999     if (!SIInstrInfo::isMIMG(I))
2000       return false;
2001     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2002     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2003            TII->getInstSizeInBytes(I) >= 16;
2004   };
2005 
2006   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2007 }
2008 
2009 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2010   int FPAtomicToDenormModeWaitStates = 3;
2011 
2012   if (!ST.hasFPAtomicToDenormModeHazard())
2013     return 0;
2014   assert(!ST.hasExtendedWaitCounts());
2015 
2016   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2017     return 0;
2018 
2019   auto IsHazardFn = [](const MachineInstr &I) {
2020     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2021       return false;
2022     return SIInstrInfo::isFPAtomic(I);
2023   };
2024 
2025   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2026     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2027       return true;
2028 
2029     switch (MI.getOpcode()) {
2030     case AMDGPU::S_WAITCNT:
2031     case AMDGPU::S_WAITCNT_VSCNT:
2032     case AMDGPU::S_WAITCNT_VMCNT:
2033     case AMDGPU::S_WAITCNT_EXPCNT:
2034     case AMDGPU::S_WAITCNT_LGKMCNT:
2035     case AMDGPU::S_WAIT_IDLE:
2036       return true;
2037     default:
2038       break;
2039     }
2040 
2041     return false;
2042   };
2043 
2044   return FPAtomicToDenormModeWaitStates -
2045          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2046 }
2047 
2048 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2049   assert(SIInstrInfo::isMAI(*MI));
2050 
2051   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2052 }
2053 
2054 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2055   // Early exit if no padding is requested.
2056   if (MFMAPaddingRatio == 0)
2057     return 0;
2058 
2059   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2060   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2061     return 0;
2062 
2063   int NeighborMFMALatency = 0;
2064   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2065                             this](const MachineInstr &MI) {
2066     if (!SIInstrInfo::isMFMA(MI))
2067       return false;
2068 
2069     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2070     return true;
2071   };
2072 
2073   const int MaxMFMAPipelineWaitStates = 16;
2074   int WaitStatesSinceNeighborMFMA =
2075       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2076 
2077   int NeighborMFMAPaddingNeeded =
2078       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2079       WaitStatesSinceNeighborMFMA;
2080 
2081   return std::max(0, NeighborMFMAPaddingNeeded);
2082 }
2083 
2084 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2085   int WaitStatesNeeded = 0;
2086   unsigned Opc = MI->getOpcode();
2087 
2088   auto IsVALUFn = [](const MachineInstr &MI) {
2089     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2090   };
2091 
2092   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2093     const int LegacyVALUWritesVGPRWaitStates = 2;
2094     const int VALUWritesExecWaitStates = 4;
2095     const int MaxWaitStates = 4;
2096 
2097     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2098       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2099     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2100 
2101     if (WaitStatesNeeded < MaxWaitStates) {
2102       for (const MachineOperand &Use : MI->explicit_uses()) {
2103         const int MaxWaitStates = 2;
2104 
2105         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2106           continue;
2107 
2108         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2109           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2110         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2111 
2112         if (WaitStatesNeeded == MaxWaitStates)
2113           break;
2114       }
2115     }
2116   }
2117 
2118   for (const MachineOperand &Op : MI->explicit_operands()) {
2119     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2120       continue;
2121 
2122     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2123       continue;
2124 
2125     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2126     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2127     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2128     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2129     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2130     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2131     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2132     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2133     const int MaxWaitStates = 18;
2134     Register Reg = Op.getReg();
2135     unsigned HazardDefLatency = 0;
2136 
2137     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2138                                this](const MachineInstr &MI) {
2139       if (!SIInstrInfo::isMFMA(MI))
2140         return false;
2141       Register DstReg = MI.getOperand(0).getReg();
2142       if (DstReg == Reg)
2143         return false;
2144       HazardDefLatency =
2145           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2146       return TRI.regsOverlap(DstReg, Reg);
2147     };
2148 
2149     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2150                                                    MaxWaitStates);
2151     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2152     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2153     int OpNo = Op.getOperandNo();
2154     if (OpNo == SrcCIdx) {
2155       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2156     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2157       switch (HazardDefLatency) {
2158       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2159                break;
2160       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2161                break;
2162       case 16: [[fallthrough]];
2163       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2164                break;
2165       }
2166     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2167       switch (HazardDefLatency) {
2168       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2169                break;
2170       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2171                break;
2172       case 16: [[fallthrough]];
2173       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2174                break;
2175       }
2176     }
2177 
2178     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2179     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2180 
2181     if (WaitStatesNeeded == MaxWaitStates)
2182       return WaitStatesNeeded; // Early exit.
2183 
2184     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2185       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2186         return false;
2187       Register DstReg = MI.getOperand(0).getReg();
2188       return TRI.regsOverlap(Reg, DstReg);
2189     };
2190 
2191     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2192     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2193     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2194     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2195     if (OpNo == SrcCIdx)
2196       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2197     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2198       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2199 
2200     WaitStatesNeededForUse = NeedWaitStates -
2201       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2202     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2203 
2204     if (WaitStatesNeeded == MaxWaitStates)
2205       return WaitStatesNeeded; // Early exit.
2206   }
2207 
2208   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2209     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2210     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2211     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2212     const int MaxWaitStates = 13;
2213     Register DstReg = MI->getOperand(0).getReg();
2214     unsigned HazardDefLatency = 0;
2215 
2216     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2217                          this](const MachineInstr &MI) {
2218       if (!SIInstrInfo::isMFMA(MI))
2219         return false;
2220       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2221       HazardDefLatency =
2222           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2223       return TRI.regsOverlap(Reg, DstReg);
2224     };
2225 
2226     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2227     int NeedWaitStates;
2228     switch (HazardDefLatency) {
2229     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2230              break;
2231     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2232              break;
2233     case 16: [[fallthrough]];
2234     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2235              break;
2236     }
2237 
2238     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2239     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2240   }
2241 
2242   // Pad neighboring MFMA with noops for better inter-wave performance.
2243   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2244 
2245   return WaitStatesNeeded;
2246 }
2247 
2248 static int
2249 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2250                                                               bool IsGFX950) {
2251   // xdl def cycles | gfx940 | gfx950
2252   // 2 pass         |  3        4
2253   // 4 pass         |  5        6
2254   // 8 pass         |  9        10
2255   // 16 pass        |  17       18
2256   return NumPasses + 1 + IsGFX950;
2257 }
2258 
2259 static int
2260 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2261                                                               bool IsGFX950) {
2262   // xdl def cycles | gfx940 | gfx950
2263   // 2 pass         |  3        3
2264   // 4 pass         |  5        6
2265   // 8 pass         |  9        10
2266   // 16 pass        |  17       18
2267   return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2268 }
2269 
2270 static int
2271 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2272   // 2 pass -> 2
2273   // 4 pass -> 4
2274   // 8 pass -> 8
2275   // 16 pass -> 16
2276   return NumPasses;
2277 }
2278 
2279 static int
2280 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2281   // 2 pass -> 4
2282   // 4 pass -> 6
2283   // 8 pass -> 10
2284   // 16 pass -> 18
2285   return NumPasses + 2;
2286 }
2287 
2288 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2289   // 2 pass -> 5
2290   // 4 pass -> 7
2291   // 8 pass -> 11
2292   // 16 pass -> 19
2293   return NumPasses + 3;
2294 }
2295 
2296 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2297   int WaitStatesNeeded = 0;
2298   unsigned Opc = MI->getOpcode();
2299 
2300   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2301     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2302   };
2303 
2304   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2305     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2306            !SIInstrInfo::isDOT(MI);
2307   };
2308 
2309   if (!SIInstrInfo::isMFMA(*MI))
2310     return WaitStatesNeeded;
2311 
2312   const int VALUWritesExecWaitStates = 4;
2313   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2314     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2315                           VALUWritesExecWaitStates);
2316   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2317 
2318   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2319 
2320   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2321   for (const MachineOperand &Use : MI->explicit_uses()) {
2322     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2323     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2324     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2325     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2326     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2327     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2328     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2329     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2330     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2331     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2332     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2333     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2334     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2335     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2336     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2337     const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2338     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2339     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2340     const int MaxWaitStates = 19;
2341 
2342     if (!Use.isReg())
2343       continue;
2344     Register Reg = Use.getReg();
2345     bool FullReg;
2346     const MachineInstr *MI1;
2347 
2348     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2349                                this](const MachineInstr &MI) {
2350       if (!SIInstrInfo::isMFMA(MI))
2351         return false;
2352       Register DstReg = MI.getOperand(0).getReg();
2353       FullReg = (DstReg == Reg);
2354       MI1 = &MI;
2355       return TRI.regsOverlap(DstReg, Reg);
2356     };
2357 
2358     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2359       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2360     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2361 
2362     int NumWaitStates =
2363         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2364     if (NumWaitStates == std::numeric_limits<int>::max())
2365       continue;
2366 
2367     int OpNo = Use.getOperandNo();
2368     unsigned Opc1 = MI1->getOpcode();
2369     int NeedWaitStates = 0;
2370     if (OpNo == SrcCIdx) {
2371       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2372         NeedWaitStates = 0;
2373       } else if (FullReg) {
2374         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2375              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2376             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2377              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2378           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2379         else if (ST.hasGFX940Insts() &&
2380                  TSchedModel.computeInstrLatency(MI1) == 2)
2381           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2382       } else {
2383         switch (Opc1) {
2384         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2385         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2386         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2387         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2388           if (!isXDL(ST, *MI))
2389             NeedWaitStates =
2390                 ST.hasGFX950Insts()
2391                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2392                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2393           break;
2394         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2395         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2396           if (!isXDL(ST, *MI))
2397             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2398           break;
2399         default:
2400           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2401           if (ST.hasGFX940Insts()) {
2402             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2403               break;
2404 
2405             NeedWaitStates =
2406                 isXDL(ST, *MI1)
2407                     ? (isXDL(ST, *MI)
2408                            ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2409                                  NumPasses, ST.hasGFX950Insts())
2410                            : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2411                                  NumPasses, ST.hasGFX950Insts()))
2412                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2413                           NumPasses);
2414             break;
2415           }
2416 
2417           switch (NumPasses) {
2418           case 2:
2419             NeedWaitStates =
2420                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2421                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2422             break;
2423           case 8:
2424             NeedWaitStates =
2425                 isDGEMM(Opc)
2426                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2427                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2428             break;
2429           case 16:
2430             NeedWaitStates =
2431                 isDGEMM(Opc)
2432                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2433                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2434             break;
2435           default:
2436             llvm_unreachable("unexpected number of passes");
2437           }
2438         }
2439       }
2440     } else {
2441       switch (Opc1) {
2442       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2443       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2444       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2445       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2446         NeedWaitStates =
2447             ST.hasGFX950Insts()
2448                 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2449                 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2450         break;
2451       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2452       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2453         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2454         break;
2455       default:
2456         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2457 
2458         if (ST.hasGFX940Insts()) {
2459           NeedWaitStates =
2460               isXDL(ST, *MI1)
2461                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2462                         NumPasses)
2463                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2464                         NumPasses);
2465           break;
2466         }
2467 
2468         switch (NumPasses) {
2469         case 2:
2470           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2471           break;
2472         case 4:
2473           llvm_unreachable("unexpected number of passes for mfma");
2474         case 8:
2475           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2476           break;
2477         case 16:
2478         default:
2479           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2480         }
2481       }
2482     }
2483     if (WaitStatesNeeded >= NeedWaitStates)
2484       continue;
2485 
2486     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2487     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2488 
2489     if (WaitStatesNeeded == MaxWaitStates)
2490       break;
2491   }
2492 
2493   // Pad neighboring MFMA with noops for better inter-wave performance.
2494   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2495 
2496   return WaitStatesNeeded;
2497 }
2498 
2499 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2500   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2501   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2502     return 0;
2503 
2504   int WaitStatesNeeded = 0;
2505 
2506   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2507     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2508   };
2509 
2510   for (const MachineOperand &Op : MI->explicit_uses()) {
2511     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2512       continue;
2513 
2514     Register Reg = Op.getReg();
2515 
2516     const int AccVgprReadLdStWaitStates = 2;
2517     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2518     const int MaxWaitStates = 2;
2519 
2520     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2521       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2522     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2523 
2524     if (WaitStatesNeeded == MaxWaitStates)
2525       return WaitStatesNeeded; // Early exit.
2526 
2527     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2528       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2529           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2530         return false;
2531       auto IsVALUFn = [](const MachineInstr &MI) {
2532         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2533       };
2534       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2535              std::numeric_limits<int>::max();
2536     };
2537 
2538     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2539       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2540     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2541   }
2542 
2543   return WaitStatesNeeded;
2544 }
2545 
2546 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2547   assert(!ST.hasVcmpxPermlaneHazard() &&
2548          "this is a different vcmpx+permlane hazard");
2549   const SIRegisterInfo *TRI = ST.getRegisterInfo();
2550   const SIInstrInfo *TII = ST.getInstrInfo();
2551 
2552   auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2553     return isVCmpXWritesExec(*TII, *TRI, MI);
2554   };
2555 
2556   auto IsVALUFn = [](const MachineInstr &MI) {
2557     return SIInstrInfo::isVALU(MI);
2558   };
2559 
2560   const int VCmpXWritesExecWaitStates = 4;
2561   const int VALUWritesVDstWaitStates = 2;
2562   int WaitStatesNeeded = 0;
2563 
2564   for (const MachineOperand &Op : MI->explicit_uses()) {
2565     if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2566       continue;
2567     Register Reg = Op.getReg();
2568 
2569     int WaitStatesSinceDef =
2570         VALUWritesVDstWaitStates -
2571         getWaitStatesSinceDef(Reg, IsVALUFn,
2572                               /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2573     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2574     if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2575       break;
2576   }
2577 
2578   int VCmpXHazardWaits =
2579       VCmpXWritesExecWaitStates -
2580       getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2581 
2582   WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2583   return WaitStatesNeeded;
2584 }
2585 
2586 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2587   // 2 pass -> 4
2588   // 4 pass -> 6
2589   // 8 pass -> 10
2590   // 16 pass -> 18
2591   return NumPasses + 2;
2592 }
2593 
2594 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2595   // 2 pass -> 5
2596   // 4 pass -> 7
2597   // 8 pass -> 11
2598   // 16 pass -> 19
2599   return NumPasses + 3;
2600 }
2601 
2602 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2603   // 2 pass -> 5
2604   // 4 pass -> 7
2605   // 8 pass -> 11
2606   // 16 pass -> 19
2607   return NumPasses + 3;
2608 }
2609 
2610 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2611   // 2 pass -> 4
2612   // 4 pass -> 6
2613   // 8 pass -> 10
2614   // 16 pass -> 18
2615   return NumPasses + 2;
2616 }
2617 
2618 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2619   if (!ST.hasGFX90AInsts())
2620     return 0;
2621 
2622   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2623     return isDGEMM(MI.getOpcode());
2624   };
2625 
2626   // This is checked in checkMAIHazards90A()
2627   if (SIInstrInfo::isMFMA(*MI))
2628     return 0;
2629 
2630   const MachineRegisterInfo &MRI = MF.getRegInfo();
2631 
2632   int WaitStatesNeeded = 0;
2633 
2634   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2635                SIInstrInfo::isFLAT(*MI) ||
2636                SIInstrInfo::isDS(*MI);
2637   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2638   bool IsVALU = SIInstrInfo::isVALU(*MI);
2639 
2640   const MachineInstr *MFMA = nullptr;
2641   unsigned Reg;
2642   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2643     if (!SIInstrInfo::isMFMA(MI) ||
2644         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2645       return false;
2646     MFMA = &MI;
2647     return true;
2648   };
2649 
2650   const MachineInstr *DOT = nullptr;
2651   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2652     if (!SIInstrInfo::isDOT(MI) ||
2653         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2654       return false;
2655     DOT = &MI;
2656     return true;
2657   };
2658 
2659   bool DGEMMAfterVALUWrite = false;
2660   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2661     // Found DGEMM on reverse traversal to def.
2662     if (isDGEMM(MI.getOpcode()))
2663       DGEMMAfterVALUWrite = true;
2664 
2665     // Only hazard if register is defined by a VALU and a DGEMM is found after
2666     // after the def.
2667     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2668       return false;
2669 
2670     return true;
2671   };
2672 
2673   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2674                                            AMDGPU::OpName::src2);
2675 
2676   if (IsMemOrExport || IsVALU) {
2677     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2678     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2679     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2680     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2681     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2682     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2683     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2684     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2685     const int DotWriteSameDotReadSrcAB = 3;
2686     const int DotWriteDifferentVALURead = 3;
2687     const int DMFMABetweenVALUWriteVMEMRead = 2;
2688     const int MaxWaitStates = 19;
2689 
2690     for (const MachineOperand &Use : MI->explicit_uses()) {
2691       if (!Use.isReg())
2692         continue;
2693       Reg = Use.getReg();
2694 
2695       DOT = nullptr;
2696       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2697                                                      MaxWaitStates);
2698       if (DOT) {
2699         int NeedWaitStates = 0;
2700         if (DOT->getOpcode() == MI->getOpcode()) {
2701           if (&Use - &MI->getOperand(0) != SrcCIdx)
2702             NeedWaitStates = DotWriteSameDotReadSrcAB;
2703         } else {
2704           NeedWaitStates = DotWriteDifferentVALURead;
2705         }
2706 
2707         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2708         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2709       }
2710 
2711       // Workaround for HW data hazard bug observed only in GFX90A. When there
2712       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2713       // causes the SQ to incorrectly not insert two wait states between the two
2714       // instructions needed to avoid data hazard.
2715       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2716         DGEMMAfterVALUWrite = false;
2717         if (TRI.isVectorRegister(MRI, Reg)) {
2718           int WaitStatesNeededForUse =
2719                 DMFMABetweenVALUWriteVMEMRead -
2720                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2721                                       DMFMABetweenVALUWriteVMEMRead);
2722 
2723           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2724         }
2725       }
2726 
2727       MFMA = nullptr;
2728       WaitStatesSinceDef =
2729           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2730       if (!MFMA)
2731         continue;
2732 
2733       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2734       int NumPasses = HazardDefLatency;
2735       int NeedWaitStates = MaxWaitStates;
2736 
2737       if (isDGEMM(MFMA->getOpcode())) {
2738         switch (HazardDefLatency) {
2739         case 4:
2740           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2741                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2742           break;
2743         case 8:
2744         case 16:
2745           NeedWaitStates =
2746               IsMemOrExport
2747                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2748                   : (ST.hasGFX950Insts()
2749                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2750                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2751           break;
2752         default:
2753           llvm_unreachable("unexpected dgemm");
2754         }
2755       } else if (ST.hasGFX940Insts()) {
2756         NeedWaitStates =
2757             isXDL(ST, *MFMA)
2758                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2759                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2760                       NumPasses);
2761       } else {
2762         switch (HazardDefLatency) {
2763         case 2:
2764           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2765           break;
2766         case 8:
2767           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2768           break;
2769         case 16:
2770           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2771           break;
2772         default:
2773           llvm_unreachable("unexpected number of passes for mfma");
2774         }
2775       }
2776 
2777       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2778       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2779 
2780       if (WaitStatesNeeded == MaxWaitStates)
2781         break;
2782     }
2783   }
2784 
2785   unsigned Opc = MI->getOpcode();
2786   const int DMFMAToFMA64WaitStates = 2;
2787   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2788        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2789        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2790       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2791     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2792       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2793     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2794   }
2795 
2796   if (!IsVALU && !IsMemOrExport)
2797     return WaitStatesNeeded;
2798 
2799   for (const MachineOperand &Def : MI->defs()) {
2800     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2801     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2802     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2803     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2804     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2805     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2806     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2807     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2808     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2809     const int DotWriteDifferentVALUWrite = 3;
2810     const int MaxWaitStates = 19;
2811     const int MaxWarWaitStates = 15;
2812 
2813     Reg = Def.getReg();
2814 
2815     DOT = nullptr;
2816     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2817                                                    MaxWaitStates);
2818     if (DOT && DOT->getOpcode() != MI->getOpcode())
2819       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2820                                                     WaitStatesSinceDef);
2821 
2822     MFMA = nullptr;
2823     WaitStatesSinceDef =
2824         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2825     if (MFMA) {
2826       int NeedWaitStates = MaxWaitStates;
2827       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2828 
2829       if (isDGEMM(MFMA->getOpcode())) {
2830         switch (NumPasses) {
2831         case 4:
2832           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2833           break;
2834         case 8:
2835         case 16:
2836           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2837           break;
2838         default:
2839           llvm_unreachable("unexpected number of cycles for dgemm");
2840         }
2841       } else if (ST.hasGFX940Insts()) {
2842         NeedWaitStates =
2843             isXDL(ST, *MFMA)
2844                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2845                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2846       } else {
2847         switch (NumPasses) {
2848         case 2:
2849           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2850           break;
2851         case 8:
2852           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2853           break;
2854         case 16:
2855           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2856           break;
2857         default:
2858           llvm_unreachable("Unexpected number of passes for mfma");
2859         }
2860       }
2861 
2862       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2863       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2864 
2865       if (WaitStatesNeeded == MaxWaitStates)
2866         break;
2867     }
2868 
2869     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2870       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2871           !MI.readsRegister(Reg, &TRI))
2872         return false;
2873 
2874       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2875         return false;
2876 
2877       const MachineOperand *SrcC =
2878           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2879       assert(SrcC);
2880       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2881         return false;
2882 
2883       MFMA = &MI;
2884       return true;
2885     };
2886 
2887     MFMA = nullptr;
2888     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2889                                                 MaxWarWaitStates);
2890     if (!MFMA)
2891       continue;
2892 
2893     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2894     int NeedWaitStates = MaxWaitStates;
2895     switch (HazardDefLatency) {
2896     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2897              break;
2898     case 4:  assert(ST.hasGFX940Insts());
2899              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2900              break;
2901     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2902              break;
2903     case 16: [[fallthrough]];
2904     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2905              break;
2906     }
2907 
2908     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2909     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2910   }
2911 
2912   return WaitStatesNeeded;
2913 }
2914 
2915 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2916   if (!SU->isInstr())
2917     return false;
2918 
2919   const MachineInstr *MAI = nullptr;
2920 
2921   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2922     MAI = nullptr;
2923     if (SIInstrInfo::isMFMA(MI))
2924       MAI = &MI;
2925     return MAI != nullptr;
2926   };
2927 
2928   MachineInstr *MI = SU->getInstr();
2929   if (IsMFMAFn(*MI)) {
2930     int W = getWaitStatesSince(IsMFMAFn, 16);
2931     if (MAI)
2932       return W < (int)TSchedModel.computeInstrLatency(MAI);
2933   }
2934 
2935   return false;
2936 }
2937 
2938 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2939 // insertion of a new instruction.
2940 static void updateGetPCBundle(MachineInstr *NewMI) {
2941   if (!NewMI->isBundled())
2942     return;
2943 
2944   // Find start of bundle.
2945   auto I = NewMI->getIterator();
2946   while (I->isBundledWithPred())
2947     I--;
2948   if (I->isBundle())
2949     I++;
2950 
2951   // Bail if this is not an S_GETPC bundle.
2952   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2953     return;
2954 
2955   // Update offsets of any references in the bundle.
2956   const unsigned NewBytes = 4;
2957   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2958          "Unexpected instruction insertion in bundle");
2959   auto NextMI = std::next(NewMI->getIterator());
2960   auto End = NewMI->getParent()->end();
2961   while (NextMI != End && NextMI->isBundledWithPred()) {
2962     for (auto &Operand : NextMI->operands()) {
2963       if (Operand.isGlobal())
2964         Operand.setOffset(Operand.getOffset() + NewBytes);
2965     }
2966     NextMI++;
2967   }
2968 }
2969 
2970 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2971   if (!ST.hasVALUMaskWriteHazard())
2972     return false;
2973   assert(!ST.hasExtendedWaitCounts());
2974 
2975   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2976     return false;
2977 
2978   // The hazard sequence is three instructions:
2979   //   1. VALU reads SGPR as mask
2980   //   2. SALU writes SGPR
2981   //   3. SALU reads SGPR
2982   // The hazard can expire if the distance between 2 and 3 is sufficient.
2983   // In practice this happens <10% of the time, hence this always assumes
2984   // the hazard exists if 1 and 2 are present to avoid searching.
2985 
2986   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2987   if (!SDSTOp || !SDSTOp->isReg())
2988     return false;
2989 
2990   const Register HazardReg = SDSTOp->getReg();
2991   if (HazardReg == AMDGPU::EXEC ||
2992       HazardReg == AMDGPU::EXEC_LO ||
2993       HazardReg == AMDGPU::EXEC_HI ||
2994       HazardReg == AMDGPU::M0)
2995     return false;
2996 
2997   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2998     switch (I.getOpcode()) {
2999     case AMDGPU::V_ADDC_U32_e32:
3000     case AMDGPU::V_ADDC_U32_dpp:
3001     case AMDGPU::V_CNDMASK_B16_e32:
3002     case AMDGPU::V_CNDMASK_B16_dpp:
3003     case AMDGPU::V_CNDMASK_B32_e32:
3004     case AMDGPU::V_CNDMASK_B32_dpp:
3005     case AMDGPU::V_DIV_FMAS_F32_e64:
3006     case AMDGPU::V_DIV_FMAS_F64_e64:
3007     case AMDGPU::V_SUBB_U32_e32:
3008     case AMDGPU::V_SUBB_U32_dpp:
3009     case AMDGPU::V_SUBBREV_U32_e32:
3010     case AMDGPU::V_SUBBREV_U32_dpp:
3011       // These implicitly read VCC as mask source.
3012       return HazardReg == AMDGPU::VCC ||
3013              HazardReg == AMDGPU::VCC_LO ||
3014              HazardReg == AMDGPU::VCC_HI;
3015     case AMDGPU::V_ADDC_U32_e64:
3016     case AMDGPU::V_ADDC_U32_e64_dpp:
3017     case AMDGPU::V_CNDMASK_B16_e64:
3018     case AMDGPU::V_CNDMASK_B16_e64_dpp:
3019     case AMDGPU::V_CNDMASK_B32_e64:
3020     case AMDGPU::V_CNDMASK_B32_e64_dpp:
3021     case AMDGPU::V_SUBB_U32_e64:
3022     case AMDGPU::V_SUBB_U32_e64_dpp:
3023     case AMDGPU::V_SUBBREV_U32_e64:
3024     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3025       // Only check mask register overlaps.
3026       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3027       assert(SSRCOp);
3028       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3029     }
3030     default:
3031       return false;
3032     }
3033   };
3034 
3035   const MachineRegisterInfo &MRI = MF.getRegInfo();
3036   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3037     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3038     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3039         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3040       return true;
3041 
3042     // VALU access to any SGPR or literal constant other than HazardReg
3043     // mitigates hazard. No need to check HazardReg here as this will
3044     // only be called when !IsHazardFn.
3045     if (!SIInstrInfo::isVALU(I))
3046       return false;
3047     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3048       const MachineOperand &Op = I.getOperand(OpNo);
3049       if (Op.isReg()) {
3050         Register OpReg = Op.getReg();
3051         // Only consider uses
3052         if (!Op.isUse())
3053           continue;
3054         // Ignore EXEC
3055         if (OpReg == AMDGPU::EXEC ||
3056             OpReg == AMDGPU::EXEC_LO ||
3057             OpReg == AMDGPU::EXEC_HI)
3058           continue;
3059         // Ignore all implicit uses except VCC
3060         if (Op.isImplicit()) {
3061           if (OpReg == AMDGPU::VCC ||
3062               OpReg == AMDGPU::VCC_LO ||
3063               OpReg == AMDGPU::VCC_HI)
3064             return true;
3065           continue;
3066         }
3067         if (TRI.isSGPRReg(MRI, OpReg))
3068           return true;
3069       } else {
3070         const MCInstrDesc &InstDesc = I.getDesc();
3071         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3072         if (!TII.isInlineConstant(Op, OpInfo))
3073           return true;
3074       }
3075     }
3076     return false;
3077   };
3078 
3079   // Check for hazard
3080   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3081       std::numeric_limits<int>::max())
3082     return false;
3083 
3084   auto NextMI = std::next(MI->getIterator());
3085 
3086   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3087   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3088                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3089                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3090 
3091   // SALU write may be s_getpc in a bundle.
3092   updateGetPCBundle(NewMI);
3093 
3094   return true;
3095 }
3096 
3097 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3098 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3099 static std::optional<unsigned> sgprPairNumber(Register Reg,
3100                                               const SIRegisterInfo &TRI) {
3101   switch (Reg) {
3102   case AMDGPU::M0:
3103   case AMDGPU::EXEC:
3104   case AMDGPU::EXEC_LO:
3105   case AMDGPU::EXEC_HI:
3106   case AMDGPU::SGPR_NULL:
3107   case AMDGPU::SGPR_NULL64:
3108     return {};
3109   default:
3110     break;
3111   }
3112   unsigned RegN = TRI.getEncodingValue(Reg);
3113   if (RegN > 127)
3114     return {};
3115   return (RegN >> 1) & 0x3f;
3116 }
3117 
3118 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3119 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3120   assert(MMF == &MF);
3121 
3122   // Assume non-empty vector means it has already been computed.
3123   if (!VALUReadHazardSGPRs.empty())
3124     return;
3125 
3126   auto CallingConv = MF.getFunction().getCallingConv();
3127   bool IsCallFree =
3128       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3129 
3130   // Exhaustive search is only viable in non-caller/callee functions where
3131   // VALUs will be exposed to the hazard recognizer.
3132   UseVALUReadHazardExhaustiveSearch =
3133       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3134       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3135 
3136   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3137   bool UseVALUUseCache =
3138       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3139   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3140   if (!UseVALUUseCache)
3141     return;
3142 
3143   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3144   // before a SALU write to the same SGPR.  This provides a reduction in
3145   // hazard insertion when all VALU access to an SGPR occurs after its last
3146   // SALU write, when compared to a linear scan.
3147   const MachineRegisterInfo &MRI = MF.getRegInfo();
3148   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3149   MachineCycleInfo CI;
3150   CI.compute(*MMF);
3151 
3152   for (auto *MBB : post_order(&MF)) {
3153     bool InCycle = CI.getCycle(MBB) != nullptr;
3154     for (auto &MI : reverse(MBB->instrs())) {
3155       bool IsVALU = SIInstrInfo::isVALU(MI);
3156       bool IsSALU = SIInstrInfo::isSALU(MI);
3157       if (!IsVALU && !IsSALU)
3158         continue;
3159 
3160       for (const MachineOperand &Op : MI.operands()) {
3161         if (!Op.isReg())
3162           continue;
3163         Register Reg = Op.getReg();
3164         assert(!Op.getSubReg());
3165         // Only consider implicit operands of VCC.
3166         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3167                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3168           continue;
3169         if (!TRI.isSGPRReg(MRI, Reg))
3170           continue;
3171         auto RegN = sgprPairNumber(Reg, TRI);
3172         if (!RegN)
3173           continue;
3174         if (IsVALU && Op.isUse()) {
3175           // Note: any access within a cycle must be considered a hazard.
3176           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3177             VALUReadHazardSGPRs.set(*RegN);
3178           ReadSGPRs.set(*RegN);
3179         } else if (IsSALU) {
3180           if (Op.isDef())
3181             SALUWriteSGPRs.set(*RegN);
3182           else
3183             ReadSGPRs.set(*RegN);
3184         }
3185       }
3186     }
3187   }
3188 }
3189 
3190 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3191   if (!ST.hasVALUReadSGPRHazard())
3192     return false;
3193 
3194   // The hazard sequence is fundamentally three instructions:
3195   //   1. VALU reads SGPR
3196   //   2. SALU writes SGPR
3197   //   3. VALU/SALU reads SGPR
3198   // Try to avoid searching for (1) because the expiry point of the hazard is
3199   // indeterminate; however, the hazard between (2) and (3) can expire if the
3200   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3201   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3202   // even if individual SGPRs are accessed.
3203 
3204   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3205   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3206   if (!(MIIsSALU || MIIsVALU))
3207     return false;
3208 
3209   // Avoid expensive search when compile time is priority by
3210   // mitigating every SALU which writes an SGPR.
3211   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3212     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3213       return false;
3214 
3215     const MachineOperand *SDSTOp =
3216         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3217     if (!SDSTOp || !SDSTOp->isReg())
3218       return false;
3219 
3220     const Register HazardReg = SDSTOp->getReg();
3221     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3222         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3223       return false;
3224 
3225     // Add s_wait_alu sa_sdst(0) after SALU write.
3226     auto NextMI = std::next(MI->getIterator());
3227     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3228                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3229                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3230 
3231     // SALU write may be s_getpc in a bundle.
3232     updateGetPCBundle(NewMI);
3233 
3234     return true;
3235   }
3236 
3237   // Pre-compute set of SGPR pairs read by VALUs.
3238   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3239   computeVALUHazardSGPRs(MI->getMF());
3240 
3241   // If no VALUs hazard SGPRs exist then nothing to do.
3242   if (VALUReadHazardSGPRs.none())
3243     return false;
3244 
3245   // All SGPR writes before a call/return must be flushed as the callee/caller
3246   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3247   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3248                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3249                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3250 
3251   // Collect all SGPR sources for MI which are read by a VALU.
3252   const MachineRegisterInfo &MRI = MF.getRegInfo();
3253   SmallSet<Register, 4> SGPRsUsed;
3254 
3255   if (!IsSetPC) {
3256     for (const MachineOperand &Op : MI->all_uses()) {
3257       Register OpReg = Op.getReg();
3258 
3259       // Only consider VCC implicit uses on VALUs.
3260       // The only expected SALU implicit access is SCC which is no hazard.
3261       if (MIIsSALU && Op.isImplicit())
3262         continue;
3263 
3264       if (!TRI.isSGPRReg(MRI, OpReg))
3265         continue;
3266 
3267       auto RegN = sgprPairNumber(OpReg, TRI);
3268       if (!RegN)
3269         continue;
3270 
3271       if (!VALUReadHazardSGPRs[*RegN])
3272         continue;
3273 
3274       SGPRsUsed.insert(OpReg);
3275     }
3276 
3277     // No SGPRs -> nothing to do.
3278     if (SGPRsUsed.empty())
3279       return false;
3280   }
3281 
3282   // A hazard is any SALU which writes one of the SGPRs read by MI.
3283   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3284     if (!SIInstrInfo::isSALU(I))
3285       return false;
3286     // Ensure SGPR flush before call/return by conservatively assuming every
3287     // SALU writes an SGPR.
3288     if (IsSetPC && I.getNumDefs() > 0)
3289       return true;
3290     // Check for any register writes.
3291     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3292       return I.modifiesRegister(Reg, &TRI);
3293     });
3294   };
3295 
3296   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3297   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3298     if (Count >= SALUExpiryCount)
3299       return true;
3300     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3301     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3302         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3303       return true;
3304     return false;
3305   };
3306 
3307   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3308     // Only count true SALUs as wait states.
3309     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3310       return 0;
3311     // SALU must be unrelated to any hazard registers.
3312     if (any_of(SGPRsUsed,
3313                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3314       return 0;
3315     return 1;
3316   };
3317 
3318   // Check for the hazard.
3319   DenseSet<const MachineBasicBlock *> Visited;
3320   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3321                                         std::next(MI->getReverseIterator()), 0,
3322                                         IsExpiredFn, Visited, WaitStatesFn);
3323 
3324   if (WaitStates >= SALUExpiryCount)
3325     return false;
3326 
3327   // Validate hazard through an exhaustive search.
3328   if (UseVALUReadHazardExhaustiveSearch) {
3329     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3330     // This is searching for (1) in the hazard description.
3331     auto hazardPair = [this](Register Reg) {
3332       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3333         return Register(AMDGPU::VCC);
3334       auto RegN = sgprPairNumber(Reg, TRI);
3335       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3336     };
3337     auto SearchHazardFn = [this, hazardPair,
3338                            &SGPRsUsed](const MachineInstr &I) {
3339       if (!SIInstrInfo::isVALU(I))
3340         return false;
3341       // Check for any register reads.
3342       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3343         return I.readsRegister(hazardPair(Reg), &TRI);
3344       });
3345     };
3346     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3347       return false;
3348     };
3349     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3350         std::numeric_limits<int>::max())
3351       return false;
3352   }
3353 
3354   // Add s_wait_alu sa_sdst(0) before SALU read.
3355   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3356                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3357                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3358 
3359   // SALU read may be after s_getpc in a bundle.
3360   updateGetPCBundle(NewMI);
3361 
3362   return true;
3363 }
3364 
3365 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3366                                const SIInstrInfo &TII) {
3367   MachineBasicBlock &EntryMBB = MF->front();
3368   if (EntryMBB.begin() != EntryMBB.end()) {
3369     auto &EntryMI = *EntryMBB.begin();
3370     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3371         EntryMI.getOperand(0).getImm() >= Priority)
3372       return false;
3373   }
3374 
3375   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3376       .addImm(Priority);
3377   return true;
3378 }
3379 
3380 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3381   if (!ST.hasRequiredExportPriority())
3382     return false;
3383 
3384   // Assume the following shader types will never have exports,
3385   // and avoid adding or adjusting S_SETPRIO.
3386   MachineBasicBlock *MBB = MI->getParent();
3387   MachineFunction *MF = MBB->getParent();
3388   auto CC = MF->getFunction().getCallingConv();
3389   switch (CC) {
3390   case CallingConv::AMDGPU_CS:
3391   case CallingConv::AMDGPU_CS_Chain:
3392   case CallingConv::AMDGPU_CS_ChainPreserve:
3393   case CallingConv::AMDGPU_KERNEL:
3394     return false;
3395   default:
3396     break;
3397   }
3398 
3399   const int MaxPriority = 3;
3400   const int NormalPriority = 2;
3401   const int PostExportPriority = 0;
3402 
3403   auto It = MI->getIterator();
3404   switch (MI->getOpcode()) {
3405   case AMDGPU::S_ENDPGM:
3406   case AMDGPU::S_ENDPGM_SAVED:
3407   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3408   case AMDGPU::SI_RETURN_TO_EPILOG:
3409     // Ensure shader with calls raises priority at entry.
3410     // This ensures correct priority if exports exist in callee.
3411     if (MF->getFrameInfo().hasCalls())
3412       return ensureEntrySetPrio(MF, NormalPriority, TII);
3413     return false;
3414   case AMDGPU::S_SETPRIO: {
3415     // Raise minimum priority unless in workaround.
3416     auto &PrioOp = MI->getOperand(0);
3417     int Prio = PrioOp.getImm();
3418     bool InWA = (Prio == PostExportPriority) &&
3419                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3420     if (InWA || Prio >= NormalPriority)
3421       return false;
3422     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3423     return true;
3424   }
3425   default:
3426     if (!TII.isEXP(*MI))
3427       return false;
3428     break;
3429   }
3430 
3431   // Check entry priority at each export (as there will only be a few).
3432   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3433   bool Changed = false;
3434   if (CC != CallingConv::AMDGPU_Gfx)
3435     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3436 
3437   auto NextMI = std::next(It);
3438   bool EndOfShader = false;
3439   if (NextMI != MBB->end()) {
3440     // Only need WA at end of sequence of exports.
3441     if (TII.isEXP(*NextMI))
3442       return Changed;
3443     // Assume appropriate S_SETPRIO after export means WA already applied.
3444     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3445         NextMI->getOperand(0).getImm() == PostExportPriority)
3446       return Changed;
3447     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3448   }
3449 
3450   const DebugLoc &DL = MI->getDebugLoc();
3451 
3452   // Lower priority.
3453   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3454       .addImm(PostExportPriority);
3455 
3456   if (!EndOfShader) {
3457     // Wait for exports to complete.
3458     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3459         .addReg(AMDGPU::SGPR_NULL)
3460         .addImm(0);
3461   }
3462 
3463   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3464   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3465 
3466   if (!EndOfShader) {
3467     // Return to normal (higher) priority.
3468     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3469         .addImm(NormalPriority);
3470   }
3471 
3472   return true;
3473 }
3474