xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision c3fe5ad6be9eb58d5043de9a5940ef3c397631b2)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       UseVALUReadHazardExhaustiveSearch(false),
64       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
176 }
177 
178 static bool isLdsDma(const MachineInstr &MI) {
179   return SIInstrInfo::isVALU(MI) &&
180          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
181 }
182 
183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185                                                      AMDGPU::OpName::simm16);
186   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187 }
188 
189 ScheduleHazardRecognizer::HazardType
190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
191   MachineInstr *MI = SU->getInstr();
192   // If we are not in "HazardRecognizerMode" and therefore not being run from
193   // the scheduler, track possible stalls from hazards but don't insert noops.
194   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195 
196   if (MI->isBundle())
197    return NoHazard;
198 
199   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200     return HazardType;
201 
202   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203     return HazardType;
204 
205   if (checkFPAtomicToDenormModeHazard(MI) > 0)
206     return HazardType;
207 
208   if (ST.hasNoDataDepHazard())
209     return NoHazard;
210 
211   // FIXME: Should flat be considered vmem?
212   if ((SIInstrInfo::isVMEM(*MI) ||
213        SIInstrInfo::isFLAT(*MI))
214       && checkVMEMHazards(MI) > 0)
215     return HazardType;
216 
217   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
218     return HazardType;
219 
220   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
221     return HazardType;
222 
223   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
224     return HazardType;
225 
226   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
227     return HazardType;
228 
229   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
230        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
231        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
232     return HazardType;
233 
234   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
235     return HazardType;
236 
237   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
238     return HazardType;
239 
240   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
241     return HazardType;
242 
243   if (((ST.hasReadM0MovRelInterpHazard() &&
244         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
245          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
247        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
248        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
249        (ST.hasReadM0LdsDirectHazard() &&
250         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
251       checkReadM0Hazards(MI) > 0)
252     return HazardType;
253 
254   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
255     return HazardType;
256 
257   if ((SIInstrInfo::isVMEM(*MI) ||
258        SIInstrInfo::isFLAT(*MI) ||
259        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
260     return HazardType;
261 
262   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
263     return HazardType;
264 
265   return NoHazard;
266 }
267 
268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
269                                 unsigned Quantity) {
270   while (Quantity > 0) {
271     unsigned Arg = std::min(Quantity, 8u);
272     Quantity -= Arg;
273     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
274         .addImm(Arg - 1);
275   }
276 }
277 
278 unsigned
279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
280   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
281   assert(TSchedModel.getWriteProcResBegin(SC) !=
282          TSchedModel.getWriteProcResEnd(SC));
283   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
284 }
285 
286 void GCNHazardRecognizer::processBundle() {
287   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
288   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
289   // Check bundled MachineInstr's for hazards.
290   for (; MI != E && MI->isInsideBundle(); ++MI) {
291     CurrCycleInstr = &*MI;
292     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
293 
294     if (IsHazardRecognizerMode) {
295       fixHazards(CurrCycleInstr);
296 
297       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
298     }
299 
300     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
301     // include the bundled MI directly after, only add a maximum of
302     // (MaxLookAhead - 1) noops to EmittedInstrs.
303     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
304       EmittedInstrs.push_front(nullptr);
305 
306     EmittedInstrs.push_front(CurrCycleInstr);
307     EmittedInstrs.resize(MaxLookAhead);
308   }
309   CurrCycleInstr = nullptr;
310 }
311 
312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
313   assert(IsHazardRecognizerMode);
314 
315   unsigned NumPreNoops = PreEmitNoops(MI);
316   EmitNoops(NumPreNoops);
317   if (MI->isInsideBundle())
318     insertNoopsInBundle(MI, TII, NumPreNoops);
319   else
320     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
321                     NumPreNoops);
322   EmitInstruction(MI);
323   AdvanceCycle();
324 }
325 
326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
327   IsHazardRecognizerMode = true;
328   CurrCycleInstr = MI;
329   unsigned W = PreEmitNoopsCommon(MI);
330   fixHazards(MI);
331   CurrCycleInstr = nullptr;
332   return W;
333 }
334 
335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
336   if (MI->isBundle())
337     return 0;
338 
339   int WaitStates = 0;
340 
341   if (SIInstrInfo::isSMRD(*MI))
342     return std::max(WaitStates, checkSMRDHazards(MI));
343 
344   if (ST.hasNSAtoVMEMBug())
345     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
346 
347   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
348 
349   if (ST.hasNoDataDepHazard())
350     return WaitStates;
351 
352   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
353     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
354 
355   if (SIInstrInfo::isVALU(*MI))
356     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
357 
358   if (SIInstrInfo::isDPP(*MI))
359     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
360 
361   if (isDivFMas(MI->getOpcode()))
362     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
363 
364   if (isRWLane(MI->getOpcode()))
365     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
366 
367   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
368        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
369        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
370     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
371 
372   if (MI->isInlineAsm())
373     return std::max(WaitStates, checkInlineAsmHazards(MI));
374 
375   if (isSGetReg(MI->getOpcode()))
376     return std::max(WaitStates, checkGetRegHazards(MI));
377 
378   if (isSSetReg(MI->getOpcode()))
379     return std::max(WaitStates, checkSetRegHazards(MI));
380 
381   if (isRFE(MI->getOpcode()))
382     return std::max(WaitStates, checkRFEHazards(MI));
383 
384   if ((ST.hasReadM0MovRelInterpHazard() &&
385        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
386         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
388       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
389       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
390       (ST.hasReadM0LdsDirectHazard() &&
391        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
392     return std::max(WaitStates, checkReadM0Hazards(MI));
393 
394   if (SIInstrInfo::isMAI(*MI))
395     return std::max(WaitStates, checkMAIHazards(MI));
396 
397   if (SIInstrInfo::isVMEM(*MI) ||
398       SIInstrInfo::isFLAT(*MI) ||
399       SIInstrInfo::isDS(*MI))
400     return std::max(WaitStates, checkMAILdStHazards(MI));
401 
402   if (ST.hasGFX950Insts() && isPermlane(*MI))
403     return std::max(WaitStates, checkPermlaneHazards(MI));
404 
405   return WaitStates;
406 }
407 
408 void GCNHazardRecognizer::EmitNoop() {
409   EmittedInstrs.push_front(nullptr);
410 }
411 
412 void GCNHazardRecognizer::AdvanceCycle() {
413   // When the scheduler detects a stall, it will call AdvanceCycle() without
414   // emitting any instructions.
415   if (!CurrCycleInstr) {
416     EmittedInstrs.push_front(nullptr);
417     return;
418   }
419 
420   if (CurrCycleInstr->isBundle()) {
421     processBundle();
422     return;
423   }
424 
425   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426   if (!NumWaitStates) {
427     CurrCycleInstr = nullptr;
428     return;
429   }
430 
431   // Keep track of emitted instructions
432   EmittedInstrs.push_front(CurrCycleInstr);
433 
434   // Add a nullptr for each additional wait state after the first.  Make sure
435   // not to add more than getMaxLookAhead() items to the list, since we
436   // truncate the list to that size right after this loop.
437   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438        i < e; ++i) {
439     EmittedInstrs.push_front(nullptr);
440   }
441 
442   // getMaxLookahead() is the largest number of wait states we will ever need
443   // to insert, so there is no point in keeping track of more than that many
444   // wait states.
445   EmittedInstrs.resize(getMaxLookAhead());
446 
447   CurrCycleInstr = nullptr;
448 }
449 
450 void GCNHazardRecognizer::RecedeCycle() {
451   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
452 }
453 
454 //===----------------------------------------------------------------------===//
455 // Helper Functions
456 //===----------------------------------------------------------------------===//
457 
458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
459 
460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
462 
463 // Search for a hazard in a block and its predecessors.
464 template <typename StateT>
465 static bool
466 hasHazard(StateT State,
467           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
468           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
469           const MachineBasicBlock *MBB,
470           MachineBasicBlock::const_reverse_instr_iterator I,
471           DenseSet<const MachineBasicBlock *> &Visited) {
472   for (auto E = MBB->instr_rend(); I != E; ++I) {
473     // No need to look at parent BUNDLE instructions.
474     if (I->isBundle())
475       continue;
476 
477     switch (IsHazard(State, *I)) {
478     case HazardFound:
479       return true;
480     case HazardExpired:
481       return false;
482     default:
483       // Continue search
484       break;
485     }
486 
487     if (I->isInlineAsm() || I->isMetaInstruction())
488       continue;
489 
490     UpdateState(State, *I);
491   }
492 
493   for (MachineBasicBlock *Pred : MBB->predecessors()) {
494     if (!Visited.insert(Pred).second)
495       continue;
496 
497     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
498                   Visited))
499       return true;
500   }
501 
502   return false;
503 }
504 
505 // Returns a minimum wait states since \p I walking all predecessors.
506 // Only scans until \p IsExpired does not return true.
507 // Can only be run in a hazard recognizer mode.
508 static int getWaitStatesSince(
509     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
510     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
511     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
512     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
513   for (auto E = MBB->instr_rend(); I != E; ++I) {
514     // Don't add WaitStates for parent BUNDLE instructions.
515     if (I->isBundle())
516       continue;
517 
518     if (IsHazard(*I))
519       return WaitStates;
520 
521     if (I->isInlineAsm())
522       continue;
523 
524     WaitStates += GetNumWaitStates(*I);
525 
526     if (IsExpired(*I, WaitStates))
527       return std::numeric_limits<int>::max();
528   }
529 
530   int MinWaitStates = std::numeric_limits<int>::max();
531   for (MachineBasicBlock *Pred : MBB->predecessors()) {
532     if (!Visited.insert(Pred).second)
533       continue;
534 
535     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
536                                IsExpired, Visited, GetNumWaitStates);
537 
538     MinWaitStates = std::min(MinWaitStates, W);
539   }
540 
541   return MinWaitStates;
542 }
543 
544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
545                               const MachineInstr *MI, IsExpiredFn IsExpired) {
546   DenseSet<const MachineBasicBlock *> Visited;
547   return getWaitStatesSince(IsHazard, MI->getParent(),
548                             std::next(MI->getReverseIterator()),
549                             0, IsExpired, Visited);
550 }
551 
552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
553   if (IsHazardRecognizerMode) {
554     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
555       return WaitStates >= Limit;
556     };
557     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
558   }
559 
560   int WaitStates = 0;
561   for (MachineInstr *MI : EmittedInstrs) {
562     if (MI) {
563       if (IsHazard(*MI))
564         return WaitStates;
565 
566       if (MI->isInlineAsm())
567         continue;
568     }
569     ++WaitStates;
570 
571     if (WaitStates >= Limit)
572       break;
573   }
574   return std::numeric_limits<int>::max();
575 }
576 
577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
578                                                IsHazardFn IsHazardDef,
579                                                int Limit) {
580   const SIRegisterInfo *TRI = ST.getRegisterInfo();
581 
582   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
583     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
584   };
585 
586   return getWaitStatesSince(IsHazardFn, Limit);
587 }
588 
589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
590                                                   int Limit) {
591   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
592     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
593   };
594 
595   return getWaitStatesSince(IsHazardFn, Limit);
596 }
597 
598 //===----------------------------------------------------------------------===//
599 // No-op Hazard Detection
600 //===----------------------------------------------------------------------===//
601 
602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
603                         MCRegister Reg) {
604   for (MCRegUnit Unit : TRI.regunits(Reg))
605     BV.set(Unit);
606 }
607 
608 static void addRegsToSet(const SIRegisterInfo &TRI,
609                          iterator_range<MachineInstr::const_mop_iterator> Ops,
610                          BitVector &DefSet, BitVector &UseSet) {
611   for (const MachineOperand &Op : Ops) {
612     if (Op.isReg())
613       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
614   }
615 }
616 
617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
618   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
619 }
620 
621 static bool breaksSMEMSoftClause(MachineInstr *MI) {
622   return !SIInstrInfo::isSMRD(*MI);
623 }
624 
625 static bool breaksVMEMSoftClause(MachineInstr *MI) {
626   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
627 }
628 
629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
630   // SMEM soft clause are only present on VI+, and only matter if xnack is
631   // enabled.
632   if (!ST.isXNACKEnabled())
633     return 0;
634 
635   bool IsSMRD = TII.isSMRD(*MEM);
636 
637   resetClause();
638 
639   // A soft-clause is any group of consecutive SMEM instructions.  The
640   // instructions in this group may return out of order and/or may be
641   // replayed (i.e. the same instruction issued more than once).
642   //
643   // In order to handle these situations correctly we need to make sure that
644   // when a clause has more than one instruction, no instruction in the clause
645   // writes to a register that is read by another instruction in the clause
646   // (including itself). If we encounter this situation, we need to break the
647   // clause by inserting a non SMEM instruction.
648 
649   for (MachineInstr *MI : EmittedInstrs) {
650     // When we hit a non-SMEM instruction then we have passed the start of the
651     // clause and we can stop.
652     if (!MI)
653       break;
654 
655     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
656       break;
657 
658     addClauseInst(*MI);
659   }
660 
661   if (ClauseDefs.none())
662     return 0;
663 
664   // We need to make sure not to put loads and stores in the same clause if they
665   // use the same address. For now, just start a new clause whenever we see a
666   // store.
667   if (MEM->mayStore())
668     return 1;
669 
670   addClauseInst(*MEM);
671 
672   // If the set of defs and uses intersect then we cannot add this instruction
673   // to the clause, so we have a hazard.
674   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
675 }
676 
677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
678   int WaitStatesNeeded = 0;
679 
680   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
681 
682   // This SMRD hazard only affects SI.
683   if (!ST.hasSMRDReadVALUDefHazard())
684     return WaitStatesNeeded;
685 
686   // A read of an SGPR by SMRD instruction requires 4 wait states when the
687   // SGPR was written by a VALU instruction.
688   int SmrdSgprWaitStates = 4;
689   auto IsHazardDefFn = [this](const MachineInstr &MI) {
690     return TII.isVALU(MI);
691   };
692   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
693     return TII.isSALU(MI);
694   };
695 
696   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
697 
698   for (const MachineOperand &Use : SMRD->uses()) {
699     if (!Use.isReg())
700       continue;
701     int WaitStatesNeededForUse =
702         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
703                                                    SmrdSgprWaitStates);
704     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705 
706     // This fixes what appears to be undocumented hardware behavior in SI where
707     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
708     // needs some number of nops in between. We don't know how many we need, but
709     // let's use 4. This wasn't discovered before probably because the only
710     // case when this happens is when we expand a 64-bit pointer into a full
711     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
712     // probably never encountered in the closed-source land.
713     if (IsBufferSMRD) {
714       int WaitStatesNeededForUse =
715         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
716                                                    IsBufferHazardDefFn,
717                                                    SmrdSgprWaitStates);
718       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
719     }
720   }
721 
722   return WaitStatesNeeded;
723 }
724 
725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
726   if (!ST.hasVMEMReadSGPRVALUDefHazard())
727     return 0;
728 
729   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
730 
731   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
732   // SGPR was written by a VALU Instruction.
733   const int VmemSgprWaitStates = 5;
734   auto IsHazardDefFn = [this](const MachineInstr &MI) {
735     return TII.isVALU(MI);
736   };
737   for (const MachineOperand &Use : VMEM->uses()) {
738     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
739       continue;
740 
741     int WaitStatesNeededForUse =
742         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
743                                                    VmemSgprWaitStates);
744     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745   }
746   return WaitStatesNeeded;
747 }
748 
749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
750   const SIRegisterInfo *TRI = ST.getRegisterInfo();
751   const SIInstrInfo *TII = ST.getInstrInfo();
752 
753   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
754   int DppVgprWaitStates = 2;
755   int DppExecWaitStates = 5;
756   int WaitStatesNeeded = 0;
757   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758     return TII->isVALU(MI);
759   };
760 
761   for (const MachineOperand &Use : DPP->uses()) {
762     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
763       continue;
764     int WaitStatesNeededForUse =
765         DppVgprWaitStates - getWaitStatesSinceDef(
766                                 Use.getReg(),
767                                 [](const MachineInstr &) { return true; },
768                                 DppVgprWaitStates);
769     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
770   }
771 
772   WaitStatesNeeded = std::max(
773       WaitStatesNeeded,
774       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
775                                                 DppExecWaitStates));
776 
777   return WaitStatesNeeded;
778 }
779 
780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
781   const SIInstrInfo *TII = ST.getInstrInfo();
782 
783   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
784   // instruction.
785   const int DivFMasWaitStates = 4;
786   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
787     return TII->isVALU(MI);
788   };
789   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
790                                                DivFMasWaitStates);
791 
792   return DivFMasWaitStates - WaitStatesNeeded;
793 }
794 
795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
796   const SIInstrInfo *TII = ST.getInstrInfo();
797   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
798 
799   const int GetRegWaitStates = 2;
800   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
801     return GetRegHWReg == getHWReg(TII, MI);
802   };
803   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
804 
805   return GetRegWaitStates - WaitStatesNeeded;
806 }
807 
808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
809   const SIInstrInfo *TII = ST.getInstrInfo();
810   unsigned HWReg = getHWReg(TII, *SetRegInstr);
811 
812   const int SetRegWaitStates = ST.getSetRegWaitStates();
813   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
814     return HWReg == getHWReg(TII, MI);
815   };
816   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
817   return SetRegWaitStates - WaitStatesNeeded;
818 }
819 
820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
821   if (!MI.mayStore())
822     return -1;
823 
824   const SIInstrInfo *TII = ST.getInstrInfo();
825   unsigned Opcode = MI.getOpcode();
826   const MCInstrDesc &Desc = MI.getDesc();
827 
828   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
829   int VDataRCID = -1;
830   if (VDataIdx != -1)
831     VDataRCID = Desc.operands()[VDataIdx].RegClass;
832 
833   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
834     // There is no hazard if the instruction does not use vector regs
835     // (like wbinvl1)
836     if (VDataIdx == -1)
837       return -1;
838     // For MUBUF/MTBUF instructions this hazard only exists if the
839     // instruction is not using a register in the soffset field.
840     const MachineOperand *SOffset =
841         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
842     // If we have no soffset operand, then assume this field has been
843     // hardcoded to zero.
844     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
845         (!SOffset || !SOffset->isReg()))
846       return VDataIdx;
847   }
848 
849   // MIMG instructions create a hazard if they don't use a 256-bit T# and
850   // the store size is greater than 8 bytes and they have more than two bits
851   // of their dmask set.
852   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
853   if (TII->isMIMG(MI)) {
854     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
855     assert(SRsrcIdx != -1 &&
856            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
857     (void)SRsrcIdx;
858   }
859 
860   if (TII->isFLAT(MI)) {
861     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
862     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
863       return DataIdx;
864   }
865 
866   return -1;
867 }
868 
869 int
870 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
871                                             const MachineRegisterInfo &MRI) {
872   // Helper to check for the hazard where VMEM instructions that store more than
873   // 8 bytes can have there store data over written by the next instruction.
874   const SIRegisterInfo *TRI = ST.getRegisterInfo();
875 
876   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
877   int WaitStatesNeeded = 0;
878 
879   if (!TRI->isVectorRegister(MRI, Def.getReg()))
880     return WaitStatesNeeded;
881   Register Reg = Def.getReg();
882   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
883     int DataIdx = createsVALUHazard(MI);
884     return DataIdx >= 0 &&
885            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
886   };
887 
888   int WaitStatesNeededForDef =
889     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
890   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
891 
892   return WaitStatesNeeded;
893 }
894 
895 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
896 /// pack the computed value into correct bit position of the dest register. This
897 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
898 /// dst_sel that is not aligned to the register. This function analayzes the \p
899 /// MI and \returns an operand with dst forwarding issue, or nullptr if
900 /// none exists.
901 static const MachineOperand *
902 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
903   if (!SIInstrInfo::isVALU(MI))
904     return nullptr;
905 
906   const SIInstrInfo *TII = ST.getInstrInfo();
907 
908   unsigned Opcode = MI.getOpcode();
909 
910   // There are three different types of instructions
911   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
912   // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
913   // CVT_SR_BF8_F32 with op_sel[3:2]
914   // != 0
915   if (SIInstrInfo::isSDWA(MI)) {
916     // Type 1: SDWA with dst_sel != DWORD
917     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
918       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
919         return nullptr;
920   } else {
921     // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
922     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
923     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
924         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
925               SISrcMods::DST_OP_SEL ||
926           (AMDGPU::isFP8DstSelInst(Opcode) &&
927            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
928             SISrcMods::OP_SEL_0))))
929       return nullptr;
930   }
931 
932   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
933 }
934 
935 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
936 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
937 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
938 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
939                                             const MachineOperand *Dst,
940                                             const SIRegisterInfo *TRI) {
941   // We must consider implicit reads of the VALU. SDWA with dst_sel and
942   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
943   // and we must account for that hazard.
944   // We also must account for WAW hazards. In particular, WAW with dest
945   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
946   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
947   // check for ECC. Without accounting for this hazard, the ECC will be
948   // wrong.
949   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
950   // complete zeroesHigh16BitsOfDest)
951   for (auto &Operand : VALU->operands()) {
952     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
953       return true;
954     }
955   }
956   return false;
957 }
958 
959 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
960   int WaitStatesNeeded = 0;
961 
962   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
963     const int TransDefWaitstates = 1;
964 
965     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
966       if (!SIInstrInfo::isTRANS(MI))
967         return false;
968       const SIRegisterInfo *TRI = ST.getRegisterInfo();
969       const SIInstrInfo *TII = ST.getInstrInfo();
970       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
971 
972       for (const MachineOperand &Use : VALU->explicit_uses()) {
973         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
974           return true;
975       }
976 
977       return false;
978     };
979 
980     int WaitStatesNeededForDef =
981         TransDefWaitstates -
982         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
983     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
984   }
985 
986   if (ST.hasDstSelForwardingHazard()) {
987     const int Shift16DefWaitstates = 1;
988 
989     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
990       const SIRegisterInfo *TRI = ST.getRegisterInfo();
991       const MachineOperand *ForwardedDst =
992           getDstSelForwardingOperand(ProducerMI, ST);
993       if (ForwardedDst) {
994         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
995       }
996 
997       if (ProducerMI.isInlineAsm()) {
998         // Assume inline asm has dst forwarding hazard
999         for (auto &Def : ProducerMI.all_defs()) {
1000           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1001             return true;
1002         }
1003       }
1004 
1005       return false;
1006     };
1007 
1008     int WaitStatesNeededForDef =
1009         Shift16DefWaitstates -
1010         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1011     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1012   }
1013 
1014   if (ST.hasVDecCoExecHazard()) {
1015     const int VALUWriteSGPRVALUReadWaitstates = 2;
1016     const int VALUWriteEXECRWLane = 4;
1017     const int VALUWriteVGPRReadlaneRead = 1;
1018 
1019     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1020     const MachineRegisterInfo &MRI = MF.getRegInfo();
1021     Register UseReg;
1022     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1023       if (!SIInstrInfo::isVALU(MI))
1024         return false;
1025       return MI.modifiesRegister(UseReg, TRI);
1026     };
1027 
1028     for (const MachineOperand &Use : VALU->explicit_uses()) {
1029       if (!Use.isReg())
1030         continue;
1031 
1032       UseReg = Use.getReg();
1033       if (TRI->isSGPRReg(MRI, UseReg)) {
1034         int WaitStatesNeededForDef =
1035             VALUWriteSGPRVALUReadWaitstates -
1036             getWaitStatesSince(IsVALUDefSGPRFn,
1037                                VALUWriteSGPRVALUReadWaitstates);
1038         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1039       }
1040     }
1041 
1042     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1043       UseReg = AMDGPU::VCC;
1044       int WaitStatesNeededForDef =
1045           VALUWriteSGPRVALUReadWaitstates -
1046           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1047       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1048     }
1049 
1050     switch (VALU->getOpcode()) {
1051     case AMDGPU::V_READLANE_B32:
1052     case AMDGPU::V_READFIRSTLANE_B32: {
1053       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1054       UseReg = Src->getReg();
1055       int WaitStatesNeededForDef =
1056           VALUWriteVGPRReadlaneRead -
1057           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1058       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1059     }
1060       [[fallthrough]];
1061     case AMDGPU::V_WRITELANE_B32: {
1062       UseReg = AMDGPU::EXEC;
1063       int WaitStatesNeededForDef =
1064           VALUWriteEXECRWLane -
1065           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1066       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1067       break;
1068     }
1069     default:
1070       break;
1071     }
1072   }
1073 
1074   // This checks for the hazard where VMEM instructions that store more than
1075   // 8 bytes can have there store data over written by the next instruction.
1076   if (!ST.has12DWordStoreHazard())
1077     return WaitStatesNeeded;
1078 
1079   const MachineRegisterInfo &MRI = MF.getRegInfo();
1080 
1081   for (const MachineOperand &Def : VALU->defs()) {
1082     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1083   }
1084 
1085   return WaitStatesNeeded;
1086 }
1087 
1088 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1089   // This checks for hazards associated with inline asm statements.
1090   // Since inline asms can contain just about anything, we use this
1091   // to call/leverage other check*Hazard routines. Note that
1092   // this function doesn't attempt to address all possible inline asm
1093   // hazards (good luck), but is a collection of what has been
1094   // problematic thus far.
1095 
1096   // see checkVALUHazards()
1097   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1098     return 0;
1099 
1100   const MachineRegisterInfo &MRI = MF.getRegInfo();
1101   int WaitStatesNeeded = 0;
1102 
1103   for (const MachineOperand &Op :
1104        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1105     if (Op.isReg() && Op.isDef()) {
1106       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1107         continue;
1108 
1109       if (ST.has12DWordStoreHazard()) {
1110         WaitStatesNeeded =
1111             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1112       }
1113     }
1114   }
1115 
1116   if (ST.hasDstSelForwardingHazard()) {
1117     const int Shift16DefWaitstates = 1;
1118 
1119     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1120       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1121       // Assume inline asm reads the dst
1122       if (Dst)
1123         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1124                IA->readsRegister(Dst->getReg(), &TRI);
1125 
1126       if (ProducerMI.isInlineAsm()) {
1127         // If MI is inline asm, assume it has dst forwarding hazard
1128         for (auto &Def : ProducerMI.all_defs()) {
1129           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1130               IA->readsRegister(Def.getReg(), &TRI)) {
1131             return true;
1132           }
1133         }
1134       }
1135 
1136       return false;
1137     };
1138 
1139     int WaitStatesNeededForDef =
1140         Shift16DefWaitstates -
1141         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1142     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1143   }
1144 
1145   return WaitStatesNeeded;
1146 }
1147 
1148 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1149   const SIInstrInfo *TII = ST.getInstrInfo();
1150   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1151   const MachineRegisterInfo &MRI = MF.getRegInfo();
1152 
1153   const MachineOperand *LaneSelectOp =
1154       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1155 
1156   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1157     return 0;
1158 
1159   Register LaneSelectReg = LaneSelectOp->getReg();
1160   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1161 
1162   const int RWLaneWaitStates = 4;
1163   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1164                                               RWLaneWaitStates);
1165   return RWLaneWaitStates - WaitStatesSince;
1166 }
1167 
1168 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1169   if (!ST.hasRFEHazards())
1170     return 0;
1171 
1172   const SIInstrInfo *TII = ST.getInstrInfo();
1173 
1174   const int RFEWaitStates = 1;
1175 
1176   auto IsHazardFn = [TII](const MachineInstr &MI) {
1177     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1178   };
1179   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1180   return RFEWaitStates - WaitStatesNeeded;
1181 }
1182 
1183 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1184   const SIInstrInfo *TII = ST.getInstrInfo();
1185   const int ReadM0WaitStates = 1;
1186   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1187   return ReadM0WaitStates -
1188          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1189 }
1190 
1191 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1192   fixVMEMtoScalarWriteHazards(MI);
1193   fixVcmpxPermlaneHazards(MI);
1194   fixSMEMtoVectorWriteHazards(MI);
1195   fixVcmpxExecWARHazard(MI);
1196   fixLdsBranchVmemWARHazard(MI);
1197   if (ST.hasLdsDirect()) {
1198     fixLdsDirectVALUHazard(MI);
1199     fixLdsDirectVMEMHazard(MI);
1200   }
1201   fixVALUPartialForwardingHazard(MI);
1202   fixVALUTransUseHazard(MI);
1203   fixWMMAHazards(MI);
1204   fixShift64HighRegBug(MI);
1205   fixVALUMaskWriteHazard(MI);
1206   fixVALUReadSGPRHazard(MI);
1207   fixRequiredExportPriority(MI);
1208 }
1209 
1210 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1211                               const MachineInstr &MI) {
1212   return (TII.isVOPC(MI) ||
1213           (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1214          MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1215 }
1216 
1217 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1218   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1219     return false;
1220 
1221   const SIInstrInfo *TII = ST.getInstrInfo();
1222   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1223   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1224     return isVCmpXWritesExec(*TII, *TRI, MI);
1225   };
1226 
1227   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1228     unsigned Opc = MI.getOpcode();
1229     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1230            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1231   };
1232 
1233   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1234       std::numeric_limits<int>::max())
1235     return false;
1236 
1237   // V_NOP will be discarded by SQ.
1238   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1239   // which is always a VGPR and available.
1240   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1241   Register Reg = Src0->getReg();
1242   bool IsUndef = Src0->isUndef();
1243   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1244           TII->get(AMDGPU::V_MOV_B32_e32))
1245     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1246     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1247 
1248   return true;
1249 }
1250 
1251 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1252   if (!ST.hasVMEMtoScalarWriteHazard())
1253     return false;
1254   assert(!ST.hasExtendedWaitCounts());
1255 
1256   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1257     return false;
1258 
1259   if (MI->getNumDefs() == 0)
1260     return false;
1261 
1262   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1263 
1264   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1265     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1266         !SIInstrInfo::isFLAT(I))
1267       return false;
1268 
1269     for (const MachineOperand &Def : MI->defs()) {
1270       const MachineOperand *Op =
1271           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1272       if (!Op)
1273         continue;
1274       return true;
1275     }
1276     return false;
1277   };
1278 
1279   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1280     return SIInstrInfo::isVALU(MI) ||
1281            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1282             !MI.getOperand(0).getImm()) ||
1283            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1284             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1285   };
1286 
1287   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1288       std::numeric_limits<int>::max())
1289     return false;
1290 
1291   const SIInstrInfo *TII = ST.getInstrInfo();
1292   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1293           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1294       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1295   return true;
1296 }
1297 
1298 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1299   if (!ST.hasSMEMtoVectorWriteHazard())
1300     return false;
1301   assert(!ST.hasExtendedWaitCounts());
1302 
1303   if (!SIInstrInfo::isVALU(*MI))
1304     return false;
1305 
1306   unsigned SDSTName;
1307   switch (MI->getOpcode()) {
1308   case AMDGPU::V_READLANE_B32:
1309   case AMDGPU::V_READFIRSTLANE_B32:
1310     SDSTName = AMDGPU::OpName::vdst;
1311     break;
1312   default:
1313     SDSTName = AMDGPU::OpName::sdst;
1314     break;
1315   }
1316 
1317   const SIInstrInfo *TII = ST.getInstrInfo();
1318   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1319   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1320   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1321   if (!SDST) {
1322     for (const auto &MO : MI->implicit_operands()) {
1323       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1324         SDST = &MO;
1325         break;
1326       }
1327     }
1328   }
1329 
1330   if (!SDST)
1331     return false;
1332 
1333   const Register SDSTReg = SDST->getReg();
1334   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1335     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1336   };
1337 
1338   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1339     if (TII->isSALU(MI)) {
1340       switch (MI.getOpcode()) {
1341       case AMDGPU::S_SETVSKIP:
1342       case AMDGPU::S_VERSION:
1343       case AMDGPU::S_WAITCNT_VSCNT:
1344       case AMDGPU::S_WAITCNT_VMCNT:
1345       case AMDGPU::S_WAITCNT_EXPCNT:
1346         // These instructions cannot not mitigate the hazard.
1347         return false;
1348       case AMDGPU::S_WAITCNT_LGKMCNT:
1349         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1350         return (MI.getOperand(1).getImm() == 0) &&
1351                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1352       case AMDGPU::S_WAITCNT: {
1353         const int64_t Imm = MI.getOperand(0).getImm();
1354         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1355         // DsCnt corresponds to LGKMCnt here.
1356         return (Decoded.DsCnt == 0);
1357       }
1358       default:
1359         // SOPP instructions cannot mitigate the hazard.
1360         if (TII->isSOPP(MI))
1361           return false;
1362         // At this point the SALU can be assumed to mitigate the hazard
1363         // because either:
1364         // (a) it is independent of the at risk SMEM (breaking chain),
1365         // or
1366         // (b) it is dependent on the SMEM, in which case an appropriate
1367         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1368         //     SMEM instruction.
1369         return true;
1370       }
1371     }
1372     return false;
1373   };
1374 
1375   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1376       std::numeric_limits<int>::max())
1377     return false;
1378 
1379   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1380           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1381       .addImm(0);
1382   return true;
1383 }
1384 
1385 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1386   if (!ST.hasVcmpxExecWARHazard())
1387     return false;
1388   assert(!ST.hasExtendedWaitCounts());
1389 
1390   if (!SIInstrInfo::isVALU(*MI))
1391     return false;
1392 
1393   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1394   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1395     return false;
1396 
1397   auto IsHazardFn = [TRI](const MachineInstr &I) {
1398     if (SIInstrInfo::isVALU(I))
1399       return false;
1400     return I.readsRegister(AMDGPU::EXEC, TRI);
1401   };
1402 
1403   const SIInstrInfo *TII = ST.getInstrInfo();
1404   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1405     if (SIInstrInfo::isVALU(MI)) {
1406       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1407         return true;
1408       for (auto MO : MI.implicit_operands())
1409         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1410           return true;
1411     }
1412     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1413         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1414       return true;
1415     return false;
1416   };
1417 
1418   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1419       std::numeric_limits<int>::max())
1420     return false;
1421 
1422   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1423           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1424       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1425   return true;
1426 }
1427 
1428 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1429                                                  const GCNSubtarget &ST) {
1430   if (!ST.hasLdsBranchVmemWARHazard())
1431     return false;
1432 
1433   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1434   // instructions need to appear in the same function.
1435   bool HasLds = false;
1436   bool HasVmem = false;
1437   for (auto &MBB : MF) {
1438     for (auto &MI : MBB) {
1439       HasLds |= SIInstrInfo::isDS(MI);
1440       HasVmem |=
1441           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1442       if (HasLds && HasVmem)
1443         return true;
1444     }
1445   }
1446   return false;
1447 }
1448 
1449 static bool isStoreCountWaitZero(const MachineInstr &I) {
1450   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1451          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1452          !I.getOperand(1).getImm();
1453 }
1454 
1455 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1456   if (!RunLdsBranchVmemWARHazardFixup)
1457     return false;
1458 
1459   assert(ST.hasLdsBranchVmemWARHazard());
1460   assert(!ST.hasExtendedWaitCounts());
1461 
1462   auto IsHazardInst = [](const MachineInstr &MI) {
1463     if (SIInstrInfo::isDS(MI))
1464       return 1;
1465     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1466       return 2;
1467     return 0;
1468   };
1469 
1470   auto InstType = IsHazardInst(*MI);
1471   if (!InstType)
1472     return false;
1473 
1474   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1475     return IsHazardInst(I) || isStoreCountWaitZero(I);
1476   };
1477 
1478   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1479     if (!I.isBranch())
1480       return false;
1481 
1482     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1483       auto InstType2 = IsHazardInst(I);
1484       return InstType2 && InstType != InstType2;
1485     };
1486 
1487     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1488       auto InstType2 = IsHazardInst(I);
1489       if (InstType == InstType2)
1490         return true;
1491 
1492       return isStoreCountWaitZero(I);
1493     };
1494 
1495     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1496            std::numeric_limits<int>::max();
1497   };
1498 
1499   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1500       std::numeric_limits<int>::max())
1501     return false;
1502 
1503   const SIInstrInfo *TII = ST.getInstrInfo();
1504   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1505           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1506     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1507     .addImm(0);
1508 
1509   return true;
1510 }
1511 
1512 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1513   if (!SIInstrInfo::isLDSDIR(*MI))
1514     return false;
1515 
1516   const int NoHazardWaitStates = 15;
1517   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1518   const Register VDSTReg = VDST->getReg();
1519 
1520   bool VisitedTrans = false;
1521   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1522     if (!SIInstrInfo::isVALU(I))
1523       return false;
1524     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1525     // Cover both WAR and WAW
1526     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1527   };
1528   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1529     if (WaitStates >= NoHazardWaitStates)
1530       return true;
1531     // Instructions which cause va_vdst==0 expire hazard
1532     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1533            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1534   };
1535   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1536     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1537   };
1538 
1539   DenseSet<const MachineBasicBlock *> Visited;
1540   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1541                                     std::next(MI->getReverseIterator()), 0,
1542                                     IsExpiredFn, Visited, GetWaitStatesFn);
1543 
1544   // Transcendentals can execute in parallel to other VALUs.
1545   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1546   if (VisitedTrans)
1547     Count = 0;
1548 
1549   MachineOperand *WaitVdstOp =
1550       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1551   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1552 
1553   return true;
1554 }
1555 
1556 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1557   if (!SIInstrInfo::isLDSDIR(*MI))
1558     return false;
1559 
1560   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1561   const Register VDSTReg = VDST->getReg();
1562 
1563   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1564     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1565         !SIInstrInfo::isDS(I))
1566       return false;
1567     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1568   };
1569   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1570   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1571   // according to the type of VMEM instruction.
1572   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1573     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1574            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1575            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1576             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1577            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1578             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1579   };
1580 
1581   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1582       std::numeric_limits<int>::max())
1583     return false;
1584 
1585   if (LdsdirCanWait) {
1586     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1587   } else {
1588     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1589             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1590         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1591   }
1592 
1593   return true;
1594 }
1595 
1596 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1597   if (!ST.hasVALUPartialForwardingHazard())
1598     return false;
1599   assert(!ST.hasExtendedWaitCounts());
1600 
1601   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1602     return false;
1603 
1604   SmallSetVector<Register, 4> SrcVGPRs;
1605 
1606   for (const MachineOperand &Use : MI->explicit_uses()) {
1607     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1608       SrcVGPRs.insert(Use.getReg());
1609   }
1610 
1611   // Only applies with >= 2 unique VGPR sources
1612   if (SrcVGPRs.size() <= 1)
1613     return false;
1614 
1615   // Look for the following pattern:
1616   //   Va <- VALU [PreExecPos]
1617   //   intv1
1618   //   Exec <- SALU [ExecPos]
1619   //   intv2
1620   //   Vb <- VALU [PostExecPos]
1621   //   intv3
1622   //   MI Va, Vb (WaitState = 0)
1623   //
1624   // Where:
1625   // intv1 + intv2 <= 2 VALUs
1626   // intv3 <= 4 VALUs
1627   //
1628   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1629 
1630   const int Intv1plus2MaxVALUs = 2;
1631   const int Intv3MaxVALUs = 4;
1632   const int IntvMaxVALUs = 6;
1633   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1634 
1635   struct StateType {
1636     SmallDenseMap<Register, int, 4> DefPos;
1637     int ExecPos = std::numeric_limits<int>::max();
1638     int VALUs = 0;
1639   };
1640 
1641   StateType State;
1642 
1643   // This overloads expiry testing with all the hazard detection
1644   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1645     // Too many VALU states have passed
1646     if (State.VALUs > NoHazardVALUWaitStates)
1647       return HazardExpired;
1648 
1649     // Instructions which cause va_vdst==0 expire hazard
1650     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1651         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1652         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1653          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1654       return HazardExpired;
1655 
1656     // Track registers writes
1657     bool Changed = false;
1658     if (SIInstrInfo::isVALU(I)) {
1659       for (Register Src : SrcVGPRs) {
1660         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1661           State.DefPos[Src] = State.VALUs;
1662           Changed = true;
1663         }
1664       }
1665     } else if (SIInstrInfo::isSALU(I)) {
1666       if (State.ExecPos == std::numeric_limits<int>::max()) {
1667         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1668           State.ExecPos = State.VALUs;
1669           Changed = true;
1670         }
1671       }
1672     }
1673 
1674     // Early expiration: too many VALUs in intv3
1675     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1676       return HazardExpired;
1677 
1678     // Only evaluate state if something changed
1679     if (!Changed)
1680       return NoHazardFound;
1681 
1682     // Determine positions of VALUs pre/post exec change
1683     if (State.ExecPos == std::numeric_limits<int>::max())
1684       return NoHazardFound;
1685 
1686     int PreExecPos = std::numeric_limits<int>::max();
1687     int PostExecPos = std::numeric_limits<int>::max();
1688 
1689     for (auto Entry : State.DefPos) {
1690       int DefVALUs = Entry.second;
1691       if (DefVALUs != std::numeric_limits<int>::max()) {
1692         if (DefVALUs >= State.ExecPos)
1693           PreExecPos = std::min(PreExecPos, DefVALUs);
1694         else
1695           PostExecPos = std::min(PostExecPos, DefVALUs);
1696       }
1697     }
1698 
1699     // Need a VALUs post exec change
1700     if (PostExecPos == std::numeric_limits<int>::max())
1701       return NoHazardFound;
1702 
1703     // Too many VALUs in intv3?
1704     int Intv3VALUs = PostExecPos;
1705     if (Intv3VALUs > Intv3MaxVALUs)
1706       return HazardExpired;
1707 
1708     // Too many VALUs in intv2?
1709     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1710     if (Intv2VALUs > Intv1plus2MaxVALUs)
1711       return HazardExpired;
1712 
1713     // Need a VALUs pre exec change
1714     if (PreExecPos == std::numeric_limits<int>::max())
1715       return NoHazardFound;
1716 
1717     // Too many VALUs in intv1?
1718     int Intv1VALUs = PreExecPos - State.ExecPos;
1719     if (Intv1VALUs > Intv1plus2MaxVALUs)
1720       return HazardExpired;
1721 
1722     // Too many VALUs in intv1 + intv2
1723     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1724       return HazardExpired;
1725 
1726     return HazardFound;
1727   };
1728   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1729     if (SIInstrInfo::isVALU(MI))
1730       State.VALUs += 1;
1731   };
1732 
1733   DenseSet<const MachineBasicBlock *> Visited;
1734   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1735                             std::next(MI->getReverseIterator()), Visited))
1736     return false;
1737 
1738   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1739           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1740       .addImm(0x0fff);
1741 
1742   return true;
1743 }
1744 
1745 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1746   if (!ST.hasVALUTransUseHazard())
1747     return false;
1748   assert(!ST.hasExtendedWaitCounts());
1749 
1750   if (!SIInstrInfo::isVALU(*MI))
1751     return false;
1752 
1753   SmallSet<Register, 4> SrcVGPRs;
1754 
1755   for (const MachineOperand &Use : MI->explicit_uses()) {
1756     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1757       SrcVGPRs.insert(Use.getReg());
1758   }
1759 
1760   // Look for the following pattern:
1761   //   Va <- TRANS VALU
1762   //   intv
1763   //   MI Va (WaitState = 0)
1764   //
1765   // Where:
1766   // intv <= 5 VALUs / 1 TRANS
1767   //
1768   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1769 
1770   const int IntvMaxVALUs = 5;
1771   const int IntvMaxTRANS = 1;
1772 
1773   struct StateType {
1774     int VALUs = 0;
1775     int TRANS = 0;
1776   };
1777 
1778   StateType State;
1779 
1780   // This overloads expiry testing with all the hazard detection
1781   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1782     // Too many VALU states have passed
1783     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1784       return HazardExpired;
1785 
1786     // Instructions which cause va_vdst==0 expire hazard
1787     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1788         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1789         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1790          I.getOperand(0).getImm() == 0x0fff))
1791       return HazardExpired;
1792 
1793     // Track registers writes
1794     if (SIInstrInfo::isTRANS(I)) {
1795       for (Register Src : SrcVGPRs) {
1796         if (I.modifiesRegister(Src, &TRI)) {
1797           return HazardFound;
1798         }
1799       }
1800     }
1801 
1802     return NoHazardFound;
1803   };
1804   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1805     if (SIInstrInfo::isVALU(MI))
1806       State.VALUs += 1;
1807     if (SIInstrInfo::isTRANS(MI))
1808       State.TRANS += 1;
1809   };
1810 
1811   DenseSet<const MachineBasicBlock *> Visited;
1812   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1813                             std::next(MI->getReverseIterator()), Visited))
1814     return false;
1815 
1816   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1817   // avoided.
1818   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1819           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1820       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1821 
1822   return true;
1823 }
1824 
1825 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1826   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1827     return false;
1828 
1829   const SIInstrInfo *TII = ST.getInstrInfo();
1830   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1831 
1832   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1833     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1834       return false;
1835 
1836     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1837     // with the dest(matrix D) of the previous wmma.
1838     const Register CurSrc0Reg =
1839         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1840     const Register CurSrc1Reg =
1841         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1842 
1843     const Register PrevDstReg =
1844         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1845 
1846     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1847         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1848       return true;
1849     }
1850 
1851     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1852     // but Index can't overlap with PrevDstReg.
1853     if (AMDGPU::isGFX12Plus(ST)) {
1854       if (SIInstrInfo::isSWMMAC(*MI)) {
1855         const Register CurIndex =
1856             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1857         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1858           return true;
1859       }
1860       return false;
1861     }
1862 
1863     return false;
1864   };
1865 
1866   auto IsExpiredFn = [](const MachineInstr &I, int) {
1867     return SIInstrInfo::isVALU(I);
1868   };
1869 
1870   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1871       std::numeric_limits<int>::max())
1872     return false;
1873 
1874   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1875 
1876   return true;
1877 }
1878 
1879 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1880   if (!ST.hasShift64HighRegBug())
1881     return false;
1882   assert(!ST.hasExtendedWaitCounts());
1883 
1884   switch (MI->getOpcode()) {
1885   default:
1886     return false;
1887   case AMDGPU::V_LSHLREV_B64_e64:
1888   case AMDGPU::V_LSHRREV_B64_e64:
1889   case AMDGPU::V_ASHRREV_I64_e64:
1890     break;
1891   }
1892 
1893   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1894   if (!Amt->isReg())
1895     return false;
1896 
1897   Register AmtReg = Amt->getReg();
1898   const MachineRegisterInfo &MRI = MF.getRegInfo();
1899   // Check if this is a last VGPR in the allocation block.
1900   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1901     return false;
1902 
1903   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1904     return false;
1905 
1906   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1907   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1908   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1909   bool Overlapped = OverlappedSrc || OverlappedDst;
1910 
1911   assert(!OverlappedDst || !OverlappedSrc ||
1912          Src1->getReg() == MI->getOperand(0).getReg());
1913   assert(ST.needsAlignedVGPRs());
1914   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1915 
1916   Register NewReg;
1917   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1918                                    : AMDGPU::VGPR_32RegClass) {
1919     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1920       NewReg = Reg;
1921       break;
1922     }
1923   }
1924 
1925   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1926                                : NewReg;
1927   Register NewAmtLo;
1928 
1929   if (Overlapped)
1930     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1931 
1932   DebugLoc DL = MI->getDebugLoc();
1933   MachineBasicBlock *MBB = MI->getParent();
1934   // Insert a full wait count because found register might be pending a wait.
1935   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1936       .addImm(0);
1937 
1938   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1939   if (Overlapped)
1940     runOnInstruction(
1941         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1942             .addDef(AmtReg - 1)
1943             .addReg(AmtReg - 1, RegState::Undef)
1944             .addReg(NewAmtLo, RegState::Undef));
1945   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1946                        .addDef(AmtReg)
1947                        .addReg(AmtReg, RegState::Undef)
1948                        .addReg(NewAmt, RegState::Undef));
1949 
1950   // Instructions emitted after the current instruction will be processed by the
1951   // parent loop of the hazard recognizer in a natural way.
1952   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1953           AmtReg)
1954       .addDef(NewAmt)
1955       .addReg(NewAmt)
1956       .addReg(AmtReg);
1957   if (Overlapped)
1958     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1959             AmtReg - 1)
1960         .addDef(NewAmtLo)
1961         .addReg(NewAmtLo)
1962         .addReg(AmtReg - 1);
1963 
1964   // Re-running hazard recognizer on the modified instruction is not necessary,
1965   // inserted V_SWAP_B32 has already both read and write new registers so
1966   // hazards related to these register has already been handled.
1967   Amt->setReg(NewAmt);
1968   Amt->setIsKill(false);
1969   // We do not update liveness, so verifier may see it as undef.
1970   Amt->setIsUndef();
1971   if (OverlappedDst)
1972     MI->getOperand(0).setReg(NewReg);
1973   if (OverlappedSrc) {
1974     Src1->setReg(NewReg);
1975     Src1->setIsKill(false);
1976     Src1->setIsUndef();
1977   }
1978 
1979   return true;
1980 }
1981 
1982 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1983   int NSAtoVMEMWaitStates = 1;
1984 
1985   if (!ST.hasNSAtoVMEMBug())
1986     return 0;
1987 
1988   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1989     return 0;
1990 
1991   const SIInstrInfo *TII = ST.getInstrInfo();
1992   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1993   if (!Offset || (Offset->getImm() & 6) == 0)
1994     return 0;
1995 
1996   auto IsHazardFn = [TII](const MachineInstr &I) {
1997     if (!SIInstrInfo::isMIMG(I))
1998       return false;
1999     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2000     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2001            TII->getInstSizeInBytes(I) >= 16;
2002   };
2003 
2004   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2005 }
2006 
2007 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2008   int FPAtomicToDenormModeWaitStates = 3;
2009 
2010   if (!ST.hasFPAtomicToDenormModeHazard())
2011     return 0;
2012   assert(!ST.hasExtendedWaitCounts());
2013 
2014   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2015     return 0;
2016 
2017   auto IsHazardFn = [](const MachineInstr &I) {
2018     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2019       return false;
2020     return SIInstrInfo::isFPAtomic(I);
2021   };
2022 
2023   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2024     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2025       return true;
2026 
2027     switch (MI.getOpcode()) {
2028     case AMDGPU::S_WAITCNT:
2029     case AMDGPU::S_WAITCNT_VSCNT:
2030     case AMDGPU::S_WAITCNT_VMCNT:
2031     case AMDGPU::S_WAITCNT_EXPCNT:
2032     case AMDGPU::S_WAITCNT_LGKMCNT:
2033     case AMDGPU::S_WAIT_IDLE:
2034       return true;
2035     default:
2036       break;
2037     }
2038 
2039     return false;
2040   };
2041 
2042   return FPAtomicToDenormModeWaitStates -
2043          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2044 }
2045 
2046 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2047   assert(SIInstrInfo::isMAI(*MI));
2048 
2049   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2050 }
2051 
2052 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2053   // Early exit if no padding is requested.
2054   if (MFMAPaddingRatio == 0)
2055     return 0;
2056 
2057   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2058   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2059     return 0;
2060 
2061   int NeighborMFMALatency = 0;
2062   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2063                             this](const MachineInstr &MI) {
2064     if (!SIInstrInfo::isMFMA(MI))
2065       return false;
2066 
2067     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2068     return true;
2069   };
2070 
2071   const int MaxMFMAPipelineWaitStates = 16;
2072   int WaitStatesSinceNeighborMFMA =
2073       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2074 
2075   int NeighborMFMAPaddingNeeded =
2076       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2077       WaitStatesSinceNeighborMFMA;
2078 
2079   return std::max(0, NeighborMFMAPaddingNeeded);
2080 }
2081 
2082 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2083   int WaitStatesNeeded = 0;
2084   unsigned Opc = MI->getOpcode();
2085 
2086   auto IsVALUFn = [](const MachineInstr &MI) {
2087     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2088   };
2089 
2090   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2091     const int LegacyVALUWritesVGPRWaitStates = 2;
2092     const int VALUWritesExecWaitStates = 4;
2093     const int MaxWaitStates = 4;
2094 
2095     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2096       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2097     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2098 
2099     if (WaitStatesNeeded < MaxWaitStates) {
2100       for (const MachineOperand &Use : MI->explicit_uses()) {
2101         const int MaxWaitStates = 2;
2102 
2103         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2104           continue;
2105 
2106         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2107           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2108         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2109 
2110         if (WaitStatesNeeded == MaxWaitStates)
2111           break;
2112       }
2113     }
2114   }
2115 
2116   for (const MachineOperand &Op : MI->explicit_operands()) {
2117     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2118       continue;
2119 
2120     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2121       continue;
2122 
2123     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2124     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2125     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2126     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2127     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2128     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2129     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2130     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2131     const int MaxWaitStates = 18;
2132     Register Reg = Op.getReg();
2133     unsigned HazardDefLatency = 0;
2134 
2135     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2136                                this](const MachineInstr &MI) {
2137       if (!SIInstrInfo::isMFMA(MI))
2138         return false;
2139       Register DstReg = MI.getOperand(0).getReg();
2140       if (DstReg == Reg)
2141         return false;
2142       HazardDefLatency =
2143           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2144       return TRI.regsOverlap(DstReg, Reg);
2145     };
2146 
2147     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2148                                                    MaxWaitStates);
2149     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2150     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2151     int OpNo = Op.getOperandNo();
2152     if (OpNo == SrcCIdx) {
2153       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2154     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2155       switch (HazardDefLatency) {
2156       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2157                break;
2158       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2159                break;
2160       case 16: [[fallthrough]];
2161       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2162                break;
2163       }
2164     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2165       switch (HazardDefLatency) {
2166       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2167                break;
2168       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2169                break;
2170       case 16: [[fallthrough]];
2171       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2172                break;
2173       }
2174     }
2175 
2176     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2177     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2178 
2179     if (WaitStatesNeeded == MaxWaitStates)
2180       return WaitStatesNeeded; // Early exit.
2181 
2182     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2183       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2184         return false;
2185       Register DstReg = MI.getOperand(0).getReg();
2186       return TRI.regsOverlap(Reg, DstReg);
2187     };
2188 
2189     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2190     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2191     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2192     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2193     if (OpNo == SrcCIdx)
2194       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2195     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2196       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2197 
2198     WaitStatesNeededForUse = NeedWaitStates -
2199       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2200     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2201 
2202     if (WaitStatesNeeded == MaxWaitStates)
2203       return WaitStatesNeeded; // Early exit.
2204   }
2205 
2206   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2207     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2208     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2209     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2210     const int MaxWaitStates = 13;
2211     Register DstReg = MI->getOperand(0).getReg();
2212     unsigned HazardDefLatency = 0;
2213 
2214     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2215                          this](const MachineInstr &MI) {
2216       if (!SIInstrInfo::isMFMA(MI))
2217         return false;
2218       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2219       HazardDefLatency =
2220           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2221       return TRI.regsOverlap(Reg, DstReg);
2222     };
2223 
2224     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2225     int NeedWaitStates;
2226     switch (HazardDefLatency) {
2227     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2228              break;
2229     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2230              break;
2231     case 16: [[fallthrough]];
2232     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2233              break;
2234     }
2235 
2236     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2237     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2238   }
2239 
2240   // Pad neighboring MFMA with noops for better inter-wave performance.
2241   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2242 
2243   return WaitStatesNeeded;
2244 }
2245 
2246 static int
2247 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2248                                                               bool IsGFX950) {
2249   // xdl def cycles | gfx940 | gfx950
2250   // 2 pass         |  3        4
2251   // 4 pass         |  5        6
2252   // 8 pass         |  9        10
2253   // 16 pass        |  17       18
2254   return NumPasses + 1 + IsGFX950;
2255 }
2256 
2257 static int
2258 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2259                                                               bool IsGFX950) {
2260   // xdl def cycles | gfx940 | gfx950
2261   // 2 pass         |  3        3
2262   // 4 pass         |  5        6
2263   // 8 pass         |  9        10
2264   // 16 pass        |  17       18
2265   return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2266 }
2267 
2268 static int
2269 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2270   // 2 pass -> 2
2271   // 4 pass -> 4
2272   // 8 pass -> 8
2273   // 16 pass -> 16
2274   return NumPasses;
2275 }
2276 
2277 static int
2278 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2279   // 2 pass -> 4
2280   // 4 pass -> 6
2281   // 8 pass -> 10
2282   // 16 pass -> 18
2283   return NumPasses + 2;
2284 }
2285 
2286 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2287   // 2 pass -> 5
2288   // 4 pass -> 7
2289   // 8 pass -> 11
2290   // 16 pass -> 19
2291   return NumPasses + 3;
2292 }
2293 
2294 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2295   int WaitStatesNeeded = 0;
2296   unsigned Opc = MI->getOpcode();
2297 
2298   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2299     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2300   };
2301 
2302   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2303     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2304            !SIInstrInfo::isDOT(MI);
2305   };
2306 
2307   if (!SIInstrInfo::isMFMA(*MI))
2308     return WaitStatesNeeded;
2309 
2310   const int VALUWritesExecWaitStates = 4;
2311   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2312     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2313                           VALUWritesExecWaitStates);
2314   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2315 
2316   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2317 
2318   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2319   for (const MachineOperand &Use : MI->explicit_uses()) {
2320     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2321     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2322     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2323     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2324     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2325     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2326     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2327     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2328     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2329     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2330     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2331     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2332     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2333     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2334     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2335     const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2336     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2337     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2338     const int MaxWaitStates = 19;
2339 
2340     if (!Use.isReg())
2341       continue;
2342     Register Reg = Use.getReg();
2343     bool FullReg;
2344     const MachineInstr *MI1;
2345 
2346     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2347                                this](const MachineInstr &MI) {
2348       if (!SIInstrInfo::isMFMA(MI))
2349         return false;
2350       Register DstReg = MI.getOperand(0).getReg();
2351       FullReg = (DstReg == Reg);
2352       MI1 = &MI;
2353       return TRI.regsOverlap(DstReg, Reg);
2354     };
2355 
2356     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2357       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2358     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2359 
2360     int NumWaitStates =
2361         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2362     if (NumWaitStates == std::numeric_limits<int>::max())
2363       continue;
2364 
2365     int OpNo = Use.getOperandNo();
2366     unsigned Opc1 = MI1->getOpcode();
2367     int NeedWaitStates = 0;
2368     if (OpNo == SrcCIdx) {
2369       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2370         NeedWaitStates = 0;
2371       } else if (FullReg) {
2372         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2373              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2374             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2375              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2376           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2377         else if (ST.hasGFX940Insts() &&
2378                  TSchedModel.computeInstrLatency(MI1) == 2)
2379           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2380       } else {
2381         switch (Opc1) {
2382         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2383         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2384         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2385         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2386           if (!isXDL(ST, *MI))
2387             NeedWaitStates =
2388                 ST.hasGFX950Insts()
2389                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2390                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2391           break;
2392         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2393         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2394           if (!isXDL(ST, *MI))
2395             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2396           break;
2397         default:
2398           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2399           if (ST.hasGFX940Insts()) {
2400             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2401               break;
2402 
2403             NeedWaitStates =
2404                 isXDL(ST, *MI1)
2405                     ? (isXDL(ST, *MI)
2406                            ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2407                                  NumPasses, ST.hasGFX950Insts())
2408                            : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2409                                  NumPasses, ST.hasGFX950Insts()))
2410                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2411                           NumPasses);
2412             break;
2413           }
2414 
2415           switch (NumPasses) {
2416           case 2:
2417             NeedWaitStates =
2418                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2419                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2420             break;
2421           case 8:
2422             NeedWaitStates =
2423                 isDGEMM(Opc)
2424                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2425                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2426             break;
2427           case 16:
2428             NeedWaitStates =
2429                 isDGEMM(Opc)
2430                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2431                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2432             break;
2433           default:
2434             llvm_unreachable("unexpected number of passes");
2435           }
2436         }
2437       }
2438     } else {
2439       switch (Opc1) {
2440       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2441       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2442       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2443       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2444         NeedWaitStates =
2445             ST.hasGFX950Insts()
2446                 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2447                 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2448         break;
2449       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2450       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2451         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2452         break;
2453       default:
2454         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2455 
2456         if (ST.hasGFX940Insts()) {
2457           NeedWaitStates =
2458               isXDL(ST, *MI1)
2459                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2460                         NumPasses)
2461                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2462                         NumPasses);
2463           break;
2464         }
2465 
2466         switch (NumPasses) {
2467         case 2:
2468           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2469           break;
2470         case 4:
2471           llvm_unreachable("unexpected number of passes for mfma");
2472         case 8:
2473           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2474           break;
2475         case 16:
2476         default:
2477           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2478         }
2479       }
2480     }
2481     if (WaitStatesNeeded >= NeedWaitStates)
2482       continue;
2483 
2484     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2485     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2486 
2487     if (WaitStatesNeeded == MaxWaitStates)
2488       break;
2489   }
2490 
2491   // Pad neighboring MFMA with noops for better inter-wave performance.
2492   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2493 
2494   return WaitStatesNeeded;
2495 }
2496 
2497 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2498   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2499   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2500     return 0;
2501 
2502   int WaitStatesNeeded = 0;
2503 
2504   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2505     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2506   };
2507 
2508   for (const MachineOperand &Op : MI->explicit_uses()) {
2509     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2510       continue;
2511 
2512     Register Reg = Op.getReg();
2513 
2514     const int AccVgprReadLdStWaitStates = 2;
2515     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2516     const int MaxWaitStates = 2;
2517 
2518     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2519       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2520     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2521 
2522     if (WaitStatesNeeded == MaxWaitStates)
2523       return WaitStatesNeeded; // Early exit.
2524 
2525     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2526       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2527           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2528         return false;
2529       auto IsVALUFn = [](const MachineInstr &MI) {
2530         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2531       };
2532       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2533              std::numeric_limits<int>::max();
2534     };
2535 
2536     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2537       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2538     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2539   }
2540 
2541   return WaitStatesNeeded;
2542 }
2543 
2544 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2545   assert(!ST.hasVcmpxPermlaneHazard() &&
2546          "this is a different vcmpx+permlane hazard");
2547   const SIRegisterInfo *TRI = ST.getRegisterInfo();
2548   const SIInstrInfo *TII = ST.getInstrInfo();
2549 
2550   auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2551     return isVCmpXWritesExec(*TII, *TRI, MI);
2552   };
2553 
2554   const int NumWaitStates = 4;
2555   return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
2556 }
2557 
2558 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2559   // 2 pass -> 4
2560   // 4 pass -> 6
2561   // 8 pass -> 10
2562   // 16 pass -> 18
2563   return NumPasses + 2;
2564 }
2565 
2566 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2567   // 2 pass -> 5
2568   // 4 pass -> 7
2569   // 8 pass -> 11
2570   // 16 pass -> 19
2571   return NumPasses + 3;
2572 }
2573 
2574 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2575   // 2 pass -> 5
2576   // 4 pass -> 7
2577   // 8 pass -> 11
2578   // 16 pass -> 19
2579   return NumPasses + 3;
2580 }
2581 
2582 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2583   // 2 pass -> 4
2584   // 4 pass -> 6
2585   // 8 pass -> 10
2586   // 16 pass -> 18
2587   return NumPasses + 2;
2588 }
2589 
2590 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2591   if (!ST.hasGFX90AInsts())
2592     return 0;
2593 
2594   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2595     return isDGEMM(MI.getOpcode());
2596   };
2597 
2598   // This is checked in checkMAIHazards90A()
2599   if (SIInstrInfo::isMFMA(*MI))
2600     return 0;
2601 
2602   const MachineRegisterInfo &MRI = MF.getRegInfo();
2603 
2604   int WaitStatesNeeded = 0;
2605 
2606   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2607                SIInstrInfo::isFLAT(*MI) ||
2608                SIInstrInfo::isDS(*MI);
2609   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2610   bool IsVALU = SIInstrInfo::isVALU(*MI);
2611 
2612   const MachineInstr *MFMA = nullptr;
2613   unsigned Reg;
2614   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2615     if (!SIInstrInfo::isMFMA(MI) ||
2616         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2617       return false;
2618     MFMA = &MI;
2619     return true;
2620   };
2621 
2622   const MachineInstr *DOT = nullptr;
2623   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2624     if (!SIInstrInfo::isDOT(MI) ||
2625         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2626       return false;
2627     DOT = &MI;
2628     return true;
2629   };
2630 
2631   bool DGEMMAfterVALUWrite = false;
2632   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2633     // Found DGEMM on reverse traversal to def.
2634     if (isDGEMM(MI.getOpcode()))
2635       DGEMMAfterVALUWrite = true;
2636 
2637     // Only hazard if register is defined by a VALU and a DGEMM is found after
2638     // after the def.
2639     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2640       return false;
2641 
2642     return true;
2643   };
2644 
2645   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2646                                            AMDGPU::OpName::src2);
2647 
2648   if (IsMemOrExport || IsVALU) {
2649     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2650     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2651     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2652     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2653     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2654     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2655     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2656     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2657     const int DotWriteSameDotReadSrcAB = 3;
2658     const int DotWriteDifferentVALURead = 3;
2659     const int DMFMABetweenVALUWriteVMEMRead = 2;
2660     const int MaxWaitStates = 19;
2661 
2662     for (const MachineOperand &Use : MI->explicit_uses()) {
2663       if (!Use.isReg())
2664         continue;
2665       Reg = Use.getReg();
2666 
2667       DOT = nullptr;
2668       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2669                                                      MaxWaitStates);
2670       if (DOT) {
2671         int NeedWaitStates = 0;
2672         if (DOT->getOpcode() == MI->getOpcode()) {
2673           if (&Use - &MI->getOperand(0) != SrcCIdx)
2674             NeedWaitStates = DotWriteSameDotReadSrcAB;
2675         } else {
2676           NeedWaitStates = DotWriteDifferentVALURead;
2677         }
2678 
2679         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2680         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2681       }
2682 
2683       // Workaround for HW data hazard bug observed only in GFX90A. When there
2684       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2685       // causes the SQ to incorrectly not insert two wait states between the two
2686       // instructions needed to avoid data hazard.
2687       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2688         DGEMMAfterVALUWrite = false;
2689         if (TRI.isVectorRegister(MRI, Reg)) {
2690           int WaitStatesNeededForUse =
2691                 DMFMABetweenVALUWriteVMEMRead -
2692                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2693                                       DMFMABetweenVALUWriteVMEMRead);
2694 
2695           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2696         }
2697       }
2698 
2699       MFMA = nullptr;
2700       WaitStatesSinceDef =
2701           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2702       if (!MFMA)
2703         continue;
2704 
2705       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2706       int NumPasses = HazardDefLatency;
2707       int NeedWaitStates = MaxWaitStates;
2708 
2709       if (isDGEMM(MFMA->getOpcode())) {
2710         switch (HazardDefLatency) {
2711         case 4:
2712           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2713                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2714           break;
2715         case 8:
2716         case 16:
2717           NeedWaitStates =
2718               IsMemOrExport
2719                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2720                   : (ST.hasGFX950Insts()
2721                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2722                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2723           break;
2724         default:
2725           llvm_unreachable("unexpected dgemm");
2726         }
2727       } else if (ST.hasGFX940Insts()) {
2728         NeedWaitStates =
2729             isXDL(ST, *MFMA)
2730                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2731                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2732                       NumPasses);
2733       } else {
2734         switch (HazardDefLatency) {
2735         case 2:
2736           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2737           break;
2738         case 8:
2739           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2740           break;
2741         case 16:
2742           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2743           break;
2744         default:
2745           llvm_unreachable("unexpected number of passes for mfma");
2746         }
2747       }
2748 
2749       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2750       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2751 
2752       if (WaitStatesNeeded == MaxWaitStates)
2753         break;
2754     }
2755   }
2756 
2757   unsigned Opc = MI->getOpcode();
2758   const int DMFMAToFMA64WaitStates = 2;
2759   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2760        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2761        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2762       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2763     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2764       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2765     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2766   }
2767 
2768   if (!IsVALU && !IsMemOrExport)
2769     return WaitStatesNeeded;
2770 
2771   for (const MachineOperand &Def : MI->defs()) {
2772     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2773     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2774     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2775     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2776     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2777     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2778     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2779     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2780     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2781     const int DotWriteDifferentVALUWrite = 3;
2782     const int MaxWaitStates = 19;
2783     const int MaxWarWaitStates = 15;
2784 
2785     Reg = Def.getReg();
2786 
2787     DOT = nullptr;
2788     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2789                                                    MaxWaitStates);
2790     if (DOT && DOT->getOpcode() != MI->getOpcode())
2791       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2792                                                     WaitStatesSinceDef);
2793 
2794     MFMA = nullptr;
2795     WaitStatesSinceDef =
2796         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2797     if (MFMA) {
2798       int NeedWaitStates = MaxWaitStates;
2799       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2800 
2801       if (isDGEMM(MFMA->getOpcode())) {
2802         switch (NumPasses) {
2803         case 4:
2804           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2805           break;
2806         case 8:
2807         case 16:
2808           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2809           break;
2810         default:
2811           llvm_unreachable("unexpected number of cycles for dgemm");
2812         }
2813       } else if (ST.hasGFX940Insts()) {
2814         NeedWaitStates =
2815             isXDL(ST, *MFMA)
2816                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2817                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2818       } else {
2819         switch (NumPasses) {
2820         case 2:
2821           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2822           break;
2823         case 8:
2824           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2825           break;
2826         case 16:
2827           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2828           break;
2829         default:
2830           llvm_unreachable("Unexpected number of passes for mfma");
2831         }
2832       }
2833 
2834       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2835       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2836 
2837       if (WaitStatesNeeded == MaxWaitStates)
2838         break;
2839     }
2840 
2841     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2842       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2843           !MI.readsRegister(Reg, &TRI))
2844         return false;
2845 
2846       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2847         return false;
2848 
2849       const MachineOperand *SrcC =
2850           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2851       assert(SrcC);
2852       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2853         return false;
2854 
2855       MFMA = &MI;
2856       return true;
2857     };
2858 
2859     MFMA = nullptr;
2860     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2861                                                 MaxWarWaitStates);
2862     if (!MFMA)
2863       continue;
2864 
2865     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2866     int NeedWaitStates = MaxWaitStates;
2867     switch (HazardDefLatency) {
2868     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2869              break;
2870     case 4:  assert(ST.hasGFX940Insts());
2871              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2872              break;
2873     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2874              break;
2875     case 16: [[fallthrough]];
2876     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2877              break;
2878     }
2879 
2880     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2881     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2882   }
2883 
2884   return WaitStatesNeeded;
2885 }
2886 
2887 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2888   if (!SU->isInstr())
2889     return false;
2890 
2891   const MachineInstr *MAI = nullptr;
2892 
2893   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2894     MAI = nullptr;
2895     if (SIInstrInfo::isMFMA(MI))
2896       MAI = &MI;
2897     return MAI != nullptr;
2898   };
2899 
2900   MachineInstr *MI = SU->getInstr();
2901   if (IsMFMAFn(*MI)) {
2902     int W = getWaitStatesSince(IsMFMAFn, 16);
2903     if (MAI)
2904       return W < (int)TSchedModel.computeInstrLatency(MAI);
2905   }
2906 
2907   return false;
2908 }
2909 
2910 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2911 // insertion of a new instruction.
2912 static void updateGetPCBundle(MachineInstr *NewMI) {
2913   if (!NewMI->isBundled())
2914     return;
2915 
2916   // Find start of bundle.
2917   auto I = NewMI->getIterator();
2918   while (I->isBundledWithPred())
2919     I--;
2920   if (I->isBundle())
2921     I++;
2922 
2923   // Bail if this is not an S_GETPC bundle.
2924   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2925     return;
2926 
2927   // Update offsets of any references in the bundle.
2928   const unsigned NewBytes = 4;
2929   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2930          "Unexpected instruction insertion in bundle");
2931   auto NextMI = std::next(NewMI->getIterator());
2932   auto End = NewMI->getParent()->end();
2933   while (NextMI != End && NextMI->isBundledWithPred()) {
2934     for (auto &Operand : NextMI->operands()) {
2935       if (Operand.isGlobal())
2936         Operand.setOffset(Operand.getOffset() + NewBytes);
2937     }
2938     NextMI++;
2939   }
2940 }
2941 
2942 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2943   if (!ST.hasVALUMaskWriteHazard())
2944     return false;
2945   assert(!ST.hasExtendedWaitCounts());
2946 
2947   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2948     return false;
2949 
2950   // The hazard sequence is three instructions:
2951   //   1. VALU reads SGPR as mask
2952   //   2. SALU writes SGPR
2953   //   3. SALU reads SGPR
2954   // The hazard can expire if the distance between 2 and 3 is sufficient.
2955   // In practice this happens <10% of the time, hence this always assumes
2956   // the hazard exists if 1 and 2 are present to avoid searching.
2957 
2958   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2959   if (!SDSTOp || !SDSTOp->isReg())
2960     return false;
2961 
2962   const Register HazardReg = SDSTOp->getReg();
2963   if (HazardReg == AMDGPU::EXEC ||
2964       HazardReg == AMDGPU::EXEC_LO ||
2965       HazardReg == AMDGPU::EXEC_HI ||
2966       HazardReg == AMDGPU::M0)
2967     return false;
2968 
2969   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2970     switch (I.getOpcode()) {
2971     case AMDGPU::V_ADDC_U32_e32:
2972     case AMDGPU::V_ADDC_U32_dpp:
2973     case AMDGPU::V_CNDMASK_B16_e32:
2974     case AMDGPU::V_CNDMASK_B16_dpp:
2975     case AMDGPU::V_CNDMASK_B32_e32:
2976     case AMDGPU::V_CNDMASK_B32_dpp:
2977     case AMDGPU::V_DIV_FMAS_F32_e64:
2978     case AMDGPU::V_DIV_FMAS_F64_e64:
2979     case AMDGPU::V_SUBB_U32_e32:
2980     case AMDGPU::V_SUBB_U32_dpp:
2981     case AMDGPU::V_SUBBREV_U32_e32:
2982     case AMDGPU::V_SUBBREV_U32_dpp:
2983       // These implicitly read VCC as mask source.
2984       return HazardReg == AMDGPU::VCC ||
2985              HazardReg == AMDGPU::VCC_LO ||
2986              HazardReg == AMDGPU::VCC_HI;
2987     case AMDGPU::V_ADDC_U32_e64:
2988     case AMDGPU::V_ADDC_U32_e64_dpp:
2989     case AMDGPU::V_CNDMASK_B16_e64:
2990     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2991     case AMDGPU::V_CNDMASK_B32_e64:
2992     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2993     case AMDGPU::V_SUBB_U32_e64:
2994     case AMDGPU::V_SUBB_U32_e64_dpp:
2995     case AMDGPU::V_SUBBREV_U32_e64:
2996     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2997       // Only check mask register overlaps.
2998       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2999       assert(SSRCOp);
3000       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3001     }
3002     default:
3003       return false;
3004     }
3005   };
3006 
3007   const MachineRegisterInfo &MRI = MF.getRegInfo();
3008   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3009     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3010     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3011         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3012       return true;
3013 
3014     // VALU access to any SGPR or literal constant other than HazardReg
3015     // mitigates hazard. No need to check HazardReg here as this will
3016     // only be called when !IsHazardFn.
3017     if (!SIInstrInfo::isVALU(I))
3018       return false;
3019     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3020       const MachineOperand &Op = I.getOperand(OpNo);
3021       if (Op.isReg()) {
3022         Register OpReg = Op.getReg();
3023         // Only consider uses
3024         if (!Op.isUse())
3025           continue;
3026         // Ignore EXEC
3027         if (OpReg == AMDGPU::EXEC ||
3028             OpReg == AMDGPU::EXEC_LO ||
3029             OpReg == AMDGPU::EXEC_HI)
3030           continue;
3031         // Ignore all implicit uses except VCC
3032         if (Op.isImplicit()) {
3033           if (OpReg == AMDGPU::VCC ||
3034               OpReg == AMDGPU::VCC_LO ||
3035               OpReg == AMDGPU::VCC_HI)
3036             return true;
3037           continue;
3038         }
3039         if (TRI.isSGPRReg(MRI, OpReg))
3040           return true;
3041       } else {
3042         const MCInstrDesc &InstDesc = I.getDesc();
3043         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3044         if (!TII.isInlineConstant(Op, OpInfo))
3045           return true;
3046       }
3047     }
3048     return false;
3049   };
3050 
3051   // Check for hazard
3052   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3053       std::numeric_limits<int>::max())
3054     return false;
3055 
3056   auto NextMI = std::next(MI->getIterator());
3057 
3058   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3059   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3060                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3061                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3062 
3063   // SALU write may be s_getpc in a bundle.
3064   updateGetPCBundle(NewMI);
3065 
3066   return true;
3067 }
3068 
3069 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3070 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3071 static std::optional<unsigned> sgprPairNumber(Register Reg,
3072                                               const SIRegisterInfo &TRI) {
3073   switch (Reg) {
3074   case AMDGPU::M0:
3075   case AMDGPU::EXEC:
3076   case AMDGPU::EXEC_LO:
3077   case AMDGPU::EXEC_HI:
3078   case AMDGPU::SGPR_NULL:
3079   case AMDGPU::SGPR_NULL64:
3080     return {};
3081   default:
3082     break;
3083   }
3084   unsigned RegN = TRI.getEncodingValue(Reg);
3085   if (RegN > 127)
3086     return {};
3087   return (RegN >> 1) & 0x3f;
3088 }
3089 
3090 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3091 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3092   assert(MMF == &MF);
3093 
3094   // Assume non-empty vector means it has already been computed.
3095   if (!VALUReadHazardSGPRs.empty())
3096     return;
3097 
3098   auto CallingConv = MF.getFunction().getCallingConv();
3099   bool IsCallFree =
3100       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3101 
3102   // Exhaustive search is only viable in non-caller/callee functions where
3103   // VALUs will be exposed to the hazard recognizer.
3104   UseVALUReadHazardExhaustiveSearch =
3105       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3106       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3107 
3108   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3109   bool UseVALUUseCache =
3110       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3111   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3112   if (!UseVALUUseCache)
3113     return;
3114 
3115   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3116   // before a SALU write to the same SGPR.  This provides a reduction in
3117   // hazard insertion when all VALU access to an SGPR occurs after its last
3118   // SALU write, when compared to a linear scan.
3119   const MachineRegisterInfo &MRI = MF.getRegInfo();
3120   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3121   MachineCycleInfo CI;
3122   CI.compute(*MMF);
3123 
3124   for (auto *MBB : post_order(&MF)) {
3125     bool InCycle = CI.getCycle(MBB) != nullptr;
3126     for (auto &MI : reverse(MBB->instrs())) {
3127       bool IsVALU = SIInstrInfo::isVALU(MI);
3128       bool IsSALU = SIInstrInfo::isSALU(MI);
3129       if (!IsVALU && !IsSALU)
3130         continue;
3131 
3132       for (const MachineOperand &Op : MI.operands()) {
3133         if (!Op.isReg())
3134           continue;
3135         Register Reg = Op.getReg();
3136         assert(!Op.getSubReg());
3137         // Only consider implicit operands of VCC.
3138         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3139                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3140           continue;
3141         if (!TRI.isSGPRReg(MRI, Reg))
3142           continue;
3143         auto RegN = sgprPairNumber(Reg, TRI);
3144         if (!RegN)
3145           continue;
3146         if (IsVALU && Op.isUse()) {
3147           // Note: any access within a cycle must be considered a hazard.
3148           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3149             VALUReadHazardSGPRs.set(*RegN);
3150           ReadSGPRs.set(*RegN);
3151         } else if (IsSALU) {
3152           if (Op.isDef())
3153             SALUWriteSGPRs.set(*RegN);
3154           else
3155             ReadSGPRs.set(*RegN);
3156         }
3157       }
3158     }
3159   }
3160 }
3161 
3162 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3163   if (!ST.hasVALUReadSGPRHazard())
3164     return false;
3165 
3166   // The hazard sequence is fundamentally three instructions:
3167   //   1. VALU reads SGPR
3168   //   2. SALU writes SGPR
3169   //   3. VALU/SALU reads SGPR
3170   // Try to avoid searching for (1) because the expiry point of the hazard is
3171   // indeterminate; however, the hazard between (2) and (3) can expire if the
3172   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3173   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3174   // even if individual SGPRs are accessed.
3175 
3176   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3177   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3178   if (!(MIIsSALU || MIIsVALU))
3179     return false;
3180 
3181   // Avoid expensive search when compile time is priority by
3182   // mitigating every SALU which writes an SGPR.
3183   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3184     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3185       return false;
3186 
3187     const MachineOperand *SDSTOp =
3188         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3189     if (!SDSTOp || !SDSTOp->isReg())
3190       return false;
3191 
3192     const Register HazardReg = SDSTOp->getReg();
3193     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3194         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3195       return false;
3196 
3197     // Add s_wait_alu sa_sdst(0) after SALU write.
3198     auto NextMI = std::next(MI->getIterator());
3199     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3200                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3201                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3202 
3203     // SALU write may be s_getpc in a bundle.
3204     updateGetPCBundle(NewMI);
3205 
3206     return true;
3207   }
3208 
3209   // Pre-compute set of SGPR pairs read by VALUs.
3210   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3211   computeVALUHazardSGPRs(MI->getMF());
3212 
3213   // If no VALUs hazard SGPRs exist then nothing to do.
3214   if (VALUReadHazardSGPRs.none())
3215     return false;
3216 
3217   // All SGPR writes before a call/return must be flushed as the callee/caller
3218   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3219   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3220                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3221                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3222 
3223   // Collect all SGPR sources for MI which are read by a VALU.
3224   const MachineRegisterInfo &MRI = MF.getRegInfo();
3225   SmallSet<Register, 4> SGPRsUsed;
3226 
3227   if (!IsSetPC) {
3228     for (const MachineOperand &Op : MI->all_uses()) {
3229       Register OpReg = Op.getReg();
3230 
3231       // Only consider VCC implicit uses on VALUs.
3232       // The only expected SALU implicit access is SCC which is no hazard.
3233       if (MIIsSALU && Op.isImplicit())
3234         continue;
3235 
3236       if (!TRI.isSGPRReg(MRI, OpReg))
3237         continue;
3238 
3239       auto RegN = sgprPairNumber(OpReg, TRI);
3240       if (!RegN)
3241         continue;
3242 
3243       if (!VALUReadHazardSGPRs[*RegN])
3244         continue;
3245 
3246       SGPRsUsed.insert(OpReg);
3247     }
3248 
3249     // No SGPRs -> nothing to do.
3250     if (SGPRsUsed.empty())
3251       return false;
3252   }
3253 
3254   // A hazard is any SALU which writes one of the SGPRs read by MI.
3255   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3256     if (!SIInstrInfo::isSALU(I))
3257       return false;
3258     // Ensure SGPR flush before call/return by conservatively assuming every
3259     // SALU writes an SGPR.
3260     if (IsSetPC && I.getNumDefs() > 0)
3261       return true;
3262     // Check for any register writes.
3263     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3264       return I.modifiesRegister(Reg, &TRI);
3265     });
3266   };
3267 
3268   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3269   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3270     if (Count >= SALUExpiryCount)
3271       return true;
3272     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3273     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3274         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3275       return true;
3276     return false;
3277   };
3278 
3279   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3280     // Only count true SALUs as wait states.
3281     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3282       return 0;
3283     // SALU must be unrelated to any hazard registers.
3284     if (any_of(SGPRsUsed,
3285                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3286       return 0;
3287     return 1;
3288   };
3289 
3290   // Check for the hazard.
3291   DenseSet<const MachineBasicBlock *> Visited;
3292   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3293                                         std::next(MI->getReverseIterator()), 0,
3294                                         IsExpiredFn, Visited, WaitStatesFn);
3295 
3296   if (WaitStates >= SALUExpiryCount)
3297     return false;
3298 
3299   // Validate hazard through an exhaustive search.
3300   if (UseVALUReadHazardExhaustiveSearch) {
3301     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3302     // This is searching for (1) in the hazard description.
3303     auto hazardPair = [this](Register Reg) {
3304       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3305         return Register(AMDGPU::VCC);
3306       auto RegN = sgprPairNumber(Reg, TRI);
3307       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3308     };
3309     auto SearchHazardFn = [this, hazardPair,
3310                            &SGPRsUsed](const MachineInstr &I) {
3311       if (!SIInstrInfo::isVALU(I))
3312         return false;
3313       // Check for any register reads.
3314       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3315         return I.readsRegister(hazardPair(Reg), &TRI);
3316       });
3317     };
3318     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3319       return false;
3320     };
3321     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3322         std::numeric_limits<int>::max())
3323       return false;
3324   }
3325 
3326   // Add s_wait_alu sa_sdst(0) before SALU read.
3327   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3328                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3329                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3330 
3331   // SALU read may be after s_getpc in a bundle.
3332   updateGetPCBundle(NewMI);
3333 
3334   return true;
3335 }
3336 
3337 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3338                                const SIInstrInfo &TII) {
3339   MachineBasicBlock &EntryMBB = MF->front();
3340   if (EntryMBB.begin() != EntryMBB.end()) {
3341     auto &EntryMI = *EntryMBB.begin();
3342     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3343         EntryMI.getOperand(0).getImm() >= Priority)
3344       return false;
3345   }
3346 
3347   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3348       .addImm(Priority);
3349   return true;
3350 }
3351 
3352 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3353   if (!ST.hasRequiredExportPriority())
3354     return false;
3355 
3356   // Assume the following shader types will never have exports,
3357   // and avoid adding or adjusting S_SETPRIO.
3358   MachineBasicBlock *MBB = MI->getParent();
3359   MachineFunction *MF = MBB->getParent();
3360   auto CC = MF->getFunction().getCallingConv();
3361   switch (CC) {
3362   case CallingConv::AMDGPU_CS:
3363   case CallingConv::AMDGPU_CS_Chain:
3364   case CallingConv::AMDGPU_CS_ChainPreserve:
3365   case CallingConv::AMDGPU_KERNEL:
3366     return false;
3367   default:
3368     break;
3369   }
3370 
3371   const int MaxPriority = 3;
3372   const int NormalPriority = 2;
3373   const int PostExportPriority = 0;
3374 
3375   auto It = MI->getIterator();
3376   switch (MI->getOpcode()) {
3377   case AMDGPU::S_ENDPGM:
3378   case AMDGPU::S_ENDPGM_SAVED:
3379   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3380   case AMDGPU::SI_RETURN_TO_EPILOG:
3381     // Ensure shader with calls raises priority at entry.
3382     // This ensures correct priority if exports exist in callee.
3383     if (MF->getFrameInfo().hasCalls())
3384       return ensureEntrySetPrio(MF, NormalPriority, TII);
3385     return false;
3386   case AMDGPU::S_SETPRIO: {
3387     // Raise minimum priority unless in workaround.
3388     auto &PrioOp = MI->getOperand(0);
3389     int Prio = PrioOp.getImm();
3390     bool InWA = (Prio == PostExportPriority) &&
3391                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3392     if (InWA || Prio >= NormalPriority)
3393       return false;
3394     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3395     return true;
3396   }
3397   default:
3398     if (!TII.isEXP(*MI))
3399       return false;
3400     break;
3401   }
3402 
3403   // Check entry priority at each export (as there will only be a few).
3404   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3405   bool Changed = false;
3406   if (CC != CallingConv::AMDGPU_Gfx)
3407     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3408 
3409   auto NextMI = std::next(It);
3410   bool EndOfShader = false;
3411   if (NextMI != MBB->end()) {
3412     // Only need WA at end of sequence of exports.
3413     if (TII.isEXP(*NextMI))
3414       return Changed;
3415     // Assume appropriate S_SETPRIO after export means WA already applied.
3416     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3417         NextMI->getOperand(0).getImm() == PostExportPriority)
3418       return Changed;
3419     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3420   }
3421 
3422   const DebugLoc &DL = MI->getDebugLoc();
3423 
3424   // Lower priority.
3425   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3426       .addImm(PostExportPriority);
3427 
3428   if (!EndOfShader) {
3429     // Wait for exports to complete.
3430     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3431         .addReg(AMDGPU::SGPR_NULL)
3432         .addImm(0);
3433   }
3434 
3435   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3436   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3437 
3438   if (!EndOfShader) {
3439     // Return to normal (higher) priority.
3440     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3441         .addImm(NormalPriority);
3442   }
3443 
3444   return true;
3445 }
3446