xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision db08d78c3e368ffbc8bef1b806d2c7179a5ccbf9)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       UseVALUReadHazardExhaustiveSearch(false),
64       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
172 }
173 
174 static bool isLdsDma(const MachineInstr &MI) {
175   return SIInstrInfo::isVALU(MI) &&
176          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
177 }
178 
179 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
180   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
181                                                      AMDGPU::OpName::simm16);
182   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
183 }
184 
185 ScheduleHazardRecognizer::HazardType
186 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
187   MachineInstr *MI = SU->getInstr();
188   // If we are not in "HazardRecognizerMode" and therefore not being run from
189   // the scheduler, track possible stalls from hazards but don't insert noops.
190   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
191 
192   if (MI->isBundle())
193    return NoHazard;
194 
195   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
196     return HazardType;
197 
198   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
199     return HazardType;
200 
201   if (checkFPAtomicToDenormModeHazard(MI) > 0)
202     return HazardType;
203 
204   if (ST.hasNoDataDepHazard())
205     return NoHazard;
206 
207   // FIXME: Should flat be considered vmem?
208   if ((SIInstrInfo::isVMEM(*MI) ||
209        SIInstrInfo::isFLAT(*MI))
210       && checkVMEMHazards(MI) > 0)
211     return HazardType;
212 
213   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
214     return HazardType;
215 
216   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
217     return HazardType;
218 
219   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
220     return HazardType;
221 
222   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
223     return HazardType;
224 
225   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
226        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
227        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
228     return HazardType;
229 
230   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
231     return HazardType;
232 
233   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
234     return HazardType;
235 
236   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
237     return HazardType;
238 
239   if (((ST.hasReadM0MovRelInterpHazard() &&
240         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
241          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
242          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
243        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
244        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
245        (ST.hasReadM0LdsDirectHazard() &&
246         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
247       checkReadM0Hazards(MI) > 0)
248     return HazardType;
249 
250   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
251     return HazardType;
252 
253   if ((SIInstrInfo::isVMEM(*MI) ||
254        SIInstrInfo::isFLAT(*MI) ||
255        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
256     return HazardType;
257 
258   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
259     return HazardType;
260 
261   return NoHazard;
262 }
263 
264 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
265                                 unsigned Quantity) {
266   while (Quantity > 0) {
267     unsigned Arg = std::min(Quantity, 8u);
268     Quantity -= Arg;
269     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
270         .addImm(Arg - 1);
271   }
272 }
273 
274 unsigned
275 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
276   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
277   assert(TSchedModel.getWriteProcResBegin(SC) !=
278          TSchedModel.getWriteProcResEnd(SC));
279   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
280 }
281 
282 void GCNHazardRecognizer::processBundle() {
283   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
284   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
285   // Check bundled MachineInstr's for hazards.
286   for (; MI != E && MI->isInsideBundle(); ++MI) {
287     CurrCycleInstr = &*MI;
288     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
289 
290     if (IsHazardRecognizerMode) {
291       fixHazards(CurrCycleInstr);
292 
293       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
294     }
295 
296     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
297     // include the bundled MI directly after, only add a maximum of
298     // (MaxLookAhead - 1) noops to EmittedInstrs.
299     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
300       EmittedInstrs.push_front(nullptr);
301 
302     EmittedInstrs.push_front(CurrCycleInstr);
303     EmittedInstrs.resize(MaxLookAhead);
304   }
305   CurrCycleInstr = nullptr;
306 }
307 
308 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
309   assert(IsHazardRecognizerMode);
310 
311   unsigned NumPreNoops = PreEmitNoops(MI);
312   EmitNoops(NumPreNoops);
313   if (MI->isInsideBundle())
314     insertNoopsInBundle(MI, TII, NumPreNoops);
315   else
316     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
317                     NumPreNoops);
318   EmitInstruction(MI);
319   AdvanceCycle();
320 }
321 
322 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
323   IsHazardRecognizerMode = true;
324   CurrCycleInstr = MI;
325   unsigned W = PreEmitNoopsCommon(MI);
326   fixHazards(MI);
327   CurrCycleInstr = nullptr;
328   return W;
329 }
330 
331 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
332   if (MI->isBundle())
333     return 0;
334 
335   int WaitStates = 0;
336 
337   if (SIInstrInfo::isSMRD(*MI))
338     return std::max(WaitStates, checkSMRDHazards(MI));
339 
340   if (ST.hasNSAtoVMEMBug())
341     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
342 
343   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
344 
345   if (ST.hasNoDataDepHazard())
346     return WaitStates;
347 
348   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
349     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
350 
351   if (SIInstrInfo::isVALU(*MI))
352     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
353 
354   if (SIInstrInfo::isDPP(*MI))
355     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
356 
357   if (isDivFMas(MI->getOpcode()))
358     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
359 
360   if (isRWLane(MI->getOpcode()))
361     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
362 
363   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
364        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
365        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
366     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
367 
368   if (MI->isInlineAsm())
369     return std::max(WaitStates, checkInlineAsmHazards(MI));
370 
371   if (isSGetReg(MI->getOpcode()))
372     return std::max(WaitStates, checkGetRegHazards(MI));
373 
374   if (isSSetReg(MI->getOpcode()))
375     return std::max(WaitStates, checkSetRegHazards(MI));
376 
377   if (isRFE(MI->getOpcode()))
378     return std::max(WaitStates, checkRFEHazards(MI));
379 
380   if ((ST.hasReadM0MovRelInterpHazard() &&
381        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
382         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
383         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
384       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
385       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
386       (ST.hasReadM0LdsDirectHazard() &&
387        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
388     return std::max(WaitStates, checkReadM0Hazards(MI));
389 
390   if (SIInstrInfo::isMAI(*MI))
391     return std::max(WaitStates, checkMAIHazards(MI));
392 
393   if (SIInstrInfo::isVMEM(*MI) ||
394       SIInstrInfo::isFLAT(*MI) ||
395       SIInstrInfo::isDS(*MI))
396     return std::max(WaitStates, checkMAILdStHazards(MI));
397 
398   return WaitStates;
399 }
400 
401 void GCNHazardRecognizer::EmitNoop() {
402   EmittedInstrs.push_front(nullptr);
403 }
404 
405 void GCNHazardRecognizer::AdvanceCycle() {
406   // When the scheduler detects a stall, it will call AdvanceCycle() without
407   // emitting any instructions.
408   if (!CurrCycleInstr) {
409     EmittedInstrs.push_front(nullptr);
410     return;
411   }
412 
413   if (CurrCycleInstr->isBundle()) {
414     processBundle();
415     return;
416   }
417 
418   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
419   if (!NumWaitStates) {
420     CurrCycleInstr = nullptr;
421     return;
422   }
423 
424   // Keep track of emitted instructions
425   EmittedInstrs.push_front(CurrCycleInstr);
426 
427   // Add a nullptr for each additional wait state after the first.  Make sure
428   // not to add more than getMaxLookAhead() items to the list, since we
429   // truncate the list to that size right after this loop.
430   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
431        i < e; ++i) {
432     EmittedInstrs.push_front(nullptr);
433   }
434 
435   // getMaxLookahead() is the largest number of wait states we will ever need
436   // to insert, so there is no point in keeping track of more than that many
437   // wait states.
438   EmittedInstrs.resize(getMaxLookAhead());
439 
440   CurrCycleInstr = nullptr;
441 }
442 
443 void GCNHazardRecognizer::RecedeCycle() {
444   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
445 }
446 
447 //===----------------------------------------------------------------------===//
448 // Helper Functions
449 //===----------------------------------------------------------------------===//
450 
451 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
452 
453 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
454 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
455 
456 // Search for a hazard in a block and its predecessors.
457 template <typename StateT>
458 static bool
459 hasHazard(StateT State,
460           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
461           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
462           const MachineBasicBlock *MBB,
463           MachineBasicBlock::const_reverse_instr_iterator I,
464           DenseSet<const MachineBasicBlock *> &Visited) {
465   for (auto E = MBB->instr_rend(); I != E; ++I) {
466     // No need to look at parent BUNDLE instructions.
467     if (I->isBundle())
468       continue;
469 
470     switch (IsHazard(State, *I)) {
471     case HazardFound:
472       return true;
473     case HazardExpired:
474       return false;
475     default:
476       // Continue search
477       break;
478     }
479 
480     if (I->isInlineAsm() || I->isMetaInstruction())
481       continue;
482 
483     UpdateState(State, *I);
484   }
485 
486   for (MachineBasicBlock *Pred : MBB->predecessors()) {
487     if (!Visited.insert(Pred).second)
488       continue;
489 
490     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
491                   Visited))
492       return true;
493   }
494 
495   return false;
496 }
497 
498 // Returns a minimum wait states since \p I walking all predecessors.
499 // Only scans until \p IsExpired does not return true.
500 // Can only be run in a hazard recognizer mode.
501 static int getWaitStatesSince(
502     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
503     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
504     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
505     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
506   for (auto E = MBB->instr_rend(); I != E; ++I) {
507     // Don't add WaitStates for parent BUNDLE instructions.
508     if (I->isBundle())
509       continue;
510 
511     if (IsHazard(*I))
512       return WaitStates;
513 
514     if (I->isInlineAsm())
515       continue;
516 
517     WaitStates += GetNumWaitStates(*I);
518 
519     if (IsExpired(*I, WaitStates))
520       return std::numeric_limits<int>::max();
521   }
522 
523   int MinWaitStates = std::numeric_limits<int>::max();
524   for (MachineBasicBlock *Pred : MBB->predecessors()) {
525     if (!Visited.insert(Pred).second)
526       continue;
527 
528     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
529                                IsExpired, Visited, GetNumWaitStates);
530 
531     MinWaitStates = std::min(MinWaitStates, W);
532   }
533 
534   return MinWaitStates;
535 }
536 
537 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
538                               const MachineInstr *MI, IsExpiredFn IsExpired) {
539   DenseSet<const MachineBasicBlock *> Visited;
540   return getWaitStatesSince(IsHazard, MI->getParent(),
541                             std::next(MI->getReverseIterator()),
542                             0, IsExpired, Visited);
543 }
544 
545 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
546   if (IsHazardRecognizerMode) {
547     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
548       return WaitStates >= Limit;
549     };
550     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
551   }
552 
553   int WaitStates = 0;
554   for (MachineInstr *MI : EmittedInstrs) {
555     if (MI) {
556       if (IsHazard(*MI))
557         return WaitStates;
558 
559       if (MI->isInlineAsm())
560         continue;
561     }
562     ++WaitStates;
563 
564     if (WaitStates >= Limit)
565       break;
566   }
567   return std::numeric_limits<int>::max();
568 }
569 
570 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
571                                                IsHazardFn IsHazardDef,
572                                                int Limit) {
573   const SIRegisterInfo *TRI = ST.getRegisterInfo();
574 
575   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
576     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
577   };
578 
579   return getWaitStatesSince(IsHazardFn, Limit);
580 }
581 
582 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
583                                                   int Limit) {
584   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
585     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
586   };
587 
588   return getWaitStatesSince(IsHazardFn, Limit);
589 }
590 
591 //===----------------------------------------------------------------------===//
592 // No-op Hazard Detection
593 //===----------------------------------------------------------------------===//
594 
595 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
596                         MCRegister Reg) {
597   for (MCRegUnit Unit : TRI.regunits(Reg))
598     BV.set(Unit);
599 }
600 
601 static void addRegsToSet(const SIRegisterInfo &TRI,
602                          iterator_range<MachineInstr::const_mop_iterator> Ops,
603                          BitVector &DefSet, BitVector &UseSet) {
604   for (const MachineOperand &Op : Ops) {
605     if (Op.isReg())
606       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
607   }
608 }
609 
610 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
611   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
612 }
613 
614 static bool breaksSMEMSoftClause(MachineInstr *MI) {
615   return !SIInstrInfo::isSMRD(*MI);
616 }
617 
618 static bool breaksVMEMSoftClause(MachineInstr *MI) {
619   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
620 }
621 
622 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
623   // SMEM soft clause are only present on VI+, and only matter if xnack is
624   // enabled.
625   if (!ST.isXNACKEnabled())
626     return 0;
627 
628   bool IsSMRD = TII.isSMRD(*MEM);
629 
630   resetClause();
631 
632   // A soft-clause is any group of consecutive SMEM instructions.  The
633   // instructions in this group may return out of order and/or may be
634   // replayed (i.e. the same instruction issued more than once).
635   //
636   // In order to handle these situations correctly we need to make sure that
637   // when a clause has more than one instruction, no instruction in the clause
638   // writes to a register that is read by another instruction in the clause
639   // (including itself). If we encounter this situation, we need to break the
640   // clause by inserting a non SMEM instruction.
641 
642   for (MachineInstr *MI : EmittedInstrs) {
643     // When we hit a non-SMEM instruction then we have passed the start of the
644     // clause and we can stop.
645     if (!MI)
646       break;
647 
648     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
649       break;
650 
651     addClauseInst(*MI);
652   }
653 
654   if (ClauseDefs.none())
655     return 0;
656 
657   // We need to make sure not to put loads and stores in the same clause if they
658   // use the same address. For now, just start a new clause whenever we see a
659   // store.
660   if (MEM->mayStore())
661     return 1;
662 
663   addClauseInst(*MEM);
664 
665   // If the set of defs and uses intersect then we cannot add this instruction
666   // to the clause, so we have a hazard.
667   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
668 }
669 
670 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
671   int WaitStatesNeeded = 0;
672 
673   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
674 
675   // This SMRD hazard only affects SI.
676   if (!ST.hasSMRDReadVALUDefHazard())
677     return WaitStatesNeeded;
678 
679   // A read of an SGPR by SMRD instruction requires 4 wait states when the
680   // SGPR was written by a VALU instruction.
681   int SmrdSgprWaitStates = 4;
682   auto IsHazardDefFn = [this](const MachineInstr &MI) {
683     return TII.isVALU(MI);
684   };
685   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
686     return TII.isSALU(MI);
687   };
688 
689   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
690 
691   for (const MachineOperand &Use : SMRD->uses()) {
692     if (!Use.isReg())
693       continue;
694     int WaitStatesNeededForUse =
695         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
696                                                    SmrdSgprWaitStates);
697     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
698 
699     // This fixes what appears to be undocumented hardware behavior in SI where
700     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
701     // needs some number of nops in between. We don't know how many we need, but
702     // let's use 4. This wasn't discovered before probably because the only
703     // case when this happens is when we expand a 64-bit pointer into a full
704     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
705     // probably never encountered in the closed-source land.
706     if (IsBufferSMRD) {
707       int WaitStatesNeededForUse =
708         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
709                                                    IsBufferHazardDefFn,
710                                                    SmrdSgprWaitStates);
711       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
712     }
713   }
714 
715   return WaitStatesNeeded;
716 }
717 
718 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
719   if (!ST.hasVMEMReadSGPRVALUDefHazard())
720     return 0;
721 
722   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
723 
724   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
725   // SGPR was written by a VALU Instruction.
726   const int VmemSgprWaitStates = 5;
727   auto IsHazardDefFn = [this](const MachineInstr &MI) {
728     return TII.isVALU(MI);
729   };
730   for (const MachineOperand &Use : VMEM->uses()) {
731     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
732       continue;
733 
734     int WaitStatesNeededForUse =
735         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
736                                                    VmemSgprWaitStates);
737     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
738   }
739   return WaitStatesNeeded;
740 }
741 
742 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
743   const SIRegisterInfo *TRI = ST.getRegisterInfo();
744   const SIInstrInfo *TII = ST.getInstrInfo();
745 
746   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
747   int DppVgprWaitStates = 2;
748   int DppExecWaitStates = 5;
749   int WaitStatesNeeded = 0;
750   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
751     return TII->isVALU(MI);
752   };
753 
754   for (const MachineOperand &Use : DPP->uses()) {
755     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
756       continue;
757     int WaitStatesNeededForUse =
758         DppVgprWaitStates - getWaitStatesSinceDef(
759                                 Use.getReg(),
760                                 [](const MachineInstr &) { return true; },
761                                 DppVgprWaitStates);
762     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763   }
764 
765   WaitStatesNeeded = std::max(
766       WaitStatesNeeded,
767       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
768                                                 DppExecWaitStates));
769 
770   return WaitStatesNeeded;
771 }
772 
773 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
774   const SIInstrInfo *TII = ST.getInstrInfo();
775 
776   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
777   // instruction.
778   const int DivFMasWaitStates = 4;
779   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
780     return TII->isVALU(MI);
781   };
782   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
783                                                DivFMasWaitStates);
784 
785   return DivFMasWaitStates - WaitStatesNeeded;
786 }
787 
788 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
789   const SIInstrInfo *TII = ST.getInstrInfo();
790   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
791 
792   const int GetRegWaitStates = 2;
793   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
794     return GetRegHWReg == getHWReg(TII, MI);
795   };
796   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
797 
798   return GetRegWaitStates - WaitStatesNeeded;
799 }
800 
801 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
802   const SIInstrInfo *TII = ST.getInstrInfo();
803   unsigned HWReg = getHWReg(TII, *SetRegInstr);
804 
805   const int SetRegWaitStates = ST.getSetRegWaitStates();
806   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
807     return HWReg == getHWReg(TII, MI);
808   };
809   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
810   return SetRegWaitStates - WaitStatesNeeded;
811 }
812 
813 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
814   if (!MI.mayStore())
815     return -1;
816 
817   const SIInstrInfo *TII = ST.getInstrInfo();
818   unsigned Opcode = MI.getOpcode();
819   const MCInstrDesc &Desc = MI.getDesc();
820 
821   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
822   int VDataRCID = -1;
823   if (VDataIdx != -1)
824     VDataRCID = Desc.operands()[VDataIdx].RegClass;
825 
826   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
827     // There is no hazard if the instruction does not use vector regs
828     // (like wbinvl1)
829     if (VDataIdx == -1)
830       return -1;
831     // For MUBUF/MTBUF instructions this hazard only exists if the
832     // instruction is not using a register in the soffset field.
833     const MachineOperand *SOffset =
834         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
835     // If we have no soffset operand, then assume this field has been
836     // hardcoded to zero.
837     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
838         (!SOffset || !SOffset->isReg()))
839       return VDataIdx;
840   }
841 
842   // MIMG instructions create a hazard if they don't use a 256-bit T# and
843   // the store size is greater than 8 bytes and they have more than two bits
844   // of their dmask set.
845   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
846   if (TII->isMIMG(MI)) {
847     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
848     assert(SRsrcIdx != -1 &&
849            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
850     (void)SRsrcIdx;
851   }
852 
853   if (TII->isFLAT(MI)) {
854     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
855     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
856       return DataIdx;
857   }
858 
859   return -1;
860 }
861 
862 int
863 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
864                                             const MachineRegisterInfo &MRI) {
865   // Helper to check for the hazard where VMEM instructions that store more than
866   // 8 bytes can have there store data over written by the next instruction.
867   const SIRegisterInfo *TRI = ST.getRegisterInfo();
868 
869   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
870   int WaitStatesNeeded = 0;
871 
872   if (!TRI->isVectorRegister(MRI, Def.getReg()))
873     return WaitStatesNeeded;
874   Register Reg = Def.getReg();
875   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
876     int DataIdx = createsVALUHazard(MI);
877     return DataIdx >= 0 &&
878            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
879   };
880 
881   int WaitStatesNeededForDef =
882     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
883   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
884 
885   return WaitStatesNeeded;
886 }
887 
888 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
889 /// pack the computed value into correct bit position of the dest register. This
890 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
891 /// dst_sel that is not aligned to the register. This function analayzes the \p
892 /// MI and \returns an operand with dst forwarding issue, or nullptr if
893 /// none exists.
894 static const MachineOperand *
895 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
896   if (!SIInstrInfo::isVALU(MI))
897     return nullptr;
898 
899   const SIInstrInfo *TII = ST.getInstrInfo();
900 
901   unsigned Opcode = MI.getOpcode();
902 
903   // There are three different types of instructions
904   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
905   // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
906   // CVT_SR_BF8_F32 with op_sel[3:2]
907   // != 0
908   if (SIInstrInfo::isSDWA(MI)) {
909     // Type 1: SDWA with dst_sel != DWORD
910     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
911       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
912         return nullptr;
913   } else {
914     // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
915     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
916     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
917         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
918               SISrcMods::DST_OP_SEL ||
919           (AMDGPU::isFP8DstSelInst(Opcode) &&
920            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
921             SISrcMods::OP_SEL_0))))
922       return nullptr;
923   }
924 
925   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
926 }
927 
928 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
929 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
930 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
931 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
932                                             const MachineOperand *Dst,
933                                             const SIRegisterInfo *TRI) {
934   // We must consider implicit reads of the VALU. SDWA with dst_sel and
935   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
936   // and we must account for that hazard.
937   // We also must account for WAW hazards. In particular, WAW with dest
938   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
939   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
940   // check for ECC. Without accounting for this hazard, the ECC will be
941   // wrong.
942   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
943   // complete zeroesHigh16BitsOfDest)
944   for (auto &Operand : VALU->operands()) {
945     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
946       return true;
947     }
948   }
949   return false;
950 }
951 
952 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
953   int WaitStatesNeeded = 0;
954 
955   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
956     const int TransDefWaitstates = 1;
957 
958     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
959       if (!SIInstrInfo::isTRANS(MI))
960         return false;
961       const SIRegisterInfo *TRI = ST.getRegisterInfo();
962       const SIInstrInfo *TII = ST.getInstrInfo();
963       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
964 
965       for (const MachineOperand &Use : VALU->explicit_uses()) {
966         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
967           return true;
968       }
969 
970       return false;
971     };
972 
973     int WaitStatesNeededForDef =
974         TransDefWaitstates -
975         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
976     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
977   }
978 
979   if (ST.hasDstSelForwardingHazard()) {
980     const int Shift16DefWaitstates = 1;
981 
982     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
983       const SIRegisterInfo *TRI = ST.getRegisterInfo();
984       const MachineOperand *ForwardedDst =
985           getDstSelForwardingOperand(ProducerMI, ST);
986       if (ForwardedDst) {
987         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
988       }
989 
990       if (ProducerMI.isInlineAsm()) {
991         // Assume inline asm has dst forwarding hazard
992         for (auto &Def : ProducerMI.all_defs()) {
993           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
994             return true;
995         }
996       }
997 
998       return false;
999     };
1000 
1001     int WaitStatesNeededForDef =
1002         Shift16DefWaitstates -
1003         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1004     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1005   }
1006 
1007   if (ST.hasVDecCoExecHazard()) {
1008     const int VALUWriteSGPRVALUReadWaitstates = 2;
1009     const int VALUWriteEXECRWLane = 4;
1010     const int VALUWriteVGPRReadlaneRead = 1;
1011 
1012     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1013     const MachineRegisterInfo &MRI = MF.getRegInfo();
1014     Register UseReg;
1015     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1016       if (!SIInstrInfo::isVALU(MI))
1017         return false;
1018       return MI.modifiesRegister(UseReg, TRI);
1019     };
1020 
1021     for (const MachineOperand &Use : VALU->explicit_uses()) {
1022       if (!Use.isReg())
1023         continue;
1024 
1025       UseReg = Use.getReg();
1026       if (TRI->isSGPRReg(MRI, UseReg)) {
1027         int WaitStatesNeededForDef =
1028             VALUWriteSGPRVALUReadWaitstates -
1029             getWaitStatesSince(IsVALUDefSGPRFn,
1030                                VALUWriteSGPRVALUReadWaitstates);
1031         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1032       }
1033     }
1034 
1035     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1036       UseReg = AMDGPU::VCC;
1037       int WaitStatesNeededForDef =
1038           VALUWriteSGPRVALUReadWaitstates -
1039           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1040       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1041     }
1042 
1043     switch (VALU->getOpcode()) {
1044     case AMDGPU::V_READLANE_B32:
1045     case AMDGPU::V_READFIRSTLANE_B32: {
1046       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1047       UseReg = Src->getReg();
1048       int WaitStatesNeededForDef =
1049           VALUWriteVGPRReadlaneRead -
1050           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1051       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052     }
1053       [[fallthrough]];
1054     case AMDGPU::V_WRITELANE_B32: {
1055       UseReg = AMDGPU::EXEC;
1056       int WaitStatesNeededForDef =
1057           VALUWriteEXECRWLane -
1058           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1059       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1060       break;
1061     }
1062     default:
1063       break;
1064     }
1065   }
1066 
1067   // This checks for the hazard where VMEM instructions that store more than
1068   // 8 bytes can have there store data over written by the next instruction.
1069   if (!ST.has12DWordStoreHazard())
1070     return WaitStatesNeeded;
1071 
1072   const MachineRegisterInfo &MRI = MF.getRegInfo();
1073 
1074   for (const MachineOperand &Def : VALU->defs()) {
1075     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1076   }
1077 
1078   return WaitStatesNeeded;
1079 }
1080 
1081 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1082   // This checks for hazards associated with inline asm statements.
1083   // Since inline asms can contain just about anything, we use this
1084   // to call/leverage other check*Hazard routines. Note that
1085   // this function doesn't attempt to address all possible inline asm
1086   // hazards (good luck), but is a collection of what has been
1087   // problematic thus far.
1088 
1089   // see checkVALUHazards()
1090   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1091     return 0;
1092 
1093   const MachineRegisterInfo &MRI = MF.getRegInfo();
1094   int WaitStatesNeeded = 0;
1095 
1096   for (const MachineOperand &Op :
1097        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1098     if (Op.isReg() && Op.isDef()) {
1099       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1100         continue;
1101 
1102       if (ST.has12DWordStoreHazard()) {
1103         WaitStatesNeeded =
1104             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1105       }
1106     }
1107   }
1108 
1109   if (ST.hasDstSelForwardingHazard()) {
1110     const int Shift16DefWaitstates = 1;
1111 
1112     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1113       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1114       // Assume inline asm reads the dst
1115       if (Dst)
1116         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1117                IA->readsRegister(Dst->getReg(), &TRI);
1118 
1119       if (ProducerMI.isInlineAsm()) {
1120         // If MI is inline asm, assume it has dst forwarding hazard
1121         for (auto &Def : ProducerMI.all_defs()) {
1122           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1123               IA->readsRegister(Def.getReg(), &TRI)) {
1124             return true;
1125           }
1126         }
1127       }
1128 
1129       return false;
1130     };
1131 
1132     int WaitStatesNeededForDef =
1133         Shift16DefWaitstates -
1134         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1135     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1136   }
1137 
1138   return WaitStatesNeeded;
1139 }
1140 
1141 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1142   const SIInstrInfo *TII = ST.getInstrInfo();
1143   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1144   const MachineRegisterInfo &MRI = MF.getRegInfo();
1145 
1146   const MachineOperand *LaneSelectOp =
1147       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1148 
1149   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1150     return 0;
1151 
1152   Register LaneSelectReg = LaneSelectOp->getReg();
1153   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1154 
1155   const int RWLaneWaitStates = 4;
1156   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1157                                               RWLaneWaitStates);
1158   return RWLaneWaitStates - WaitStatesSince;
1159 }
1160 
1161 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1162   if (!ST.hasRFEHazards())
1163     return 0;
1164 
1165   const SIInstrInfo *TII = ST.getInstrInfo();
1166 
1167   const int RFEWaitStates = 1;
1168 
1169   auto IsHazardFn = [TII](const MachineInstr &MI) {
1170     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1171   };
1172   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1173   return RFEWaitStates - WaitStatesNeeded;
1174 }
1175 
1176 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1177   const SIInstrInfo *TII = ST.getInstrInfo();
1178   const int ReadM0WaitStates = 1;
1179   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1180   return ReadM0WaitStates -
1181          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1182 }
1183 
1184 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1185   fixVMEMtoScalarWriteHazards(MI);
1186   fixVcmpxPermlaneHazards(MI);
1187   fixSMEMtoVectorWriteHazards(MI);
1188   fixVcmpxExecWARHazard(MI);
1189   fixLdsBranchVmemWARHazard(MI);
1190   if (ST.hasLdsDirect()) {
1191     fixLdsDirectVALUHazard(MI);
1192     fixLdsDirectVMEMHazard(MI);
1193   }
1194   fixVALUPartialForwardingHazard(MI);
1195   fixVALUTransUseHazard(MI);
1196   fixWMMAHazards(MI);
1197   fixShift64HighRegBug(MI);
1198   fixVALUMaskWriteHazard(MI);
1199   fixVALUReadSGPRHazard(MI);
1200   fixRequiredExportPriority(MI);
1201 }
1202 
1203 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1204   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1205     return false;
1206 
1207   const SIInstrInfo *TII = ST.getInstrInfo();
1208   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1209   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1210     return (TII->isVOPC(MI) ||
1211             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1212            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1213   };
1214 
1215   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1216     unsigned Opc = MI.getOpcode();
1217     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1218            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1219   };
1220 
1221   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1222       std::numeric_limits<int>::max())
1223     return false;
1224 
1225   // V_NOP will be discarded by SQ.
1226   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1227   // which is always a VGPR and available.
1228   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1229   Register Reg = Src0->getReg();
1230   bool IsUndef = Src0->isUndef();
1231   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1232           TII->get(AMDGPU::V_MOV_B32_e32))
1233     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1234     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1235 
1236   return true;
1237 }
1238 
1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1240   if (!ST.hasVMEMtoScalarWriteHazard())
1241     return false;
1242   assert(!ST.hasExtendedWaitCounts());
1243 
1244   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1245     return false;
1246 
1247   if (MI->getNumDefs() == 0)
1248     return false;
1249 
1250   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1251 
1252   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1253     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1254         !SIInstrInfo::isFLAT(I))
1255       return false;
1256 
1257     for (const MachineOperand &Def : MI->defs()) {
1258       const MachineOperand *Op =
1259           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1260       if (!Op)
1261         continue;
1262       return true;
1263     }
1264     return false;
1265   };
1266 
1267   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1268     return SIInstrInfo::isVALU(MI) ||
1269            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1270             !MI.getOperand(0).getImm()) ||
1271            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1272             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1273   };
1274 
1275   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1276       std::numeric_limits<int>::max())
1277     return false;
1278 
1279   const SIInstrInfo *TII = ST.getInstrInfo();
1280   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1281           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1282       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1283   return true;
1284 }
1285 
1286 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1287   if (!ST.hasSMEMtoVectorWriteHazard())
1288     return false;
1289   assert(!ST.hasExtendedWaitCounts());
1290 
1291   if (!SIInstrInfo::isVALU(*MI))
1292     return false;
1293 
1294   unsigned SDSTName;
1295   switch (MI->getOpcode()) {
1296   case AMDGPU::V_READLANE_B32:
1297   case AMDGPU::V_READFIRSTLANE_B32:
1298     SDSTName = AMDGPU::OpName::vdst;
1299     break;
1300   default:
1301     SDSTName = AMDGPU::OpName::sdst;
1302     break;
1303   }
1304 
1305   const SIInstrInfo *TII = ST.getInstrInfo();
1306   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1307   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1308   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1309   if (!SDST) {
1310     for (const auto &MO : MI->implicit_operands()) {
1311       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1312         SDST = &MO;
1313         break;
1314       }
1315     }
1316   }
1317 
1318   if (!SDST)
1319     return false;
1320 
1321   const Register SDSTReg = SDST->getReg();
1322   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1323     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1324   };
1325 
1326   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1327     if (TII->isSALU(MI)) {
1328       switch (MI.getOpcode()) {
1329       case AMDGPU::S_SETVSKIP:
1330       case AMDGPU::S_VERSION:
1331       case AMDGPU::S_WAITCNT_VSCNT:
1332       case AMDGPU::S_WAITCNT_VMCNT:
1333       case AMDGPU::S_WAITCNT_EXPCNT:
1334         // These instructions cannot not mitigate the hazard.
1335         return false;
1336       case AMDGPU::S_WAITCNT_LGKMCNT:
1337         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1338         return (MI.getOperand(1).getImm() == 0) &&
1339                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1340       case AMDGPU::S_WAITCNT: {
1341         const int64_t Imm = MI.getOperand(0).getImm();
1342         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1343         // DsCnt corresponds to LGKMCnt here.
1344         return (Decoded.DsCnt == 0);
1345       }
1346       default:
1347         // SOPP instructions cannot mitigate the hazard.
1348         if (TII->isSOPP(MI))
1349           return false;
1350         // At this point the SALU can be assumed to mitigate the hazard
1351         // because either:
1352         // (a) it is independent of the at risk SMEM (breaking chain),
1353         // or
1354         // (b) it is dependent on the SMEM, in which case an appropriate
1355         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1356         //     SMEM instruction.
1357         return true;
1358       }
1359     }
1360     return false;
1361   };
1362 
1363   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1364       std::numeric_limits<int>::max())
1365     return false;
1366 
1367   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1368           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1369       .addImm(0);
1370   return true;
1371 }
1372 
1373 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1374   if (!ST.hasVcmpxExecWARHazard())
1375     return false;
1376   assert(!ST.hasExtendedWaitCounts());
1377 
1378   if (!SIInstrInfo::isVALU(*MI))
1379     return false;
1380 
1381   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1382   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1383     return false;
1384 
1385   auto IsHazardFn = [TRI](const MachineInstr &I) {
1386     if (SIInstrInfo::isVALU(I))
1387       return false;
1388     return I.readsRegister(AMDGPU::EXEC, TRI);
1389   };
1390 
1391   const SIInstrInfo *TII = ST.getInstrInfo();
1392   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1393     if (SIInstrInfo::isVALU(MI)) {
1394       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1395         return true;
1396       for (auto MO : MI.implicit_operands())
1397         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1398           return true;
1399     }
1400     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1401         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1402       return true;
1403     return false;
1404   };
1405 
1406   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1407       std::numeric_limits<int>::max())
1408     return false;
1409 
1410   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1411           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1412       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1413   return true;
1414 }
1415 
1416 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1417                                                  const GCNSubtarget &ST) {
1418   if (!ST.hasLdsBranchVmemWARHazard())
1419     return false;
1420 
1421   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1422   // instructions need to appear in the same function.
1423   bool HasLds = false;
1424   bool HasVmem = false;
1425   for (auto &MBB : MF) {
1426     for (auto &MI : MBB) {
1427       HasLds |= SIInstrInfo::isDS(MI);
1428       HasVmem |=
1429           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1430       if (HasLds && HasVmem)
1431         return true;
1432     }
1433   }
1434   return false;
1435 }
1436 
1437 static bool isStoreCountWaitZero(const MachineInstr &I) {
1438   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1439          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1440          !I.getOperand(1).getImm();
1441 }
1442 
1443 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1444   if (!RunLdsBranchVmemWARHazardFixup)
1445     return false;
1446 
1447   assert(ST.hasLdsBranchVmemWARHazard());
1448   assert(!ST.hasExtendedWaitCounts());
1449 
1450   auto IsHazardInst = [](const MachineInstr &MI) {
1451     if (SIInstrInfo::isDS(MI))
1452       return 1;
1453     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1454       return 2;
1455     return 0;
1456   };
1457 
1458   auto InstType = IsHazardInst(*MI);
1459   if (!InstType)
1460     return false;
1461 
1462   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1463     return IsHazardInst(I) || isStoreCountWaitZero(I);
1464   };
1465 
1466   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1467     if (!I.isBranch())
1468       return false;
1469 
1470     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1471       auto InstType2 = IsHazardInst(I);
1472       return InstType2 && InstType != InstType2;
1473     };
1474 
1475     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1476       auto InstType2 = IsHazardInst(I);
1477       if (InstType == InstType2)
1478         return true;
1479 
1480       return isStoreCountWaitZero(I);
1481     };
1482 
1483     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1484            std::numeric_limits<int>::max();
1485   };
1486 
1487   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1488       std::numeric_limits<int>::max())
1489     return false;
1490 
1491   const SIInstrInfo *TII = ST.getInstrInfo();
1492   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1493           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1494     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1495     .addImm(0);
1496 
1497   return true;
1498 }
1499 
1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1501   if (!SIInstrInfo::isLDSDIR(*MI))
1502     return false;
1503 
1504   const int NoHazardWaitStates = 15;
1505   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1506   const Register VDSTReg = VDST->getReg();
1507 
1508   bool VisitedTrans = false;
1509   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1510     if (!SIInstrInfo::isVALU(I))
1511       return false;
1512     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1513     // Cover both WAR and WAW
1514     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1515   };
1516   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1517     if (WaitStates >= NoHazardWaitStates)
1518       return true;
1519     // Instructions which cause va_vdst==0 expire hazard
1520     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1521            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1522   };
1523   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1524     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1525   };
1526 
1527   DenseSet<const MachineBasicBlock *> Visited;
1528   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1529                                     std::next(MI->getReverseIterator()), 0,
1530                                     IsExpiredFn, Visited, GetWaitStatesFn);
1531 
1532   // Transcendentals can execute in parallel to other VALUs.
1533   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1534   if (VisitedTrans)
1535     Count = 0;
1536 
1537   MachineOperand *WaitVdstOp =
1538       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1539   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1540 
1541   return true;
1542 }
1543 
1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1545   if (!SIInstrInfo::isLDSDIR(*MI))
1546     return false;
1547 
1548   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1549   const Register VDSTReg = VDST->getReg();
1550 
1551   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1552     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1553         !SIInstrInfo::isDS(I))
1554       return false;
1555     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1556   };
1557   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1558   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1559   // according to the type of VMEM instruction.
1560   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1561     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1562            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1563            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1564             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1565            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1566             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1567   };
1568 
1569   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1570       std::numeric_limits<int>::max())
1571     return false;
1572 
1573   if (LdsdirCanWait) {
1574     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1575   } else {
1576     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1577             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1578         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1579   }
1580 
1581   return true;
1582 }
1583 
1584 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1585   if (!ST.hasVALUPartialForwardingHazard())
1586     return false;
1587   assert(!ST.hasExtendedWaitCounts());
1588 
1589   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1590     return false;
1591 
1592   SmallSetVector<Register, 4> SrcVGPRs;
1593 
1594   for (const MachineOperand &Use : MI->explicit_uses()) {
1595     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1596       SrcVGPRs.insert(Use.getReg());
1597   }
1598 
1599   // Only applies with >= 2 unique VGPR sources
1600   if (SrcVGPRs.size() <= 1)
1601     return false;
1602 
1603   // Look for the following pattern:
1604   //   Va <- VALU [PreExecPos]
1605   //   intv1
1606   //   Exec <- SALU [ExecPos]
1607   //   intv2
1608   //   Vb <- VALU [PostExecPos]
1609   //   intv3
1610   //   MI Va, Vb (WaitState = 0)
1611   //
1612   // Where:
1613   // intv1 + intv2 <= 2 VALUs
1614   // intv3 <= 4 VALUs
1615   //
1616   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1617 
1618   const int Intv1plus2MaxVALUs = 2;
1619   const int Intv3MaxVALUs = 4;
1620   const int IntvMaxVALUs = 6;
1621   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1622 
1623   struct StateType {
1624     SmallDenseMap<Register, int, 4> DefPos;
1625     int ExecPos = std::numeric_limits<int>::max();
1626     int VALUs = 0;
1627   };
1628 
1629   StateType State;
1630 
1631   // This overloads expiry testing with all the hazard detection
1632   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1633     // Too many VALU states have passed
1634     if (State.VALUs > NoHazardVALUWaitStates)
1635       return HazardExpired;
1636 
1637     // Instructions which cause va_vdst==0 expire hazard
1638     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1639         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1640         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1641          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1642       return HazardExpired;
1643 
1644     // Track registers writes
1645     bool Changed = false;
1646     if (SIInstrInfo::isVALU(I)) {
1647       for (Register Src : SrcVGPRs) {
1648         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1649           State.DefPos[Src] = State.VALUs;
1650           Changed = true;
1651         }
1652       }
1653     } else if (SIInstrInfo::isSALU(I)) {
1654       if (State.ExecPos == std::numeric_limits<int>::max()) {
1655         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1656           State.ExecPos = State.VALUs;
1657           Changed = true;
1658         }
1659       }
1660     }
1661 
1662     // Early expiration: too many VALUs in intv3
1663     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1664       return HazardExpired;
1665 
1666     // Only evaluate state if something changed
1667     if (!Changed)
1668       return NoHazardFound;
1669 
1670     // Determine positions of VALUs pre/post exec change
1671     if (State.ExecPos == std::numeric_limits<int>::max())
1672       return NoHazardFound;
1673 
1674     int PreExecPos = std::numeric_limits<int>::max();
1675     int PostExecPos = std::numeric_limits<int>::max();
1676 
1677     for (auto Entry : State.DefPos) {
1678       int DefVALUs = Entry.second;
1679       if (DefVALUs != std::numeric_limits<int>::max()) {
1680         if (DefVALUs >= State.ExecPos)
1681           PreExecPos = std::min(PreExecPos, DefVALUs);
1682         else
1683           PostExecPos = std::min(PostExecPos, DefVALUs);
1684       }
1685     }
1686 
1687     // Need a VALUs post exec change
1688     if (PostExecPos == std::numeric_limits<int>::max())
1689       return NoHazardFound;
1690 
1691     // Too many VALUs in intv3?
1692     int Intv3VALUs = PostExecPos;
1693     if (Intv3VALUs > Intv3MaxVALUs)
1694       return HazardExpired;
1695 
1696     // Too many VALUs in intv2?
1697     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1698     if (Intv2VALUs > Intv1plus2MaxVALUs)
1699       return HazardExpired;
1700 
1701     // Need a VALUs pre exec change
1702     if (PreExecPos == std::numeric_limits<int>::max())
1703       return NoHazardFound;
1704 
1705     // Too many VALUs in intv1?
1706     int Intv1VALUs = PreExecPos - State.ExecPos;
1707     if (Intv1VALUs > Intv1plus2MaxVALUs)
1708       return HazardExpired;
1709 
1710     // Too many VALUs in intv1 + intv2
1711     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1712       return HazardExpired;
1713 
1714     return HazardFound;
1715   };
1716   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1717     if (SIInstrInfo::isVALU(MI))
1718       State.VALUs += 1;
1719   };
1720 
1721   DenseSet<const MachineBasicBlock *> Visited;
1722   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1723                             std::next(MI->getReverseIterator()), Visited))
1724     return false;
1725 
1726   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1727           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1728       .addImm(0x0fff);
1729 
1730   return true;
1731 }
1732 
1733 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1734   if (!ST.hasVALUTransUseHazard())
1735     return false;
1736   assert(!ST.hasExtendedWaitCounts());
1737 
1738   if (!SIInstrInfo::isVALU(*MI))
1739     return false;
1740 
1741   SmallSet<Register, 4> SrcVGPRs;
1742 
1743   for (const MachineOperand &Use : MI->explicit_uses()) {
1744     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1745       SrcVGPRs.insert(Use.getReg());
1746   }
1747 
1748   // Look for the following pattern:
1749   //   Va <- TRANS VALU
1750   //   intv
1751   //   MI Va (WaitState = 0)
1752   //
1753   // Where:
1754   // intv <= 5 VALUs / 1 TRANS
1755   //
1756   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1757 
1758   const int IntvMaxVALUs = 5;
1759   const int IntvMaxTRANS = 1;
1760 
1761   struct StateType {
1762     int VALUs = 0;
1763     int TRANS = 0;
1764   };
1765 
1766   StateType State;
1767 
1768   // This overloads expiry testing with all the hazard detection
1769   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1770     // Too many VALU states have passed
1771     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1772       return HazardExpired;
1773 
1774     // Instructions which cause va_vdst==0 expire hazard
1775     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1776         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1777         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1778          I.getOperand(0).getImm() == 0x0fff))
1779       return HazardExpired;
1780 
1781     // Track registers writes
1782     if (SIInstrInfo::isTRANS(I)) {
1783       for (Register Src : SrcVGPRs) {
1784         if (I.modifiesRegister(Src, &TRI)) {
1785           return HazardFound;
1786         }
1787       }
1788     }
1789 
1790     return NoHazardFound;
1791   };
1792   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1793     if (SIInstrInfo::isVALU(MI))
1794       State.VALUs += 1;
1795     if (SIInstrInfo::isTRANS(MI))
1796       State.TRANS += 1;
1797   };
1798 
1799   DenseSet<const MachineBasicBlock *> Visited;
1800   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1801                             std::next(MI->getReverseIterator()), Visited))
1802     return false;
1803 
1804   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1805   // avoided.
1806   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1807           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1808       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1809 
1810   return true;
1811 }
1812 
1813 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1814   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1815     return false;
1816 
1817   const SIInstrInfo *TII = ST.getInstrInfo();
1818   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1819 
1820   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1821     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1822       return false;
1823 
1824     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1825     // with the dest(matrix D) of the previous wmma.
1826     const Register CurSrc0Reg =
1827         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1828     const Register CurSrc1Reg =
1829         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1830 
1831     const Register PrevDstReg =
1832         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1833 
1834     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1835         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1836       return true;
1837     }
1838 
1839     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1840     // but Index can't overlap with PrevDstReg.
1841     if (AMDGPU::isGFX12Plus(ST)) {
1842       if (SIInstrInfo::isSWMMAC(*MI)) {
1843         const Register CurIndex =
1844             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1845         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1846           return true;
1847       }
1848       return false;
1849     }
1850 
1851     return false;
1852   };
1853 
1854   auto IsExpiredFn = [](const MachineInstr &I, int) {
1855     return SIInstrInfo::isVALU(I);
1856   };
1857 
1858   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1859       std::numeric_limits<int>::max())
1860     return false;
1861 
1862   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1863 
1864   return true;
1865 }
1866 
1867 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1868   if (!ST.hasShift64HighRegBug())
1869     return false;
1870   assert(!ST.hasExtendedWaitCounts());
1871 
1872   switch (MI->getOpcode()) {
1873   default:
1874     return false;
1875   case AMDGPU::V_LSHLREV_B64_e64:
1876   case AMDGPU::V_LSHRREV_B64_e64:
1877   case AMDGPU::V_ASHRREV_I64_e64:
1878     break;
1879   }
1880 
1881   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1882   if (!Amt->isReg())
1883     return false;
1884 
1885   Register AmtReg = Amt->getReg();
1886   const MachineRegisterInfo &MRI = MF.getRegInfo();
1887   // Check if this is a last VGPR in the allocation block.
1888   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1889     return false;
1890 
1891   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1892     return false;
1893 
1894   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1895   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1896   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1897   bool Overlapped = OverlappedSrc || OverlappedDst;
1898 
1899   assert(!OverlappedDst || !OverlappedSrc ||
1900          Src1->getReg() == MI->getOperand(0).getReg());
1901   assert(ST.needsAlignedVGPRs());
1902   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1903 
1904   Register NewReg;
1905   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1906                                    : AMDGPU::VGPR_32RegClass) {
1907     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1908       NewReg = Reg;
1909       break;
1910     }
1911   }
1912 
1913   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1914                                : NewReg;
1915   Register NewAmtLo;
1916 
1917   if (Overlapped)
1918     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1919 
1920   DebugLoc DL = MI->getDebugLoc();
1921   MachineBasicBlock *MBB = MI->getParent();
1922   // Insert a full wait count because found register might be pending a wait.
1923   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1924       .addImm(0);
1925 
1926   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1927   if (Overlapped)
1928     runOnInstruction(
1929         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1930             .addDef(AmtReg - 1)
1931             .addReg(AmtReg - 1, RegState::Undef)
1932             .addReg(NewAmtLo, RegState::Undef));
1933   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1934                        .addDef(AmtReg)
1935                        .addReg(AmtReg, RegState::Undef)
1936                        .addReg(NewAmt, RegState::Undef));
1937 
1938   // Instructions emitted after the current instruction will be processed by the
1939   // parent loop of the hazard recognizer in a natural way.
1940   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1941           AmtReg)
1942       .addDef(NewAmt)
1943       .addReg(NewAmt)
1944       .addReg(AmtReg);
1945   if (Overlapped)
1946     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1947             AmtReg - 1)
1948         .addDef(NewAmtLo)
1949         .addReg(NewAmtLo)
1950         .addReg(AmtReg - 1);
1951 
1952   // Re-running hazard recognizer on the modified instruction is not necessary,
1953   // inserted V_SWAP_B32 has already both read and write new registers so
1954   // hazards related to these register has already been handled.
1955   Amt->setReg(NewAmt);
1956   Amt->setIsKill(false);
1957   // We do not update liveness, so verifier may see it as undef.
1958   Amt->setIsUndef();
1959   if (OverlappedDst)
1960     MI->getOperand(0).setReg(NewReg);
1961   if (OverlappedSrc) {
1962     Src1->setReg(NewReg);
1963     Src1->setIsKill(false);
1964     Src1->setIsUndef();
1965   }
1966 
1967   return true;
1968 }
1969 
1970 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1971   int NSAtoVMEMWaitStates = 1;
1972 
1973   if (!ST.hasNSAtoVMEMBug())
1974     return 0;
1975 
1976   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1977     return 0;
1978 
1979   const SIInstrInfo *TII = ST.getInstrInfo();
1980   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1981   if (!Offset || (Offset->getImm() & 6) == 0)
1982     return 0;
1983 
1984   auto IsHazardFn = [TII](const MachineInstr &I) {
1985     if (!SIInstrInfo::isMIMG(I))
1986       return false;
1987     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1988     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1989            TII->getInstSizeInBytes(I) >= 16;
1990   };
1991 
1992   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1993 }
1994 
1995 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1996   int FPAtomicToDenormModeWaitStates = 3;
1997 
1998   if (!ST.hasFPAtomicToDenormModeHazard())
1999     return 0;
2000   assert(!ST.hasExtendedWaitCounts());
2001 
2002   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2003     return 0;
2004 
2005   auto IsHazardFn = [](const MachineInstr &I) {
2006     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2007       return false;
2008     return SIInstrInfo::isFPAtomic(I);
2009   };
2010 
2011   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2012     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2013       return true;
2014 
2015     switch (MI.getOpcode()) {
2016     case AMDGPU::S_WAITCNT:
2017     case AMDGPU::S_WAITCNT_VSCNT:
2018     case AMDGPU::S_WAITCNT_VMCNT:
2019     case AMDGPU::S_WAITCNT_EXPCNT:
2020     case AMDGPU::S_WAITCNT_LGKMCNT:
2021     case AMDGPU::S_WAIT_IDLE:
2022       return true;
2023     default:
2024       break;
2025     }
2026 
2027     return false;
2028   };
2029 
2030   return FPAtomicToDenormModeWaitStates -
2031          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2032 }
2033 
2034 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2035   assert(SIInstrInfo::isMAI(*MI));
2036 
2037   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2038 }
2039 
2040 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2041   // Early exit if no padding is requested.
2042   if (MFMAPaddingRatio == 0)
2043     return 0;
2044 
2045   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2046   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2047     return 0;
2048 
2049   int NeighborMFMALatency = 0;
2050   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2051                             this](const MachineInstr &MI) {
2052     if (!SIInstrInfo::isMFMA(MI))
2053       return false;
2054 
2055     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2056     return true;
2057   };
2058 
2059   const int MaxMFMAPipelineWaitStates = 16;
2060   int WaitStatesSinceNeighborMFMA =
2061       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2062 
2063   int NeighborMFMAPaddingNeeded =
2064       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2065       WaitStatesSinceNeighborMFMA;
2066 
2067   return std::max(0, NeighborMFMAPaddingNeeded);
2068 }
2069 
2070 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2071   int WaitStatesNeeded = 0;
2072   unsigned Opc = MI->getOpcode();
2073 
2074   auto IsVALUFn = [](const MachineInstr &MI) {
2075     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2076   };
2077 
2078   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2079     const int LegacyVALUWritesVGPRWaitStates = 2;
2080     const int VALUWritesExecWaitStates = 4;
2081     const int MaxWaitStates = 4;
2082 
2083     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2084       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2085     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2086 
2087     if (WaitStatesNeeded < MaxWaitStates) {
2088       for (const MachineOperand &Use : MI->explicit_uses()) {
2089         const int MaxWaitStates = 2;
2090 
2091         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2092           continue;
2093 
2094         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2095           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2096         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2097 
2098         if (WaitStatesNeeded == MaxWaitStates)
2099           break;
2100       }
2101     }
2102   }
2103 
2104   for (const MachineOperand &Op : MI->explicit_operands()) {
2105     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2106       continue;
2107 
2108     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2109       continue;
2110 
2111     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2112     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2113     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2114     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2115     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2116     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2117     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2118     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2119     const int MaxWaitStates = 18;
2120     Register Reg = Op.getReg();
2121     unsigned HazardDefLatency = 0;
2122 
2123     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2124                                this](const MachineInstr &MI) {
2125       if (!SIInstrInfo::isMFMA(MI))
2126         return false;
2127       Register DstReg = MI.getOperand(0).getReg();
2128       if (DstReg == Reg)
2129         return false;
2130       HazardDefLatency =
2131           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2132       return TRI.regsOverlap(DstReg, Reg);
2133     };
2134 
2135     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2136                                                    MaxWaitStates);
2137     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2138     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2139     int OpNo = Op.getOperandNo();
2140     if (OpNo == SrcCIdx) {
2141       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2142     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2143       switch (HazardDefLatency) {
2144       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2145                break;
2146       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2147                break;
2148       case 16: [[fallthrough]];
2149       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2150                break;
2151       }
2152     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2153       switch (HazardDefLatency) {
2154       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2155                break;
2156       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2157                break;
2158       case 16: [[fallthrough]];
2159       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2160                break;
2161       }
2162     }
2163 
2164     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2165     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2166 
2167     if (WaitStatesNeeded == MaxWaitStates)
2168       return WaitStatesNeeded; // Early exit.
2169 
2170     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2171       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2172         return false;
2173       Register DstReg = MI.getOperand(0).getReg();
2174       return TRI.regsOverlap(Reg, DstReg);
2175     };
2176 
2177     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2178     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2179     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2180     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2181     if (OpNo == SrcCIdx)
2182       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2183     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2184       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2185 
2186     WaitStatesNeededForUse = NeedWaitStates -
2187       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2188     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2189 
2190     if (WaitStatesNeeded == MaxWaitStates)
2191       return WaitStatesNeeded; // Early exit.
2192   }
2193 
2194   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2195     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2196     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2197     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2198     const int MaxWaitStates = 13;
2199     Register DstReg = MI->getOperand(0).getReg();
2200     unsigned HazardDefLatency = 0;
2201 
2202     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2203                          this](const MachineInstr &MI) {
2204       if (!SIInstrInfo::isMFMA(MI))
2205         return false;
2206       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2207       HazardDefLatency =
2208           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2209       return TRI.regsOverlap(Reg, DstReg);
2210     };
2211 
2212     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2213     int NeedWaitStates;
2214     switch (HazardDefLatency) {
2215     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2216              break;
2217     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2218              break;
2219     case 16: [[fallthrough]];
2220     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2221              break;
2222     }
2223 
2224     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2225     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2226   }
2227 
2228   // Pad neighboring MFMA with noops for better inter-wave performance.
2229   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2230 
2231   return WaitStatesNeeded;
2232 }
2233 
2234 static int
2235 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses,
2236                                                          bool IsGFX950) {
2237   // xdl def cycles | gfx940 | gfx950
2238   // 2 pass         |  3        4
2239   // 4 pass         |  5        6
2240   // 8 pass         |  9        10
2241   // 16 pass        |  17       18
2242   return NumPasses + 1 + IsGFX950;
2243 }
2244 
2245 static int
2246 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2247   // 2 pass -> 2
2248   // 4 pass -> 4
2249   // 8 pass -> 8
2250   // 16 pass -> 16
2251   return NumPasses;
2252 }
2253 
2254 static int
2255 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2256   // 2 pass -> 4
2257   // 4 pass -> 6
2258   // 8 pass -> 10
2259   // 16 pass -> 18
2260   return NumPasses + 2;
2261 }
2262 
2263 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2264   // 2 pass -> 5
2265   // 4 pass -> 7
2266   // 8 pass -> 11
2267   // 16 pass -> 19
2268   return NumPasses + 3;
2269 }
2270 
2271 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2272   int WaitStatesNeeded = 0;
2273   unsigned Opc = MI->getOpcode();
2274 
2275   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2276     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2277   };
2278 
2279   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2280     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2281            !SIInstrInfo::isDOT(MI);
2282   };
2283 
2284   if (!SIInstrInfo::isMFMA(*MI))
2285     return WaitStatesNeeded;
2286 
2287   const int VALUWritesExecWaitStates = 4;
2288   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2289     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2290                           VALUWritesExecWaitStates);
2291   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2292 
2293   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2294 
2295   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2296   for (const MachineOperand &Use : MI->explicit_uses()) {
2297     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2298     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2299     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2300     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2301     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2302     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2303     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2304     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2305     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2306     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2307     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2308     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2309     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2310     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2311     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2312     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2313     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2314     const int MaxWaitStates = 19;
2315 
2316     if (!Use.isReg())
2317       continue;
2318     Register Reg = Use.getReg();
2319     bool FullReg;
2320     const MachineInstr *MI1;
2321 
2322     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2323                                this](const MachineInstr &MI) {
2324       if (!SIInstrInfo::isMFMA(MI))
2325         return false;
2326       Register DstReg = MI.getOperand(0).getReg();
2327       FullReg = (DstReg == Reg);
2328       MI1 = &MI;
2329       return TRI.regsOverlap(DstReg, Reg);
2330     };
2331 
2332     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2333       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2334     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2335 
2336     int NumWaitStates =
2337         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2338     if (NumWaitStates == std::numeric_limits<int>::max())
2339       continue;
2340 
2341     int OpNo = Use.getOperandNo();
2342     unsigned Opc1 = MI1->getOpcode();
2343     int NeedWaitStates = 0;
2344     if (OpNo == SrcCIdx) {
2345       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2346         NeedWaitStates = 0;
2347       } else if (FullReg) {
2348         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2349              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2350             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2351              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2352           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2353         else if (ST.hasGFX940Insts() &&
2354                  TSchedModel.computeInstrLatency(MI1) == 2)
2355           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2356       } else {
2357         switch (Opc1) {
2358         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2359         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2360         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2361         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2362           if (!isXDL(ST, *MI))
2363             NeedWaitStates =
2364                 ST.hasGFX950Insts()
2365                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2366                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2367           break;
2368         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2369         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2370           if (!isXDL(ST, *MI))
2371             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2372           break;
2373         default:
2374           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2375           if (ST.hasGFX940Insts()) {
2376             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2377               break;
2378 
2379             NeedWaitStates =
2380                 isXDL(ST, *MI1)
2381                     ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2382                           NumPasses, ST.hasGFX950Insts())
2383                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2384                           NumPasses);
2385             break;
2386           }
2387 
2388           switch (NumPasses) {
2389           case 2:
2390             NeedWaitStates =
2391                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2392                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2393             break;
2394           case 8:
2395             NeedWaitStates =
2396                 isDGEMM(Opc)
2397                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2398                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2399             break;
2400           case 16:
2401             NeedWaitStates =
2402                 isDGEMM(Opc)
2403                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2404                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2405             break;
2406           default:
2407             llvm_unreachable("unexpected number of passes");
2408           }
2409         }
2410       }
2411     } else {
2412       switch (Opc1) {
2413       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2414       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2415       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2416       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2417         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2418         break;
2419       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2420       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2421         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2422         break;
2423       default:
2424         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2425 
2426         if (ST.hasGFX940Insts()) {
2427           NeedWaitStates =
2428               isXDL(ST, *MI1)
2429                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2430                         NumPasses)
2431                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2432                         NumPasses);
2433           break;
2434         }
2435 
2436         switch (NumPasses) {
2437         case 2:
2438           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2439           break;
2440         case 4:
2441           llvm_unreachable("unexpected number of passes for mfma");
2442         case 8:
2443           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2444           break;
2445         case 16:
2446         default:
2447           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2448         }
2449       }
2450     }
2451     if (WaitStatesNeeded >= NeedWaitStates)
2452       continue;
2453 
2454     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2455     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2456 
2457     if (WaitStatesNeeded == MaxWaitStates)
2458       break;
2459   }
2460 
2461   // Pad neighboring MFMA with noops for better inter-wave performance.
2462   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2463 
2464   return WaitStatesNeeded;
2465 }
2466 
2467 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2468   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2469   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2470     return 0;
2471 
2472   int WaitStatesNeeded = 0;
2473 
2474   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2475     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2476   };
2477 
2478   for (const MachineOperand &Op : MI->explicit_uses()) {
2479     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2480       continue;
2481 
2482     Register Reg = Op.getReg();
2483 
2484     const int AccVgprReadLdStWaitStates = 2;
2485     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2486     const int MaxWaitStates = 2;
2487 
2488     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2489       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2490     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2491 
2492     if (WaitStatesNeeded == MaxWaitStates)
2493       return WaitStatesNeeded; // Early exit.
2494 
2495     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2496       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2497           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2498         return false;
2499       auto IsVALUFn = [](const MachineInstr &MI) {
2500         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2501       };
2502       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2503              std::numeric_limits<int>::max();
2504     };
2505 
2506     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2507       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2508     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2509   }
2510 
2511   return WaitStatesNeeded;
2512 }
2513 
2514 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2515   // 2 pass -> 4
2516   // 4 pass -> 6
2517   // 8 pass -> 10
2518   // 16 pass -> 18
2519   return NumPasses + 2;
2520 }
2521 
2522 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2523   // 2 pass -> 5
2524   // 4 pass -> 7
2525   // 8 pass -> 11
2526   // 16 pass -> 19
2527   return NumPasses + 3;
2528 }
2529 
2530 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2531   // 2 pass -> 5
2532   // 4 pass -> 7
2533   // 8 pass -> 11
2534   // 16 pass -> 19
2535   return NumPasses + 3;
2536 }
2537 
2538 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2539   // 2 pass -> 4
2540   // 4 pass -> 6
2541   // 8 pass -> 10
2542   // 16 pass -> 18
2543   return NumPasses + 2;
2544 }
2545 
2546 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2547   if (!ST.hasGFX90AInsts())
2548     return 0;
2549 
2550   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2551     return isDGEMM(MI.getOpcode());
2552   };
2553 
2554   // This is checked in checkMAIHazards90A()
2555   if (SIInstrInfo::isMFMA(*MI))
2556     return 0;
2557 
2558   const MachineRegisterInfo &MRI = MF.getRegInfo();
2559 
2560   int WaitStatesNeeded = 0;
2561 
2562   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2563                SIInstrInfo::isFLAT(*MI) ||
2564                SIInstrInfo::isDS(*MI);
2565   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2566   bool IsVALU = SIInstrInfo::isVALU(*MI);
2567 
2568   const MachineInstr *MFMA = nullptr;
2569   unsigned Reg;
2570   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2571     if (!SIInstrInfo::isMFMA(MI) ||
2572         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2573       return false;
2574     MFMA = &MI;
2575     return true;
2576   };
2577 
2578   const MachineInstr *DOT = nullptr;
2579   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2580     if (!SIInstrInfo::isDOT(MI) ||
2581         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2582       return false;
2583     DOT = &MI;
2584     return true;
2585   };
2586 
2587   bool DGEMMAfterVALUWrite = false;
2588   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2589     // Found DGEMM on reverse traversal to def.
2590     if (isDGEMM(MI.getOpcode()))
2591       DGEMMAfterVALUWrite = true;
2592 
2593     // Only hazard if register is defined by a VALU and a DGEMM is found after
2594     // after the def.
2595     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2596       return false;
2597 
2598     return true;
2599   };
2600 
2601   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2602                                            AMDGPU::OpName::src2);
2603 
2604   if (IsMemOrExport || IsVALU) {
2605     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2606     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2607     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2608     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2609     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2610     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2611     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2612     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2613     const int DotWriteSameDotReadSrcAB = 3;
2614     const int DotWriteDifferentVALURead = 3;
2615     const int DMFMABetweenVALUWriteVMEMRead = 2;
2616     const int MaxWaitStates = 19;
2617 
2618     for (const MachineOperand &Use : MI->explicit_uses()) {
2619       if (!Use.isReg())
2620         continue;
2621       Reg = Use.getReg();
2622 
2623       DOT = nullptr;
2624       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2625                                                      MaxWaitStates);
2626       if (DOT) {
2627         int NeedWaitStates = 0;
2628         if (DOT->getOpcode() == MI->getOpcode()) {
2629           if (&Use - &MI->getOperand(0) != SrcCIdx)
2630             NeedWaitStates = DotWriteSameDotReadSrcAB;
2631         } else {
2632           NeedWaitStates = DotWriteDifferentVALURead;
2633         }
2634 
2635         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2636         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2637       }
2638 
2639       // Workaround for HW data hazard bug observed only in GFX90A. When there
2640       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2641       // causes the SQ to incorrectly not insert two wait states between the two
2642       // instructions needed to avoid data hazard.
2643       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2644         DGEMMAfterVALUWrite = false;
2645         if (TRI.isVectorRegister(MRI, Reg)) {
2646           int WaitStatesNeededForUse =
2647                 DMFMABetweenVALUWriteVMEMRead -
2648                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2649                                       DMFMABetweenVALUWriteVMEMRead);
2650 
2651           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2652         }
2653       }
2654 
2655       MFMA = nullptr;
2656       WaitStatesSinceDef =
2657           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2658       if (!MFMA)
2659         continue;
2660 
2661       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2662       int NumPasses = HazardDefLatency;
2663       int NeedWaitStates = MaxWaitStates;
2664 
2665       if (isDGEMM(MFMA->getOpcode())) {
2666         switch (HazardDefLatency) {
2667         case 4:
2668           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2669                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2670           break;
2671         case 8:
2672         case 16:
2673           NeedWaitStates =
2674               IsMemOrExport
2675                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2676                   : (ST.hasGFX950Insts()
2677                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2678                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2679           break;
2680         default:
2681           llvm_unreachable("unexpected dgemm");
2682         }
2683       } else if (ST.hasGFX940Insts()) {
2684         NeedWaitStates =
2685             isXDL(ST, *MFMA)
2686                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2687                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2688                       NumPasses);
2689       } else {
2690         switch (HazardDefLatency) {
2691         case 2:
2692           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2693           break;
2694         case 8:
2695           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2696           break;
2697         case 16:
2698           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2699           break;
2700         default:
2701           llvm_unreachable("unexpected number of passes for mfma");
2702         }
2703       }
2704 
2705       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2706       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2707 
2708       if (WaitStatesNeeded == MaxWaitStates)
2709         break;
2710     }
2711   }
2712 
2713   unsigned Opc = MI->getOpcode();
2714   const int DMFMAToFMA64WaitStates = 2;
2715   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2716        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2717        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2718       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2719     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2720       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2721     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2722   }
2723 
2724   if (!IsVALU && !IsMemOrExport)
2725     return WaitStatesNeeded;
2726 
2727   for (const MachineOperand &Def : MI->defs()) {
2728     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2729     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2730     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2731     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2732     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2733     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2734     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2735     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2736     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2737     const int DotWriteDifferentVALUWrite = 3;
2738     const int MaxWaitStates = 19;
2739     const int MaxWarWaitStates = 15;
2740 
2741     Reg = Def.getReg();
2742 
2743     DOT = nullptr;
2744     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2745                                                    MaxWaitStates);
2746     if (DOT && DOT->getOpcode() != MI->getOpcode())
2747       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2748                                                     WaitStatesSinceDef);
2749 
2750     MFMA = nullptr;
2751     WaitStatesSinceDef =
2752         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2753     if (MFMA) {
2754       int NeedWaitStates = MaxWaitStates;
2755       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2756 
2757       if (isDGEMM(MFMA->getOpcode())) {
2758         switch (NumPasses) {
2759         case 4:
2760           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2761           break;
2762         case 8:
2763         case 16:
2764           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2765           break;
2766         default:
2767           llvm_unreachable("unexpected number of cycles for dgemm");
2768         }
2769       } else if (ST.hasGFX940Insts()) {
2770         NeedWaitStates =
2771             isXDL(ST, *MFMA)
2772                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2773                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2774       } else {
2775         switch (NumPasses) {
2776         case 2:
2777           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2778           break;
2779         case 8:
2780           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2781           break;
2782         case 16:
2783           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2784           break;
2785         default:
2786           llvm_unreachable("Unexpected number of passes for mfma");
2787         }
2788       }
2789 
2790       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2791       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2792 
2793       if (WaitStatesNeeded == MaxWaitStates)
2794         break;
2795     }
2796 
2797     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2798       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2799           !MI.readsRegister(Reg, &TRI))
2800         return false;
2801 
2802       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2803         return false;
2804 
2805       const MachineOperand *SrcC =
2806           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2807       assert(SrcC);
2808       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2809         return false;
2810 
2811       MFMA = &MI;
2812       return true;
2813     };
2814 
2815     MFMA = nullptr;
2816     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2817                                                 MaxWarWaitStates);
2818     if (!MFMA)
2819       continue;
2820 
2821     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2822     int NeedWaitStates = MaxWaitStates;
2823     switch (HazardDefLatency) {
2824     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2825              break;
2826     case 4:  assert(ST.hasGFX940Insts());
2827              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2828              break;
2829     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2830              break;
2831     case 16: [[fallthrough]];
2832     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2833              break;
2834     }
2835 
2836     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2837     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2838   }
2839 
2840   return WaitStatesNeeded;
2841 }
2842 
2843 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2844   if (!SU->isInstr())
2845     return false;
2846 
2847   const MachineInstr *MAI = nullptr;
2848 
2849   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2850     MAI = nullptr;
2851     if (SIInstrInfo::isMFMA(MI))
2852       MAI = &MI;
2853     return MAI != nullptr;
2854   };
2855 
2856   MachineInstr *MI = SU->getInstr();
2857   if (IsMFMAFn(*MI)) {
2858     int W = getWaitStatesSince(IsMFMAFn, 16);
2859     if (MAI)
2860       return W < (int)TSchedModel.computeInstrLatency(MAI);
2861   }
2862 
2863   return false;
2864 }
2865 
2866 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2867 // insertion of a new instruction.
2868 static void updateGetPCBundle(MachineInstr *NewMI) {
2869   if (!NewMI->isBundled())
2870     return;
2871 
2872   // Find start of bundle.
2873   auto I = NewMI->getIterator();
2874   while (I->isBundledWithPred())
2875     I--;
2876   if (I->isBundle())
2877     I++;
2878 
2879   // Bail if this is not an S_GETPC bundle.
2880   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2881     return;
2882 
2883   // Update offsets of any references in the bundle.
2884   const unsigned NewBytes = 4;
2885   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2886          "Unexpected instruction insertion in bundle");
2887   auto NextMI = std::next(NewMI->getIterator());
2888   auto End = NewMI->getParent()->end();
2889   while (NextMI != End && NextMI->isBundledWithPred()) {
2890     for (auto &Operand : NextMI->operands()) {
2891       if (Operand.isGlobal())
2892         Operand.setOffset(Operand.getOffset() + NewBytes);
2893     }
2894     NextMI++;
2895   }
2896 }
2897 
2898 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2899   if (!ST.hasVALUMaskWriteHazard())
2900     return false;
2901   assert(!ST.hasExtendedWaitCounts());
2902 
2903   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2904     return false;
2905 
2906   // The hazard sequence is three instructions:
2907   //   1. VALU reads SGPR as mask
2908   //   2. SALU writes SGPR
2909   //   3. SALU reads SGPR
2910   // The hazard can expire if the distance between 2 and 3 is sufficient.
2911   // In practice this happens <10% of the time, hence this always assumes
2912   // the hazard exists if 1 and 2 are present to avoid searching.
2913 
2914   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2915   if (!SDSTOp || !SDSTOp->isReg())
2916     return false;
2917 
2918   const Register HazardReg = SDSTOp->getReg();
2919   if (HazardReg == AMDGPU::EXEC ||
2920       HazardReg == AMDGPU::EXEC_LO ||
2921       HazardReg == AMDGPU::EXEC_HI ||
2922       HazardReg == AMDGPU::M0)
2923     return false;
2924 
2925   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2926     switch (I.getOpcode()) {
2927     case AMDGPU::V_ADDC_U32_e32:
2928     case AMDGPU::V_ADDC_U32_dpp:
2929     case AMDGPU::V_CNDMASK_B16_e32:
2930     case AMDGPU::V_CNDMASK_B16_dpp:
2931     case AMDGPU::V_CNDMASK_B32_e32:
2932     case AMDGPU::V_CNDMASK_B32_dpp:
2933     case AMDGPU::V_DIV_FMAS_F32_e64:
2934     case AMDGPU::V_DIV_FMAS_F64_e64:
2935     case AMDGPU::V_SUBB_U32_e32:
2936     case AMDGPU::V_SUBB_U32_dpp:
2937     case AMDGPU::V_SUBBREV_U32_e32:
2938     case AMDGPU::V_SUBBREV_U32_dpp:
2939       // These implicitly read VCC as mask source.
2940       return HazardReg == AMDGPU::VCC ||
2941              HazardReg == AMDGPU::VCC_LO ||
2942              HazardReg == AMDGPU::VCC_HI;
2943     case AMDGPU::V_ADDC_U32_e64:
2944     case AMDGPU::V_ADDC_U32_e64_dpp:
2945     case AMDGPU::V_CNDMASK_B16_e64:
2946     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2947     case AMDGPU::V_CNDMASK_B32_e64:
2948     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2949     case AMDGPU::V_SUBB_U32_e64:
2950     case AMDGPU::V_SUBB_U32_e64_dpp:
2951     case AMDGPU::V_SUBBREV_U32_e64:
2952     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2953       // Only check mask register overlaps.
2954       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2955       assert(SSRCOp);
2956       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2957     }
2958     default:
2959       return false;
2960     }
2961   };
2962 
2963   const MachineRegisterInfo &MRI = MF.getRegInfo();
2964   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2965     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2966     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2967         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2968       return true;
2969 
2970     // VALU access to any SGPR or literal constant other than HazardReg
2971     // mitigates hazard. No need to check HazardReg here as this will
2972     // only be called when !IsHazardFn.
2973     if (!SIInstrInfo::isVALU(I))
2974       return false;
2975     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2976       const MachineOperand &Op = I.getOperand(OpNo);
2977       if (Op.isReg()) {
2978         Register OpReg = Op.getReg();
2979         // Only consider uses
2980         if (!Op.isUse())
2981           continue;
2982         // Ignore EXEC
2983         if (OpReg == AMDGPU::EXEC ||
2984             OpReg == AMDGPU::EXEC_LO ||
2985             OpReg == AMDGPU::EXEC_HI)
2986           continue;
2987         // Ignore all implicit uses except VCC
2988         if (Op.isImplicit()) {
2989           if (OpReg == AMDGPU::VCC ||
2990               OpReg == AMDGPU::VCC_LO ||
2991               OpReg == AMDGPU::VCC_HI)
2992             return true;
2993           continue;
2994         }
2995         if (TRI.isSGPRReg(MRI, OpReg))
2996           return true;
2997       } else {
2998         const MCInstrDesc &InstDesc = I.getDesc();
2999         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3000         if (!TII.isInlineConstant(Op, OpInfo))
3001           return true;
3002       }
3003     }
3004     return false;
3005   };
3006 
3007   // Check for hazard
3008   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3009       std::numeric_limits<int>::max())
3010     return false;
3011 
3012   auto NextMI = std::next(MI->getIterator());
3013 
3014   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3015   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3016                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3017                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3018 
3019   // SALU write may be s_getpc in a bundle.
3020   updateGetPCBundle(NewMI);
3021 
3022   return true;
3023 }
3024 
3025 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3026 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3027 static std::optional<unsigned> sgprPairNumber(Register Reg,
3028                                               const SIRegisterInfo &TRI) {
3029   switch (Reg) {
3030   case AMDGPU::M0:
3031   case AMDGPU::EXEC:
3032   case AMDGPU::EXEC_LO:
3033   case AMDGPU::EXEC_HI:
3034   case AMDGPU::SGPR_NULL:
3035   case AMDGPU::SGPR_NULL64:
3036     return {};
3037   default:
3038     break;
3039   }
3040   unsigned RegN = TRI.getEncodingValue(Reg);
3041   if (RegN > 127)
3042     return {};
3043   return (RegN >> 1) & 0x3f;
3044 }
3045 
3046 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3047 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3048   assert(MMF == &MF);
3049 
3050   // Assume non-empty vector means it has already been computed.
3051   if (!VALUReadHazardSGPRs.empty())
3052     return;
3053 
3054   auto CallingConv = MF.getFunction().getCallingConv();
3055   bool IsCallFree =
3056       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3057 
3058   // Exhaustive search is only viable in non-caller/callee functions where
3059   // VALUs will be exposed to the hazard recognizer.
3060   UseVALUReadHazardExhaustiveSearch =
3061       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3062       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3063 
3064   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3065   bool UseVALUUseCache =
3066       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3067   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3068   if (!UseVALUUseCache)
3069     return;
3070 
3071   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3072   // before a SALU write to the same SGPR.  This provides a reduction in
3073   // hazard insertion when all VALU access to an SGPR occurs after its last
3074   // SALU write, when compared to a linear scan.
3075   const MachineRegisterInfo &MRI = MF.getRegInfo();
3076   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3077   MachineCycleInfo CI;
3078   CI.compute(*MMF);
3079 
3080   for (auto *MBB : post_order(&MF)) {
3081     bool InCycle = CI.getCycle(MBB) != nullptr;
3082     for (auto &MI : reverse(MBB->instrs())) {
3083       bool IsVALU = SIInstrInfo::isVALU(MI);
3084       bool IsSALU = SIInstrInfo::isSALU(MI);
3085       if (!IsVALU && !IsSALU)
3086         continue;
3087 
3088       for (const MachineOperand &Op : MI.operands()) {
3089         if (!Op.isReg())
3090           continue;
3091         Register Reg = Op.getReg();
3092         assert(!Op.getSubReg());
3093         // Only consider implicit operands of VCC.
3094         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3095                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3096           continue;
3097         if (!TRI.isSGPRReg(MRI, Reg))
3098           continue;
3099         auto RegN = sgprPairNumber(Reg, TRI);
3100         if (!RegN)
3101           continue;
3102         if (IsVALU && Op.isUse()) {
3103           // Note: any access within a cycle must be considered a hazard.
3104           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3105             VALUReadHazardSGPRs.set(*RegN);
3106           ReadSGPRs.set(*RegN);
3107         } else if (IsSALU) {
3108           if (Op.isDef())
3109             SALUWriteSGPRs.set(*RegN);
3110           else
3111             ReadSGPRs.set(*RegN);
3112         }
3113       }
3114     }
3115   }
3116 }
3117 
3118 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3119   if (!ST.hasVALUReadSGPRHazard())
3120     return false;
3121 
3122   // The hazard sequence is fundamentally three instructions:
3123   //   1. VALU reads SGPR
3124   //   2. SALU writes SGPR
3125   //   3. VALU/SALU reads SGPR
3126   // Try to avoid searching for (1) because the expiry point of the hazard is
3127   // indeterminate; however, the hazard between (2) and (3) can expire if the
3128   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3129   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3130   // even if individual SGPRs are accessed.
3131 
3132   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3133   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3134   if (!(MIIsSALU || MIIsVALU))
3135     return false;
3136 
3137   // Avoid expensive search when compile time is priority by
3138   // mitigating every SALU which writes an SGPR.
3139   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3140     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3141       return false;
3142 
3143     const MachineOperand *SDSTOp =
3144         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3145     if (!SDSTOp || !SDSTOp->isReg())
3146       return false;
3147 
3148     const Register HazardReg = SDSTOp->getReg();
3149     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3150         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3151       return false;
3152 
3153     // Add s_wait_alu sa_sdst(0) after SALU write.
3154     auto NextMI = std::next(MI->getIterator());
3155     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3156                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3157                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3158 
3159     // SALU write may be s_getpc in a bundle.
3160     updateGetPCBundle(NewMI);
3161 
3162     return true;
3163   }
3164 
3165   // Pre-compute set of SGPR pairs read by VALUs.
3166   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3167   computeVALUHazardSGPRs(MI->getMF());
3168 
3169   // If no VALUs hazard SGPRs exist then nothing to do.
3170   if (VALUReadHazardSGPRs.none())
3171     return false;
3172 
3173   // All SGPR writes before a call/return must be flushed as the callee/caller
3174   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3175   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3176                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3177                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3178 
3179   // Collect all SGPR sources for MI which are read by a VALU.
3180   const MachineRegisterInfo &MRI = MF.getRegInfo();
3181   SmallSet<Register, 4> SGPRsUsed;
3182 
3183   if (!IsSetPC) {
3184     for (const MachineOperand &Op : MI->all_uses()) {
3185       Register OpReg = Op.getReg();
3186 
3187       // Only consider VCC implicit uses on VALUs.
3188       // The only expected SALU implicit access is SCC which is no hazard.
3189       if (MIIsSALU && Op.isImplicit())
3190         continue;
3191 
3192       if (!TRI.isSGPRReg(MRI, OpReg))
3193         continue;
3194 
3195       auto RegN = sgprPairNumber(OpReg, TRI);
3196       if (!RegN)
3197         continue;
3198 
3199       if (!VALUReadHazardSGPRs[*RegN])
3200         continue;
3201 
3202       SGPRsUsed.insert(OpReg);
3203     }
3204 
3205     // No SGPRs -> nothing to do.
3206     if (SGPRsUsed.empty())
3207       return false;
3208   }
3209 
3210   // A hazard is any SALU which writes one of the SGPRs read by MI.
3211   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3212     if (!SIInstrInfo::isSALU(I))
3213       return false;
3214     // Ensure SGPR flush before call/return by conservatively assuming every
3215     // SALU writes an SGPR.
3216     if (IsSetPC && I.getNumDefs() > 0)
3217       return true;
3218     // Check for any register writes.
3219     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3220       return I.modifiesRegister(Reg, &TRI);
3221     });
3222   };
3223 
3224   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3225   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3226     if (Count >= SALUExpiryCount)
3227       return true;
3228     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3229     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3230         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3231       return true;
3232     return false;
3233   };
3234 
3235   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3236     // Only count true SALUs as wait states.
3237     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3238       return 0;
3239     // SALU must be unrelated to any hazard registers.
3240     if (any_of(SGPRsUsed,
3241                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3242       return 0;
3243     return 1;
3244   };
3245 
3246   // Check for the hazard.
3247   DenseSet<const MachineBasicBlock *> Visited;
3248   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3249                                         std::next(MI->getReverseIterator()), 0,
3250                                         IsExpiredFn, Visited, WaitStatesFn);
3251 
3252   if (WaitStates >= SALUExpiryCount)
3253     return false;
3254 
3255   // Validate hazard through an exhaustive search.
3256   if (UseVALUReadHazardExhaustiveSearch) {
3257     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3258     // This is searching for (1) in the hazard description.
3259     auto hazardPair = [this](Register Reg) {
3260       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3261         return Register(AMDGPU::VCC);
3262       auto RegN = sgprPairNumber(Reg, TRI);
3263       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3264     };
3265     auto SearchHazardFn = [this, hazardPair,
3266                            &SGPRsUsed](const MachineInstr &I) {
3267       if (!SIInstrInfo::isVALU(I))
3268         return false;
3269       // Check for any register reads.
3270       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3271         return I.readsRegister(hazardPair(Reg), &TRI);
3272       });
3273     };
3274     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3275       return false;
3276     };
3277     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3278         std::numeric_limits<int>::max())
3279       return false;
3280   }
3281 
3282   // Add s_wait_alu sa_sdst(0) before SALU read.
3283   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3284                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3285                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3286 
3287   // SALU read may be after s_getpc in a bundle.
3288   updateGetPCBundle(NewMI);
3289 
3290   return true;
3291 }
3292 
3293 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3294                                const SIInstrInfo &TII) {
3295   MachineBasicBlock &EntryMBB = MF->front();
3296   if (EntryMBB.begin() != EntryMBB.end()) {
3297     auto &EntryMI = *EntryMBB.begin();
3298     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3299         EntryMI.getOperand(0).getImm() >= Priority)
3300       return false;
3301   }
3302 
3303   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3304       .addImm(Priority);
3305   return true;
3306 }
3307 
3308 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3309   if (!ST.hasRequiredExportPriority())
3310     return false;
3311 
3312   // Assume the following shader types will never have exports,
3313   // and avoid adding or adjusting S_SETPRIO.
3314   MachineBasicBlock *MBB = MI->getParent();
3315   MachineFunction *MF = MBB->getParent();
3316   auto CC = MF->getFunction().getCallingConv();
3317   switch (CC) {
3318   case CallingConv::AMDGPU_CS:
3319   case CallingConv::AMDGPU_CS_Chain:
3320   case CallingConv::AMDGPU_CS_ChainPreserve:
3321   case CallingConv::AMDGPU_KERNEL:
3322     return false;
3323   default:
3324     break;
3325   }
3326 
3327   const int MaxPriority = 3;
3328   const int NormalPriority = 2;
3329   const int PostExportPriority = 0;
3330 
3331   auto It = MI->getIterator();
3332   switch (MI->getOpcode()) {
3333   case AMDGPU::S_ENDPGM:
3334   case AMDGPU::S_ENDPGM_SAVED:
3335   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3336   case AMDGPU::SI_RETURN_TO_EPILOG:
3337     // Ensure shader with calls raises priority at entry.
3338     // This ensures correct priority if exports exist in callee.
3339     if (MF->getFrameInfo().hasCalls())
3340       return ensureEntrySetPrio(MF, NormalPriority, TII);
3341     return false;
3342   case AMDGPU::S_SETPRIO: {
3343     // Raise minimum priority unless in workaround.
3344     auto &PrioOp = MI->getOperand(0);
3345     int Prio = PrioOp.getImm();
3346     bool InWA = (Prio == PostExportPriority) &&
3347                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3348     if (InWA || Prio >= NormalPriority)
3349       return false;
3350     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3351     return true;
3352   }
3353   default:
3354     if (!TII.isEXP(*MI))
3355       return false;
3356     break;
3357   }
3358 
3359   // Check entry priority at each export (as there will only be a few).
3360   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3361   bool Changed = false;
3362   if (CC != CallingConv::AMDGPU_Gfx)
3363     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3364 
3365   auto NextMI = std::next(It);
3366   bool EndOfShader = false;
3367   if (NextMI != MBB->end()) {
3368     // Only need WA at end of sequence of exports.
3369     if (TII.isEXP(*NextMI))
3370       return Changed;
3371     // Assume appropriate S_SETPRIO after export means WA already applied.
3372     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3373         NextMI->getOperand(0).getImm() == PostExportPriority)
3374       return Changed;
3375     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3376   }
3377 
3378   const DebugLoc &DL = MI->getDebugLoc();
3379 
3380   // Lower priority.
3381   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3382       .addImm(PostExportPriority);
3383 
3384   if (!EndOfShader) {
3385     // Wait for exports to complete.
3386     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3387         .addReg(AMDGPU::SGPR_NULL)
3388         .addImm(0);
3389   }
3390 
3391   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3392   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3393 
3394   if (!EndOfShader) {
3395     // Return to normal (higher) priority.
3396     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3397         .addImm(NormalPriority);
3398   }
3399 
3400   return true;
3401 }
3402