xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 9ca1323de1ad2583b02930d2ee5721c96f2d3a51)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/ScheduleDAG.h"
21 #include "llvm/TargetParser/TargetParser.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29 
30   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31     if (Arg.getAsInteger(0, Value))
32       return O.error("'" + Arg + "' value invalid for uint argument!");
33 
34     if (Value > 100)
35       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36 
37     return false;
38   }
39 };
40 
41 } // end anonymous namespace
42 
43 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45                      cl::desc("Fill a percentage of the latency between "
46                               "neighboring MFMA with s_nops."));
47 
48 static cl::opt<unsigned> MaxExhaustiveHazardSearch(
49     "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50     cl::desc("Maximum function size for exhausive hazard search"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       UseVALUReadHazardExhaustiveSearch(false),
64       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67 }
68 
69 void GCNHazardRecognizer::Reset() {
70   EmittedInstrs.clear();
71 }
72 
73 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74   EmitInstruction(SU->getInstr());
75 }
76 
77 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78   CurrCycleInstr = MI;
79 }
80 
81 static bool isDivFMas(unsigned Opcode) {
82   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83 }
84 
85 static bool isSGetReg(unsigned Opcode) {
86   return Opcode == AMDGPU::S_GETREG_B32;
87 }
88 
89 static bool isSSetReg(unsigned Opcode) {
90   switch (Opcode) {
91   case AMDGPU::S_SETREG_B32:
92   case AMDGPU::S_SETREG_B32_mode:
93   case AMDGPU::S_SETREG_IMM32_B32:
94   case AMDGPU::S_SETREG_IMM32_B32_mode:
95     return true;
96   }
97   return false;
98 }
99 
100 static bool isRWLane(unsigned Opcode) {
101   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102 }
103 
104 static bool isRFE(unsigned Opcode) {
105   return Opcode == AMDGPU::S_RFE_B64;
106 }
107 
108 static bool isSMovRel(unsigned Opcode) {
109   switch (Opcode) {
110   case AMDGPU::S_MOVRELS_B32:
111   case AMDGPU::S_MOVRELS_B64:
112   case AMDGPU::S_MOVRELD_B32:
113   case AMDGPU::S_MOVRELD_B64:
114     return true;
115   default:
116     return false;
117   }
118 }
119 
120 static bool isDGEMM(unsigned Opcode) {
121   return AMDGPU::getMAIIsDGEMM(Opcode);
122 }
123 
124 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125   unsigned Opcode = MI.getOpcode();
126 
127   if (!SIInstrInfo::isMAI(MI) ||
128       isDGEMM(Opcode) ||
129       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131     return false;
132 
133   if (!ST.hasGFX940Insts())
134     return true;
135 
136   return AMDGPU::getMAIIsGFX940XDL(Opcode);
137 }
138 
139 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
140                                     const MachineInstr &MI) {
141   if (TII.isAlwaysGDS(MI.getOpcode()))
142     return true;
143 
144   switch (MI.getOpcode()) {
145   case AMDGPU::S_SENDMSG:
146   case AMDGPU::S_SENDMSGHALT:
147   case AMDGPU::S_TTRACEDATA:
148     return true;
149   // These DS opcodes don't support GDS.
150   case AMDGPU::DS_NOP:
151   case AMDGPU::DS_PERMUTE_B32:
152   case AMDGPU::DS_BPERMUTE_B32:
153     return false;
154   default:
155     if (TII.isDS(MI.getOpcode())) {
156       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157                                            AMDGPU::OpName::gds);
158       if (MI.getOperand(GDS).getImm())
159         return true;
160     }
161     return false;
162   }
163 }
164 
165 static bool isPermlane(const MachineInstr &MI) {
166   unsigned Opcode = MI.getOpcode();
167   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168          Opcode == AMDGPU::V_PERMLANE64_B32 ||
169          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
176 }
177 
178 static bool isLdsDma(const MachineInstr &MI) {
179   return SIInstrInfo::isVALU(MI) &&
180          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
181 }
182 
183 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185                                                      AMDGPU::OpName::simm16);
186   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187 }
188 
189 ScheduleHazardRecognizer::HazardType
190 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
191   MachineInstr *MI = SU->getInstr();
192   // If we are not in "HazardRecognizerMode" and therefore not being run from
193   // the scheduler, track possible stalls from hazards but don't insert noops.
194   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195 
196   if (MI->isBundle())
197    return NoHazard;
198 
199   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200     return HazardType;
201 
202   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203     return HazardType;
204 
205   if (checkFPAtomicToDenormModeHazard(MI) > 0)
206     return HazardType;
207 
208   if (ST.hasNoDataDepHazard())
209     return NoHazard;
210 
211   // FIXME: Should flat be considered vmem?
212   if ((SIInstrInfo::isVMEM(*MI) ||
213        SIInstrInfo::isFLAT(*MI))
214       && checkVMEMHazards(MI) > 0)
215     return HazardType;
216 
217   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
218     return HazardType;
219 
220   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
221     return HazardType;
222 
223   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
224     return HazardType;
225 
226   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
227     return HazardType;
228 
229   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
230        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
231        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
232     return HazardType;
233 
234   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
235     return HazardType;
236 
237   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
238     return HazardType;
239 
240   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
241     return HazardType;
242 
243   if (((ST.hasReadM0MovRelInterpHazard() &&
244         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
245          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
247        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
248        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
249        (ST.hasReadM0LdsDirectHazard() &&
250         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
251       checkReadM0Hazards(MI) > 0)
252     return HazardType;
253 
254   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
255     return HazardType;
256 
257   if ((SIInstrInfo::isVMEM(*MI) ||
258        SIInstrInfo::isFLAT(*MI) ||
259        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
260     return HazardType;
261 
262   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
263     return HazardType;
264 
265   return NoHazard;
266 }
267 
268 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
269                                 unsigned Quantity) {
270   while (Quantity > 0) {
271     unsigned Arg = std::min(Quantity, 8u);
272     Quantity -= Arg;
273     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
274         .addImm(Arg - 1);
275   }
276 }
277 
278 unsigned
279 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
280   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
281   assert(TSchedModel.getWriteProcResBegin(SC) !=
282          TSchedModel.getWriteProcResEnd(SC));
283   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
284 }
285 
286 void GCNHazardRecognizer::processBundle() {
287   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
288   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
289   // Check bundled MachineInstr's for hazards.
290   for (; MI != E && MI->isInsideBundle(); ++MI) {
291     CurrCycleInstr = &*MI;
292     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
293 
294     if (IsHazardRecognizerMode) {
295       fixHazards(CurrCycleInstr);
296 
297       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
298     }
299 
300     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
301     // include the bundled MI directly after, only add a maximum of
302     // (MaxLookAhead - 1) noops to EmittedInstrs.
303     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
304       EmittedInstrs.push_front(nullptr);
305 
306     EmittedInstrs.push_front(CurrCycleInstr);
307     EmittedInstrs.resize(MaxLookAhead);
308   }
309   CurrCycleInstr = nullptr;
310 }
311 
312 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
313   assert(IsHazardRecognizerMode);
314 
315   unsigned NumPreNoops = PreEmitNoops(MI);
316   EmitNoops(NumPreNoops);
317   if (MI->isInsideBundle())
318     insertNoopsInBundle(MI, TII, NumPreNoops);
319   else
320     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
321                     NumPreNoops);
322   EmitInstruction(MI);
323   AdvanceCycle();
324 }
325 
326 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
327   IsHazardRecognizerMode = true;
328   CurrCycleInstr = MI;
329   unsigned W = PreEmitNoopsCommon(MI);
330   fixHazards(MI);
331   CurrCycleInstr = nullptr;
332   return W;
333 }
334 
335 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
336   if (MI->isBundle())
337     return 0;
338 
339   int WaitStates = 0;
340 
341   if (SIInstrInfo::isSMRD(*MI))
342     return std::max(WaitStates, checkSMRDHazards(MI));
343 
344   if (ST.hasNSAtoVMEMBug())
345     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
346 
347   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
348 
349   if (ST.hasNoDataDepHazard())
350     return WaitStates;
351 
352   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
353     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
354 
355   if (SIInstrInfo::isVALU(*MI))
356     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
357 
358   if (SIInstrInfo::isDPP(*MI))
359     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
360 
361   if (isDivFMas(MI->getOpcode()))
362     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
363 
364   if (isRWLane(MI->getOpcode()))
365     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
366 
367   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
368        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
369        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
370     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
371 
372   if (MI->isInlineAsm())
373     return std::max(WaitStates, checkInlineAsmHazards(MI));
374 
375   if (isSGetReg(MI->getOpcode()))
376     return std::max(WaitStates, checkGetRegHazards(MI));
377 
378   if (isSSetReg(MI->getOpcode()))
379     return std::max(WaitStates, checkSetRegHazards(MI));
380 
381   if (isRFE(MI->getOpcode()))
382     return std::max(WaitStates, checkRFEHazards(MI));
383 
384   if ((ST.hasReadM0MovRelInterpHazard() &&
385        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
386         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
388       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
389       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
390       (ST.hasReadM0LdsDirectHazard() &&
391        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
392     return std::max(WaitStates, checkReadM0Hazards(MI));
393 
394   if (SIInstrInfo::isMAI(*MI))
395     return std::max(WaitStates, checkMAIHazards(MI));
396 
397   if (SIInstrInfo::isVMEM(*MI) ||
398       SIInstrInfo::isFLAT(*MI) ||
399       SIInstrInfo::isDS(*MI))
400     return std::max(WaitStates, checkMAILdStHazards(MI));
401 
402   if (ST.hasGFX950Insts() && isPermlane(*MI))
403     return std::max(WaitStates, checkPermlaneHazards(MI));
404 
405   return WaitStates;
406 }
407 
408 void GCNHazardRecognizer::EmitNoop() {
409   EmittedInstrs.push_front(nullptr);
410 }
411 
412 void GCNHazardRecognizer::AdvanceCycle() {
413   // When the scheduler detects a stall, it will call AdvanceCycle() without
414   // emitting any instructions.
415   if (!CurrCycleInstr) {
416     EmittedInstrs.push_front(nullptr);
417     return;
418   }
419 
420   if (CurrCycleInstr->isBundle()) {
421     processBundle();
422     return;
423   }
424 
425   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426   if (!NumWaitStates) {
427     CurrCycleInstr = nullptr;
428     return;
429   }
430 
431   // Keep track of emitted instructions
432   EmittedInstrs.push_front(CurrCycleInstr);
433 
434   // Add a nullptr for each additional wait state after the first.  Make sure
435   // not to add more than getMaxLookAhead() items to the list, since we
436   // truncate the list to that size right after this loop.
437   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438        i < e; ++i) {
439     EmittedInstrs.push_front(nullptr);
440   }
441 
442   // getMaxLookahead() is the largest number of wait states we will ever need
443   // to insert, so there is no point in keeping track of more than that many
444   // wait states.
445   EmittedInstrs.resize(getMaxLookAhead());
446 
447   CurrCycleInstr = nullptr;
448 }
449 
450 void GCNHazardRecognizer::RecedeCycle() {
451   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
452 }
453 
454 //===----------------------------------------------------------------------===//
455 // Helper Functions
456 //===----------------------------------------------------------------------===//
457 
458 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
459 
460 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
461 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
462 
463 // Search for a hazard in a block and its predecessors.
464 template <typename StateT>
465 static bool
466 hasHazard(StateT State,
467           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
468           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
469           const MachineBasicBlock *MBB,
470           MachineBasicBlock::const_reverse_instr_iterator I,
471           DenseSet<const MachineBasicBlock *> &Visited) {
472   for (auto E = MBB->instr_rend(); I != E; ++I) {
473     // No need to look at parent BUNDLE instructions.
474     if (I->isBundle())
475       continue;
476 
477     switch (IsHazard(State, *I)) {
478     case HazardFound:
479       return true;
480     case HazardExpired:
481       return false;
482     default:
483       // Continue search
484       break;
485     }
486 
487     if (I->isInlineAsm() || I->isMetaInstruction())
488       continue;
489 
490     UpdateState(State, *I);
491   }
492 
493   for (MachineBasicBlock *Pred : MBB->predecessors()) {
494     if (!Visited.insert(Pred).second)
495       continue;
496 
497     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
498                   Visited))
499       return true;
500   }
501 
502   return false;
503 }
504 
505 // Returns a minimum wait states since \p I walking all predecessors.
506 // Only scans until \p IsExpired does not return true.
507 // Can only be run in a hazard recognizer mode.
508 static int getWaitStatesSince(
509     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
510     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
511     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
512     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
513   for (auto E = MBB->instr_rend(); I != E; ++I) {
514     // Don't add WaitStates for parent BUNDLE instructions.
515     if (I->isBundle())
516       continue;
517 
518     if (IsHazard(*I))
519       return WaitStates;
520 
521     if (I->isInlineAsm())
522       continue;
523 
524     WaitStates += GetNumWaitStates(*I);
525 
526     if (IsExpired(*I, WaitStates))
527       return std::numeric_limits<int>::max();
528   }
529 
530   int MinWaitStates = std::numeric_limits<int>::max();
531   for (MachineBasicBlock *Pred : MBB->predecessors()) {
532     if (!Visited.insert(Pred).second)
533       continue;
534 
535     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
536                                IsExpired, Visited, GetNumWaitStates);
537 
538     MinWaitStates = std::min(MinWaitStates, W);
539   }
540 
541   return MinWaitStates;
542 }
543 
544 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
545                               const MachineInstr *MI, IsExpiredFn IsExpired) {
546   DenseSet<const MachineBasicBlock *> Visited;
547   return getWaitStatesSince(IsHazard, MI->getParent(),
548                             std::next(MI->getReverseIterator()),
549                             0, IsExpired, Visited);
550 }
551 
552 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
553   if (IsHazardRecognizerMode) {
554     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
555       return WaitStates >= Limit;
556     };
557     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
558   }
559 
560   int WaitStates = 0;
561   for (MachineInstr *MI : EmittedInstrs) {
562     if (MI) {
563       if (IsHazard(*MI))
564         return WaitStates;
565 
566       if (MI->isInlineAsm())
567         continue;
568     }
569     ++WaitStates;
570 
571     if (WaitStates >= Limit)
572       break;
573   }
574   return std::numeric_limits<int>::max();
575 }
576 
577 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
578                                                IsHazardFn IsHazardDef,
579                                                int Limit) {
580   const SIRegisterInfo *TRI = ST.getRegisterInfo();
581 
582   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
583     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
584   };
585 
586   return getWaitStatesSince(IsHazardFn, Limit);
587 }
588 
589 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
590                                                   int Limit) {
591   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
592     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
593   };
594 
595   return getWaitStatesSince(IsHazardFn, Limit);
596 }
597 
598 //===----------------------------------------------------------------------===//
599 // No-op Hazard Detection
600 //===----------------------------------------------------------------------===//
601 
602 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
603                         MCRegister Reg) {
604   for (MCRegUnit Unit : TRI.regunits(Reg))
605     BV.set(Unit);
606 }
607 
608 static void addRegsToSet(const SIRegisterInfo &TRI,
609                          iterator_range<MachineInstr::const_mop_iterator> Ops,
610                          BitVector &DefSet, BitVector &UseSet) {
611   for (const MachineOperand &Op : Ops) {
612     if (Op.isReg())
613       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
614   }
615 }
616 
617 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
618   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
619 }
620 
621 static bool breaksSMEMSoftClause(MachineInstr *MI) {
622   return !SIInstrInfo::isSMRD(*MI);
623 }
624 
625 static bool breaksVMEMSoftClause(MachineInstr *MI) {
626   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
627 }
628 
629 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
630   // SMEM soft clause are only present on VI+, and only matter if xnack is
631   // enabled.
632   if (!ST.isXNACKEnabled())
633     return 0;
634 
635   bool IsSMRD = TII.isSMRD(*MEM);
636 
637   resetClause();
638 
639   // A soft-clause is any group of consecutive SMEM instructions.  The
640   // instructions in this group may return out of order and/or may be
641   // replayed (i.e. the same instruction issued more than once).
642   //
643   // In order to handle these situations correctly we need to make sure that
644   // when a clause has more than one instruction, no instruction in the clause
645   // writes to a register that is read by another instruction in the clause
646   // (including itself). If we encounter this situation, we need to break the
647   // clause by inserting a non SMEM instruction.
648 
649   for (MachineInstr *MI : EmittedInstrs) {
650     // When we hit a non-SMEM instruction then we have passed the start of the
651     // clause and we can stop.
652     if (!MI)
653       break;
654 
655     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
656       break;
657 
658     addClauseInst(*MI);
659   }
660 
661   if (ClauseDefs.none())
662     return 0;
663 
664   // We need to make sure not to put loads and stores in the same clause if they
665   // use the same address. For now, just start a new clause whenever we see a
666   // store.
667   if (MEM->mayStore())
668     return 1;
669 
670   addClauseInst(*MEM);
671 
672   // If the set of defs and uses intersect then we cannot add this instruction
673   // to the clause, so we have a hazard.
674   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
675 }
676 
677 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
678   int WaitStatesNeeded = 0;
679 
680   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
681 
682   // This SMRD hazard only affects SI.
683   if (!ST.hasSMRDReadVALUDefHazard())
684     return WaitStatesNeeded;
685 
686   // A read of an SGPR by SMRD instruction requires 4 wait states when the
687   // SGPR was written by a VALU instruction.
688   int SmrdSgprWaitStates = 4;
689   auto IsHazardDefFn = [this](const MachineInstr &MI) {
690     return TII.isVALU(MI);
691   };
692   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
693     return TII.isSALU(MI);
694   };
695 
696   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
697 
698   for (const MachineOperand &Use : SMRD->uses()) {
699     if (!Use.isReg())
700       continue;
701     int WaitStatesNeededForUse =
702         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
703                                                    SmrdSgprWaitStates);
704     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705 
706     // This fixes what appears to be undocumented hardware behavior in SI where
707     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
708     // needs some number of nops in between. We don't know how many we need, but
709     // let's use 4. This wasn't discovered before probably because the only
710     // case when this happens is when we expand a 64-bit pointer into a full
711     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
712     // probably never encountered in the closed-source land.
713     if (IsBufferSMRD) {
714       int WaitStatesNeededForUse =
715         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
716                                                    IsBufferHazardDefFn,
717                                                    SmrdSgprWaitStates);
718       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
719     }
720   }
721 
722   return WaitStatesNeeded;
723 }
724 
725 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
726   if (!ST.hasVMEMReadSGPRVALUDefHazard())
727     return 0;
728 
729   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
730 
731   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
732   // SGPR was written by a VALU Instruction.
733   const int VmemSgprWaitStates = 5;
734   auto IsHazardDefFn = [this](const MachineInstr &MI) {
735     return TII.isVALU(MI);
736   };
737   for (const MachineOperand &Use : VMEM->uses()) {
738     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
739       continue;
740 
741     int WaitStatesNeededForUse =
742         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
743                                                    VmemSgprWaitStates);
744     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745   }
746   return WaitStatesNeeded;
747 }
748 
749 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
750   const SIRegisterInfo *TRI = ST.getRegisterInfo();
751   const SIInstrInfo *TII = ST.getInstrInfo();
752 
753   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
754   int DppVgprWaitStates = 2;
755   int DppExecWaitStates = 5;
756   int WaitStatesNeeded = 0;
757   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758     return TII->isVALU(MI);
759   };
760 
761   for (const MachineOperand &Use : DPP->uses()) {
762     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
763       continue;
764     int WaitStatesNeededForUse =
765         DppVgprWaitStates - getWaitStatesSinceDef(
766                                 Use.getReg(),
767                                 [](const MachineInstr &) { return true; },
768                                 DppVgprWaitStates);
769     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
770   }
771 
772   WaitStatesNeeded = std::max(
773       WaitStatesNeeded,
774       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
775                                                 DppExecWaitStates));
776 
777   return WaitStatesNeeded;
778 }
779 
780 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
781   const SIInstrInfo *TII = ST.getInstrInfo();
782 
783   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
784   // instruction.
785   const int DivFMasWaitStates = 4;
786   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
787     return TII->isVALU(MI);
788   };
789   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
790                                                DivFMasWaitStates);
791 
792   return DivFMasWaitStates - WaitStatesNeeded;
793 }
794 
795 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
796   const SIInstrInfo *TII = ST.getInstrInfo();
797   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
798 
799   const int GetRegWaitStates = 2;
800   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
801     return GetRegHWReg == getHWReg(TII, MI);
802   };
803   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
804 
805   return GetRegWaitStates - WaitStatesNeeded;
806 }
807 
808 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
809   const SIInstrInfo *TII = ST.getInstrInfo();
810   unsigned HWReg = getHWReg(TII, *SetRegInstr);
811 
812   const int SetRegWaitStates = ST.getSetRegWaitStates();
813   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
814     return HWReg == getHWReg(TII, MI);
815   };
816   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
817   return SetRegWaitStates - WaitStatesNeeded;
818 }
819 
820 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
821   if (!MI.mayStore())
822     return -1;
823 
824   const SIInstrInfo *TII = ST.getInstrInfo();
825   unsigned Opcode = MI.getOpcode();
826   const MCInstrDesc &Desc = MI.getDesc();
827 
828   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
829   int VDataRCID = -1;
830   if (VDataIdx != -1)
831     VDataRCID = Desc.operands()[VDataIdx].RegClass;
832 
833   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
834     // There is no hazard if the instruction does not use vector regs
835     // (like wbinvl1)
836     if (VDataIdx == -1)
837       return -1;
838     // For MUBUF/MTBUF instructions this hazard only exists if the
839     // instruction is not using a register in the soffset field.
840     const MachineOperand *SOffset =
841         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
842     // If we have no soffset operand, then assume this field has been
843     // hardcoded to zero.
844     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
845         (!SOffset || !SOffset->isReg()))
846       return VDataIdx;
847   }
848 
849   // MIMG instructions create a hazard if they don't use a 256-bit T# and
850   // the store size is greater than 8 bytes and they have more than two bits
851   // of their dmask set.
852   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
853   if (TII->isMIMG(MI)) {
854     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
855     assert(SRsrcIdx != -1 &&
856            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
857     (void)SRsrcIdx;
858   }
859 
860   if (TII->isFLAT(MI)) {
861     // There is no hazard if the instruction does not use vector regs
862     if (VDataIdx == -1)
863       return -1;
864 
865     if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
866       return VDataIdx;
867   }
868 
869   return -1;
870 }
871 
872 int
873 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
874                                             const MachineRegisterInfo &MRI) {
875   // Helper to check for the hazard where VMEM instructions that store more than
876   // 8 bytes can have there store data over written by the next instruction.
877   const SIRegisterInfo *TRI = ST.getRegisterInfo();
878 
879   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
880   int WaitStatesNeeded = 0;
881 
882   if (!TRI->isVectorRegister(MRI, Def.getReg()))
883     return WaitStatesNeeded;
884   Register Reg = Def.getReg();
885   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
886     int DataIdx = createsVALUHazard(MI);
887     return DataIdx >= 0 &&
888            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
889   };
890 
891   int WaitStatesNeededForDef =
892     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
893   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
894 
895   return WaitStatesNeeded;
896 }
897 
898 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
899 /// pack the computed value into correct bit position of the dest register. This
900 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
901 /// dst_sel that is not aligned to the register. This function analayzes the \p
902 /// MI and \returns an operand with dst forwarding issue, or nullptr if
903 /// none exists.
904 static const MachineOperand *
905 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
906   if (!SIInstrInfo::isVALU(MI))
907     return nullptr;
908 
909   const SIInstrInfo *TII = ST.getInstrInfo();
910 
911   unsigned Opcode = MI.getOpcode();
912 
913   // There are three different types of instructions
914   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
915   // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
916   // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
917   // op_sel[3:2]
918   // != 0
919   if (SIInstrInfo::isSDWA(MI)) {
920     // Type 1: SDWA with dst_sel != DWORD
921     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
922       if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
923         return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
924   }
925 
926   AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
927   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
928     // Type 2: VOP3 which write the hi bits
929     if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
930         SISrcMods::DST_OP_SEL)
931       return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
932 
933     // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
934     if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
935         (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
936          SISrcMods::OP_SEL_0))
937       return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
938   }
939 
940   // Special case: nop is required for all the opsel values for fp4 sr variant
941   // cvt scale instructions
942   if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
943     return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
944 
945   return nullptr;
946 }
947 
948 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
949 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
950 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
951 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
952                                             const MachineOperand *Dst,
953                                             const SIRegisterInfo *TRI) {
954   // We must consider implicit reads of the VALU. SDWA with dst_sel and
955   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
956   // and we must account for that hazard.
957   // We also must account for WAW hazards. In particular, WAW with dest
958   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
959   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
960   // check for ECC. Without accounting for this hazard, the ECC will be
961   // wrong.
962   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
963   // complete zeroesHigh16BitsOfDest)
964   for (auto &Operand : VALU->operands()) {
965     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
966       return true;
967     }
968   }
969   return false;
970 }
971 
972 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
973   int WaitStatesNeeded = 0;
974 
975   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
976     const int TransDefWaitstates = 1;
977 
978     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
979       if (!SIInstrInfo::isTRANS(MI))
980         return false;
981       const SIRegisterInfo *TRI = ST.getRegisterInfo();
982       const SIInstrInfo *TII = ST.getInstrInfo();
983       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
984 
985       for (const MachineOperand &Use : VALU->explicit_uses()) {
986         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
987           return true;
988       }
989 
990       return false;
991     };
992 
993     int WaitStatesNeededForDef =
994         TransDefWaitstates -
995         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
996     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
997   }
998 
999   if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1000     const int Shift16DefWaitstates = 1;
1001 
1002     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1003       const SIRegisterInfo *TRI = ST.getRegisterInfo();
1004       const MachineOperand *ForwardedDst =
1005           getDstSelForwardingOperand(ProducerMI, ST);
1006       if (ForwardedDst) {
1007         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1008       }
1009 
1010       if (ProducerMI.isInlineAsm()) {
1011         // Assume inline asm has dst forwarding hazard
1012         for (auto &Def : ProducerMI.all_defs()) {
1013           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1014             return true;
1015         }
1016       }
1017 
1018       return false;
1019     };
1020 
1021     int WaitStatesNeededForDef =
1022         Shift16DefWaitstates -
1023         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1024     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1025   }
1026 
1027   if (ST.hasVDecCoExecHazard()) {
1028     const int VALUWriteSGPRVALUReadWaitstates = 2;
1029     const int VALUWriteEXECRWLane = 4;
1030     const int VALUWriteVGPRReadlaneRead = 1;
1031 
1032     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1033     const MachineRegisterInfo &MRI = MF.getRegInfo();
1034     Register UseReg;
1035     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1036       if (!SIInstrInfo::isVALU(MI))
1037         return false;
1038       return MI.modifiesRegister(UseReg, TRI);
1039     };
1040 
1041     for (const MachineOperand &Use : VALU->explicit_uses()) {
1042       if (!Use.isReg())
1043         continue;
1044 
1045       UseReg = Use.getReg();
1046       if (TRI->isSGPRReg(MRI, UseReg)) {
1047         int WaitStatesNeededForDef =
1048             VALUWriteSGPRVALUReadWaitstates -
1049             getWaitStatesSince(IsVALUDefSGPRFn,
1050                                VALUWriteSGPRVALUReadWaitstates);
1051         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052       }
1053     }
1054 
1055     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1056       UseReg = AMDGPU::VCC;
1057       int WaitStatesNeededForDef =
1058           VALUWriteSGPRVALUReadWaitstates -
1059           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1060       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1061     }
1062 
1063     switch (VALU->getOpcode()) {
1064     case AMDGPU::V_READLANE_B32:
1065     case AMDGPU::V_READFIRSTLANE_B32: {
1066       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1067       UseReg = Src->getReg();
1068       int WaitStatesNeededForDef =
1069           VALUWriteVGPRReadlaneRead -
1070           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1071       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1072     }
1073       [[fallthrough]];
1074     case AMDGPU::V_WRITELANE_B32: {
1075       UseReg = AMDGPU::EXEC;
1076       int WaitStatesNeededForDef =
1077           VALUWriteEXECRWLane -
1078           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1079       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1080       break;
1081     }
1082     default:
1083       break;
1084     }
1085   }
1086 
1087   // This checks for the hazard where VMEM instructions that store more than
1088   // 8 bytes can have there store data over written by the next instruction.
1089   if (!ST.has12DWordStoreHazard())
1090     return WaitStatesNeeded;
1091 
1092   const MachineRegisterInfo &MRI = MF.getRegInfo();
1093 
1094   for (const MachineOperand &Def : VALU->defs()) {
1095     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1096   }
1097 
1098   return WaitStatesNeeded;
1099 }
1100 
1101 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1102   // This checks for hazards associated with inline asm statements.
1103   // Since inline asms can contain just about anything, we use this
1104   // to call/leverage other check*Hazard routines. Note that
1105   // this function doesn't attempt to address all possible inline asm
1106   // hazards (good luck), but is a collection of what has been
1107   // problematic thus far.
1108 
1109   // see checkVALUHazards()
1110   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1111       !ST.hasCvtScaleForwardingHazard())
1112     return 0;
1113 
1114   const MachineRegisterInfo &MRI = MF.getRegInfo();
1115   int WaitStatesNeeded = 0;
1116 
1117   for (const MachineOperand &Op :
1118        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1119     if (Op.isReg() && Op.isDef()) {
1120       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1121         continue;
1122 
1123       if (ST.has12DWordStoreHazard()) {
1124         WaitStatesNeeded =
1125             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1126       }
1127     }
1128   }
1129 
1130   if (ST.hasDstSelForwardingHazard()) {
1131     const int Shift16DefWaitstates = 1;
1132 
1133     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1134       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1135       // Assume inline asm reads the dst
1136       if (Dst)
1137         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1138                IA->readsRegister(Dst->getReg(), &TRI);
1139 
1140       if (ProducerMI.isInlineAsm()) {
1141         // If MI is inline asm, assume it has dst forwarding hazard
1142         for (auto &Def : ProducerMI.all_defs()) {
1143           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1144               IA->readsRegister(Def.getReg(), &TRI)) {
1145             return true;
1146           }
1147         }
1148       }
1149 
1150       return false;
1151     };
1152 
1153     int WaitStatesNeededForDef =
1154         Shift16DefWaitstates -
1155         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1156     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1157   }
1158 
1159   return WaitStatesNeeded;
1160 }
1161 
1162 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1163   const SIInstrInfo *TII = ST.getInstrInfo();
1164   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1165   const MachineRegisterInfo &MRI = MF.getRegInfo();
1166 
1167   const MachineOperand *LaneSelectOp =
1168       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1169 
1170   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1171     return 0;
1172 
1173   Register LaneSelectReg = LaneSelectOp->getReg();
1174   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1175 
1176   const int RWLaneWaitStates = 4;
1177   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1178                                               RWLaneWaitStates);
1179   return RWLaneWaitStates - WaitStatesSince;
1180 }
1181 
1182 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1183   if (!ST.hasRFEHazards())
1184     return 0;
1185 
1186   const SIInstrInfo *TII = ST.getInstrInfo();
1187 
1188   const int RFEWaitStates = 1;
1189 
1190   auto IsHazardFn = [TII](const MachineInstr &MI) {
1191     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1192   };
1193   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1194   return RFEWaitStates - WaitStatesNeeded;
1195 }
1196 
1197 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1198   const SIInstrInfo *TII = ST.getInstrInfo();
1199   const int ReadM0WaitStates = 1;
1200   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1201   return ReadM0WaitStates -
1202          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1203 }
1204 
1205 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1206   fixVMEMtoScalarWriteHazards(MI);
1207   fixVcmpxPermlaneHazards(MI);
1208   fixSMEMtoVectorWriteHazards(MI);
1209   fixVcmpxExecWARHazard(MI);
1210   fixLdsBranchVmemWARHazard(MI);
1211   if (ST.hasLdsDirect()) {
1212     fixLdsDirectVALUHazard(MI);
1213     fixLdsDirectVMEMHazard(MI);
1214   }
1215   fixVALUPartialForwardingHazard(MI);
1216   fixVALUTransUseHazard(MI);
1217   fixWMMAHazards(MI);
1218   fixShift64HighRegBug(MI);
1219   fixVALUMaskWriteHazard(MI);
1220   fixVALUReadSGPRHazard(MI);
1221   fixRequiredExportPriority(MI);
1222 }
1223 
1224 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1225                               const MachineInstr &MI) {
1226   return (TII.isVOPC(MI) ||
1227           (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1228          MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1229 }
1230 
1231 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1232   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1233     return false;
1234 
1235   const SIInstrInfo *TII = ST.getInstrInfo();
1236   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1237   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1238     return isVCmpXWritesExec(*TII, *TRI, MI);
1239   };
1240 
1241   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1242     unsigned Opc = MI.getOpcode();
1243     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1244            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1245   };
1246 
1247   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1248       std::numeric_limits<int>::max())
1249     return false;
1250 
1251   // V_NOP will be discarded by SQ.
1252   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1253   // which is always a VGPR and available.
1254   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1255   Register Reg = Src0->getReg();
1256   bool IsUndef = Src0->isUndef();
1257   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1258           TII->get(AMDGPU::V_MOV_B32_e32))
1259     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1260     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1261 
1262   return true;
1263 }
1264 
1265 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1266   if (!ST.hasVMEMtoScalarWriteHazard())
1267     return false;
1268   assert(!ST.hasExtendedWaitCounts());
1269 
1270   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1271     return false;
1272 
1273   if (MI->getNumDefs() == 0)
1274     return false;
1275 
1276   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1277 
1278   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1279     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1280         !SIInstrInfo::isFLAT(I))
1281       return false;
1282 
1283     for (const MachineOperand &Def : MI->defs()) {
1284       const MachineOperand *Op =
1285           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1286       if (!Op)
1287         continue;
1288       return true;
1289     }
1290     return false;
1291   };
1292 
1293   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1294     return SIInstrInfo::isVALU(MI) ||
1295            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1296             !MI.getOperand(0).getImm()) ||
1297            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1298             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1299   };
1300 
1301   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1302       std::numeric_limits<int>::max())
1303     return false;
1304 
1305   const SIInstrInfo *TII = ST.getInstrInfo();
1306   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1307           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1308       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1309   return true;
1310 }
1311 
1312 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1313   if (!ST.hasSMEMtoVectorWriteHazard())
1314     return false;
1315   assert(!ST.hasExtendedWaitCounts());
1316 
1317   if (!SIInstrInfo::isVALU(*MI))
1318     return false;
1319 
1320   unsigned SDSTName;
1321   switch (MI->getOpcode()) {
1322   case AMDGPU::V_READLANE_B32:
1323   case AMDGPU::V_READFIRSTLANE_B32:
1324     SDSTName = AMDGPU::OpName::vdst;
1325     break;
1326   default:
1327     SDSTName = AMDGPU::OpName::sdst;
1328     break;
1329   }
1330 
1331   const SIInstrInfo *TII = ST.getInstrInfo();
1332   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1333   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1334   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1335   if (!SDST) {
1336     for (const auto &MO : MI->implicit_operands()) {
1337       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1338         SDST = &MO;
1339         break;
1340       }
1341     }
1342   }
1343 
1344   if (!SDST)
1345     return false;
1346 
1347   const Register SDSTReg = SDST->getReg();
1348   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1349     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1350   };
1351 
1352   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1353     if (TII->isSALU(MI)) {
1354       switch (MI.getOpcode()) {
1355       case AMDGPU::S_SETVSKIP:
1356       case AMDGPU::S_VERSION:
1357       case AMDGPU::S_WAITCNT_VSCNT:
1358       case AMDGPU::S_WAITCNT_VMCNT:
1359       case AMDGPU::S_WAITCNT_EXPCNT:
1360         // These instructions cannot not mitigate the hazard.
1361         return false;
1362       case AMDGPU::S_WAITCNT_LGKMCNT:
1363         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1364         return (MI.getOperand(1).getImm() == 0) &&
1365                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1366       case AMDGPU::S_WAITCNT: {
1367         const int64_t Imm = MI.getOperand(0).getImm();
1368         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1369         // DsCnt corresponds to LGKMCnt here.
1370         return (Decoded.DsCnt == 0);
1371       }
1372       default:
1373         // SOPP instructions cannot mitigate the hazard.
1374         if (TII->isSOPP(MI))
1375           return false;
1376         // At this point the SALU can be assumed to mitigate the hazard
1377         // because either:
1378         // (a) it is independent of the at risk SMEM (breaking chain),
1379         // or
1380         // (b) it is dependent on the SMEM, in which case an appropriate
1381         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1382         //     SMEM instruction.
1383         return true;
1384       }
1385     }
1386     return false;
1387   };
1388 
1389   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1390       std::numeric_limits<int>::max())
1391     return false;
1392 
1393   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1394           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1395       .addImm(0);
1396   return true;
1397 }
1398 
1399 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1400   if (!ST.hasVcmpxExecWARHazard())
1401     return false;
1402   assert(!ST.hasExtendedWaitCounts());
1403 
1404   if (!SIInstrInfo::isVALU(*MI))
1405     return false;
1406 
1407   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1408   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1409     return false;
1410 
1411   auto IsHazardFn = [TRI](const MachineInstr &I) {
1412     if (SIInstrInfo::isVALU(I))
1413       return false;
1414     return I.readsRegister(AMDGPU::EXEC, TRI);
1415   };
1416 
1417   const SIInstrInfo *TII = ST.getInstrInfo();
1418   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1419     if (SIInstrInfo::isVALU(MI)) {
1420       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1421         return true;
1422       for (auto MO : MI.implicit_operands())
1423         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1424           return true;
1425     }
1426     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1427         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1428       return true;
1429     return false;
1430   };
1431 
1432   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1433       std::numeric_limits<int>::max())
1434     return false;
1435 
1436   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1437           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1438       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1439   return true;
1440 }
1441 
1442 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1443                                                  const GCNSubtarget &ST) {
1444   if (!ST.hasLdsBranchVmemWARHazard())
1445     return false;
1446 
1447   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1448   // instructions need to appear in the same function.
1449   bool HasLds = false;
1450   bool HasVmem = false;
1451   for (auto &MBB : MF) {
1452     for (auto &MI : MBB) {
1453       HasLds |= SIInstrInfo::isDS(MI);
1454       HasVmem |=
1455           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1456       if (HasLds && HasVmem)
1457         return true;
1458     }
1459   }
1460   return false;
1461 }
1462 
1463 static bool isStoreCountWaitZero(const MachineInstr &I) {
1464   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1465          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1466          !I.getOperand(1).getImm();
1467 }
1468 
1469 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1470   if (!RunLdsBranchVmemWARHazardFixup)
1471     return false;
1472 
1473   assert(ST.hasLdsBranchVmemWARHazard());
1474   assert(!ST.hasExtendedWaitCounts());
1475 
1476   auto IsHazardInst = [](const MachineInstr &MI) {
1477     if (SIInstrInfo::isDS(MI))
1478       return 1;
1479     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1480       return 2;
1481     return 0;
1482   };
1483 
1484   auto InstType = IsHazardInst(*MI);
1485   if (!InstType)
1486     return false;
1487 
1488   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1489     return IsHazardInst(I) || isStoreCountWaitZero(I);
1490   };
1491 
1492   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1493     if (!I.isBranch())
1494       return false;
1495 
1496     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1497       auto InstType2 = IsHazardInst(I);
1498       return InstType2 && InstType != InstType2;
1499     };
1500 
1501     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1502       auto InstType2 = IsHazardInst(I);
1503       if (InstType == InstType2)
1504         return true;
1505 
1506       return isStoreCountWaitZero(I);
1507     };
1508 
1509     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1510            std::numeric_limits<int>::max();
1511   };
1512 
1513   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1514       std::numeric_limits<int>::max())
1515     return false;
1516 
1517   const SIInstrInfo *TII = ST.getInstrInfo();
1518   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1519           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1520     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1521     .addImm(0);
1522 
1523   return true;
1524 }
1525 
1526 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1527   if (!SIInstrInfo::isLDSDIR(*MI))
1528     return false;
1529 
1530   const int NoHazardWaitStates = 15;
1531   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1532   const Register VDSTReg = VDST->getReg();
1533 
1534   bool VisitedTrans = false;
1535   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1536     if (!SIInstrInfo::isVALU(I))
1537       return false;
1538     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1539     // Cover both WAR and WAW
1540     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1541   };
1542   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1543     if (WaitStates >= NoHazardWaitStates)
1544       return true;
1545     // Instructions which cause va_vdst==0 expire hazard
1546     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1547            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1548   };
1549   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1550     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1551   };
1552 
1553   DenseSet<const MachineBasicBlock *> Visited;
1554   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1555                                     std::next(MI->getReverseIterator()), 0,
1556                                     IsExpiredFn, Visited, GetWaitStatesFn);
1557 
1558   // Transcendentals can execute in parallel to other VALUs.
1559   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1560   if (VisitedTrans)
1561     Count = 0;
1562 
1563   MachineOperand *WaitVdstOp =
1564       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1565   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1566 
1567   return true;
1568 }
1569 
1570 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1571   if (!SIInstrInfo::isLDSDIR(*MI))
1572     return false;
1573 
1574   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1575   const Register VDSTReg = VDST->getReg();
1576 
1577   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1578     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1579         !SIInstrInfo::isDS(I))
1580       return false;
1581     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1582   };
1583   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1584   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1585   // according to the type of VMEM instruction.
1586   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1587     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1588            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1589            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1590             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1591            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1592             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1593   };
1594 
1595   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1596       std::numeric_limits<int>::max())
1597     return false;
1598 
1599   if (LdsdirCanWait) {
1600     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1601   } else {
1602     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1603             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1604         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1605   }
1606 
1607   return true;
1608 }
1609 
1610 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1611   if (!ST.hasVALUPartialForwardingHazard())
1612     return false;
1613   assert(!ST.hasExtendedWaitCounts());
1614 
1615   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1616     return false;
1617 
1618   SmallSetVector<Register, 4> SrcVGPRs;
1619 
1620   for (const MachineOperand &Use : MI->explicit_uses()) {
1621     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1622       SrcVGPRs.insert(Use.getReg());
1623   }
1624 
1625   // Only applies with >= 2 unique VGPR sources
1626   if (SrcVGPRs.size() <= 1)
1627     return false;
1628 
1629   // Look for the following pattern:
1630   //   Va <- VALU [PreExecPos]
1631   //   intv1
1632   //   Exec <- SALU [ExecPos]
1633   //   intv2
1634   //   Vb <- VALU [PostExecPos]
1635   //   intv3
1636   //   MI Va, Vb (WaitState = 0)
1637   //
1638   // Where:
1639   // intv1 + intv2 <= 2 VALUs
1640   // intv3 <= 4 VALUs
1641   //
1642   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1643 
1644   const int Intv1plus2MaxVALUs = 2;
1645   const int Intv3MaxVALUs = 4;
1646   const int IntvMaxVALUs = 6;
1647   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1648 
1649   struct StateType {
1650     SmallDenseMap<Register, int, 4> DefPos;
1651     int ExecPos = std::numeric_limits<int>::max();
1652     int VALUs = 0;
1653   };
1654 
1655   StateType State;
1656 
1657   // This overloads expiry testing with all the hazard detection
1658   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1659     // Too many VALU states have passed
1660     if (State.VALUs > NoHazardVALUWaitStates)
1661       return HazardExpired;
1662 
1663     // Instructions which cause va_vdst==0 expire hazard
1664     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1665         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1666         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1667          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1668       return HazardExpired;
1669 
1670     // Track registers writes
1671     bool Changed = false;
1672     if (SIInstrInfo::isVALU(I)) {
1673       for (Register Src : SrcVGPRs) {
1674         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1675           State.DefPos[Src] = State.VALUs;
1676           Changed = true;
1677         }
1678       }
1679     } else if (SIInstrInfo::isSALU(I)) {
1680       if (State.ExecPos == std::numeric_limits<int>::max()) {
1681         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1682           State.ExecPos = State.VALUs;
1683           Changed = true;
1684         }
1685       }
1686     }
1687 
1688     // Early expiration: too many VALUs in intv3
1689     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1690       return HazardExpired;
1691 
1692     // Only evaluate state if something changed
1693     if (!Changed)
1694       return NoHazardFound;
1695 
1696     // Determine positions of VALUs pre/post exec change
1697     if (State.ExecPos == std::numeric_limits<int>::max())
1698       return NoHazardFound;
1699 
1700     int PreExecPos = std::numeric_limits<int>::max();
1701     int PostExecPos = std::numeric_limits<int>::max();
1702 
1703     for (auto Entry : State.DefPos) {
1704       int DefVALUs = Entry.second;
1705       if (DefVALUs != std::numeric_limits<int>::max()) {
1706         if (DefVALUs >= State.ExecPos)
1707           PreExecPos = std::min(PreExecPos, DefVALUs);
1708         else
1709           PostExecPos = std::min(PostExecPos, DefVALUs);
1710       }
1711     }
1712 
1713     // Need a VALUs post exec change
1714     if (PostExecPos == std::numeric_limits<int>::max())
1715       return NoHazardFound;
1716 
1717     // Too many VALUs in intv3?
1718     int Intv3VALUs = PostExecPos;
1719     if (Intv3VALUs > Intv3MaxVALUs)
1720       return HazardExpired;
1721 
1722     // Too many VALUs in intv2?
1723     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1724     if (Intv2VALUs > Intv1plus2MaxVALUs)
1725       return HazardExpired;
1726 
1727     // Need a VALUs pre exec change
1728     if (PreExecPos == std::numeric_limits<int>::max())
1729       return NoHazardFound;
1730 
1731     // Too many VALUs in intv1?
1732     int Intv1VALUs = PreExecPos - State.ExecPos;
1733     if (Intv1VALUs > Intv1plus2MaxVALUs)
1734       return HazardExpired;
1735 
1736     // Too many VALUs in intv1 + intv2
1737     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1738       return HazardExpired;
1739 
1740     return HazardFound;
1741   };
1742   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1743     if (SIInstrInfo::isVALU(MI))
1744       State.VALUs += 1;
1745   };
1746 
1747   DenseSet<const MachineBasicBlock *> Visited;
1748   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1749                             std::next(MI->getReverseIterator()), Visited))
1750     return false;
1751 
1752   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1753           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1754       .addImm(0x0fff);
1755 
1756   return true;
1757 }
1758 
1759 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1760   if (!ST.hasVALUTransUseHazard())
1761     return false;
1762   assert(!ST.hasExtendedWaitCounts());
1763 
1764   if (!SIInstrInfo::isVALU(*MI))
1765     return false;
1766 
1767   SmallSet<Register, 4> SrcVGPRs;
1768 
1769   for (const MachineOperand &Use : MI->explicit_uses()) {
1770     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1771       SrcVGPRs.insert(Use.getReg());
1772   }
1773 
1774   // Look for the following pattern:
1775   //   Va <- TRANS VALU
1776   //   intv
1777   //   MI Va (WaitState = 0)
1778   //
1779   // Where:
1780   // intv <= 5 VALUs / 1 TRANS
1781   //
1782   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1783 
1784   const int IntvMaxVALUs = 5;
1785   const int IntvMaxTRANS = 1;
1786 
1787   struct StateType {
1788     int VALUs = 0;
1789     int TRANS = 0;
1790   };
1791 
1792   StateType State;
1793 
1794   // This overloads expiry testing with all the hazard detection
1795   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1796     // Too many VALU states have passed
1797     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1798       return HazardExpired;
1799 
1800     // Instructions which cause va_vdst==0 expire hazard
1801     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1802         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1803         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1804          I.getOperand(0).getImm() == 0x0fff))
1805       return HazardExpired;
1806 
1807     // Track registers writes
1808     if (SIInstrInfo::isTRANS(I)) {
1809       for (Register Src : SrcVGPRs) {
1810         if (I.modifiesRegister(Src, &TRI)) {
1811           return HazardFound;
1812         }
1813       }
1814     }
1815 
1816     return NoHazardFound;
1817   };
1818   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1819     if (SIInstrInfo::isVALU(MI))
1820       State.VALUs += 1;
1821     if (SIInstrInfo::isTRANS(MI))
1822       State.TRANS += 1;
1823   };
1824 
1825   DenseSet<const MachineBasicBlock *> Visited;
1826   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1827                             std::next(MI->getReverseIterator()), Visited))
1828     return false;
1829 
1830   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1831   // avoided.
1832   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1833           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1834       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1835 
1836   return true;
1837 }
1838 
1839 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1840   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1841     return false;
1842 
1843   const SIInstrInfo *TII = ST.getInstrInfo();
1844   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1845 
1846   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1847     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1848       return false;
1849 
1850     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1851     // with the dest(matrix D) of the previous wmma.
1852     const Register CurSrc0Reg =
1853         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1854     const Register CurSrc1Reg =
1855         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1856 
1857     const Register PrevDstReg =
1858         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1859 
1860     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1861         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1862       return true;
1863     }
1864 
1865     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1866     // but Index can't overlap with PrevDstReg.
1867     if (AMDGPU::isGFX12Plus(ST)) {
1868       if (SIInstrInfo::isSWMMAC(*MI)) {
1869         const Register CurIndex =
1870             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1871         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1872           return true;
1873       }
1874       return false;
1875     }
1876 
1877     return false;
1878   };
1879 
1880   auto IsExpiredFn = [](const MachineInstr &I, int) {
1881     return SIInstrInfo::isVALU(I);
1882   };
1883 
1884   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1885       std::numeric_limits<int>::max())
1886     return false;
1887 
1888   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1889 
1890   return true;
1891 }
1892 
1893 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1894   if (!ST.hasShift64HighRegBug())
1895     return false;
1896   assert(!ST.hasExtendedWaitCounts());
1897 
1898   switch (MI->getOpcode()) {
1899   default:
1900     return false;
1901   case AMDGPU::V_LSHLREV_B64_e64:
1902   case AMDGPU::V_LSHRREV_B64_e64:
1903   case AMDGPU::V_ASHRREV_I64_e64:
1904     break;
1905   }
1906 
1907   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1908   if (!Amt->isReg())
1909     return false;
1910 
1911   Register AmtReg = Amt->getReg();
1912   const MachineRegisterInfo &MRI = MF.getRegInfo();
1913   // Check if this is a last VGPR in the allocation block.
1914   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1915     return false;
1916 
1917   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1918     return false;
1919 
1920   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1921   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1922   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1923   bool Overlapped = OverlappedSrc || OverlappedDst;
1924 
1925   assert(!OverlappedDst || !OverlappedSrc ||
1926          Src1->getReg() == MI->getOperand(0).getReg());
1927   assert(ST.needsAlignedVGPRs());
1928   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1929 
1930   Register NewReg;
1931   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1932                                    : AMDGPU::VGPR_32RegClass) {
1933     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1934       NewReg = Reg;
1935       break;
1936     }
1937   }
1938 
1939   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1940                                : NewReg;
1941   Register NewAmtLo;
1942 
1943   if (Overlapped)
1944     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1945 
1946   DebugLoc DL = MI->getDebugLoc();
1947   MachineBasicBlock *MBB = MI->getParent();
1948   // Insert a full wait count because found register might be pending a wait.
1949   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1950       .addImm(0);
1951 
1952   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1953   if (Overlapped)
1954     runOnInstruction(
1955         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1956             .addDef(AmtReg - 1)
1957             .addReg(AmtReg - 1, RegState::Undef)
1958             .addReg(NewAmtLo, RegState::Undef));
1959   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1960                        .addDef(AmtReg)
1961                        .addReg(AmtReg, RegState::Undef)
1962                        .addReg(NewAmt, RegState::Undef));
1963 
1964   // Instructions emitted after the current instruction will be processed by the
1965   // parent loop of the hazard recognizer in a natural way.
1966   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1967           AmtReg)
1968       .addDef(NewAmt)
1969       .addReg(NewAmt)
1970       .addReg(AmtReg);
1971   if (Overlapped)
1972     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1973             AmtReg - 1)
1974         .addDef(NewAmtLo)
1975         .addReg(NewAmtLo)
1976         .addReg(AmtReg - 1);
1977 
1978   // Re-running hazard recognizer on the modified instruction is not necessary,
1979   // inserted V_SWAP_B32 has already both read and write new registers so
1980   // hazards related to these register has already been handled.
1981   Amt->setReg(NewAmt);
1982   Amt->setIsKill(false);
1983   // We do not update liveness, so verifier may see it as undef.
1984   Amt->setIsUndef();
1985   if (OverlappedDst)
1986     MI->getOperand(0).setReg(NewReg);
1987   if (OverlappedSrc) {
1988     Src1->setReg(NewReg);
1989     Src1->setIsKill(false);
1990     Src1->setIsUndef();
1991   }
1992 
1993   return true;
1994 }
1995 
1996 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1997   int NSAtoVMEMWaitStates = 1;
1998 
1999   if (!ST.hasNSAtoVMEMBug())
2000     return 0;
2001 
2002   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
2003     return 0;
2004 
2005   const SIInstrInfo *TII = ST.getInstrInfo();
2006   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2007   if (!Offset || (Offset->getImm() & 6) == 0)
2008     return 0;
2009 
2010   auto IsHazardFn = [TII](const MachineInstr &I) {
2011     if (!SIInstrInfo::isMIMG(I))
2012       return false;
2013     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2014     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2015            TII->getInstSizeInBytes(I) >= 16;
2016   };
2017 
2018   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2019 }
2020 
2021 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2022   int FPAtomicToDenormModeWaitStates = 3;
2023 
2024   if (!ST.hasFPAtomicToDenormModeHazard())
2025     return 0;
2026   assert(!ST.hasExtendedWaitCounts());
2027 
2028   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2029     return 0;
2030 
2031   auto IsHazardFn = [](const MachineInstr &I) {
2032     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
2033       return false;
2034     return SIInstrInfo::isFPAtomic(I);
2035   };
2036 
2037   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2038     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2039       return true;
2040 
2041     switch (MI.getOpcode()) {
2042     case AMDGPU::S_WAITCNT:
2043     case AMDGPU::S_WAITCNT_VSCNT:
2044     case AMDGPU::S_WAITCNT_VMCNT:
2045     case AMDGPU::S_WAITCNT_EXPCNT:
2046     case AMDGPU::S_WAITCNT_LGKMCNT:
2047     case AMDGPU::S_WAIT_IDLE:
2048       return true;
2049     default:
2050       break;
2051     }
2052 
2053     return false;
2054   };
2055 
2056   return FPAtomicToDenormModeWaitStates -
2057          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2058 }
2059 
2060 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2061   assert(SIInstrInfo::isMAI(*MI));
2062 
2063   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2064 }
2065 
2066 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2067   // Early exit if no padding is requested.
2068   if (MFMAPaddingRatio == 0)
2069     return 0;
2070 
2071   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2072   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2073     return 0;
2074 
2075   int NeighborMFMALatency = 0;
2076   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2077                             this](const MachineInstr &MI) {
2078     if (!SIInstrInfo::isMFMA(MI))
2079       return false;
2080 
2081     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2082     return true;
2083   };
2084 
2085   const int MaxMFMAPipelineWaitStates = 16;
2086   int WaitStatesSinceNeighborMFMA =
2087       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2088 
2089   int NeighborMFMAPaddingNeeded =
2090       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2091       WaitStatesSinceNeighborMFMA;
2092 
2093   return std::max(0, NeighborMFMAPaddingNeeded);
2094 }
2095 
2096 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2097   int WaitStatesNeeded = 0;
2098   unsigned Opc = MI->getOpcode();
2099 
2100   auto IsVALUFn = [](const MachineInstr &MI) {
2101     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2102   };
2103 
2104   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2105     const int LegacyVALUWritesVGPRWaitStates = 2;
2106     const int VALUWritesExecWaitStates = 4;
2107     const int MaxWaitStates = 4;
2108 
2109     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2110       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2111     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2112 
2113     if (WaitStatesNeeded < MaxWaitStates) {
2114       for (const MachineOperand &Use : MI->explicit_uses()) {
2115         const int MaxWaitStates = 2;
2116 
2117         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2118           continue;
2119 
2120         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2121           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2122         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2123 
2124         if (WaitStatesNeeded == MaxWaitStates)
2125           break;
2126       }
2127     }
2128   }
2129 
2130   for (const MachineOperand &Op : MI->explicit_operands()) {
2131     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2132       continue;
2133 
2134     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2135       continue;
2136 
2137     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2138     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2139     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2140     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2141     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2142     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2143     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2144     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2145     const int MaxWaitStates = 18;
2146     Register Reg = Op.getReg();
2147     unsigned HazardDefLatency = 0;
2148 
2149     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2150                                this](const MachineInstr &MI) {
2151       if (!SIInstrInfo::isMFMA(MI))
2152         return false;
2153       Register DstReg = MI.getOperand(0).getReg();
2154       if (DstReg == Reg)
2155         return false;
2156       HazardDefLatency =
2157           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2158       return TRI.regsOverlap(DstReg, Reg);
2159     };
2160 
2161     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2162                                                    MaxWaitStates);
2163     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2164     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2165     int OpNo = Op.getOperandNo();
2166     if (OpNo == SrcCIdx) {
2167       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2168     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2169       switch (HazardDefLatency) {
2170       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2171                break;
2172       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2173                break;
2174       case 16: [[fallthrough]];
2175       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2176                break;
2177       }
2178     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2179       switch (HazardDefLatency) {
2180       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2181                break;
2182       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2183                break;
2184       case 16: [[fallthrough]];
2185       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2186                break;
2187       }
2188     }
2189 
2190     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2191     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2192 
2193     if (WaitStatesNeeded == MaxWaitStates)
2194       return WaitStatesNeeded; // Early exit.
2195 
2196     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2197       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2198         return false;
2199       Register DstReg = MI.getOperand(0).getReg();
2200       return TRI.regsOverlap(Reg, DstReg);
2201     };
2202 
2203     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2204     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2205     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2206     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2207     if (OpNo == SrcCIdx)
2208       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2209     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2210       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2211 
2212     WaitStatesNeededForUse = NeedWaitStates -
2213       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2214     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2215 
2216     if (WaitStatesNeeded == MaxWaitStates)
2217       return WaitStatesNeeded; // Early exit.
2218   }
2219 
2220   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2221     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2222     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2223     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2224     const int MaxWaitStates = 13;
2225     Register DstReg = MI->getOperand(0).getReg();
2226     unsigned HazardDefLatency = 0;
2227 
2228     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2229                          this](const MachineInstr &MI) {
2230       if (!SIInstrInfo::isMFMA(MI))
2231         return false;
2232       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2233       HazardDefLatency =
2234           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2235       return TRI.regsOverlap(Reg, DstReg);
2236     };
2237 
2238     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2239     int NeedWaitStates;
2240     switch (HazardDefLatency) {
2241     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2242              break;
2243     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2244              break;
2245     case 16: [[fallthrough]];
2246     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2247              break;
2248     }
2249 
2250     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2251     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2252   }
2253 
2254   // Pad neighboring MFMA with noops for better inter-wave performance.
2255   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2256 
2257   return WaitStatesNeeded;
2258 }
2259 
2260 static int
2261 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2262                                                               bool IsGFX950) {
2263   // xdl def cycles | gfx940 | gfx950
2264   // 2 pass         |  3        4
2265   // 4 pass         |  5        6
2266   // 8 pass         |  9        10
2267   // 16 pass        |  17       18
2268   return NumPasses + 1 + IsGFX950;
2269 }
2270 
2271 static int
2272 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2273                                                               bool IsGFX950) {
2274   // xdl def cycles | gfx940 | gfx950
2275   // 2 pass         |  3        3
2276   // 4 pass         |  5        6
2277   // 8 pass         |  9        10
2278   // 16 pass        |  17       18
2279   return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2280 }
2281 
2282 static int
2283 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2284   // 2 pass -> 2
2285   // 4 pass -> 4
2286   // 8 pass -> 8
2287   // 16 pass -> 16
2288   return NumPasses;
2289 }
2290 
2291 static int
2292 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2293   // 2 pass -> 4
2294   // 4 pass -> 6
2295   // 8 pass -> 10
2296   // 16 pass -> 18
2297   return NumPasses + 2;
2298 }
2299 
2300 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2301   // 2 pass -> 5
2302   // 4 pass -> 7
2303   // 8 pass -> 11
2304   // 16 pass -> 19
2305   return NumPasses + 3;
2306 }
2307 
2308 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2309   int WaitStatesNeeded = 0;
2310   unsigned Opc = MI->getOpcode();
2311 
2312   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2313     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2314   };
2315 
2316   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2317     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2318            !SIInstrInfo::isDOT(MI);
2319   };
2320 
2321   if (!SIInstrInfo::isMFMA(*MI))
2322     return WaitStatesNeeded;
2323 
2324   const int VALUWritesExecWaitStates = 4;
2325   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2326     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2327                           VALUWritesExecWaitStates);
2328   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2329 
2330   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2331 
2332   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2333   for (const MachineOperand &Use : MI->explicit_uses()) {
2334     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2335     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2336     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2337     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2338     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2339     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2340     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2341     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2342     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2343     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2344     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2345     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2346     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2347     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2348     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2349     const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2350     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2351     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2352     const int MaxWaitStates = 19;
2353 
2354     if (!Use.isReg())
2355       continue;
2356     Register Reg = Use.getReg();
2357     bool FullReg;
2358     const MachineInstr *MI1;
2359 
2360     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2361                                this](const MachineInstr &MI) {
2362       if (!SIInstrInfo::isMFMA(MI))
2363         return false;
2364       Register DstReg = MI.getOperand(0).getReg();
2365       FullReg = (DstReg == Reg);
2366       MI1 = &MI;
2367       return TRI.regsOverlap(DstReg, Reg);
2368     };
2369 
2370     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2371       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2372     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2373 
2374     int NumWaitStates =
2375         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2376     if (NumWaitStates == std::numeric_limits<int>::max())
2377       continue;
2378 
2379     int OpNo = Use.getOperandNo();
2380     unsigned Opc1 = MI1->getOpcode();
2381     int NeedWaitStates = 0;
2382     if (OpNo == SrcCIdx) {
2383       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2384         NeedWaitStates = 0;
2385       } else if (FullReg) {
2386         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2387              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2388             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2389              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2390           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2391         else if (ST.hasGFX940Insts() &&
2392                  TSchedModel.computeInstrLatency(MI1) == 2)
2393           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2394       } else {
2395         switch (Opc1) {
2396         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2397         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2398         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2399         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2400           if (!isXDL(ST, *MI))
2401             NeedWaitStates =
2402                 ST.hasGFX950Insts()
2403                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2404                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2405           break;
2406         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2407         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2408           if (!isXDL(ST, *MI))
2409             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2410           break;
2411         default:
2412           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2413           if (ST.hasGFX940Insts()) {
2414             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2415               break;
2416 
2417             NeedWaitStates =
2418                 isXDL(ST, *MI1)
2419                     ? (isXDL(ST, *MI)
2420                            ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2421                                  NumPasses, ST.hasGFX950Insts())
2422                            : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2423                                  NumPasses, ST.hasGFX950Insts()))
2424                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2425                           NumPasses);
2426             break;
2427           }
2428 
2429           switch (NumPasses) {
2430           case 2:
2431             NeedWaitStates =
2432                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2433                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2434             break;
2435           case 8:
2436             NeedWaitStates =
2437                 isDGEMM(Opc)
2438                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2439                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2440             break;
2441           case 16:
2442             NeedWaitStates =
2443                 isDGEMM(Opc)
2444                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2445                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2446             break;
2447           default:
2448             llvm_unreachable("unexpected number of passes");
2449           }
2450         }
2451       }
2452     } else {
2453       switch (Opc1) {
2454       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2455       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2456       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2457       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2458         NeedWaitStates =
2459             ST.hasGFX950Insts()
2460                 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2461                 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2462         break;
2463       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2464       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2465         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2466         break;
2467       default:
2468         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2469 
2470         if (ST.hasGFX940Insts()) {
2471           NeedWaitStates =
2472               isXDL(ST, *MI1)
2473                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2474                         NumPasses)
2475                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2476                         NumPasses);
2477           break;
2478         }
2479 
2480         switch (NumPasses) {
2481         case 2:
2482           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2483           break;
2484         case 4:
2485           llvm_unreachable("unexpected number of passes for mfma");
2486         case 8:
2487           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2488           break;
2489         case 16:
2490         default:
2491           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2492         }
2493       }
2494     }
2495     if (WaitStatesNeeded >= NeedWaitStates)
2496       continue;
2497 
2498     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2499     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2500 
2501     if (WaitStatesNeeded == MaxWaitStates)
2502       break;
2503   }
2504 
2505   // Pad neighboring MFMA with noops for better inter-wave performance.
2506   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2507 
2508   return WaitStatesNeeded;
2509 }
2510 
2511 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2512   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2513   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2514     return 0;
2515 
2516   int WaitStatesNeeded = 0;
2517 
2518   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2519     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2520   };
2521 
2522   for (const MachineOperand &Op : MI->explicit_uses()) {
2523     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2524       continue;
2525 
2526     Register Reg = Op.getReg();
2527 
2528     const int AccVgprReadLdStWaitStates = 2;
2529     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2530     const int MaxWaitStates = 2;
2531 
2532     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2533       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2534     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2535 
2536     if (WaitStatesNeeded == MaxWaitStates)
2537       return WaitStatesNeeded; // Early exit.
2538 
2539     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2540       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2541           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2542         return false;
2543       auto IsVALUFn = [](const MachineInstr &MI) {
2544         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2545       };
2546       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2547              std::numeric_limits<int>::max();
2548     };
2549 
2550     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2551       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2552     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553   }
2554 
2555   return WaitStatesNeeded;
2556 }
2557 
2558 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2559   assert(!ST.hasVcmpxPermlaneHazard() &&
2560          "this is a different vcmpx+permlane hazard");
2561   const SIRegisterInfo *TRI = ST.getRegisterInfo();
2562   const SIInstrInfo *TII = ST.getInstrInfo();
2563 
2564   auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2565     return isVCmpXWritesExec(*TII, *TRI, MI);
2566   };
2567 
2568   auto IsVALUFn = [](const MachineInstr &MI) {
2569     return SIInstrInfo::isVALU(MI);
2570   };
2571 
2572   const int VCmpXWritesExecWaitStates = 4;
2573   const int VALUWritesVDstWaitStates = 2;
2574   int WaitStatesNeeded = 0;
2575 
2576   for (const MachineOperand &Op : MI->explicit_uses()) {
2577     if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2578       continue;
2579     Register Reg = Op.getReg();
2580 
2581     int WaitStatesSinceDef =
2582         VALUWritesVDstWaitStates -
2583         getWaitStatesSinceDef(Reg, IsVALUFn,
2584                               /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2585     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2586     if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2587       break;
2588   }
2589 
2590   int VCmpXHazardWaits =
2591       VCmpXWritesExecWaitStates -
2592       getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2593 
2594   WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2595   return WaitStatesNeeded;
2596 }
2597 
2598 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2599   // 2 pass -> 4
2600   // 4 pass -> 6
2601   // 8 pass -> 10
2602   // 16 pass -> 18
2603   return NumPasses + 2;
2604 }
2605 
2606 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2607   // 2 pass -> 5
2608   // 4 pass -> 7
2609   // 8 pass -> 11
2610   // 16 pass -> 19
2611   return NumPasses + 3;
2612 }
2613 
2614 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2615   // 2 pass -> 5
2616   // 4 pass -> 7
2617   // 8 pass -> 11
2618   // 16 pass -> 19
2619   return NumPasses + 3;
2620 }
2621 
2622 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2623   // 2 pass -> 4
2624   // 4 pass -> 6
2625   // 8 pass -> 10
2626   // 16 pass -> 18
2627   return NumPasses + 2;
2628 }
2629 
2630 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2631   if (!ST.hasGFX90AInsts())
2632     return 0;
2633 
2634   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2635     return isDGEMM(MI.getOpcode());
2636   };
2637 
2638   // This is checked in checkMAIHazards90A()
2639   if (SIInstrInfo::isMFMA(*MI))
2640     return 0;
2641 
2642   const MachineRegisterInfo &MRI = MF.getRegInfo();
2643 
2644   int WaitStatesNeeded = 0;
2645 
2646   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2647                SIInstrInfo::isFLAT(*MI) ||
2648                SIInstrInfo::isDS(*MI);
2649   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2650   bool IsVALU = SIInstrInfo::isVALU(*MI);
2651 
2652   const MachineInstr *MFMA = nullptr;
2653   unsigned Reg;
2654   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2655     if (!SIInstrInfo::isMFMA(MI) ||
2656         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2657       return false;
2658     MFMA = &MI;
2659     return true;
2660   };
2661 
2662   const MachineInstr *DOT = nullptr;
2663   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2664     if (!SIInstrInfo::isDOT(MI) ||
2665         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2666       return false;
2667     DOT = &MI;
2668     return true;
2669   };
2670 
2671   bool DGEMMAfterVALUWrite = false;
2672   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2673     // Found DGEMM on reverse traversal to def.
2674     if (isDGEMM(MI.getOpcode()))
2675       DGEMMAfterVALUWrite = true;
2676 
2677     // Only hazard if register is defined by a VALU and a DGEMM is found after
2678     // after the def.
2679     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2680       return false;
2681 
2682     return true;
2683   };
2684 
2685   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2686                                            AMDGPU::OpName::src2);
2687 
2688   if (IsMemOrExport || IsVALU) {
2689     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2690     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2691     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2692     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2693     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2694     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2695     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2696     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2697     const int DotWriteSameDotReadSrcAB = 3;
2698     const int DotWriteDifferentVALURead = 3;
2699     const int DMFMABetweenVALUWriteVMEMRead = 2;
2700     const int MaxWaitStates = 19;
2701 
2702     for (const MachineOperand &Use : MI->explicit_uses()) {
2703       if (!Use.isReg())
2704         continue;
2705       Reg = Use.getReg();
2706 
2707       DOT = nullptr;
2708       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2709                                                      MaxWaitStates);
2710       if (DOT) {
2711         int NeedWaitStates = 0;
2712         if (DOT->getOpcode() == MI->getOpcode()) {
2713           if (&Use - &MI->getOperand(0) != SrcCIdx)
2714             NeedWaitStates = DotWriteSameDotReadSrcAB;
2715         } else {
2716           NeedWaitStates = DotWriteDifferentVALURead;
2717         }
2718 
2719         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2720         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2721       }
2722 
2723       // Workaround for HW data hazard bug observed only in GFX90A. When there
2724       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2725       // causes the SQ to incorrectly not insert two wait states between the two
2726       // instructions needed to avoid data hazard.
2727       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2728         DGEMMAfterVALUWrite = false;
2729         if (TRI.isVectorRegister(MRI, Reg)) {
2730           int WaitStatesNeededForUse =
2731                 DMFMABetweenVALUWriteVMEMRead -
2732                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2733                                       DMFMABetweenVALUWriteVMEMRead);
2734 
2735           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736         }
2737       }
2738 
2739       MFMA = nullptr;
2740       WaitStatesSinceDef =
2741           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2742       if (!MFMA)
2743         continue;
2744 
2745       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2746       int NumPasses = HazardDefLatency;
2747       int NeedWaitStates = MaxWaitStates;
2748 
2749       if (isDGEMM(MFMA->getOpcode())) {
2750         switch (HazardDefLatency) {
2751         case 4:
2752           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2753                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2754           break;
2755         case 8:
2756         case 16:
2757           NeedWaitStates =
2758               IsMemOrExport
2759                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2760                   : (ST.hasGFX950Insts()
2761                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2762                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2763           break;
2764         default:
2765           llvm_unreachable("unexpected dgemm");
2766         }
2767       } else if (ST.hasGFX940Insts()) {
2768         NeedWaitStates =
2769             isXDL(ST, *MFMA)
2770                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2771                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2772                       NumPasses);
2773       } else {
2774         switch (HazardDefLatency) {
2775         case 2:
2776           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2777           break;
2778         case 8:
2779           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2780           break;
2781         case 16:
2782           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2783           break;
2784         default:
2785           llvm_unreachable("unexpected number of passes for mfma");
2786         }
2787       }
2788 
2789       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2790       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2791 
2792       if (WaitStatesNeeded == MaxWaitStates)
2793         break;
2794     }
2795   }
2796 
2797   unsigned Opc = MI->getOpcode();
2798   const int DMFMAToFMA64WaitStates = 2;
2799   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2800        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2801        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2802       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2803     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2804       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2805     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2806   }
2807 
2808   if (!IsVALU && !IsMemOrExport)
2809     return WaitStatesNeeded;
2810 
2811   for (const MachineOperand &Def : MI->defs()) {
2812     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2813     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2814     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2815     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2816     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2817     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2818     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2819     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2820     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2821     const int DotWriteDifferentVALUWrite = 3;
2822     const int MaxWaitStates = 19;
2823     const int MaxWarWaitStates = 15;
2824 
2825     Reg = Def.getReg();
2826 
2827     DOT = nullptr;
2828     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2829                                                    MaxWaitStates);
2830     if (DOT && DOT->getOpcode() != MI->getOpcode())
2831       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2832                                                     WaitStatesSinceDef);
2833 
2834     MFMA = nullptr;
2835     WaitStatesSinceDef =
2836         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2837     if (MFMA) {
2838       int NeedWaitStates = MaxWaitStates;
2839       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2840 
2841       if (isDGEMM(MFMA->getOpcode())) {
2842         switch (NumPasses) {
2843         case 4:
2844           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2845           break;
2846         case 8:
2847         case 16:
2848           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2849           break;
2850         default:
2851           llvm_unreachable("unexpected number of cycles for dgemm");
2852         }
2853       } else if (ST.hasGFX940Insts()) {
2854         NeedWaitStates =
2855             isXDL(ST, *MFMA)
2856                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2857                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2858       } else {
2859         switch (NumPasses) {
2860         case 2:
2861           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2862           break;
2863         case 8:
2864           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2865           break;
2866         case 16:
2867           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2868           break;
2869         default:
2870           llvm_unreachable("Unexpected number of passes for mfma");
2871         }
2872       }
2873 
2874       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2875       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2876 
2877       if (WaitStatesNeeded == MaxWaitStates)
2878         break;
2879     }
2880 
2881     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2882       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2883           !MI.readsRegister(Reg, &TRI))
2884         return false;
2885 
2886       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2887         return false;
2888 
2889       const MachineOperand *SrcC =
2890           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2891       assert(SrcC);
2892       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2893         return false;
2894 
2895       MFMA = &MI;
2896       return true;
2897     };
2898 
2899     MFMA = nullptr;
2900     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2901                                                 MaxWarWaitStates);
2902     if (!MFMA)
2903       continue;
2904 
2905     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2906     int NeedWaitStates = MaxWaitStates;
2907     switch (HazardDefLatency) {
2908     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2909              break;
2910     case 4:  assert(ST.hasGFX940Insts());
2911              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2912              break;
2913     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2914              break;
2915     case 16: [[fallthrough]];
2916     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2917              break;
2918     }
2919 
2920     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2921     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2922   }
2923 
2924   return WaitStatesNeeded;
2925 }
2926 
2927 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2928   if (!SU->isInstr())
2929     return false;
2930 
2931   const MachineInstr *MAI = nullptr;
2932 
2933   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2934     MAI = nullptr;
2935     if (SIInstrInfo::isMFMA(MI))
2936       MAI = &MI;
2937     return MAI != nullptr;
2938   };
2939 
2940   MachineInstr *MI = SU->getInstr();
2941   if (IsMFMAFn(*MI)) {
2942     int W = getWaitStatesSince(IsMFMAFn, 16);
2943     if (MAI)
2944       return W < (int)TSchedModel.computeInstrLatency(MAI);
2945   }
2946 
2947   return false;
2948 }
2949 
2950 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2951 // insertion of a new instruction.
2952 static void updateGetPCBundle(MachineInstr *NewMI) {
2953   if (!NewMI->isBundled())
2954     return;
2955 
2956   // Find start of bundle.
2957   auto I = NewMI->getIterator();
2958   while (I->isBundledWithPred())
2959     I--;
2960   if (I->isBundle())
2961     I++;
2962 
2963   // Bail if this is not an S_GETPC bundle.
2964   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2965     return;
2966 
2967   // Update offsets of any references in the bundle.
2968   const unsigned NewBytes = 4;
2969   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2970          "Unexpected instruction insertion in bundle");
2971   auto NextMI = std::next(NewMI->getIterator());
2972   auto End = NewMI->getParent()->end();
2973   while (NextMI != End && NextMI->isBundledWithPred()) {
2974     for (auto &Operand : NextMI->operands()) {
2975       if (Operand.isGlobal())
2976         Operand.setOffset(Operand.getOffset() + NewBytes);
2977     }
2978     NextMI++;
2979   }
2980 }
2981 
2982 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2983   if (!ST.hasVALUMaskWriteHazard())
2984     return false;
2985   assert(!ST.hasExtendedWaitCounts());
2986 
2987   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2988     return false;
2989 
2990   // The hazard sequence is three instructions:
2991   //   1. VALU reads SGPR as mask
2992   //   2. SALU writes SGPR
2993   //   3. SALU reads SGPR
2994   // The hazard can expire if the distance between 2 and 3 is sufficient.
2995   // In practice this happens <10% of the time, hence this always assumes
2996   // the hazard exists if 1 and 2 are present to avoid searching.
2997 
2998   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2999   if (!SDSTOp || !SDSTOp->isReg())
3000     return false;
3001 
3002   const Register HazardReg = SDSTOp->getReg();
3003   if (HazardReg == AMDGPU::EXEC ||
3004       HazardReg == AMDGPU::EXEC_LO ||
3005       HazardReg == AMDGPU::EXEC_HI ||
3006       HazardReg == AMDGPU::M0)
3007     return false;
3008 
3009   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3010     switch (I.getOpcode()) {
3011     case AMDGPU::V_ADDC_U32_e32:
3012     case AMDGPU::V_ADDC_U32_dpp:
3013     case AMDGPU::V_CNDMASK_B16_fake16_e32:
3014     case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3015     case AMDGPU::V_CNDMASK_B32_e32:
3016     case AMDGPU::V_CNDMASK_B32_dpp:
3017     case AMDGPU::V_DIV_FMAS_F32_e64:
3018     case AMDGPU::V_DIV_FMAS_F64_e64:
3019     case AMDGPU::V_SUBB_U32_e32:
3020     case AMDGPU::V_SUBB_U32_dpp:
3021     case AMDGPU::V_SUBBREV_U32_e32:
3022     case AMDGPU::V_SUBBREV_U32_dpp:
3023       // These implicitly read VCC as mask source.
3024       return HazardReg == AMDGPU::VCC ||
3025              HazardReg == AMDGPU::VCC_LO ||
3026              HazardReg == AMDGPU::VCC_HI;
3027     case AMDGPU::V_ADDC_U32_e64:
3028     case AMDGPU::V_ADDC_U32_e64_dpp:
3029     case AMDGPU::V_CNDMASK_B16_fake16_e64:
3030     case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3031     case AMDGPU::V_CNDMASK_B32_e64:
3032     case AMDGPU::V_CNDMASK_B32_e64_dpp:
3033     case AMDGPU::V_SUBB_U32_e64:
3034     case AMDGPU::V_SUBB_U32_e64_dpp:
3035     case AMDGPU::V_SUBBREV_U32_e64:
3036     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3037       // Only check mask register overlaps.
3038       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3039       assert(SSRCOp);
3040       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3041     }
3042     default:
3043       return false;
3044     }
3045   };
3046 
3047   const MachineRegisterInfo &MRI = MF.getRegInfo();
3048   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3049     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3050     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3051         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3052       return true;
3053 
3054     // VALU access to any SGPR or literal constant other than HazardReg
3055     // mitigates hazard. No need to check HazardReg here as this will
3056     // only be called when !IsHazardFn.
3057     if (!SIInstrInfo::isVALU(I))
3058       return false;
3059     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3060       const MachineOperand &Op = I.getOperand(OpNo);
3061       if (Op.isReg()) {
3062         Register OpReg = Op.getReg();
3063         // Only consider uses
3064         if (!Op.isUse())
3065           continue;
3066         // Ignore EXEC
3067         if (OpReg == AMDGPU::EXEC ||
3068             OpReg == AMDGPU::EXEC_LO ||
3069             OpReg == AMDGPU::EXEC_HI)
3070           continue;
3071         // Ignore all implicit uses except VCC
3072         if (Op.isImplicit()) {
3073           if (OpReg == AMDGPU::VCC ||
3074               OpReg == AMDGPU::VCC_LO ||
3075               OpReg == AMDGPU::VCC_HI)
3076             return true;
3077           continue;
3078         }
3079         if (TRI.isSGPRReg(MRI, OpReg))
3080           return true;
3081       } else {
3082         const MCInstrDesc &InstDesc = I.getDesc();
3083         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3084         if (!TII.isInlineConstant(Op, OpInfo))
3085           return true;
3086       }
3087     }
3088     return false;
3089   };
3090 
3091   // Check for hazard
3092   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3093       std::numeric_limits<int>::max())
3094     return false;
3095 
3096   auto NextMI = std::next(MI->getIterator());
3097 
3098   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3099   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3100                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3101                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3102 
3103   // SALU write may be s_getpc in a bundle.
3104   updateGetPCBundle(NewMI);
3105 
3106   return true;
3107 }
3108 
3109 // Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3110 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3111 static std::optional<unsigned> sgprPairNumber(Register Reg,
3112                                               const SIRegisterInfo &TRI) {
3113   switch (Reg) {
3114   case AMDGPU::M0:
3115   case AMDGPU::EXEC:
3116   case AMDGPU::EXEC_LO:
3117   case AMDGPU::EXEC_HI:
3118   case AMDGPU::SGPR_NULL:
3119   case AMDGPU::SGPR_NULL64:
3120     return {};
3121   default:
3122     break;
3123   }
3124   unsigned RegN = TRI.getEncodingValue(Reg);
3125   if (RegN > 127)
3126     return {};
3127   return (RegN >> 1) & 0x3f;
3128 }
3129 
3130 // For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3131 void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3132   assert(MMF == &MF);
3133 
3134   // Assume non-empty vector means it has already been computed.
3135   if (!VALUReadHazardSGPRs.empty())
3136     return;
3137 
3138   auto CallingConv = MF.getFunction().getCallingConv();
3139   bool IsCallFree =
3140       AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3141 
3142   // Exhaustive search is only viable in non-caller/callee functions where
3143   // VALUs will be exposed to the hazard recognizer.
3144   UseVALUReadHazardExhaustiveSearch =
3145       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3146       MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
3147 
3148   // Consider all SGPRs hazards if the shader uses function calls or is callee.
3149   bool UseVALUUseCache =
3150       IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3151   VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3152   if (!UseVALUUseCache)
3153     return;
3154 
3155   // Perform a post ordered reverse scan to find VALUs which read an SGPR
3156   // before a SALU write to the same SGPR.  This provides a reduction in
3157   // hazard insertion when all VALU access to an SGPR occurs after its last
3158   // SALU write, when compared to a linear scan.
3159   const MachineRegisterInfo &MRI = MF.getRegInfo();
3160   BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3161   MachineCycleInfo CI;
3162   CI.compute(*MMF);
3163 
3164   for (auto *MBB : post_order(&MF)) {
3165     bool InCycle = CI.getCycle(MBB) != nullptr;
3166     for (auto &MI : reverse(MBB->instrs())) {
3167       bool IsVALU = SIInstrInfo::isVALU(MI);
3168       bool IsSALU = SIInstrInfo::isSALU(MI);
3169       if (!IsVALU && !IsSALU)
3170         continue;
3171 
3172       for (const MachineOperand &Op : MI.operands()) {
3173         if (!Op.isReg())
3174           continue;
3175         Register Reg = Op.getReg();
3176         assert(!Op.getSubReg());
3177         // Only consider implicit operands of VCC.
3178         if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3179                                  Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3180           continue;
3181         if (!TRI.isSGPRReg(MRI, Reg))
3182           continue;
3183         auto RegN = sgprPairNumber(Reg, TRI);
3184         if (!RegN)
3185           continue;
3186         if (IsVALU && Op.isUse()) {
3187           // Note: any access within a cycle must be considered a hazard.
3188           if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3189             VALUReadHazardSGPRs.set(*RegN);
3190           ReadSGPRs.set(*RegN);
3191         } else if (IsSALU) {
3192           if (Op.isDef())
3193             SALUWriteSGPRs.set(*RegN);
3194           else
3195             ReadSGPRs.set(*RegN);
3196         }
3197       }
3198     }
3199   }
3200 }
3201 
3202 bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3203   if (!ST.hasVALUReadSGPRHazard())
3204     return false;
3205 
3206   // The hazard sequence is fundamentally three instructions:
3207   //   1. VALU reads SGPR
3208   //   2. SALU writes SGPR
3209   //   3. VALU/SALU reads SGPR
3210   // Try to avoid searching for (1) because the expiry point of the hazard is
3211   // indeterminate; however, the hazard between (2) and (3) can expire if the
3212   // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3213   // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3214   // even if individual SGPRs are accessed.
3215 
3216   bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3217   bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3218   if (!(MIIsSALU || MIIsVALU))
3219     return false;
3220 
3221   // Avoid expensive search when compile time is priority by
3222   // mitigating every SALU which writes an SGPR.
3223   if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
3224     if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
3225       return false;
3226 
3227     const MachineOperand *SDSTOp =
3228         TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3229     if (!SDSTOp || !SDSTOp->isReg())
3230       return false;
3231 
3232     const Register HazardReg = SDSTOp->getReg();
3233     if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3234         HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3235       return false;
3236 
3237     // Add s_wait_alu sa_sdst(0) after SALU write.
3238     auto NextMI = std::next(MI->getIterator());
3239     auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3240                          TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3241                      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3242 
3243     // SALU write may be s_getpc in a bundle.
3244     updateGetPCBundle(NewMI);
3245 
3246     return true;
3247   }
3248 
3249   // Pre-compute set of SGPR pairs read by VALUs.
3250   // Note: pass mutable pointer to MachineFunction for CycleInfo.
3251   computeVALUHazardSGPRs(MI->getMF());
3252 
3253   // If no VALUs hazard SGPRs exist then nothing to do.
3254   if (VALUReadHazardSGPRs.none())
3255     return false;
3256 
3257   // All SGPR writes before a call/return must be flushed as the callee/caller
3258   // will not will not see the hazard chain, i.e. (2) to (3) described above.
3259   const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3260                        !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3261                          MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3262 
3263   // Collect all SGPR sources for MI which are read by a VALU.
3264   const MachineRegisterInfo &MRI = MF.getRegInfo();
3265   SmallSet<Register, 4> SGPRsUsed;
3266 
3267   if (!IsSetPC) {
3268     for (const MachineOperand &Op : MI->all_uses()) {
3269       Register OpReg = Op.getReg();
3270 
3271       // Only consider VCC implicit uses on VALUs.
3272       // The only expected SALU implicit access is SCC which is no hazard.
3273       if (MIIsSALU && Op.isImplicit())
3274         continue;
3275 
3276       if (!TRI.isSGPRReg(MRI, OpReg))
3277         continue;
3278 
3279       auto RegN = sgprPairNumber(OpReg, TRI);
3280       if (!RegN)
3281         continue;
3282 
3283       if (!VALUReadHazardSGPRs[*RegN])
3284         continue;
3285 
3286       SGPRsUsed.insert(OpReg);
3287     }
3288 
3289     // No SGPRs -> nothing to do.
3290     if (SGPRsUsed.empty())
3291       return false;
3292   }
3293 
3294   // A hazard is any SALU which writes one of the SGPRs read by MI.
3295   auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3296     if (!SIInstrInfo::isSALU(I))
3297       return false;
3298     // Ensure SGPR flush before call/return by conservatively assuming every
3299     // SALU writes an SGPR.
3300     if (IsSetPC && I.getNumDefs() > 0)
3301       return true;
3302     // Check for any register writes.
3303     return any_of(SGPRsUsed, [this, &I](Register Reg) {
3304       return I.modifiesRegister(Reg, &TRI);
3305     });
3306   };
3307 
3308   const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3309   auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3310     if (Count >= SALUExpiryCount)
3311       return true;
3312     // s_wait_alu sa_sdst(0) on path mitigates hazard.
3313     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3314         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3315       return true;
3316     return false;
3317   };
3318 
3319   auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3320     // Only count true SALUs as wait states.
3321     if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
3322       return 0;
3323     // SALU must be unrelated to any hazard registers.
3324     if (any_of(SGPRsUsed,
3325                [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3326       return 0;
3327     return 1;
3328   };
3329 
3330   // Check for the hazard.
3331   DenseSet<const MachineBasicBlock *> Visited;
3332   int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3333                                         std::next(MI->getReverseIterator()), 0,
3334                                         IsExpiredFn, Visited, WaitStatesFn);
3335 
3336   if (WaitStates >= SALUExpiryCount)
3337     return false;
3338 
3339   // Validate hazard through an exhaustive search.
3340   if (UseVALUReadHazardExhaustiveSearch) {
3341     // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3342     // This is searching for (1) in the hazard description.
3343     auto hazardPair = [this](Register Reg) {
3344       if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3345         return Register(AMDGPU::VCC);
3346       auto RegN = sgprPairNumber(Reg, TRI);
3347       return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3348     };
3349     auto SearchHazardFn = [this, hazardPair,
3350                            &SGPRsUsed](const MachineInstr &I) {
3351       if (!SIInstrInfo::isVALU(I))
3352         return false;
3353       // Check for any register reads.
3354       return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3355         return I.readsRegister(hazardPair(Reg), &TRI);
3356       });
3357     };
3358     auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3359       return false;
3360     };
3361     if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3362         std::numeric_limits<int>::max())
3363       return false;
3364   }
3365 
3366   // Add s_wait_alu sa_sdst(0) before SALU read.
3367   auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3368                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3369                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3370 
3371   // SALU read may be after s_getpc in a bundle.
3372   updateGetPCBundle(NewMI);
3373 
3374   return true;
3375 }
3376 
3377 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3378                                const SIInstrInfo &TII) {
3379   MachineBasicBlock &EntryMBB = MF->front();
3380   if (EntryMBB.begin() != EntryMBB.end()) {
3381     auto &EntryMI = *EntryMBB.begin();
3382     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3383         EntryMI.getOperand(0).getImm() >= Priority)
3384       return false;
3385   }
3386 
3387   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3388       .addImm(Priority);
3389   return true;
3390 }
3391 
3392 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3393   if (!ST.hasRequiredExportPriority())
3394     return false;
3395 
3396   // Assume the following shader types will never have exports,
3397   // and avoid adding or adjusting S_SETPRIO.
3398   MachineBasicBlock *MBB = MI->getParent();
3399   MachineFunction *MF = MBB->getParent();
3400   auto CC = MF->getFunction().getCallingConv();
3401   switch (CC) {
3402   case CallingConv::AMDGPU_CS:
3403   case CallingConv::AMDGPU_CS_Chain:
3404   case CallingConv::AMDGPU_CS_ChainPreserve:
3405   case CallingConv::AMDGPU_KERNEL:
3406     return false;
3407   default:
3408     break;
3409   }
3410 
3411   const int MaxPriority = 3;
3412   const int NormalPriority = 2;
3413   const int PostExportPriority = 0;
3414 
3415   auto It = MI->getIterator();
3416   switch (MI->getOpcode()) {
3417   case AMDGPU::S_ENDPGM:
3418   case AMDGPU::S_ENDPGM_SAVED:
3419   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3420   case AMDGPU::SI_RETURN_TO_EPILOG:
3421     // Ensure shader with calls raises priority at entry.
3422     // This ensures correct priority if exports exist in callee.
3423     if (MF->getFrameInfo().hasCalls())
3424       return ensureEntrySetPrio(MF, NormalPriority, TII);
3425     return false;
3426   case AMDGPU::S_SETPRIO: {
3427     // Raise minimum priority unless in workaround.
3428     auto &PrioOp = MI->getOperand(0);
3429     int Prio = PrioOp.getImm();
3430     bool InWA = (Prio == PostExportPriority) &&
3431                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3432     if (InWA || Prio >= NormalPriority)
3433       return false;
3434     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3435     return true;
3436   }
3437   default:
3438     if (!TII.isEXP(*MI))
3439       return false;
3440     break;
3441   }
3442 
3443   // Check entry priority at each export (as there will only be a few).
3444   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3445   bool Changed = false;
3446   if (CC != CallingConv::AMDGPU_Gfx)
3447     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3448 
3449   auto NextMI = std::next(It);
3450   bool EndOfShader = false;
3451   if (NextMI != MBB->end()) {
3452     // Only need WA at end of sequence of exports.
3453     if (TII.isEXP(*NextMI))
3454       return Changed;
3455     // Assume appropriate S_SETPRIO after export means WA already applied.
3456     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3457         NextMI->getOperand(0).getImm() == PostExportPriority)
3458       return Changed;
3459     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3460   }
3461 
3462   const DebugLoc &DL = MI->getDebugLoc();
3463 
3464   // Lower priority.
3465   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3466       .addImm(PostExportPriority);
3467 
3468   if (!EndOfShader) {
3469     // Wait for exports to complete.
3470     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3471         .addReg(AMDGPU::SGPR_NULL)
3472         .addImm(0);
3473   }
3474 
3475   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3476   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3477 
3478   if (!EndOfShader) {
3479     // Return to normal (higher) priority.
3480     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3481         .addImm(NormalPriority);
3482   }
3483 
3484   return true;
3485 }
3486