xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 9dff14be9ed6cf8da651bd675a839cde0d4294a2)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/Support/TargetParser.h"
20 
21 using namespace llvm;
22 
23 namespace {
24 
25 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27 
28   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29     if (Arg.getAsInteger(0, Value))
30       return O.error("'" + Arg + "' value invalid for uint argument!");
31 
32     if (Value > 100)
33       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34 
35     return false;
36   }
37 };
38 
39 } // end anonymous namespace
40 
41 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43                      cl::desc("Fill a percentage of the latency between "
44                               "neighboring MFMA with s_nops."));
45 
46 //===----------------------------------------------------------------------===//
47 // Hazard Recognizer Implementation
48 //===----------------------------------------------------------------------===//
49 
50 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51                                                  const GCNSubtarget &ST);
52 
53 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54   IsHazardRecognizerMode(false),
55   CurrCycleInstr(nullptr),
56   MF(MF),
57   ST(MF.getSubtarget<GCNSubtarget>()),
58   TII(*ST.getInstrInfo()),
59   TRI(TII.getRegisterInfo()),
60   ClauseUses(TRI.getNumRegUnits()),
61   ClauseDefs(TRI.getNumRegUnits()) {
62   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63   TSchedModel.init(&ST);
64   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65 }
66 
67 void GCNHazardRecognizer::Reset() {
68   EmittedInstrs.clear();
69 }
70 
71 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72   EmitInstruction(SU->getInstr());
73 }
74 
75 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76   CurrCycleInstr = MI;
77 }
78 
79 static bool isDivFMas(unsigned Opcode) {
80   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81 }
82 
83 static bool isSGetReg(unsigned Opcode) {
84   return Opcode == AMDGPU::S_GETREG_B32;
85 }
86 
87 static bool isSSetReg(unsigned Opcode) {
88   switch (Opcode) {
89   case AMDGPU::S_SETREG_B32:
90   case AMDGPU::S_SETREG_B32_mode:
91   case AMDGPU::S_SETREG_IMM32_B32:
92   case AMDGPU::S_SETREG_IMM32_B32_mode:
93     return true;
94   }
95   return false;
96 }
97 
98 static bool isRWLane(unsigned Opcode) {
99   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100 }
101 
102 static bool isRFE(unsigned Opcode) {
103   return Opcode == AMDGPU::S_RFE_B64;
104 }
105 
106 static bool isSMovRel(unsigned Opcode) {
107   switch (Opcode) {
108   case AMDGPU::S_MOVRELS_B32:
109   case AMDGPU::S_MOVRELS_B64:
110   case AMDGPU::S_MOVRELD_B32:
111   case AMDGPU::S_MOVRELD_B64:
112     return true;
113   default:
114     return false;
115   }
116 }
117 
118 static bool isDGEMM(unsigned Opcode) {
119   return AMDGPU::getMAIIsDGEMM(Opcode);
120 }
121 
122 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123   unsigned Opcode = MI.getOpcode();
124 
125   if (!SIInstrInfo::isMAI(MI) ||
126       isDGEMM(Opcode) ||
127       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129     return false;
130 
131   if (!ST.hasGFX940Insts())
132     return true;
133 
134   return AMDGPU::getMAIIsGFX940XDL(Opcode);
135 }
136 
137 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138                                     const MachineInstr &MI) {
139   if (TII.isAlwaysGDS(MI.getOpcode()))
140     return true;
141 
142   switch (MI.getOpcode()) {
143   case AMDGPU::S_SENDMSG:
144   case AMDGPU::S_SENDMSGHALT:
145   case AMDGPU::S_TTRACEDATA:
146     return true;
147   // These DS opcodes don't support GDS.
148   case AMDGPU::DS_NOP:
149   case AMDGPU::DS_PERMUTE_B32:
150   case AMDGPU::DS_BPERMUTE_B32:
151     return false;
152   default:
153     if (TII.isDS(MI.getOpcode())) {
154       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155                                            AMDGPU::OpName::gds);
156       if (MI.getOperand(GDS).getImm())
157         return true;
158     }
159     return false;
160   }
161 }
162 
163 static bool isPermlane(const MachineInstr &MI) {
164   unsigned Opcode = MI.getOpcode();
165   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
167 }
168 
169 static bool isLdsDma(const MachineInstr &MI) {
170   return SIInstrInfo::isVALU(MI) &&
171          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
172 }
173 
174 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
175   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
176                                                      AMDGPU::OpName::simm16);
177   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
178 }
179 
180 ScheduleHazardRecognizer::HazardType
181 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
182   MachineInstr *MI = SU->getInstr();
183   // If we are not in "HazardRecognizerMode" and therefore not being run from
184   // the scheduler, track possible stalls from hazards but don't insert noops.
185   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
186 
187   if (MI->isBundle())
188    return NoHazard;
189 
190   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
191     return HazardType;
192 
193   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
194     return HazardType;
195 
196   if (checkFPAtomicToDenormModeHazard(MI) > 0)
197     return HazardType;
198 
199   if (ST.hasNoDataDepHazard())
200     return NoHazard;
201 
202   // FIXME: Should flat be considered vmem?
203   if ((SIInstrInfo::isVMEM(*MI) ||
204        SIInstrInfo::isFLAT(*MI))
205       && checkVMEMHazards(MI) > 0)
206     return HazardType;
207 
208   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
209     return HazardType;
210 
211   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
212     return HazardType;
213 
214   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
215     return HazardType;
216 
217   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
218     return HazardType;
219 
220   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
221        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
222        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
223     return HazardType;
224 
225   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
226     return HazardType;
227 
228   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
229     return HazardType;
230 
231   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
232     return HazardType;
233 
234   if (((ST.hasReadM0MovRelInterpHazard() &&
235         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
236        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
237        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
238        (ST.hasReadM0LdsDirectHazard() &&
239         MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
240       checkReadM0Hazards(MI) > 0)
241     return HazardType;
242 
243   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
244     return HazardType;
245 
246   if ((SIInstrInfo::isVMEM(*MI) ||
247        SIInstrInfo::isFLAT(*MI) ||
248        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
249     return HazardType;
250 
251   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
252     return HazardType;
253 
254   return NoHazard;
255 }
256 
257 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
258                                 unsigned Quantity) {
259   while (Quantity > 0) {
260     unsigned Arg = std::min(Quantity, 8u);
261     Quantity -= Arg;
262     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
263         .addImm(Arg - 1);
264   }
265 }
266 
267 unsigned
268 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
269   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
270   assert(TSchedModel.getWriteProcResBegin(SC) !=
271          TSchedModel.getWriteProcResEnd(SC));
272   return TSchedModel.getWriteProcResBegin(SC)->Cycles;
273 }
274 
275 void GCNHazardRecognizer::processBundle() {
276   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
277   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
278   // Check bundled MachineInstr's for hazards.
279   for (; MI != E && MI->isInsideBundle(); ++MI) {
280     CurrCycleInstr = &*MI;
281     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
282 
283     if (IsHazardRecognizerMode) {
284       fixHazards(CurrCycleInstr);
285 
286       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
287     }
288 
289     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
290     // include the bundled MI directly after, only add a maximum of
291     // (MaxLookAhead - 1) noops to EmittedInstrs.
292     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
293       EmittedInstrs.push_front(nullptr);
294 
295     EmittedInstrs.push_front(CurrCycleInstr);
296     EmittedInstrs.resize(MaxLookAhead);
297   }
298   CurrCycleInstr = nullptr;
299 }
300 
301 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
302   IsHazardRecognizerMode = true;
303   CurrCycleInstr = MI;
304   unsigned W = PreEmitNoopsCommon(MI);
305   fixHazards(MI);
306   CurrCycleInstr = nullptr;
307   return W;
308 }
309 
310 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
311   if (MI->isBundle())
312     return 0;
313 
314   int WaitStates = 0;
315 
316   if (SIInstrInfo::isSMRD(*MI))
317     return std::max(WaitStates, checkSMRDHazards(MI));
318 
319   if (ST.hasNSAtoVMEMBug())
320     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
321 
322   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
323 
324   if (ST.hasNoDataDepHazard())
325     return WaitStates;
326 
327   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
328     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
329 
330   if (SIInstrInfo::isVALU(*MI))
331     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
332 
333   if (SIInstrInfo::isDPP(*MI))
334     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
335 
336   if (isDivFMas(MI->getOpcode()))
337     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
338 
339   if (isRWLane(MI->getOpcode()))
340     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
341 
342   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
343        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
344        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
345     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
346 
347   if (MI->isInlineAsm())
348     return std::max(WaitStates, checkInlineAsmHazards(MI));
349 
350   if (isSGetReg(MI->getOpcode()))
351     return std::max(WaitStates, checkGetRegHazards(MI));
352 
353   if (isSSetReg(MI->getOpcode()))
354     return std::max(WaitStates, checkSetRegHazards(MI));
355 
356   if (isRFE(MI->getOpcode()))
357     return std::max(WaitStates, checkRFEHazards(MI));
358 
359   if ((ST.hasReadM0MovRelInterpHazard() &&
360        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
361       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
362       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
363       (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
364     return std::max(WaitStates, checkReadM0Hazards(MI));
365 
366   if (SIInstrInfo::isMAI(*MI))
367     return std::max(WaitStates, checkMAIHazards(MI));
368 
369   if (SIInstrInfo::isVMEM(*MI) ||
370       SIInstrInfo::isFLAT(*MI) ||
371       SIInstrInfo::isDS(*MI))
372     return std::max(WaitStates, checkMAILdStHazards(MI));
373 
374   return WaitStates;
375 }
376 
377 void GCNHazardRecognizer::EmitNoop() {
378   EmittedInstrs.push_front(nullptr);
379 }
380 
381 void GCNHazardRecognizer::AdvanceCycle() {
382   // When the scheduler detects a stall, it will call AdvanceCycle() without
383   // emitting any instructions.
384   if (!CurrCycleInstr) {
385     EmittedInstrs.push_front(nullptr);
386     return;
387   }
388 
389   if (CurrCycleInstr->isBundle()) {
390     processBundle();
391     return;
392   }
393 
394   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
395   if (!NumWaitStates) {
396     CurrCycleInstr = nullptr;
397     return;
398   }
399 
400   // Keep track of emitted instructions
401   EmittedInstrs.push_front(CurrCycleInstr);
402 
403   // Add a nullptr for each additional wait state after the first.  Make sure
404   // not to add more than getMaxLookAhead() items to the list, since we
405   // truncate the list to that size right after this loop.
406   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
407        i < e; ++i) {
408     EmittedInstrs.push_front(nullptr);
409   }
410 
411   // getMaxLookahead() is the largest number of wait states we will ever need
412   // to insert, so there is no point in keeping track of more than that many
413   // wait states.
414   EmittedInstrs.resize(getMaxLookAhead());
415 
416   CurrCycleInstr = nullptr;
417 }
418 
419 void GCNHazardRecognizer::RecedeCycle() {
420   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
421 }
422 
423 //===----------------------------------------------------------------------===//
424 // Helper Functions
425 //===----------------------------------------------------------------------===//
426 
427 typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
428 
429 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
430 
431 // Search for a hazard in a block and its predecessors.
432 template <typename StateT>
433 static bool
434 hasHazard(StateT State,
435           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
436           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
437           const MachineBasicBlock *MBB,
438           MachineBasicBlock::const_reverse_instr_iterator I,
439           DenseSet<const MachineBasicBlock *> &Visited) {
440   for (auto E = MBB->instr_rend(); I != E; ++I) {
441     // No need to look at parent BUNDLE instructions.
442     if (I->isBundle())
443       continue;
444 
445     switch (IsHazard(State, *I)) {
446     case HazardFound:
447       return true;
448     case HazardExpired:
449       return false;
450     default:
451       // Continue search
452       break;
453     }
454 
455     if (I->isInlineAsm() || I->isMetaInstruction())
456       continue;
457 
458     UpdateState(State, *I);
459   }
460 
461   for (MachineBasicBlock *Pred : MBB->predecessors()) {
462     if (!Visited.insert(Pred).second)
463       continue;
464 
465     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
466                   Visited))
467       return true;
468   }
469 
470   return false;
471 }
472 
473 // Returns a minimum wait states since \p I walking all predecessors.
474 // Only scans until \p IsExpired does not return true.
475 // Can only be run in a hazard recognizer mode.
476 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
477                               const MachineBasicBlock *MBB,
478                               MachineBasicBlock::const_reverse_instr_iterator I,
479                               int WaitStates, IsExpiredFn IsExpired,
480                               DenseSet<const MachineBasicBlock *> &Visited) {
481   for (auto E = MBB->instr_rend(); I != E; ++I) {
482     // Don't add WaitStates for parent BUNDLE instructions.
483     if (I->isBundle())
484       continue;
485 
486     if (IsHazard(*I))
487       return WaitStates;
488 
489     if (I->isInlineAsm())
490       continue;
491 
492     WaitStates += SIInstrInfo::getNumWaitStates(*I);
493 
494     if (IsExpired(*I, WaitStates))
495       return std::numeric_limits<int>::max();
496   }
497 
498   int MinWaitStates = std::numeric_limits<int>::max();
499   for (MachineBasicBlock *Pred : MBB->predecessors()) {
500     if (!Visited.insert(Pred).second)
501       continue;
502 
503     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
504                                WaitStates, IsExpired, Visited);
505 
506     MinWaitStates = std::min(MinWaitStates, W);
507   }
508 
509   return MinWaitStates;
510 }
511 
512 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
513                               const MachineInstr *MI, IsExpiredFn IsExpired) {
514   DenseSet<const MachineBasicBlock *> Visited;
515   return getWaitStatesSince(IsHazard, MI->getParent(),
516                             std::next(MI->getReverseIterator()),
517                             0, IsExpired, Visited);
518 }
519 
520 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
521   if (IsHazardRecognizerMode) {
522     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
523       return WaitStates >= Limit;
524     };
525     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
526   }
527 
528   int WaitStates = 0;
529   for (MachineInstr *MI : EmittedInstrs) {
530     if (MI) {
531       if (IsHazard(*MI))
532         return WaitStates;
533 
534       if (MI->isInlineAsm())
535         continue;
536     }
537     ++WaitStates;
538 
539     if (WaitStates >= Limit)
540       break;
541   }
542   return std::numeric_limits<int>::max();
543 }
544 
545 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
546                                                IsHazardFn IsHazardDef,
547                                                int Limit) {
548   const SIRegisterInfo *TRI = ST.getRegisterInfo();
549 
550   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
551     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
552   };
553 
554   return getWaitStatesSince(IsHazardFn, Limit);
555 }
556 
557 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
558                                                   int Limit) {
559   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
560     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
561   };
562 
563   return getWaitStatesSince(IsHazardFn, Limit);
564 }
565 
566 //===----------------------------------------------------------------------===//
567 // No-op Hazard Detection
568 //===----------------------------------------------------------------------===//
569 
570 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
571                         MCRegister Reg) {
572   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
573     BV.set(*RUI);
574 }
575 
576 static void addRegsToSet(const SIRegisterInfo &TRI,
577                          iterator_range<MachineInstr::const_mop_iterator> Ops,
578                          BitVector &Set) {
579   for (const MachineOperand &Op : Ops) {
580     if (Op.isReg())
581       addRegUnits(TRI, Set, Op.getReg().asMCReg());
582   }
583 }
584 
585 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
586   // XXX: Do we need to worry about implicit operands
587   addRegsToSet(TRI, MI.defs(), ClauseDefs);
588   addRegsToSet(TRI, MI.uses(), ClauseUses);
589 }
590 
591 static bool breaksSMEMSoftClause(MachineInstr *MI) {
592   return !SIInstrInfo::isSMRD(*MI);
593 }
594 
595 static bool breaksVMEMSoftClause(MachineInstr *MI) {
596   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
597 }
598 
599 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
600   // SMEM soft clause are only present on VI+, and only matter if xnack is
601   // enabled.
602   if (!ST.isXNACKEnabled())
603     return 0;
604 
605   bool IsSMRD = TII.isSMRD(*MEM);
606 
607   resetClause();
608 
609   // A soft-clause is any group of consecutive SMEM instructions.  The
610   // instructions in this group may return out of order and/or may be
611   // replayed (i.e. the same instruction issued more than once).
612   //
613   // In order to handle these situations correctly we need to make sure that
614   // when a clause has more than one instruction, no instruction in the clause
615   // writes to a register that is read by another instruction in the clause
616   // (including itself). If we encounter this situation, we need to break the
617   // clause by inserting a non SMEM instruction.
618 
619   for (MachineInstr *MI : EmittedInstrs) {
620     // When we hit a non-SMEM instruction then we have passed the start of the
621     // clause and we can stop.
622     if (!MI)
623       break;
624 
625     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
626       break;
627 
628     addClauseInst(*MI);
629   }
630 
631   if (ClauseDefs.none())
632     return 0;
633 
634   // We need to make sure not to put loads and stores in the same clause if they
635   // use the same address. For now, just start a new clause whenever we see a
636   // store.
637   if (MEM->mayStore())
638     return 1;
639 
640   addClauseInst(*MEM);
641 
642   // If the set of defs and uses intersect then we cannot add this instruction
643   // to the clause, so we have a hazard.
644   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
645 }
646 
647 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
648   int WaitStatesNeeded = 0;
649 
650   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
651 
652   // This SMRD hazard only affects SI.
653   if (!ST.hasSMRDReadVALUDefHazard())
654     return WaitStatesNeeded;
655 
656   // A read of an SGPR by SMRD instruction requires 4 wait states when the
657   // SGPR was written by a VALU instruction.
658   int SmrdSgprWaitStates = 4;
659   auto IsHazardDefFn = [this](const MachineInstr &MI) {
660     return TII.isVALU(MI);
661   };
662   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
663     return TII.isSALU(MI);
664   };
665 
666   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
667 
668   for (const MachineOperand &Use : SMRD->uses()) {
669     if (!Use.isReg())
670       continue;
671     int WaitStatesNeededForUse =
672         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
673                                                    SmrdSgprWaitStates);
674     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
675 
676     // This fixes what appears to be undocumented hardware behavior in SI where
677     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
678     // needs some number of nops in between. We don't know how many we need, but
679     // let's use 4. This wasn't discovered before probably because the only
680     // case when this happens is when we expand a 64-bit pointer into a full
681     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
682     // probably never encountered in the closed-source land.
683     if (IsBufferSMRD) {
684       int WaitStatesNeededForUse =
685         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
686                                                    IsBufferHazardDefFn,
687                                                    SmrdSgprWaitStates);
688       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
689     }
690   }
691 
692   return WaitStatesNeeded;
693 }
694 
695 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
696   if (!ST.hasVMEMReadSGPRVALUDefHazard())
697     return 0;
698 
699   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
700 
701   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
702   // SGPR was written by a VALU Instruction.
703   const int VmemSgprWaitStates = 5;
704   auto IsHazardDefFn = [this](const MachineInstr &MI) {
705     return TII.isVALU(MI);
706   };
707   for (const MachineOperand &Use : VMEM->uses()) {
708     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
709       continue;
710 
711     int WaitStatesNeededForUse =
712         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
713                                                    VmemSgprWaitStates);
714     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
715   }
716   return WaitStatesNeeded;
717 }
718 
719 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
720   const SIRegisterInfo *TRI = ST.getRegisterInfo();
721   const SIInstrInfo *TII = ST.getInstrInfo();
722 
723   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
724   int DppVgprWaitStates = 2;
725   int DppExecWaitStates = 5;
726   int WaitStatesNeeded = 0;
727   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
728     return TII->isVALU(MI);
729   };
730 
731   for (const MachineOperand &Use : DPP->uses()) {
732     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
733       continue;
734     int WaitStatesNeededForUse =
735         DppVgprWaitStates - getWaitStatesSinceDef(
736                                 Use.getReg(),
737                                 [](const MachineInstr &) { return true; },
738                                 DppVgprWaitStates);
739     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
740   }
741 
742   WaitStatesNeeded = std::max(
743       WaitStatesNeeded,
744       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
745                                                 DppExecWaitStates));
746 
747   return WaitStatesNeeded;
748 }
749 
750 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
751   const SIInstrInfo *TII = ST.getInstrInfo();
752 
753   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
754   // instruction.
755   const int DivFMasWaitStates = 4;
756   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
757     return TII->isVALU(MI);
758   };
759   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
760                                                DivFMasWaitStates);
761 
762   return DivFMasWaitStates - WaitStatesNeeded;
763 }
764 
765 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
766   const SIInstrInfo *TII = ST.getInstrInfo();
767   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
768 
769   const int GetRegWaitStates = 2;
770   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
771     return GetRegHWReg == getHWReg(TII, MI);
772   };
773   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
774 
775   return GetRegWaitStates - WaitStatesNeeded;
776 }
777 
778 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
779   const SIInstrInfo *TII = ST.getInstrInfo();
780   unsigned HWReg = getHWReg(TII, *SetRegInstr);
781 
782   const int SetRegWaitStates = ST.getSetRegWaitStates();
783   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
784     return HWReg == getHWReg(TII, MI);
785   };
786   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
787   return SetRegWaitStates - WaitStatesNeeded;
788 }
789 
790 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
791   if (!MI.mayStore())
792     return -1;
793 
794   const SIInstrInfo *TII = ST.getInstrInfo();
795   unsigned Opcode = MI.getOpcode();
796   const MCInstrDesc &Desc = MI.getDesc();
797 
798   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
799   int VDataRCID = -1;
800   if (VDataIdx != -1)
801     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
802 
803   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
804     // There is no hazard if the instruction does not use vector regs
805     // (like wbinvl1)
806     if (VDataIdx == -1)
807       return -1;
808     // For MUBUF/MTBUF instructions this hazard only exists if the
809     // instruction is not using a register in the soffset field.
810     const MachineOperand *SOffset =
811         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
812     // If we have no soffset operand, then assume this field has been
813     // hardcoded to zero.
814     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
815         (!SOffset || !SOffset->isReg()))
816       return VDataIdx;
817   }
818 
819   // MIMG instructions create a hazard if they don't use a 256-bit T# and
820   // the store size is greater than 8 bytes and they have more than two bits
821   // of their dmask set.
822   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
823   if (TII->isMIMG(MI)) {
824     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
825     assert(SRsrcIdx != -1 &&
826            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
827     (void)SRsrcIdx;
828   }
829 
830   if (TII->isFLAT(MI)) {
831     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
832     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
833       return DataIdx;
834   }
835 
836   return -1;
837 }
838 
839 int
840 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
841                                             const MachineRegisterInfo &MRI) {
842   // Helper to check for the hazard where VMEM instructions that store more than
843   // 8 bytes can have there store data over written by the next instruction.
844   const SIRegisterInfo *TRI = ST.getRegisterInfo();
845 
846   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
847   int WaitStatesNeeded = 0;
848 
849   if (!TRI->isVectorRegister(MRI, Def.getReg()))
850     return WaitStatesNeeded;
851   Register Reg = Def.getReg();
852   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
853     int DataIdx = createsVALUHazard(MI);
854     return DataIdx >= 0 &&
855            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
856   };
857   int WaitStatesNeededForDef =
858     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
859   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
860 
861   return WaitStatesNeeded;
862 }
863 
864 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
865   int WaitStatesNeeded = 0;
866 
867   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
868     const int TransDefWaitstates = 1;
869 
870     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
871       if (!SIInstrInfo::isTRANS(MI))
872         return false;
873       const SIRegisterInfo *TRI = ST.getRegisterInfo();
874       const SIInstrInfo *TII = ST.getInstrInfo();
875       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
876 
877       for (const MachineOperand &Use : VALU->explicit_uses()) {
878         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
879           return true;
880       }
881 
882       return false;
883     };
884 
885     int WaitStatesNeededForDef =
886         TransDefWaitstates -
887         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
888     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
889   }
890 
891   if (ST.hasDstSelForwardingHazard()) {
892     const int Shift16DefWaitstates = 1;
893 
894     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
895       if (!SIInstrInfo::isVALU(MI))
896         return false;
897       const SIInstrInfo *TII = ST.getInstrInfo();
898       if (SIInstrInfo::isSDWA(MI)) {
899         if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
900           if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
901             return false;
902       } else {
903         if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
904                                         AMDGPU::OpName::op_sel) == -1) ||
905             !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
906                   ->getImm() &
907               SISrcMods::DST_OP_SEL))
908           return false;
909       }
910       const SIRegisterInfo *TRI = ST.getRegisterInfo();
911       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
912         Register Def = Dst->getReg();
913 
914         for (const MachineOperand &Use : VALU->explicit_uses()) {
915           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
916             return true;
917         }
918       }
919 
920       return false;
921     };
922 
923     int WaitStatesNeededForDef =
924         Shift16DefWaitstates -
925         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
926     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
927   }
928 
929   if (ST.hasVDecCoExecHazard()) {
930     const int VALUWriteSGPRVALUReadWaitstates = 2;
931     const int VALUWriteEXECRWLane = 4;
932     const int VALUWriteVGPRReadlaneRead = 1;
933 
934     const SIRegisterInfo *TRI = ST.getRegisterInfo();
935     const MachineRegisterInfo &MRI = MF.getRegInfo();
936     Register UseReg;
937     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
938       if (!SIInstrInfo::isVALU(MI))
939         return false;
940       return MI.modifiesRegister(UseReg, TRI);
941     };
942 
943     for (const MachineOperand &Use : VALU->explicit_uses()) {
944       if (!Use.isReg())
945         continue;
946 
947       UseReg = Use.getReg();
948       if (TRI->isSGPRReg(MRI, UseReg)) {
949         int WaitStatesNeededForDef =
950             VALUWriteSGPRVALUReadWaitstates -
951             getWaitStatesSince(IsVALUDefSGPRFn,
952                                VALUWriteSGPRVALUReadWaitstates);
953         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
954       }
955     }
956 
957     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
958       UseReg = AMDGPU::VCC;
959       int WaitStatesNeededForDef =
960           VALUWriteSGPRVALUReadWaitstates -
961           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
962       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
963     }
964 
965     switch (VALU->getOpcode()) {
966     case AMDGPU::V_READLANE_B32:
967     case AMDGPU::V_READFIRSTLANE_B32: {
968       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
969       UseReg = Src->getReg();
970       int WaitStatesNeededForDef =
971           VALUWriteVGPRReadlaneRead -
972           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
973       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
974     }
975       LLVM_FALLTHROUGH;
976     case AMDGPU::V_WRITELANE_B32: {
977       UseReg = AMDGPU::EXEC;
978       int WaitStatesNeededForDef =
979           VALUWriteEXECRWLane -
980           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
981       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
982       break;
983     }
984     default:
985       break;
986     }
987   }
988 
989   // This checks for the hazard where VMEM instructions that store more than
990   // 8 bytes can have there store data over written by the next instruction.
991   if (!ST.has12DWordStoreHazard())
992     return WaitStatesNeeded;
993 
994   const MachineRegisterInfo &MRI = MF.getRegInfo();
995 
996   for (const MachineOperand &Def : VALU->defs()) {
997     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
998   }
999 
1000   return WaitStatesNeeded;
1001 }
1002 
1003 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1004   // This checks for hazards associated with inline asm statements.
1005   // Since inline asms can contain just about anything, we use this
1006   // to call/leverage other check*Hazard routines. Note that
1007   // this function doesn't attempt to address all possible inline asm
1008   // hazards (good luck), but is a collection of what has been
1009   // problematic thus far.
1010 
1011   // see checkVALUHazards()
1012   if (!ST.has12DWordStoreHazard())
1013     return 0;
1014 
1015   const MachineRegisterInfo &MRI = MF.getRegInfo();
1016   int WaitStatesNeeded = 0;
1017 
1018   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
1019        I != E; ++I) {
1020     const MachineOperand &Op = IA->getOperand(I);
1021     if (Op.isReg() && Op.isDef()) {
1022       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1023     }
1024   }
1025 
1026   return WaitStatesNeeded;
1027 }
1028 
1029 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1030   const SIInstrInfo *TII = ST.getInstrInfo();
1031   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1032   const MachineRegisterInfo &MRI = MF.getRegInfo();
1033 
1034   const MachineOperand *LaneSelectOp =
1035       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1036 
1037   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1038     return 0;
1039 
1040   Register LaneSelectReg = LaneSelectOp->getReg();
1041   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1042 
1043   const int RWLaneWaitStates = 4;
1044   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1045                                               RWLaneWaitStates);
1046   return RWLaneWaitStates - WaitStatesSince;
1047 }
1048 
1049 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1050   if (!ST.hasRFEHazards())
1051     return 0;
1052 
1053   const SIInstrInfo *TII = ST.getInstrInfo();
1054 
1055   const int RFEWaitStates = 1;
1056 
1057   auto IsHazardFn = [TII](const MachineInstr &MI) {
1058     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1059   };
1060   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1061   return RFEWaitStates - WaitStatesNeeded;
1062 }
1063 
1064 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1065   const SIInstrInfo *TII = ST.getInstrInfo();
1066   const int ReadM0WaitStates = 1;
1067   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1068   return ReadM0WaitStates -
1069          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1070 }
1071 
1072 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1073   fixVMEMtoScalarWriteHazards(MI);
1074   fixVcmpxPermlaneHazards(MI);
1075   fixSMEMtoVectorWriteHazards(MI);
1076   fixVcmpxExecWARHazard(MI);
1077   fixLdsBranchVmemWARHazard(MI);
1078   fixVALUPartialForwardingHazard(MI);
1079   fixVALUTransUseHazard(MI);
1080 }
1081 
1082 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1083   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1084     return false;
1085 
1086   const SIInstrInfo *TII = ST.getInstrInfo();
1087   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1088   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1089     return (TII->isVOPC(MI) ||
1090             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1091            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1092   };
1093 
1094   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1095     unsigned Opc = MI.getOpcode();
1096     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1097            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1098   };
1099 
1100   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1101       std::numeric_limits<int>::max())
1102     return false;
1103 
1104   // V_NOP will be discarded by SQ.
1105   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1106   // which is always a VGPR and available.
1107   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1108   Register Reg = Src0->getReg();
1109   bool IsUndef = Src0->isUndef();
1110   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1111           TII->get(AMDGPU::V_MOV_B32_e32))
1112     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1113     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1114 
1115   return true;
1116 }
1117 
1118 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1119   if (!ST.hasVMEMtoScalarWriteHazard())
1120     return false;
1121 
1122   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1123     return false;
1124 
1125   if (MI->getNumDefs() == 0)
1126     return false;
1127 
1128   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1129 
1130   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1131     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1132         !SIInstrInfo::isFLAT(I))
1133       return false;
1134 
1135     for (const MachineOperand &Def : MI->defs()) {
1136       const MachineOperand *Op =
1137           I.findRegisterUseOperand(Def.getReg(), false, TRI);
1138       if (!Op)
1139         continue;
1140       return true;
1141     }
1142     return false;
1143   };
1144 
1145   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1146     return SIInstrInfo::isVALU(MI) ||
1147            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1148             !MI.getOperand(0).getImm()) ||
1149            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1150             MI.getOperand(0).getImm() == 0xffe3);
1151   };
1152 
1153   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1154       std::numeric_limits<int>::max())
1155     return false;
1156 
1157   const SIInstrInfo *TII = ST.getInstrInfo();
1158   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1159           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1160       .addImm(0xffe3);
1161   return true;
1162 }
1163 
1164 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1165   if (!ST.hasSMEMtoVectorWriteHazard())
1166     return false;
1167 
1168   if (!SIInstrInfo::isVALU(*MI))
1169     return false;
1170 
1171   unsigned SDSTName;
1172   switch (MI->getOpcode()) {
1173   case AMDGPU::V_READLANE_B32:
1174   case AMDGPU::V_READFIRSTLANE_B32:
1175     SDSTName = AMDGPU::OpName::vdst;
1176     break;
1177   default:
1178     SDSTName = AMDGPU::OpName::sdst;
1179     break;
1180   }
1181 
1182   const SIInstrInfo *TII = ST.getInstrInfo();
1183   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1184   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1185   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1186   if (!SDST) {
1187     for (const auto &MO : MI->implicit_operands()) {
1188       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
1189         SDST = &MO;
1190         break;
1191       }
1192     }
1193   }
1194 
1195   if (!SDST)
1196     return false;
1197 
1198   const Register SDSTReg = SDST->getReg();
1199   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1200     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1201   };
1202 
1203   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1204     if (TII->isSALU(MI)) {
1205       switch (MI.getOpcode()) {
1206       case AMDGPU::S_SETVSKIP:
1207       case AMDGPU::S_VERSION:
1208       case AMDGPU::S_WAITCNT_VSCNT:
1209       case AMDGPU::S_WAITCNT_VMCNT:
1210       case AMDGPU::S_WAITCNT_EXPCNT:
1211         // These instructions cannot not mitigate the hazard.
1212         return false;
1213       case AMDGPU::S_WAITCNT_LGKMCNT:
1214         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1215         return (MI.getOperand(1).getImm() == 0) &&
1216                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1217       case AMDGPU::S_WAITCNT: {
1218         const int64_t Imm = MI.getOperand(0).getImm();
1219         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1220         return (Decoded.LgkmCnt == 0);
1221       }
1222       default:
1223         // SOPP instructions cannot mitigate the hazard.
1224         if (TII->isSOPP(MI))
1225           return false;
1226         // At this point the SALU can be assumed to mitigate the hazard
1227         // because either:
1228         // (a) it is independent of the at risk SMEM (breaking chain),
1229         // or
1230         // (b) it is dependent on the SMEM, in which case an appropriate
1231         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1232         //     SMEM instruction.
1233         return true;
1234       }
1235     }
1236     return false;
1237   };
1238 
1239   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1240       std::numeric_limits<int>::max())
1241     return false;
1242 
1243   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1244           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1245       .addImm(0);
1246   return true;
1247 }
1248 
1249 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1250   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1251     return false;
1252 
1253   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1254   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1255     return false;
1256 
1257   auto IsHazardFn = [TRI](const MachineInstr &I) {
1258     if (SIInstrInfo::isVALU(I))
1259       return false;
1260     return I.readsRegister(AMDGPU::EXEC, TRI);
1261   };
1262 
1263   const SIInstrInfo *TII = ST.getInstrInfo();
1264   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1265     if (SIInstrInfo::isVALU(MI)) {
1266       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1267         return true;
1268       for (auto MO : MI.implicit_operands())
1269         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1270           return true;
1271     }
1272     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1273         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1274       return true;
1275     return false;
1276   };
1277 
1278   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1279       std::numeric_limits<int>::max())
1280     return false;
1281 
1282   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1283           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1284     .addImm(0xfffe);
1285   return true;
1286 }
1287 
1288 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1289                                                  const GCNSubtarget &ST) {
1290   if (!ST.hasLdsBranchVmemWARHazard())
1291     return false;
1292 
1293   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1294   // instructions need to appear in the same function.
1295   bool HasLds = false;
1296   bool HasVmem = false;
1297   for (auto &MBB : MF) {
1298     for (auto &MI : MBB) {
1299       HasLds |= SIInstrInfo::isDS(MI);
1300       HasVmem |=
1301           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1302       if (HasLds && HasVmem)
1303         return true;
1304     }
1305   }
1306   return false;
1307 }
1308 
1309 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1310   if (!RunLdsBranchVmemWARHazardFixup)
1311     return false;
1312 
1313   assert(ST.hasLdsBranchVmemWARHazard());
1314 
1315   auto IsHazardInst = [](const MachineInstr &MI) {
1316     if (SIInstrInfo::isDS(MI))
1317       return 1;
1318     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1319       return 2;
1320     return 0;
1321   };
1322 
1323   auto InstType = IsHazardInst(*MI);
1324   if (!InstType)
1325     return false;
1326 
1327   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1328     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1329                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1330                                !I.getOperand(1).getImm());
1331   };
1332 
1333   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1334     if (!I.isBranch())
1335       return false;
1336 
1337     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1338       auto InstType2 = IsHazardInst(I);
1339       return InstType2 && InstType != InstType2;
1340     };
1341 
1342     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1343       auto InstType2 = IsHazardInst(I);
1344       if (InstType == InstType2)
1345         return true;
1346 
1347       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1348              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1349              !I.getOperand(1).getImm();
1350     };
1351 
1352     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1353            std::numeric_limits<int>::max();
1354   };
1355 
1356   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1357       std::numeric_limits<int>::max())
1358     return false;
1359 
1360   const SIInstrInfo *TII = ST.getInstrInfo();
1361   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1362           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1363     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1364     .addImm(0);
1365 
1366   return true;
1367 }
1368 
1369 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1370   if (!ST.isWave64())
1371     return false;
1372   if (!ST.hasVALUPartialForwardingHazard())
1373     return false;
1374   if (!SIInstrInfo::isVALU(*MI))
1375     return false;
1376 
1377   SmallSetVector<Register, 4> SrcVGPRs;
1378 
1379   for (const MachineOperand &Use : MI->explicit_uses()) {
1380     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1381       SrcVGPRs.insert(Use.getReg());
1382   }
1383 
1384   // Only applies with >= 2 unique VGPR sources
1385   if (SrcVGPRs.size() <= 1)
1386     return false;
1387 
1388   // Look for the following pattern:
1389   //   Va <- VALU [PreExecPos]
1390   //   intv1
1391   //   Exec <- SALU [ExecPos]
1392   //   intv2
1393   //   Vb <- VALU [PostExecPos]
1394   //   intv3
1395   //   MI Va, Vb (WaitState = 0)
1396   //
1397   // Where:
1398   // intv1 + intv2 <= 2 VALUs
1399   // intv3 <= 4 VALUs
1400   //
1401   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1402 
1403   const int Intv1plus2MaxVALUs = 2;
1404   const int Intv3MaxVALUs = 4;
1405   const int IntvMaxVALUs = 6;
1406   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1407 
1408   struct StateType {
1409     SmallDenseMap<Register, int, 4> DefPos;
1410     int ExecPos = std::numeric_limits<int>::max();
1411     int VALUs = 0;
1412   };
1413 
1414   StateType State;
1415 
1416   // This overloads expiry testing with all the hazard detection
1417   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1418     // Too many VALU states have passed
1419     if (State.VALUs > NoHazardVALUWaitStates)
1420       return HazardExpired;
1421 
1422     // Instructions which cause va_vdst==0 expire hazard
1423     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1424         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1425         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1426          I.getOperand(0).getImm() == 0x0fff))
1427       return HazardExpired;
1428 
1429     // Track registers writes
1430     bool Changed = false;
1431     if (SIInstrInfo::isVALU(I)) {
1432       for (Register Src : SrcVGPRs) {
1433         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1434           State.DefPos[Src] = State.VALUs;
1435           Changed = true;
1436         }
1437       }
1438     } else if (SIInstrInfo::isSALU(I)) {
1439       if (State.ExecPos == std::numeric_limits<int>::max()) {
1440         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1441           State.ExecPos = State.VALUs;
1442           Changed = true;
1443         }
1444       }
1445     }
1446 
1447     // Early expiration: too many VALUs in intv3
1448     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1449       return HazardExpired;
1450 
1451     // Only evaluate state if something changed
1452     if (!Changed)
1453       return NoHazardFound;
1454 
1455     // Determine positions of VALUs pre/post exec change
1456     if (State.ExecPos == std::numeric_limits<int>::max())
1457       return NoHazardFound;
1458 
1459     int PreExecPos = std::numeric_limits<int>::max();
1460     int PostExecPos = std::numeric_limits<int>::max();
1461 
1462     for (auto Entry : State.DefPos) {
1463       int DefVALUs = Entry.second;
1464       if (DefVALUs != std::numeric_limits<int>::max()) {
1465         if (DefVALUs >= State.ExecPos)
1466           PreExecPos = std::min(PreExecPos, DefVALUs);
1467         else if (DefVALUs < State.ExecPos)
1468           PostExecPos = std::min(PostExecPos, DefVALUs);
1469       }
1470     }
1471 
1472     // Need a VALUs post exec change
1473     if (PostExecPos == std::numeric_limits<int>::max())
1474       return NoHazardFound;
1475 
1476     // Too many VALUs in intv3?
1477     int Intv3VALUs = PostExecPos;
1478     if (Intv3VALUs > Intv3MaxVALUs)
1479       return HazardExpired;
1480 
1481     // Too many VALUs in intv2?
1482     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1483     if (Intv2VALUs > Intv1plus2MaxVALUs)
1484       return HazardExpired;
1485 
1486     // Need a VALUs pre exec change
1487     if (PreExecPos == std::numeric_limits<int>::max())
1488       return NoHazardFound;
1489 
1490     // Too many VALUs in intv1?
1491     int Intv1VALUs = PreExecPos - State.ExecPos;
1492     if (Intv1VALUs > Intv1plus2MaxVALUs)
1493       return HazardExpired;
1494 
1495     // Too many VALUs in intv1 + intv2
1496     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1497       return HazardExpired;
1498 
1499     return HazardFound;
1500   };
1501   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1502     if (SIInstrInfo::isVALU(MI))
1503       State.VALUs += 1;
1504   };
1505 
1506   DenseSet<const MachineBasicBlock *> Visited;
1507   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1508                             std::next(MI->getReverseIterator()), Visited))
1509     return false;
1510 
1511   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1512           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1513       .addImm(0x0fff);
1514 
1515   return true;
1516 }
1517 
1518 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1519   if (!ST.hasVALUTransUseHazard())
1520     return false;
1521   if (!SIInstrInfo::isVALU(*MI))
1522     return false;
1523 
1524   SmallSet<Register, 4> SrcVGPRs;
1525 
1526   for (const MachineOperand &Use : MI->explicit_uses()) {
1527     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1528       SrcVGPRs.insert(Use.getReg());
1529   }
1530 
1531   // Look for the following pattern:
1532   //   Va <- TRANS VALU
1533   //   intv
1534   //   MI Va (WaitState = 0)
1535   //
1536   // Where:
1537   // intv <= 5 VALUs / 1 TRANS
1538   //
1539   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1540 
1541   const int IntvMaxVALUs = 5;
1542   const int IntvMaxTRANS = 1;
1543 
1544   struct StateType {
1545     int VALUs = 0;
1546     int TRANS = 0;
1547   };
1548 
1549   StateType State;
1550 
1551   // This overloads expiry testing with all the hazard detection
1552   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1553     // Too many VALU states have passed
1554     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1555       return HazardExpired;
1556 
1557     // Instructions which cause va_vdst==0 expire hazard
1558     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1559         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1560         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1561          I.getOperand(0).getImm() == 0x0fff))
1562       return HazardExpired;
1563 
1564     // Track registers writes
1565     if (SIInstrInfo::isTRANS(I)) {
1566       for (Register Src : SrcVGPRs) {
1567         if (I.modifiesRegister(Src, &TRI)) {
1568           return HazardFound;
1569         }
1570       }
1571     }
1572 
1573     return NoHazardFound;
1574   };
1575   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1576     if (SIInstrInfo::isVALU(MI))
1577       State.VALUs += 1;
1578     if (SIInstrInfo::isTRANS(MI))
1579       State.TRANS += 1;
1580   };
1581 
1582   DenseSet<const MachineBasicBlock *> Visited;
1583   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1584                             std::next(MI->getReverseIterator()), Visited))
1585     return false;
1586 
1587   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1588   // avoided (mask 0x0fff achieves this).
1589   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1590           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1591       .addImm(0x0fff);
1592 
1593   return true;
1594 }
1595 
1596 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1597   int NSAtoVMEMWaitStates = 1;
1598 
1599   if (!ST.hasNSAtoVMEMBug())
1600     return 0;
1601 
1602   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1603     return 0;
1604 
1605   const SIInstrInfo *TII = ST.getInstrInfo();
1606   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1607   if (!Offset || (Offset->getImm() & 6) == 0)
1608     return 0;
1609 
1610   auto IsHazardFn = [TII](const MachineInstr &I) {
1611     if (!SIInstrInfo::isMIMG(I))
1612       return false;
1613     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1614     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1615            TII->getInstSizeInBytes(I) >= 16;
1616   };
1617 
1618   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1619 }
1620 
1621 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1622   int FPAtomicToDenormModeWaitStates = 3;
1623 
1624   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1625     return 0;
1626 
1627   auto IsHazardFn = [](const MachineInstr &I) {
1628     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1629       return false;
1630     return SIInstrInfo::isFPAtomic(I);
1631   };
1632 
1633   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1634     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1635       return true;
1636 
1637     switch (MI.getOpcode()) {
1638     case AMDGPU::S_WAITCNT:
1639     case AMDGPU::S_WAITCNT_VSCNT:
1640     case AMDGPU::S_WAITCNT_VMCNT:
1641     case AMDGPU::S_WAITCNT_EXPCNT:
1642     case AMDGPU::S_WAITCNT_LGKMCNT:
1643     case AMDGPU::S_WAIT_IDLE:
1644       return true;
1645     default:
1646       break;
1647     }
1648 
1649     return false;
1650   };
1651 
1652   return FPAtomicToDenormModeWaitStates -
1653          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1654 }
1655 
1656 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1657   assert(SIInstrInfo::isMAI(*MI));
1658 
1659   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1660 }
1661 
1662 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1663   // Early exit if no padding is requested.
1664   if (MFMAPaddingRatio == 0)
1665     return 0;
1666 
1667   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1668   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1669     return 0;
1670 
1671   int NeighborMFMALatency = 0;
1672   auto IsNeighboringMFMA = [&NeighborMFMALatency,
1673                             this](const MachineInstr &MI) {
1674     if (!SIInstrInfo::isMFMA(MI))
1675       return false;
1676 
1677     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1678     return true;
1679   };
1680 
1681   const int MaxMFMAPipelineWaitStates = 16;
1682   int WaitStatesSinceNeighborMFMA =
1683       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1684 
1685   int NeighborMFMAPaddingNeeded =
1686       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1687       WaitStatesSinceNeighborMFMA;
1688 
1689   return std::max(0, NeighborMFMAPaddingNeeded);
1690 }
1691 
1692 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1693   int WaitStatesNeeded = 0;
1694   unsigned Opc = MI->getOpcode();
1695 
1696   auto IsVALUFn = [](const MachineInstr &MI) {
1697     return SIInstrInfo::isVALU(MI);
1698   };
1699 
1700   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1701     const int LegacyVALUWritesVGPRWaitStates = 2;
1702     const int VALUWritesExecWaitStates = 4;
1703     const int MaxWaitStates = 4;
1704 
1705     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1706       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1707     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1708 
1709     if (WaitStatesNeeded < MaxWaitStates) {
1710       for (const MachineOperand &Use : MI->explicit_uses()) {
1711         const int MaxWaitStates = 2;
1712 
1713         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1714           continue;
1715 
1716         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1717           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1718         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1719 
1720         if (WaitStatesNeeded == MaxWaitStates)
1721           break;
1722       }
1723     }
1724   }
1725 
1726   for (const MachineOperand &Op : MI->explicit_operands()) {
1727     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1728       continue;
1729 
1730     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1731       continue;
1732 
1733     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1734     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1735     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1736     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1737     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1738     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1739     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1740     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1741     const int MaxWaitStates = 18;
1742     Register Reg = Op.getReg();
1743     unsigned HazardDefLatency = 0;
1744 
1745     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
1746                                this](const MachineInstr &MI) {
1747       if (!SIInstrInfo::isMFMA(MI))
1748         return false;
1749       Register DstReg = MI.getOperand(0).getReg();
1750       if (DstReg == Reg)
1751         return false;
1752       HazardDefLatency =
1753           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1754       return TRI.regsOverlap(DstReg, Reg);
1755     };
1756 
1757     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1758                                                    MaxWaitStates);
1759     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1760     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1761     int OpNo = MI->getOperandNo(&Op);
1762     if (OpNo == SrcCIdx) {
1763       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1764     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1765       switch (HazardDefLatency) {
1766       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1767                break;
1768       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1769                break;
1770       case 16: LLVM_FALLTHROUGH;
1771       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1772                break;
1773       }
1774     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1775       switch (HazardDefLatency) {
1776       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1777                break;
1778       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1779                break;
1780       case 16: LLVM_FALLTHROUGH;
1781       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1782                break;
1783       }
1784     }
1785 
1786     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1787     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1788 
1789     if (WaitStatesNeeded == MaxWaitStates)
1790       return WaitStatesNeeded; // Early exit.
1791 
1792     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1793       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1794         return false;
1795       Register DstReg = MI.getOperand(0).getReg();
1796       return TRI.regsOverlap(Reg, DstReg);
1797     };
1798 
1799     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1800     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1801     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1802     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1803     if (OpNo == SrcCIdx)
1804       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1805     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1806       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1807 
1808     WaitStatesNeededForUse = NeedWaitStates -
1809       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1810     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1811 
1812     if (WaitStatesNeeded == MaxWaitStates)
1813       return WaitStatesNeeded; // Early exit.
1814   }
1815 
1816   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1817     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1818     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1819     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1820     const int MaxWaitStates = 13;
1821     Register DstReg = MI->getOperand(0).getReg();
1822     unsigned HazardDefLatency = 0;
1823 
1824     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
1825                          this](const MachineInstr &MI) {
1826       if (!SIInstrInfo::isMFMA(MI))
1827         return false;
1828       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1829       HazardDefLatency =
1830           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1831       return TRI.regsOverlap(Reg, DstReg);
1832     };
1833 
1834     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1835     int NeedWaitStates;
1836     switch (HazardDefLatency) {
1837     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1838              break;
1839     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1840              break;
1841     case 16: LLVM_FALLTHROUGH;
1842     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1843              break;
1844     }
1845 
1846     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1847     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1848   }
1849 
1850   // Pad neighboring MFMA with noops for better inter-wave performance.
1851   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
1852 
1853   return WaitStatesNeeded;
1854 }
1855 
1856 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1857   int WaitStatesNeeded = 0;
1858   unsigned Opc = MI->getOpcode();
1859 
1860   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
1861     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
1862   };
1863 
1864   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
1865     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
1866            !SIInstrInfo::isDOT(MI);
1867   };
1868 
1869   if (!SIInstrInfo::isMFMA(*MI))
1870     return WaitStatesNeeded;
1871 
1872   const int VALUWritesExecWaitStates = 4;
1873   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1874     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1875                           VALUWritesExecWaitStates);
1876   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1877 
1878   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1879 
1880   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1881   for (const MachineOperand &Use : MI->explicit_uses()) {
1882     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1883     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1884     const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
1885     const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
1886     const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
1887     const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
1888     const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
1889     const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
1890     const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
1891     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1892     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1893     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1894     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1895     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1896     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1897     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1898     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1899     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1900     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1901     const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
1902     const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
1903     const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
1904     const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
1905     const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
1906     const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
1907     const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
1908     const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
1909     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1910     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1911     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1912     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
1913     const int MaxWaitStates = 19;
1914 
1915     if (!Use.isReg())
1916       continue;
1917     Register Reg = Use.getReg();
1918     bool FullReg;
1919     const MachineInstr *MI1;
1920 
1921     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
1922                                this](const MachineInstr &MI) {
1923       if (!SIInstrInfo::isMFMA(MI))
1924         return false;
1925       Register DstReg = MI.getOperand(0).getReg();
1926       FullReg = (DstReg == Reg);
1927       MI1 = &MI;
1928       return TRI.regsOverlap(DstReg, Reg);
1929     };
1930 
1931     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1932       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1933     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1934 
1935     int NumWaitStates =
1936         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
1937     if (NumWaitStates == std::numeric_limits<int>::max())
1938       continue;
1939 
1940     int OpNo = MI->getOperandNo(&Use);
1941     unsigned Opc1 = MI1->getOpcode();
1942     int NeedWaitStates = 0;
1943     if (OpNo == SrcCIdx) {
1944       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
1945         NeedWaitStates = 0;
1946       } else if (FullReg) {
1947         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1948              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1949             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1950              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1951           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1952         else if (ST.hasGFX940Insts() &&
1953                  TSchedModel.computeInstrLatency(MI1) == 2)
1954           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
1955       } else {
1956         switch (Opc1) {
1957         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1958         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1959         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
1960         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
1961           if (!isXDL(ST, *MI))
1962             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1963           break;
1964         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1965         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1966           if (!isXDL(ST, *MI))
1967             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1968           break;
1969         default:
1970           if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
1971             break;
1972           switch (TSchedModel.computeInstrLatency(MI1)) {
1973           case 2:
1974             NeedWaitStates = ST.hasGFX940Insts()
1975               ? isXDL(ST, *MI1)
1976                 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
1977                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
1978               : isDGEMM(Opc)
1979                 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1980                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1981             break;
1982           case 4:
1983             assert(ST.hasGFX940Insts());
1984             NeedWaitStates = isXDL(ST, *MI1)
1985               ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
1986               : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
1987             break;
1988           case 8:
1989             NeedWaitStates = ST.hasGFX940Insts()
1990               ? isXDL(ST, *MI1)
1991                 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
1992                 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
1993               : isDGEMM(Opc)
1994                 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1995                 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1996             break;
1997           case 16: LLVM_FALLTHROUGH;
1998           default:
1999             NeedWaitStates = ST.hasGFX940Insts()
2000               ? isXDL(ST, *MI1)
2001                 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2002                 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2003               : isDGEMM(Opc)
2004                 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2005                 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2006           }
2007         }
2008       }
2009     } else {
2010       switch (Opc1) {
2011       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2012       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2013       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2014       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2015         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2016         break;
2017       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2018       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2019         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2020         break;
2021       default:
2022         switch (TSchedModel.computeInstrLatency(MI1)) {
2023         case 2:
2024           NeedWaitStates = ST.hasGFX940Insts()
2025             ? isXDL(ST, *MI1)
2026               ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2027               : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2028             : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2029           break;
2030         case 4:
2031           assert(ST.hasGFX940Insts());
2032           NeedWaitStates = isXDL(ST, *MI1)
2033             ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2034             : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2035           break;
2036         case 8:
2037           NeedWaitStates = ST.hasGFX940Insts()
2038             ? isXDL(ST, *MI1)
2039               ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2040               : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2041             : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2042           break;
2043         case 16: LLVM_FALLTHROUGH;
2044         default:
2045           NeedWaitStates = ST.hasGFX940Insts()
2046             ? isXDL(ST, *MI1)
2047               ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2048               : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2049             : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2050         }
2051       }
2052     }
2053     if (WaitStatesNeeded >= NeedWaitStates)
2054       continue;
2055 
2056     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2057     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2058 
2059     if (WaitStatesNeeded == MaxWaitStates)
2060       break;
2061   }
2062 
2063   return WaitStatesNeeded;
2064 }
2065 
2066 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2067   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2068   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2069     return 0;
2070 
2071   int WaitStatesNeeded = 0;
2072 
2073   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2074     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2075   };
2076 
2077   for (const MachineOperand &Op : MI->explicit_uses()) {
2078     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2079       continue;
2080 
2081     Register Reg = Op.getReg();
2082 
2083     const int AccVgprReadLdStWaitStates = 2;
2084     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2085     const int MaxWaitStates = 2;
2086 
2087     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2088       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2089     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2090 
2091     if (WaitStatesNeeded == MaxWaitStates)
2092       return WaitStatesNeeded; // Early exit.
2093 
2094     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2095       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2096           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2097         return false;
2098       auto IsVALUFn = [](const MachineInstr &MI) {
2099         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2100       };
2101       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2102              std::numeric_limits<int>::max();
2103     };
2104 
2105     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2106       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2107     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2108   }
2109 
2110   return WaitStatesNeeded;
2111 }
2112 
2113 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2114   if (!ST.hasGFX90AInsts())
2115     return 0;
2116 
2117   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2118     return isDGEMM(MI.getOpcode());
2119   };
2120 
2121   // This is checked in checkMAIHazards90A()
2122   if (SIInstrInfo::isMFMA(*MI))
2123     return 0;
2124 
2125   int WaitStatesNeeded = 0;
2126 
2127   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
2128                        SIInstrInfo::isFLAT(*MI) ||
2129                        SIInstrInfo::isDS(*MI) ||
2130                        SIInstrInfo::isEXP(*MI);
2131   bool IsVALU = SIInstrInfo::isVALU(*MI);
2132 
2133   const MachineInstr *MFMA = nullptr;
2134   unsigned Reg;
2135   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2136     if (!SIInstrInfo::isMFMA(MI) ||
2137         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2138       return false;
2139     MFMA = &MI;
2140     return true;
2141   };
2142 
2143   const MachineInstr *DOT = nullptr;
2144   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2145     if (!SIInstrInfo::isDOT(MI) ||
2146         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2147       return false;
2148     DOT = &MI;
2149     return true;
2150   };
2151 
2152   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2153                                            AMDGPU::OpName::src2);
2154 
2155   if (IsMemOrExport || IsVALU) {
2156     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2157     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2158     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2159     const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2160     const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2161     const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2162     const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2163     const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2164     const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2165     const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2166     const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2167     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2168     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2169     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2170     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2171     const int DotWriteSameDotReadSrcAB = 3;
2172     const int DotWriteDifferentVALURead = 3;
2173     const int MaxWaitStates = 19;
2174 
2175     for (const MachineOperand &Use : MI->explicit_uses()) {
2176       if (!Use.isReg())
2177         continue;
2178       Reg = Use.getReg();
2179 
2180       DOT = nullptr;
2181       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2182                                                      MaxWaitStates);
2183       if (DOT) {
2184         int NeedWaitStates = 0;
2185         if (DOT->getOpcode() == MI->getOpcode()) {
2186           if (&Use - &MI->getOperand(0) != SrcCIdx)
2187             NeedWaitStates = DotWriteSameDotReadSrcAB;
2188         } else {
2189           NeedWaitStates = DotWriteDifferentVALURead;
2190         }
2191 
2192         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2193         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2194       }
2195 
2196       MFMA = nullptr;
2197       WaitStatesSinceDef =
2198           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2199       if (!MFMA)
2200         continue;
2201 
2202       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2203       int NeedWaitStates = MaxWaitStates;
2204       switch (HazardDefLatency) {
2205       case 2:
2206         NeedWaitStates =
2207           ST.hasGFX940Insts()
2208             ? isXDL(ST, *MFMA)
2209               ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2210               : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2211             : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2212         break;
2213       case 4:
2214         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2215         NeedWaitStates =
2216           isDGEMM(MFMA->getOpcode())
2217             ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2218                             : DMFMA4x4WriteVgprVALUReadWaitStates
2219             : isXDL(ST, *MFMA)
2220               ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2221               : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2222         break;
2223       case 8:
2224         NeedWaitStates =
2225           ST.hasGFX940Insts()
2226             ? isXDL(ST, *MFMA)
2227               ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2228               : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2229             : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2230         break;
2231       case 16: LLVM_FALLTHROUGH;
2232       default:
2233         NeedWaitStates =
2234           isDGEMM(MFMA->getOpcode())
2235             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2236                             : DMFMA16x16WriteVgprVALUReadWaitStates
2237             : ST.hasGFX940Insts()
2238               ? isXDL(ST, *MFMA)
2239                 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2240                 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2241               : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2242         break;
2243       }
2244 
2245       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2246       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2247 
2248       if (WaitStatesNeeded == MaxWaitStates)
2249         break;
2250     }
2251   }
2252 
2253   unsigned Opc = MI->getOpcode();
2254   const int DMFMAToFMA64WaitStates = 2;
2255   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2256        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2257        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2258       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2259     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2260       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2261     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2262   }
2263 
2264   if (!IsVALU && !IsMemOrExport)
2265     return WaitStatesNeeded;
2266 
2267   for (const MachineOperand &Def : MI->defs()) {
2268     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2269     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2270     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2271     const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2272     const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2273     const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2274     const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2275     const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2276     const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2277     const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2278     const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2279     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2280     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2281     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2282     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2283     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2284     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2285     const int DotWriteDifferentVALUWrite = 3;
2286     const int MaxWaitStates = 19;
2287     const int MaxWarWaitStates = 15;
2288 
2289     Reg = Def.getReg();
2290 
2291     DOT = nullptr;
2292     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2293                                                    MaxWaitStates);
2294     if (DOT && DOT->getOpcode() != MI->getOpcode())
2295       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2296                                                     WaitStatesSinceDef);
2297 
2298     MFMA = nullptr;
2299     WaitStatesSinceDef =
2300         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2301     if (MFMA) {
2302       int NeedWaitStates = MaxWaitStates;
2303       switch (TSchedModel.computeInstrLatency(MFMA)) {
2304       case 2:
2305         NeedWaitStates = ST.hasGFX940Insts()
2306           ? isXDL(ST, *MFMA)
2307             ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2308             : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2309           : SMFMA4x4WriteVgprVALUWawWaitStates;
2310         break;
2311       case 4:
2312         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2313         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2314             ? DMFMA4x4WriteVgprVALUWriteWaitStates
2315             : isXDL(ST, *MFMA)
2316               ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2317               : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2318         break;
2319       case 8:
2320         NeedWaitStates = ST.hasGFX940Insts()
2321           ? isXDL(ST, *MFMA)
2322             ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2323             : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2324           : SMFMA16x16WriteVgprVALUWawWaitStates;
2325         break;
2326       case 16: LLVM_FALLTHROUGH;
2327       default:
2328         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2329                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
2330                    : ST.hasGFX940Insts()
2331                      ? isXDL(ST, *MFMA)
2332                        ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2333                        : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2334                    : SMFMA32x32WriteVgprVALUWawWaitStates;
2335         break;
2336       }
2337 
2338       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2339       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2340 
2341       if (WaitStatesNeeded == MaxWaitStates)
2342         break;
2343     }
2344 
2345     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2346       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2347           !MI.readsRegister(Reg, &TRI))
2348         return false;
2349 
2350       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2351         return false;
2352 
2353       const MachineOperand *SrcC =
2354           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2355       assert(SrcC);
2356       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2357         return false;
2358 
2359       MFMA = &MI;
2360       return true;
2361     };
2362 
2363     MFMA = nullptr;
2364     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2365                                                 MaxWarWaitStates);
2366     if (!MFMA)
2367       continue;
2368 
2369     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2370     int NeedWaitStates = MaxWaitStates;
2371     switch (HazardDefLatency) {
2372     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2373              break;
2374     case 4:  assert(ST.hasGFX940Insts());
2375              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2376              break;
2377     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2378              break;
2379     case 16: LLVM_FALLTHROUGH;
2380     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2381              break;
2382     }
2383 
2384     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2385     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2386   }
2387 
2388   return WaitStatesNeeded;
2389 }
2390 
2391 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2392   if (!SU->isInstr())
2393     return false;
2394 
2395   const MachineInstr *MAI = nullptr;
2396 
2397   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2398     MAI = nullptr;
2399     if (SIInstrInfo::isMFMA(MI))
2400       MAI = &MI;
2401     return MAI != nullptr;
2402   };
2403 
2404   MachineInstr *MI = SU->getInstr();
2405   if (IsMFMAFn(*MI)) {
2406     int W = getWaitStatesSince(IsMFMAFn, 16);
2407     if (MAI)
2408       return W < (int)TSchedModel.computeInstrLatency(MAI);
2409   }
2410 
2411   return false;
2412 }
2413