xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision d1f45ed58f3a7bdca69b93f548682754dd67d80f)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19 
20 using namespace llvm;
21 
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25 
26 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
27                                                  const GCNSubtarget &ST);
28 
29 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
30   IsHazardRecognizerMode(false),
31   CurrCycleInstr(nullptr),
32   MF(MF),
33   ST(MF.getSubtarget<GCNSubtarget>()),
34   TII(*ST.getInstrInfo()),
35   TRI(TII.getRegisterInfo()),
36   ClauseUses(TRI.getNumRegUnits()),
37   ClauseDefs(TRI.getNumRegUnits()) {
38   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
39   TSchedModel.init(&ST);
40   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
41 }
42 
43 void GCNHazardRecognizer::Reset() {
44   EmittedInstrs.clear();
45 }
46 
47 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
48   EmitInstruction(SU->getInstr());
49 }
50 
51 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
52   CurrCycleInstr = MI;
53 }
54 
55 static bool isDivFMas(unsigned Opcode) {
56   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
57 }
58 
59 static bool isSGetReg(unsigned Opcode) {
60   return Opcode == AMDGPU::S_GETREG_B32;
61 }
62 
63 static bool isSSetReg(unsigned Opcode) {
64   switch (Opcode) {
65   case AMDGPU::S_SETREG_B32:
66   case AMDGPU::S_SETREG_B32_mode:
67   case AMDGPU::S_SETREG_IMM32_B32:
68   case AMDGPU::S_SETREG_IMM32_B32_mode:
69     return true;
70   }
71   return false;
72 }
73 
74 static bool isRWLane(unsigned Opcode) {
75   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
76 }
77 
78 static bool isRFE(unsigned Opcode) {
79   return Opcode == AMDGPU::S_RFE_B64;
80 }
81 
82 static bool isSMovRel(unsigned Opcode) {
83   switch (Opcode) {
84   case AMDGPU::S_MOVRELS_B32:
85   case AMDGPU::S_MOVRELS_B64:
86   case AMDGPU::S_MOVRELD_B32:
87   case AMDGPU::S_MOVRELD_B64:
88     return true;
89   default:
90     return false;
91   }
92 }
93 
94 static bool isDGEMM(unsigned Opcode) {
95   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
96          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
97          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
98          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
99 }
100 
101 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
102   unsigned Opcode = MI.getOpcode();
103 
104   if (!SIInstrInfo::isMAI(MI) ||
105       isDGEMM(Opcode) ||
106       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
107       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
108     return false;
109 
110   return true;
111 }
112 
113 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
114                                     const MachineInstr &MI) {
115   if (TII.isAlwaysGDS(MI.getOpcode()))
116     return true;
117 
118   switch (MI.getOpcode()) {
119   case AMDGPU::S_SENDMSG:
120   case AMDGPU::S_SENDMSGHALT:
121   case AMDGPU::S_TTRACEDATA:
122     return true;
123   // These DS opcodes don't support GDS.
124   case AMDGPU::DS_NOP:
125   case AMDGPU::DS_PERMUTE_B32:
126   case AMDGPU::DS_BPERMUTE_B32:
127     return false;
128   default:
129     if (TII.isDS(MI.getOpcode())) {
130       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
131                                            AMDGPU::OpName::gds);
132       if (MI.getOperand(GDS).getImm())
133         return true;
134     }
135     return false;
136   }
137 }
138 
139 static bool isPermlane(const MachineInstr &MI) {
140   unsigned Opcode = MI.getOpcode();
141   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
142          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
143 }
144 
145 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
146   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
147                                                      AMDGPU::OpName::simm16);
148   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
149 }
150 
151 ScheduleHazardRecognizer::HazardType
152 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
153   MachineInstr *MI = SU->getInstr();
154   // If we are not in "HazardRecognizerMode" and therefore not being run from
155   // the scheduler, track possible stalls from hazards but don't insert noops.
156   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
157 
158   if (MI->isBundle())
159    return NoHazard;
160 
161   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
162     return HazardType;
163 
164   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
165     return HazardType;
166 
167   if (checkFPAtomicToDenormModeHazard(MI) > 0)
168     return HazardType;
169 
170   if (ST.hasNoDataDepHazard())
171     return NoHazard;
172 
173   // FIXME: Should flat be considered vmem?
174   if ((SIInstrInfo::isVMEM(*MI) ||
175        SIInstrInfo::isFLAT(*MI))
176       && checkVMEMHazards(MI) > 0)
177     return HazardType;
178 
179   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
180     return HazardType;
181 
182   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
183     return HazardType;
184 
185   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
186     return HazardType;
187 
188   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
189     return HazardType;
190 
191   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
192        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
193        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
194     return HazardType;
195 
196   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
197     return HazardType;
198 
199   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
200     return HazardType;
201 
202   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
203     return HazardType;
204 
205   if (ST.hasReadM0MovRelInterpHazard() &&
206       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
207       checkReadM0Hazards(MI) > 0)
208     return HazardType;
209 
210   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
211       checkReadM0Hazards(MI) > 0)
212     return HazardType;
213 
214   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
215     return HazardType;
216 
217   if ((SIInstrInfo::isVMEM(*MI) ||
218        SIInstrInfo::isFLAT(*MI) ||
219        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
220     return HazardType;
221 
222   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
223     return HazardType;
224 
225   return NoHazard;
226 }
227 
228 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
229                                 unsigned Quantity) {
230   while (Quantity > 0) {
231     unsigned Arg = std::min(Quantity, 8u);
232     Quantity -= Arg;
233     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
234         .addImm(Arg - 1);
235   }
236 }
237 
238 void GCNHazardRecognizer::processBundle() {
239   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
240   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
241   // Check bundled MachineInstr's for hazards.
242   for (; MI != E && MI->isInsideBundle(); ++MI) {
243     CurrCycleInstr = &*MI;
244     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
245 
246     if (IsHazardRecognizerMode) {
247       fixHazards(CurrCycleInstr);
248 
249       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
250     }
251 
252     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
253     // include the bundled MI directly after, only add a maximum of
254     // (MaxLookAhead - 1) noops to EmittedInstrs.
255     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
256       EmittedInstrs.push_front(nullptr);
257 
258     EmittedInstrs.push_front(CurrCycleInstr);
259     EmittedInstrs.resize(MaxLookAhead);
260   }
261   CurrCycleInstr = nullptr;
262 }
263 
264 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
265   IsHazardRecognizerMode = true;
266   CurrCycleInstr = MI;
267   unsigned W = PreEmitNoopsCommon(MI);
268   fixHazards(MI);
269   CurrCycleInstr = nullptr;
270   return W;
271 }
272 
273 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
274   if (MI->isBundle())
275     return 0;
276 
277   int WaitStates = 0;
278 
279   if (SIInstrInfo::isSMRD(*MI))
280     return std::max(WaitStates, checkSMRDHazards(MI));
281 
282   if (ST.hasNSAtoVMEMBug())
283     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
284 
285   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
286 
287   if (ST.hasNoDataDepHazard())
288     return WaitStates;
289 
290   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
291     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
292 
293   if (SIInstrInfo::isVALU(*MI))
294     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
295 
296   if (SIInstrInfo::isDPP(*MI))
297     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
298 
299   if (isDivFMas(MI->getOpcode()))
300     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
301 
302   if (isRWLane(MI->getOpcode()))
303     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
304 
305   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
306        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
307        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
308     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
309 
310   if (MI->isInlineAsm())
311     return std::max(WaitStates, checkInlineAsmHazards(MI));
312 
313   if (isSGetReg(MI->getOpcode()))
314     return std::max(WaitStates, checkGetRegHazards(MI));
315 
316   if (isSSetReg(MI->getOpcode()))
317     return std::max(WaitStates, checkSetRegHazards(MI));
318 
319   if (isRFE(MI->getOpcode()))
320     return std::max(WaitStates, checkRFEHazards(MI));
321 
322   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
323                                            isSMovRel(MI->getOpcode())))
324     return std::max(WaitStates, checkReadM0Hazards(MI));
325 
326   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
327     return std::max(WaitStates, checkReadM0Hazards(MI));
328 
329   if (SIInstrInfo::isMAI(*MI))
330     return std::max(WaitStates, checkMAIHazards(MI));
331 
332   if (SIInstrInfo::isVMEM(*MI) ||
333       SIInstrInfo::isFLAT(*MI) ||
334       SIInstrInfo::isDS(*MI))
335     return std::max(WaitStates, checkMAILdStHazards(MI));
336 
337   return WaitStates;
338 }
339 
340 void GCNHazardRecognizer::EmitNoop() {
341   EmittedInstrs.push_front(nullptr);
342 }
343 
344 void GCNHazardRecognizer::AdvanceCycle() {
345   // When the scheduler detects a stall, it will call AdvanceCycle() without
346   // emitting any instructions.
347   if (!CurrCycleInstr) {
348     EmittedInstrs.push_front(nullptr);
349     return;
350   }
351 
352   if (CurrCycleInstr->isBundle()) {
353     processBundle();
354     return;
355   }
356 
357   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
358   if (!NumWaitStates) {
359     CurrCycleInstr = nullptr;
360     return;
361   }
362 
363   // Keep track of emitted instructions
364   EmittedInstrs.push_front(CurrCycleInstr);
365 
366   // Add a nullptr for each additional wait state after the first.  Make sure
367   // not to add more than getMaxLookAhead() items to the list, since we
368   // truncate the list to that size right after this loop.
369   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
370        i < e; ++i) {
371     EmittedInstrs.push_front(nullptr);
372   }
373 
374   // getMaxLookahead() is the largest number of wait states we will ever need
375   // to insert, so there is no point in keeping track of more than that many
376   // wait states.
377   EmittedInstrs.resize(getMaxLookAhead());
378 
379   CurrCycleInstr = nullptr;
380 }
381 
382 void GCNHazardRecognizer::RecedeCycle() {
383   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
384 }
385 
386 //===----------------------------------------------------------------------===//
387 // Helper Functions
388 //===----------------------------------------------------------------------===//
389 
390 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
391 
392 // Returns a minimum wait states since \p I walking all predecessors.
393 // Only scans until \p IsExpired does not return true.
394 // Can only be run in a hazard recognizer mode.
395 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
396                               const MachineBasicBlock *MBB,
397                               MachineBasicBlock::const_reverse_instr_iterator I,
398                               int WaitStates, IsExpiredFn IsExpired,
399                               DenseSet<const MachineBasicBlock *> &Visited) {
400   for (auto E = MBB->instr_rend(); I != E; ++I) {
401     // Don't add WaitStates for parent BUNDLE instructions.
402     if (I->isBundle())
403       continue;
404 
405     if (IsHazard(*I))
406       return WaitStates;
407 
408     if (I->isInlineAsm())
409       continue;
410 
411     WaitStates += SIInstrInfo::getNumWaitStates(*I);
412 
413     if (IsExpired(*I, WaitStates))
414       return std::numeric_limits<int>::max();
415   }
416 
417   int MinWaitStates = std::numeric_limits<int>::max();
418   for (MachineBasicBlock *Pred : MBB->predecessors()) {
419     if (!Visited.insert(Pred).second)
420       continue;
421 
422     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
423                                WaitStates, IsExpired, Visited);
424 
425     MinWaitStates = std::min(MinWaitStates, W);
426   }
427 
428   return MinWaitStates;
429 }
430 
431 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
432                               const MachineInstr *MI, IsExpiredFn IsExpired) {
433   DenseSet<const MachineBasicBlock *> Visited;
434   return getWaitStatesSince(IsHazard, MI->getParent(),
435                             std::next(MI->getReverseIterator()),
436                             0, IsExpired, Visited);
437 }
438 
439 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
440   if (IsHazardRecognizerMode) {
441     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
442       return WaitStates >= Limit;
443     };
444     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
445   }
446 
447   int WaitStates = 0;
448   for (MachineInstr *MI : EmittedInstrs) {
449     if (MI) {
450       if (IsHazard(*MI))
451         return WaitStates;
452 
453       if (MI->isInlineAsm())
454         continue;
455     }
456     ++WaitStates;
457 
458     if (WaitStates >= Limit)
459       break;
460   }
461   return std::numeric_limits<int>::max();
462 }
463 
464 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
465                                                IsHazardFn IsHazardDef,
466                                                int Limit) {
467   const SIRegisterInfo *TRI = ST.getRegisterInfo();
468 
469   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
470     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
471   };
472 
473   return getWaitStatesSince(IsHazardFn, Limit);
474 }
475 
476 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
477                                                   int Limit) {
478   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
479     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
480   };
481 
482   return getWaitStatesSince(IsHazardFn, Limit);
483 }
484 
485 //===----------------------------------------------------------------------===//
486 // No-op Hazard Detection
487 //===----------------------------------------------------------------------===//
488 
489 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
490                         MCRegister Reg) {
491   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
492     BV.set(*RUI);
493 }
494 
495 static void addRegsToSet(const SIRegisterInfo &TRI,
496                          iterator_range<MachineInstr::const_mop_iterator> Ops,
497                          BitVector &Set) {
498   for (const MachineOperand &Op : Ops) {
499     if (Op.isReg())
500       addRegUnits(TRI, Set, Op.getReg().asMCReg());
501   }
502 }
503 
504 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
505   // XXX: Do we need to worry about implicit operands
506   addRegsToSet(TRI, MI.defs(), ClauseDefs);
507   addRegsToSet(TRI, MI.uses(), ClauseUses);
508 }
509 
510 static bool breaksSMEMSoftClause(MachineInstr *MI) {
511   return !SIInstrInfo::isSMRD(*MI);
512 }
513 
514 static bool breaksVMEMSoftClause(MachineInstr *MI) {
515   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
516 }
517 
518 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
519   // SMEM soft clause are only present on VI+, and only matter if xnack is
520   // enabled.
521   if (!ST.isXNACKEnabled())
522     return 0;
523 
524   bool IsSMRD = TII.isSMRD(*MEM);
525 
526   resetClause();
527 
528   // A soft-clause is any group of consecutive SMEM instructions.  The
529   // instructions in this group may return out of order and/or may be
530   // replayed (i.e. the same instruction issued more than once).
531   //
532   // In order to handle these situations correctly we need to make sure that
533   // when a clause has more than one instruction, no instruction in the clause
534   // writes to a register that is read by another instruction in the clause
535   // (including itself). If we encounter this situaion, we need to break the
536   // clause by inserting a non SMEM instruction.
537 
538   for (MachineInstr *MI : EmittedInstrs) {
539     // When we hit a non-SMEM instruction then we have passed the start of the
540     // clause and we can stop.
541     if (!MI)
542       break;
543 
544     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
545       break;
546 
547     addClauseInst(*MI);
548   }
549 
550   if (ClauseDefs.none())
551     return 0;
552 
553   // We need to make sure not to put loads and stores in the same clause if they
554   // use the same address. For now, just start a new clause whenever we see a
555   // store.
556   if (MEM->mayStore())
557     return 1;
558 
559   addClauseInst(*MEM);
560 
561   // If the set of defs and uses intersect then we cannot add this instruction
562   // to the clause, so we have a hazard.
563   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
564 }
565 
566 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
567   int WaitStatesNeeded = 0;
568 
569   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
570 
571   // This SMRD hazard only affects SI.
572   if (!ST.hasSMRDReadVALUDefHazard())
573     return WaitStatesNeeded;
574 
575   // A read of an SGPR by SMRD instruction requires 4 wait states when the
576   // SGPR was written by a VALU instruction.
577   int SmrdSgprWaitStates = 4;
578   auto IsHazardDefFn = [this](const MachineInstr &MI) {
579     return TII.isVALU(MI);
580   };
581   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
582     return TII.isSALU(MI);
583   };
584 
585   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
586 
587   for (const MachineOperand &Use : SMRD->uses()) {
588     if (!Use.isReg())
589       continue;
590     int WaitStatesNeededForUse =
591         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
592                                                    SmrdSgprWaitStates);
593     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
594 
595     // This fixes what appears to be undocumented hardware behavior in SI where
596     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
597     // needs some number of nops in between. We don't know how many we need, but
598     // let's use 4. This wasn't discovered before probably because the only
599     // case when this happens is when we expand a 64-bit pointer into a full
600     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
601     // probably never encountered in the closed-source land.
602     if (IsBufferSMRD) {
603       int WaitStatesNeededForUse =
604         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
605                                                    IsBufferHazardDefFn,
606                                                    SmrdSgprWaitStates);
607       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
608     }
609   }
610 
611   return WaitStatesNeeded;
612 }
613 
614 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
615   if (!ST.hasVMEMReadSGPRVALUDefHazard())
616     return 0;
617 
618   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
619 
620   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
621   // SGPR was written by a VALU Instruction.
622   const int VmemSgprWaitStates = 5;
623   auto IsHazardDefFn = [this](const MachineInstr &MI) {
624     return TII.isVALU(MI);
625   };
626   for (const MachineOperand &Use : VMEM->uses()) {
627     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
628       continue;
629 
630     int WaitStatesNeededForUse =
631         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
632                                                    VmemSgprWaitStates);
633     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
634   }
635   return WaitStatesNeeded;
636 }
637 
638 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
639   const SIRegisterInfo *TRI = ST.getRegisterInfo();
640   const SIInstrInfo *TII = ST.getInstrInfo();
641 
642   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
643   int DppVgprWaitStates = 2;
644   int DppExecWaitStates = 5;
645   int WaitStatesNeeded = 0;
646   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
647     return TII->isVALU(MI);
648   };
649 
650   for (const MachineOperand &Use : DPP->uses()) {
651     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
652       continue;
653     int WaitStatesNeededForUse =
654         DppVgprWaitStates - getWaitStatesSinceDef(
655                                 Use.getReg(),
656                                 [](const MachineInstr &) { return true; },
657                                 DppVgprWaitStates);
658     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
659   }
660 
661   WaitStatesNeeded = std::max(
662       WaitStatesNeeded,
663       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
664                                                 DppExecWaitStates));
665 
666   return WaitStatesNeeded;
667 }
668 
669 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
670   const SIInstrInfo *TII = ST.getInstrInfo();
671 
672   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
673   // instruction.
674   const int DivFMasWaitStates = 4;
675   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
676     return TII->isVALU(MI);
677   };
678   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
679                                                DivFMasWaitStates);
680 
681   return DivFMasWaitStates - WaitStatesNeeded;
682 }
683 
684 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
685   const SIInstrInfo *TII = ST.getInstrInfo();
686   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
687 
688   const int GetRegWaitStates = 2;
689   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
690     return GetRegHWReg == getHWReg(TII, MI);
691   };
692   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
693 
694   return GetRegWaitStates - WaitStatesNeeded;
695 }
696 
697 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
698   const SIInstrInfo *TII = ST.getInstrInfo();
699   unsigned HWReg = getHWReg(TII, *SetRegInstr);
700 
701   const int SetRegWaitStates = ST.getSetRegWaitStates();
702   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
703     return HWReg == getHWReg(TII, MI);
704   };
705   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
706   return SetRegWaitStates - WaitStatesNeeded;
707 }
708 
709 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
710   if (!MI.mayStore())
711     return -1;
712 
713   const SIInstrInfo *TII = ST.getInstrInfo();
714   unsigned Opcode = MI.getOpcode();
715   const MCInstrDesc &Desc = MI.getDesc();
716 
717   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
718   int VDataRCID = -1;
719   if (VDataIdx != -1)
720     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
721 
722   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
723     // There is no hazard if the instruction does not use vector regs
724     // (like wbinvl1)
725     if (VDataIdx == -1)
726       return -1;
727     // For MUBUF/MTBUF instructions this hazard only exists if the
728     // instruction is not using a register in the soffset field.
729     const MachineOperand *SOffset =
730         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
731     // If we have no soffset operand, then assume this field has been
732     // hardcoded to zero.
733     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
734         (!SOffset || !SOffset->isReg()))
735       return VDataIdx;
736   }
737 
738   // MIMG instructions create a hazard if they don't use a 256-bit T# and
739   // the store size is greater than 8 bytes and they have more than two bits
740   // of their dmask set.
741   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
742   if (TII->isMIMG(MI)) {
743     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
744     assert(SRsrcIdx != -1 &&
745            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
746     (void)SRsrcIdx;
747   }
748 
749   if (TII->isFLAT(MI)) {
750     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
751     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
752       return DataIdx;
753   }
754 
755   return -1;
756 }
757 
758 int
759 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
760                                             const MachineRegisterInfo &MRI) {
761   // Helper to check for the hazard where VMEM instructions that store more than
762   // 8 bytes can have there store data over written by the next instruction.
763   const SIRegisterInfo *TRI = ST.getRegisterInfo();
764 
765   const int VALUWaitStates = 1;
766   int WaitStatesNeeded = 0;
767 
768   if (!TRI->isVectorRegister(MRI, Def.getReg()))
769     return WaitStatesNeeded;
770   Register Reg = Def.getReg();
771   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
772     int DataIdx = createsVALUHazard(MI);
773     return DataIdx >= 0 &&
774            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
775   };
776   int WaitStatesNeededForDef =
777     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
778   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
779 
780   return WaitStatesNeeded;
781 }
782 
783 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
784   // This checks for the hazard where VMEM instructions that store more than
785   // 8 bytes can have there store data over written by the next instruction.
786   if (!ST.has12DWordStoreHazard())
787     return 0;
788 
789   const MachineRegisterInfo &MRI = MF.getRegInfo();
790   int WaitStatesNeeded = 0;
791 
792   for (const MachineOperand &Def : VALU->defs()) {
793     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
794   }
795 
796   return WaitStatesNeeded;
797 }
798 
799 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
800   // This checks for hazards associated with inline asm statements.
801   // Since inline asms can contain just about anything, we use this
802   // to call/leverage other check*Hazard routines. Note that
803   // this function doesn't attempt to address all possible inline asm
804   // hazards (good luck), but is a collection of what has been
805   // problematic thus far.
806 
807   // see checkVALUHazards()
808   if (!ST.has12DWordStoreHazard())
809     return 0;
810 
811   const MachineRegisterInfo &MRI = MF.getRegInfo();
812   int WaitStatesNeeded = 0;
813 
814   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
815        I != E; ++I) {
816     const MachineOperand &Op = IA->getOperand(I);
817     if (Op.isReg() && Op.isDef()) {
818       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
819     }
820   }
821 
822   return WaitStatesNeeded;
823 }
824 
825 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
826   const SIInstrInfo *TII = ST.getInstrInfo();
827   const SIRegisterInfo *TRI = ST.getRegisterInfo();
828   const MachineRegisterInfo &MRI = MF.getRegInfo();
829 
830   const MachineOperand *LaneSelectOp =
831       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
832 
833   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
834     return 0;
835 
836   Register LaneSelectReg = LaneSelectOp->getReg();
837   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
838 
839   const int RWLaneWaitStates = 4;
840   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
841                                               RWLaneWaitStates);
842   return RWLaneWaitStates - WaitStatesSince;
843 }
844 
845 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
846   if (!ST.hasRFEHazards())
847     return 0;
848 
849   const SIInstrInfo *TII = ST.getInstrInfo();
850 
851   const int RFEWaitStates = 1;
852 
853   auto IsHazardFn = [TII](const MachineInstr &MI) {
854     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
855   };
856   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
857   return RFEWaitStates - WaitStatesNeeded;
858 }
859 
860 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
861   const SIInstrInfo *TII = ST.getInstrInfo();
862   const int SMovRelWaitStates = 1;
863   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
864   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
865                                                    SMovRelWaitStates);
866 }
867 
868 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
869   fixVMEMtoScalarWriteHazards(MI);
870   fixVcmpxPermlaneHazards(MI);
871   fixSMEMtoVectorWriteHazards(MI);
872   fixVcmpxExecWARHazard(MI);
873   fixLdsBranchVmemWARHazard(MI);
874 }
875 
876 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
877   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
878     return false;
879 
880   const SIInstrInfo *TII = ST.getInstrInfo();
881   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
882 
883   auto IsExpiredFn = [](const MachineInstr &MI, int) {
884     unsigned Opc = MI.getOpcode();
885     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
886            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
887   };
888 
889   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
890       std::numeric_limits<int>::max())
891     return false;
892 
893   // V_NOP will be discarded by SQ.
894   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
895   // which is always a VGPR and available.
896   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
897   Register Reg = Src0->getReg();
898   bool IsUndef = Src0->isUndef();
899   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
900           TII->get(AMDGPU::V_MOV_B32_e32))
901     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
902     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
903 
904   return true;
905 }
906 
907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
908   if (!ST.hasVMEMtoScalarWriteHazard())
909     return false;
910 
911   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
912     return false;
913 
914   if (MI->getNumDefs() == 0)
915     return false;
916 
917   const SIRegisterInfo *TRI = ST.getRegisterInfo();
918 
919   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
920     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
921         !SIInstrInfo::isFLAT(I))
922       return false;
923 
924     for (const MachineOperand &Def : MI->defs()) {
925       const MachineOperand *Op =
926           I.findRegisterUseOperand(Def.getReg(), false, TRI);
927       if (!Op)
928         continue;
929       return true;
930     }
931     return false;
932   };
933 
934   auto IsExpiredFn = [](const MachineInstr &MI, int) {
935     return SIInstrInfo::isVALU(MI) ||
936            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
937             !MI.getOperand(0).getImm()) ||
938            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
939             MI.getOperand(0).getImm() == 0xffe3);
940   };
941 
942   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
943       std::numeric_limits<int>::max())
944     return false;
945 
946   const SIInstrInfo *TII = ST.getInstrInfo();
947   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
948           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
949       .addImm(0xffe3);
950   return true;
951 }
952 
953 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
954   if (!ST.hasSMEMtoVectorWriteHazard())
955     return false;
956 
957   if (!SIInstrInfo::isVALU(*MI))
958     return false;
959 
960   unsigned SDSTName;
961   switch (MI->getOpcode()) {
962   case AMDGPU::V_READLANE_B32:
963   case AMDGPU::V_READFIRSTLANE_B32:
964     SDSTName = AMDGPU::OpName::vdst;
965     break;
966   default:
967     SDSTName = AMDGPU::OpName::sdst;
968     break;
969   }
970 
971   const SIInstrInfo *TII = ST.getInstrInfo();
972   const SIRegisterInfo *TRI = ST.getRegisterInfo();
973   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
974   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
975   if (!SDST) {
976     for (const auto &MO : MI->implicit_operands()) {
977       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
978         SDST = &MO;
979         break;
980       }
981     }
982   }
983 
984   if (!SDST)
985     return false;
986 
987   const Register SDSTReg = SDST->getReg();
988   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
989     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
990   };
991 
992   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
993     if (TII->isSALU(MI)) {
994       switch (MI.getOpcode()) {
995       case AMDGPU::S_SETVSKIP:
996       case AMDGPU::S_VERSION:
997       case AMDGPU::S_WAITCNT_VSCNT:
998       case AMDGPU::S_WAITCNT_VMCNT:
999       case AMDGPU::S_WAITCNT_EXPCNT:
1000         // These instructions cannot not mitigate the hazard.
1001         return false;
1002       case AMDGPU::S_WAITCNT_LGKMCNT:
1003         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1004         return (MI.getOperand(1).getImm() == 0) &&
1005                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1006       case AMDGPU::S_WAITCNT: {
1007         const int64_t Imm = MI.getOperand(0).getImm();
1008         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1009         return (Decoded.LgkmCnt == 0);
1010       }
1011       default:
1012         // SOPP instructions cannot mitigate the hazard.
1013         if (TII->isSOPP(MI))
1014           return false;
1015         // At this point the SALU can be assumed to mitigate the hazard
1016         // because either:
1017         // (a) it is independent of the at risk SMEM (breaking chain),
1018         // or
1019         // (b) it is dependent on the SMEM, in which case an appropriate
1020         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1021         //     SMEM instruction.
1022         return true;
1023       }
1024     }
1025     return false;
1026   };
1027 
1028   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1029       std::numeric_limits<int>::max())
1030     return false;
1031 
1032   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1033           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1034       .addImm(0);
1035   return true;
1036 }
1037 
1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1039   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1040     return false;
1041 
1042   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1043   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1044     return false;
1045 
1046   auto IsHazardFn = [TRI](const MachineInstr &I) {
1047     if (SIInstrInfo::isVALU(I))
1048       return false;
1049     return I.readsRegister(AMDGPU::EXEC, TRI);
1050   };
1051 
1052   const SIInstrInfo *TII = ST.getInstrInfo();
1053   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1054     if (SIInstrInfo::isVALU(MI)) {
1055       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1056         return true;
1057       for (auto MO : MI.implicit_operands())
1058         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1059           return true;
1060     }
1061     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1062         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1063       return true;
1064     return false;
1065   };
1066 
1067   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1068       std::numeric_limits<int>::max())
1069     return false;
1070 
1071   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1072           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1073     .addImm(0xfffe);
1074   return true;
1075 }
1076 
1077 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1078                                                  const GCNSubtarget &ST) {
1079   if (!ST.hasLdsBranchVmemWARHazard())
1080     return false;
1081 
1082   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1083   // instructions need to appear in the same function.
1084   bool HasLds = false;
1085   bool HasVmem = false;
1086   for (auto &MBB : MF) {
1087     for (auto &MI : MBB) {
1088       HasLds |= SIInstrInfo::isDS(MI);
1089       HasVmem |=
1090           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1091       if (HasLds && HasVmem)
1092         return true;
1093     }
1094   }
1095   return false;
1096 }
1097 
1098 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1099   if (!RunLdsBranchVmemWARHazardFixup)
1100     return false;
1101 
1102   assert(ST.hasLdsBranchVmemWARHazard());
1103 
1104   auto IsHazardInst = [](const MachineInstr &MI) {
1105     if (SIInstrInfo::isDS(MI))
1106       return 1;
1107     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1108       return 2;
1109     return 0;
1110   };
1111 
1112   auto InstType = IsHazardInst(*MI);
1113   if (!InstType)
1114     return false;
1115 
1116   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1117     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1118                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1119                                !I.getOperand(1).getImm());
1120   };
1121 
1122   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1123     if (!I.isBranch())
1124       return false;
1125 
1126     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1127       auto InstType2 = IsHazardInst(I);
1128       return InstType2 && InstType != InstType2;
1129     };
1130 
1131     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1132       auto InstType2 = IsHazardInst(I);
1133       if (InstType == InstType2)
1134         return true;
1135 
1136       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1137              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1138              !I.getOperand(1).getImm();
1139     };
1140 
1141     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1142            std::numeric_limits<int>::max();
1143   };
1144 
1145   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1146       std::numeric_limits<int>::max())
1147     return false;
1148 
1149   const SIInstrInfo *TII = ST.getInstrInfo();
1150   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1151           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1152     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1153     .addImm(0);
1154 
1155   return true;
1156 }
1157 
1158 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1159   int NSAtoVMEMWaitStates = 1;
1160 
1161   if (!ST.hasNSAtoVMEMBug())
1162     return 0;
1163 
1164   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1165     return 0;
1166 
1167   const SIInstrInfo *TII = ST.getInstrInfo();
1168   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1169   if (!Offset || (Offset->getImm() & 6) == 0)
1170     return 0;
1171 
1172   auto IsHazardFn = [TII](const MachineInstr &I) {
1173     if (!SIInstrInfo::isMIMG(I))
1174       return false;
1175     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1176     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1177            TII->getInstSizeInBytes(I) >= 16;
1178   };
1179 
1180   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1181 }
1182 
1183 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1184   int FPAtomicToDenormModeWaitStates = 3;
1185 
1186   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1187     return 0;
1188 
1189   auto IsHazardFn = [](const MachineInstr &I) {
1190     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1191       return false;
1192     return SIInstrInfo::isFPAtomic(I);
1193   };
1194 
1195   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1196     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1197       return true;
1198 
1199     switch (MI.getOpcode()) {
1200     case AMDGPU::S_WAITCNT:
1201     case AMDGPU::S_WAITCNT_VSCNT:
1202     case AMDGPU::S_WAITCNT_VMCNT:
1203     case AMDGPU::S_WAITCNT_EXPCNT:
1204     case AMDGPU::S_WAITCNT_LGKMCNT:
1205     case AMDGPU::S_WAIT_IDLE:
1206       return true;
1207     default:
1208       break;
1209     }
1210 
1211     return false;
1212   };
1213 
1214   return FPAtomicToDenormModeWaitStates -
1215          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1216 }
1217 
1218 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1219   assert(SIInstrInfo::isMAI(*MI));
1220 
1221   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1222 }
1223 
1224 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1225   int WaitStatesNeeded = 0;
1226   unsigned Opc = MI->getOpcode();
1227 
1228   auto IsVALUFn = [](const MachineInstr &MI) {
1229     return SIInstrInfo::isVALU(MI);
1230   };
1231 
1232   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1233     const int LegacyVALUWritesVGPRWaitStates = 2;
1234     const int VALUWritesExecWaitStates = 4;
1235     const int MaxWaitStates = 4;
1236 
1237     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1238       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1239     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1240 
1241     if (WaitStatesNeeded < MaxWaitStates) {
1242       for (const MachineOperand &Use : MI->explicit_uses()) {
1243         const int MaxWaitStates = 2;
1244 
1245         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1246           continue;
1247 
1248         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1249           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1250         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1251 
1252         if (WaitStatesNeeded == MaxWaitStates)
1253           break;
1254       }
1255     }
1256   }
1257 
1258   auto IsMFMAFn = [](const MachineInstr &MI) {
1259     return SIInstrInfo::isMAI(MI) &&
1260            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1261            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1262   };
1263 
1264   for (const MachineOperand &Op : MI->explicit_operands()) {
1265     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1266       continue;
1267 
1268     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1269       continue;
1270 
1271     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1272     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1273     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1274     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1275     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1276     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1277     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1278     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1279     const int MaxWaitStates = 18;
1280     Register Reg = Op.getReg();
1281     unsigned HazardDefLatency = 0;
1282 
1283     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1284                                this](const MachineInstr &MI) {
1285       if (!IsMFMAFn(MI))
1286         return false;
1287       Register DstReg = MI.getOperand(0).getReg();
1288       if (DstReg == Reg)
1289         return false;
1290       HazardDefLatency =
1291           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1292       return TRI.regsOverlap(DstReg, Reg);
1293     };
1294 
1295     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1296                                                    MaxWaitStates);
1297     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1298     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1299     int OpNo = MI->getOperandNo(&Op);
1300     if (OpNo == SrcCIdx) {
1301       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1302     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1303       switch (HazardDefLatency) {
1304       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1305                break;
1306       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1307                break;
1308       case 16: LLVM_FALLTHROUGH;
1309       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1310                break;
1311       }
1312     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1313       switch (HazardDefLatency) {
1314       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1315                break;
1316       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1317                break;
1318       case 16: LLVM_FALLTHROUGH;
1319       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1320                break;
1321       }
1322     }
1323 
1324     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1325     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1326 
1327     if (WaitStatesNeeded == MaxWaitStates)
1328       return WaitStatesNeeded; // Early exit.
1329 
1330     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1331       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1332         return false;
1333       Register DstReg = MI.getOperand(0).getReg();
1334       return TRI.regsOverlap(Reg, DstReg);
1335     };
1336 
1337     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1338     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1339     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1340     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1341     if (OpNo == SrcCIdx)
1342       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1343     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1344       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1345 
1346     WaitStatesNeededForUse = NeedWaitStates -
1347       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1348     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1349 
1350     if (WaitStatesNeeded == MaxWaitStates)
1351       return WaitStatesNeeded; // Early exit.
1352   }
1353 
1354   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1355     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1356     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1357     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1358     const int MaxWaitStates = 13;
1359     Register DstReg = MI->getOperand(0).getReg();
1360     unsigned HazardDefLatency = 0;
1361 
1362     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1363                          this](const MachineInstr &MI) {
1364       if (!IsMFMAFn(MI))
1365         return false;
1366       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1367       HazardDefLatency =
1368           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1369       return TRI.regsOverlap(Reg, DstReg);
1370     };
1371 
1372     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1373     int NeedWaitStates;
1374     switch (HazardDefLatency) {
1375     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1376              break;
1377     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1378              break;
1379     case 16: LLVM_FALLTHROUGH;
1380     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1381              break;
1382     }
1383 
1384     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1385     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1386   }
1387 
1388   return WaitStatesNeeded;
1389 }
1390 
1391 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1392   int WaitStatesNeeded = 0;
1393   unsigned Opc = MI->getOpcode();
1394 
1395   auto IsMFMAFn = [](const MachineInstr &MI) {
1396     return SIInstrInfo::isMAI(MI) &&
1397            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1398            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1399   };
1400 
1401   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1402     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1403   };
1404 
1405   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1406     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1407   };
1408 
1409   if (!IsMFMAFn(*MI))
1410     return WaitStatesNeeded;
1411 
1412   const int VALUWritesExecWaitStates = 4;
1413   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1414     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1415                           VALUWritesExecWaitStates);
1416   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1417 
1418   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1419 
1420   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1421   for (const MachineOperand &Use : MI->explicit_uses()) {
1422     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1423     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1424     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1425     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1426     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1427     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1428     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1429     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1430     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1431     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1432     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1433     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1434     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1435     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1436     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1437     const int MaxWaitStates = 19;
1438 
1439     if (!Use.isReg())
1440       continue;
1441     unsigned Reg = Use.getReg();
1442     bool FullReg;
1443     const MachineInstr *MI1;
1444 
1445     auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1446                                      this](const MachineInstr &MI) {
1447       if (!IsMFMAFn(MI))
1448         return false;
1449       if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1450         return false;
1451       Register DstReg = MI.getOperand(0).getReg();
1452       FullReg = (DstReg == Reg);
1453       MI1 = &MI;
1454       return TRI.regsOverlap(DstReg, Reg);
1455     };
1456 
1457     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1458       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1459     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1460 
1461     int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
1462                                               MaxWaitStates);
1463     if (NumWaitStates == std::numeric_limits<int>::max())
1464       continue;
1465 
1466     int OpNo = MI->getOperandNo(&Use);
1467     unsigned Opc1 = MI1->getOpcode();
1468     int NeedWaitStates = 0;
1469     if (OpNo == SrcCIdx) {
1470       if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1471         NeedWaitStates = 0;
1472       } else if (FullReg) {
1473         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1474              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1475             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1476              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1477           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1478       } else {
1479         switch (Opc1) {
1480         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1481         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1482           if (!isXDL(ST, *MI))
1483             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1484           break;
1485         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1486         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1487           if (!isXDL(ST, *MI))
1488             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1489           break;
1490         default:
1491           switch (TSchedModel.computeInstrLatency(MI1)) {
1492           case 2:
1493             NeedWaitStates = isDGEMM(Opc)
1494               ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1495               : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1496             break;
1497           case 8:
1498             NeedWaitStates = isDGEMM(Opc)
1499               ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1500               : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1501             break;
1502           case 16: LLVM_FALLTHROUGH;
1503           default:
1504             NeedWaitStates = isDGEMM(Opc)
1505               ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1506               : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1507           }
1508         }
1509       }
1510     } else {
1511       switch (Opc1) {
1512       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1513       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1514         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1515         break;
1516       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1517       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1518         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1519         break;
1520       default:
1521         switch (TSchedModel.computeInstrLatency(MI1)) {
1522         case 2:
1523           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1524           break;
1525         case 8:
1526           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1527           break;
1528         case 16: LLVM_FALLTHROUGH;
1529         default:
1530           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1531         }
1532       }
1533     }
1534     if (WaitStatesNeeded >= NeedWaitStates)
1535       continue;
1536 
1537     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1538     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1539 
1540     if (WaitStatesNeeded == MaxWaitStates)
1541       break;
1542   }
1543 
1544   return WaitStatesNeeded;
1545 }
1546 
1547 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1548   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
1549   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1550     return 0;
1551 
1552   int WaitStatesNeeded = 0;
1553 
1554   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1555     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1556   };
1557 
1558   for (const MachineOperand &Op : MI->explicit_uses()) {
1559     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1560       continue;
1561 
1562     Register Reg = Op.getReg();
1563 
1564     const int AccVgprReadLdStWaitStates = 2;
1565     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1566     const int MaxWaitStates = 2;
1567 
1568     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1569       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1570     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1571 
1572     if (WaitStatesNeeded == MaxWaitStates)
1573       return WaitStatesNeeded; // Early exit.
1574 
1575     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1576       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1577           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1578         return false;
1579       auto IsVALUFn = [](const MachineInstr &MI) {
1580         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1581       };
1582       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1583              std::numeric_limits<int>::max();
1584     };
1585 
1586     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1587       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1588     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1589   }
1590 
1591   return WaitStatesNeeded;
1592 }
1593 
1594 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1595   if (!ST.hasGFX90AInsts())
1596     return 0;
1597 
1598   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1599     return SIInstrInfo::isMAI(MI) &&
1600            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1601            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1602   };
1603 
1604   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1605     return isDGEMM(MI.getOpcode());
1606   };
1607 
1608   // This is checked in checkMAIHazards90A()
1609   if (IsMFMAFn(*MI))
1610     return 0;
1611 
1612   int WaitStatesNeeded = 0;
1613 
1614   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1615                        SIInstrInfo::isFLAT(*MI) ||
1616                        SIInstrInfo::isDS(*MI) ||
1617                        SIInstrInfo::isEXP(*MI);
1618   bool IsVALU = SIInstrInfo::isVALU(*MI);
1619 
1620   const MachineInstr *MFMA = nullptr;
1621   unsigned Reg;
1622   auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
1623                               this](const MachineInstr &MI) {
1624     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1625       return false;
1626     if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1627       return false;
1628     MFMA = &MI;
1629     return true;
1630   };
1631 
1632   const MachineInstr *DOT = nullptr;
1633   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1634     if (!SIInstrInfo::isDOT(MI) ||
1635         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1636       return false;
1637     DOT = &MI;
1638     return true;
1639   };
1640 
1641   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1642                                            AMDGPU::OpName::src2);
1643 
1644   if (IsMemOrExport || IsVALU) {
1645     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1646     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1647     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1648     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1649     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1650     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1651     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1652     const int DotWriteSameDotReadSrcAB = 3;
1653     const int DotWriteDifferentVALURead = 3;
1654     const int MaxWaitStates = 19;
1655 
1656     for (const MachineOperand &Use : MI->explicit_uses()) {
1657       if (!Use.isReg())
1658         continue;
1659       Reg = Use.getReg();
1660 
1661       DOT = nullptr;
1662       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1663                                                      MaxWaitStates);
1664       if (DOT) {
1665         int NeedWaitStates = 0;
1666         if (DOT->getOpcode() == MI->getOpcode()) {
1667           if (&Use - &MI->getOperand(0) != SrcCIdx)
1668             NeedWaitStates = DotWriteSameDotReadSrcAB;
1669         } else {
1670           NeedWaitStates = DotWriteDifferentVALURead;
1671         }
1672 
1673         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1674         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1675       }
1676 
1677       MFMA = nullptr;
1678       WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1679                                                  MaxWaitStates);
1680       if (!MFMA)
1681         continue;
1682 
1683       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1684       int NeedWaitStates = MaxWaitStates;
1685       switch (HazardDefLatency) {
1686       case 2:
1687         NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1688         break;
1689       case 4:
1690         assert(isDGEMM(MFMA->getOpcode()));
1691         NeedWaitStates =
1692             IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1693                           : DMFMA4x4WriteVgprVALUReadWaitStates;
1694         break;
1695       case 8:
1696         NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1697         break;
1698       case 16: LLVM_FALLTHROUGH;
1699       default:
1700         NeedWaitStates =
1701           isDGEMM(MFMA->getOpcode())
1702             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1703                             : DMFMA16x16WriteVgprVALUReadWaitStates
1704             : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1705         break;
1706       }
1707 
1708       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1709       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1710 
1711       if (WaitStatesNeeded == MaxWaitStates)
1712         break;
1713     }
1714   }
1715 
1716   unsigned Opc = MI->getOpcode();
1717   const int DMFMAToFMA64WaitStates = 2;
1718   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1719        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1720        Opc == AMDGPU::V_FMAC_F64_dpp) &&
1721       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1722     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1723       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1724     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1725   }
1726 
1727   if (!IsVALU && !IsMemOrExport)
1728     return WaitStatesNeeded;
1729 
1730   for (const MachineOperand &Def : MI->defs()) {
1731     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1732     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1733     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1734     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1735     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1736     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1737     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1738     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1739     const int DotWriteDifferentVALUWrite = 3;
1740     const int MaxWaitStates = 19;
1741     const int MaxWarWaitStates = 15;
1742 
1743     Reg = Def.getReg();
1744 
1745     DOT = nullptr;
1746     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1747                                                    MaxWaitStates);
1748     if (DOT && DOT->getOpcode() != MI->getOpcode())
1749       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1750                                                     WaitStatesSinceDef);
1751 
1752     MFMA = nullptr;
1753     WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1754                                                MaxWaitStates);
1755     if (MFMA) {
1756       int NeedWaitStates = MaxWaitStates;
1757       switch (TSchedModel.computeInstrLatency(MFMA)) {
1758       case 2:
1759         NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1760         break;
1761       case 4:
1762         assert(isDGEMM(MFMA->getOpcode()));
1763         NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1764         break;
1765       case 8:
1766         NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1767         break;
1768       case 16: LLVM_FALLTHROUGH;
1769       default:
1770         NeedWaitStates = isDGEMM(MFMA->getOpcode())
1771                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
1772                    : SMFMA32x32WriteVgprVALUWawWaitStates;
1773         break;
1774       }
1775 
1776       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1777       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1778 
1779       if (WaitStatesNeeded == MaxWaitStates)
1780         break;
1781     }
1782 
1783     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1784                              this](const MachineInstr &MI) {
1785       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1786           !MI.readsRegister(Reg, &TRI))
1787         return false;
1788 
1789       const MachineOperand *SrcC =
1790           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1791       assert(SrcC);
1792       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1793         return false;
1794 
1795       MFMA = &MI;
1796       return true;
1797     };
1798 
1799     MFMA = nullptr;
1800     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1801                                                 MaxWarWaitStates);
1802     if (!MFMA)
1803       continue;
1804 
1805     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1806     int NeedWaitStates = MaxWaitStates;
1807     switch (HazardDefLatency) {
1808     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1809              break;
1810     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1811              break;
1812     case 16: LLVM_FALLTHROUGH;
1813     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1814              break;
1815     }
1816 
1817     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1818     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1819   }
1820 
1821   return WaitStatesNeeded;
1822 }
1823 
1824 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1825   if (!SU->isInstr())
1826     return false;
1827 
1828   const MachineInstr *MAI = nullptr;
1829   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1830     MAI = nullptr;
1831     if (SIInstrInfo::isMAI(MI) &&
1832         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1833         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1834       MAI = &MI;
1835     return MAI != nullptr;
1836   };
1837 
1838   MachineInstr *MI = SU->getInstr();
1839   if (IsMFMAFn(*MI)) {
1840     int W = getWaitStatesSince(IsMFMAFn, 16);
1841     if (MAI)
1842       return W < (int)TSchedModel.computeInstrLatency(MAI);
1843   }
1844 
1845   return false;
1846 }
1847