xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 424f1f6f96d0e0f14d25b0d1f3e85b85aa6a8249)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19 
20 using namespace llvm;
21 
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25 
26 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
27   IsHazardRecognizerMode(false),
28   CurrCycleInstr(nullptr),
29   MF(MF),
30   ST(MF.getSubtarget<GCNSubtarget>()),
31   TII(*ST.getInstrInfo()),
32   TRI(TII.getRegisterInfo()),
33   ClauseUses(TRI.getNumRegUnits()),
34   ClauseDefs(TRI.getNumRegUnits()) {
35   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
36   TSchedModel.init(&ST);
37 }
38 
39 void GCNHazardRecognizer::Reset() {
40   EmittedInstrs.clear();
41 }
42 
43 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
44   EmitInstruction(SU->getInstr());
45 }
46 
47 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
48   CurrCycleInstr = MI;
49 }
50 
51 static bool isDivFMas(unsigned Opcode) {
52   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
53 }
54 
55 static bool isSGetReg(unsigned Opcode) {
56   return Opcode == AMDGPU::S_GETREG_B32;
57 }
58 
59 static bool isSSetReg(unsigned Opcode) {
60   switch (Opcode) {
61   case AMDGPU::S_SETREG_B32:
62   case AMDGPU::S_SETREG_B32_mode:
63   case AMDGPU::S_SETREG_IMM32_B32:
64   case AMDGPU::S_SETREG_IMM32_B32_mode:
65     return true;
66   }
67   return false;
68 }
69 
70 static bool isRWLane(unsigned Opcode) {
71   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
72 }
73 
74 static bool isRFE(unsigned Opcode) {
75   return Opcode == AMDGPU::S_RFE_B64;
76 }
77 
78 static bool isSMovRel(unsigned Opcode) {
79   switch (Opcode) {
80   case AMDGPU::S_MOVRELS_B32:
81   case AMDGPU::S_MOVRELS_B64:
82   case AMDGPU::S_MOVRELD_B32:
83   case AMDGPU::S_MOVRELD_B64:
84     return true;
85   default:
86     return false;
87   }
88 }
89 
90 static bool isDGEMM(unsigned Opcode) {
91   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
92          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
93          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
94          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
95 }
96 
97 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
98   unsigned Opcode = MI.getOpcode();
99 
100   if (!SIInstrInfo::isMAI(MI) ||
101       isDGEMM(Opcode) ||
102       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
103       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
104     return false;
105 
106   return true;
107 }
108 
109 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
110                                     const MachineInstr &MI) {
111   if (TII.isAlwaysGDS(MI.getOpcode()))
112     return true;
113 
114   switch (MI.getOpcode()) {
115   case AMDGPU::S_SENDMSG:
116   case AMDGPU::S_SENDMSGHALT:
117   case AMDGPU::S_TTRACEDATA:
118     return true;
119   // These DS opcodes don't support GDS.
120   case AMDGPU::DS_NOP:
121   case AMDGPU::DS_PERMUTE_B32:
122   case AMDGPU::DS_BPERMUTE_B32:
123     return false;
124   default:
125     if (TII.isDS(MI.getOpcode())) {
126       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
127                                            AMDGPU::OpName::gds);
128       if (MI.getOperand(GDS).getImm())
129         return true;
130     }
131     return false;
132   }
133 }
134 
135 static bool isPermlane(const MachineInstr &MI) {
136   unsigned Opcode = MI.getOpcode();
137   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
138          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
139 }
140 
141 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
142   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
143                                                      AMDGPU::OpName::simm16);
144   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
145 }
146 
147 ScheduleHazardRecognizer::HazardType
148 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
149   MachineInstr *MI = SU->getInstr();
150   // If we are not in "HazardRecognizerMode" and therefore not being run from
151   // the scheduler, track possible stalls from hazards but don't insert noops.
152   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
153 
154   if (MI->isBundle())
155    return NoHazard;
156 
157   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
158     return HazardType;
159 
160   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
161     return HazardType;
162 
163   if (checkFPAtomicToDenormModeHazard(MI) > 0)
164     return HazardType;
165 
166   if (ST.hasNoDataDepHazard())
167     return NoHazard;
168 
169   // FIXME: Should flat be considered vmem?
170   if ((SIInstrInfo::isVMEM(*MI) ||
171        SIInstrInfo::isFLAT(*MI))
172       && checkVMEMHazards(MI) > 0)
173     return HazardType;
174 
175   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
176     return HazardType;
177 
178   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
179     return HazardType;
180 
181   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
182     return HazardType;
183 
184   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
185     return HazardType;
186 
187   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
188        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
189        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
190     return HazardType;
191 
192   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
193     return HazardType;
194 
195   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
196     return HazardType;
197 
198   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
199     return HazardType;
200 
201   if (ST.hasReadM0MovRelInterpHazard() &&
202       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
203       checkReadM0Hazards(MI) > 0)
204     return HazardType;
205 
206   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
207       checkReadM0Hazards(MI) > 0)
208     return HazardType;
209 
210   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
211     return HazardType;
212 
213   if ((SIInstrInfo::isVMEM(*MI) ||
214        SIInstrInfo::isFLAT(*MI) ||
215        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
216     return HazardType;
217 
218   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
219     return HazardType;
220 
221   return NoHazard;
222 }
223 
224 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
225                                 unsigned Quantity) {
226   while (Quantity > 0) {
227     unsigned Arg = std::min(Quantity, 8u);
228     Quantity -= Arg;
229     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
230         .addImm(Arg - 1);
231   }
232 }
233 
234 void GCNHazardRecognizer::processBundle() {
235   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
236   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
237   // Check bundled MachineInstr's for hazards.
238   for (; MI != E && MI->isInsideBundle(); ++MI) {
239     CurrCycleInstr = &*MI;
240     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
241 
242     if (IsHazardRecognizerMode) {
243       fixHazards(CurrCycleInstr);
244 
245       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
246     }
247 
248     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
249     // include the bundled MI directly after, only add a maximum of
250     // (MaxLookAhead - 1) noops to EmittedInstrs.
251     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
252       EmittedInstrs.push_front(nullptr);
253 
254     EmittedInstrs.push_front(CurrCycleInstr);
255     EmittedInstrs.resize(MaxLookAhead);
256   }
257   CurrCycleInstr = nullptr;
258 }
259 
260 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
261   IsHazardRecognizerMode = true;
262   CurrCycleInstr = MI;
263   unsigned W = PreEmitNoopsCommon(MI);
264   fixHazards(MI);
265   CurrCycleInstr = nullptr;
266   return W;
267 }
268 
269 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
270   if (MI->isBundle())
271     return 0;
272 
273   int WaitStates = 0;
274 
275   if (SIInstrInfo::isSMRD(*MI))
276     return std::max(WaitStates, checkSMRDHazards(MI));
277 
278   if (ST.hasNSAtoVMEMBug())
279     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
280 
281   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
282 
283   if (ST.hasNoDataDepHazard())
284     return WaitStates;
285 
286   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
287     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
288 
289   if (SIInstrInfo::isVALU(*MI))
290     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
291 
292   if (SIInstrInfo::isDPP(*MI))
293     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
294 
295   if (isDivFMas(MI->getOpcode()))
296     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
297 
298   if (isRWLane(MI->getOpcode()))
299     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
300 
301   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
302        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
303        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
304     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
305 
306   if (MI->isInlineAsm())
307     return std::max(WaitStates, checkInlineAsmHazards(MI));
308 
309   if (isSGetReg(MI->getOpcode()))
310     return std::max(WaitStates, checkGetRegHazards(MI));
311 
312   if (isSSetReg(MI->getOpcode()))
313     return std::max(WaitStates, checkSetRegHazards(MI));
314 
315   if (isRFE(MI->getOpcode()))
316     return std::max(WaitStates, checkRFEHazards(MI));
317 
318   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
319                                            isSMovRel(MI->getOpcode())))
320     return std::max(WaitStates, checkReadM0Hazards(MI));
321 
322   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
323     return std::max(WaitStates, checkReadM0Hazards(MI));
324 
325   if (SIInstrInfo::isMAI(*MI))
326     return std::max(WaitStates, checkMAIHazards(MI));
327 
328   if (SIInstrInfo::isVMEM(*MI) ||
329       SIInstrInfo::isFLAT(*MI) ||
330       SIInstrInfo::isDS(*MI))
331     return std::max(WaitStates, checkMAILdStHazards(MI));
332 
333   return WaitStates;
334 }
335 
336 void GCNHazardRecognizer::EmitNoop() {
337   EmittedInstrs.push_front(nullptr);
338 }
339 
340 void GCNHazardRecognizer::AdvanceCycle() {
341   // When the scheduler detects a stall, it will call AdvanceCycle() without
342   // emitting any instructions.
343   if (!CurrCycleInstr) {
344     EmittedInstrs.push_front(nullptr);
345     return;
346   }
347 
348   // Do not track non-instructions which do not affect the wait states.
349   // If included, these instructions can lead to buffer overflow such that
350   // detectable hazards are missed.
351   if (CurrCycleInstr->isMetaInstruction()) {
352     CurrCycleInstr = nullptr;
353     return;
354   }
355 
356   if (CurrCycleInstr->isBundle()) {
357     processBundle();
358     return;
359   }
360 
361   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
362 
363   // Keep track of emitted instructions
364   EmittedInstrs.push_front(CurrCycleInstr);
365 
366   // Add a nullptr for each additional wait state after the first.  Make sure
367   // not to add more than getMaxLookAhead() items to the list, since we
368   // truncate the list to that size right after this loop.
369   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
370        i < e; ++i) {
371     EmittedInstrs.push_front(nullptr);
372   }
373 
374   // getMaxLookahead() is the largest number of wait states we will ever need
375   // to insert, so there is no point in keeping track of more than that many
376   // wait states.
377   EmittedInstrs.resize(getMaxLookAhead());
378 
379   CurrCycleInstr = nullptr;
380 }
381 
382 void GCNHazardRecognizer::RecedeCycle() {
383   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
384 }
385 
386 //===----------------------------------------------------------------------===//
387 // Helper Functions
388 //===----------------------------------------------------------------------===//
389 
390 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
391 
392 // Returns a minimum wait states since \p I walking all predecessors.
393 // Only scans until \p IsExpired does not return true.
394 // Can only be run in a hazard recognizer mode.
395 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
396                               const MachineBasicBlock *MBB,
397                               MachineBasicBlock::const_reverse_instr_iterator I,
398                               int WaitStates, IsExpiredFn IsExpired,
399                               DenseSet<const MachineBasicBlock *> &Visited) {
400   for (auto E = MBB->instr_rend(); I != E; ++I) {
401     // Don't add WaitStates for parent BUNDLE instructions.
402     if (I->isBundle())
403       continue;
404 
405     if (IsHazard(*I))
406       return WaitStates;
407 
408     if (I->isInlineAsm() || I->isMetaInstruction())
409       continue;
410 
411     WaitStates += SIInstrInfo::getNumWaitStates(*I);
412 
413     if (IsExpired(*I, WaitStates))
414       return std::numeric_limits<int>::max();
415   }
416 
417   int MinWaitStates = WaitStates;
418   bool Found = false;
419   for (MachineBasicBlock *Pred : MBB->predecessors()) {
420     if (!Visited.insert(Pred).second)
421       continue;
422 
423     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
424                                WaitStates, IsExpired, Visited);
425 
426     if (W == std::numeric_limits<int>::max())
427       continue;
428 
429     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
430     Found = true;
431   }
432 
433   if (Found)
434     return MinWaitStates;
435 
436   return std::numeric_limits<int>::max();
437 }
438 
439 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
440                               const MachineInstr *MI, IsExpiredFn IsExpired) {
441   DenseSet<const MachineBasicBlock *> Visited;
442   return getWaitStatesSince(IsHazard, MI->getParent(),
443                             std::next(MI->getReverseIterator()),
444                             0, IsExpired, Visited);
445 }
446 
447 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
448   if (IsHazardRecognizerMode) {
449     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
450       return WaitStates >= Limit;
451     };
452     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
453   }
454 
455   int WaitStates = 0;
456   for (MachineInstr *MI : EmittedInstrs) {
457     if (MI) {
458       if (IsHazard(*MI))
459         return WaitStates;
460 
461       if (MI->isInlineAsm())
462         continue;
463     }
464     ++WaitStates;
465 
466     if (WaitStates >= Limit)
467       break;
468   }
469   return std::numeric_limits<int>::max();
470 }
471 
472 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
473                                                IsHazardFn IsHazardDef,
474                                                int Limit) {
475   const SIRegisterInfo *TRI = ST.getRegisterInfo();
476 
477   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
478     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
479   };
480 
481   return getWaitStatesSince(IsHazardFn, Limit);
482 }
483 
484 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
485                                                   int Limit) {
486   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
487     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
488   };
489 
490   return getWaitStatesSince(IsHazardFn, Limit);
491 }
492 
493 //===----------------------------------------------------------------------===//
494 // No-op Hazard Detection
495 //===----------------------------------------------------------------------===//
496 
497 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
498                         MCRegister Reg) {
499   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
500     BV.set(*RUI);
501 }
502 
503 static void addRegsToSet(const SIRegisterInfo &TRI,
504                          iterator_range<MachineInstr::const_mop_iterator> Ops,
505                          BitVector &Set) {
506   for (const MachineOperand &Op : Ops) {
507     if (Op.isReg())
508       addRegUnits(TRI, Set, Op.getReg().asMCReg());
509   }
510 }
511 
512 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
513   // XXX: Do we need to worry about implicit operands
514   addRegsToSet(TRI, MI.defs(), ClauseDefs);
515   addRegsToSet(TRI, MI.uses(), ClauseUses);
516 }
517 
518 static bool breaksSMEMSoftClause(MachineInstr *MI) {
519   return !SIInstrInfo::isSMRD(*MI);
520 }
521 
522 static bool breaksVMEMSoftClause(MachineInstr *MI) {
523   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
524 }
525 
526 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
527   // SMEM soft clause are only present on VI+, and only matter if xnack is
528   // enabled.
529   if (!ST.isXNACKEnabled())
530     return 0;
531 
532   bool IsSMRD = TII.isSMRD(*MEM);
533 
534   resetClause();
535 
536   // A soft-clause is any group of consecutive SMEM instructions.  The
537   // instructions in this group may return out of order and/or may be
538   // replayed (i.e. the same instruction issued more than once).
539   //
540   // In order to handle these situations correctly we need to make sure that
541   // when a clause has more than one instruction, no instruction in the clause
542   // writes to a register that is read by another instruction in the clause
543   // (including itself). If we encounter this situaion, we need to break the
544   // clause by inserting a non SMEM instruction.
545 
546   for (MachineInstr *MI : EmittedInstrs) {
547     // When we hit a non-SMEM instruction then we have passed the start of the
548     // clause and we can stop.
549     if (!MI)
550       break;
551 
552     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
553       break;
554 
555     addClauseInst(*MI);
556   }
557 
558   if (ClauseDefs.none())
559     return 0;
560 
561   // We need to make sure not to put loads and stores in the same clause if they
562   // use the same address. For now, just start a new clause whenever we see a
563   // store.
564   if (MEM->mayStore())
565     return 1;
566 
567   addClauseInst(*MEM);
568 
569   // If the set of defs and uses intersect then we cannot add this instruction
570   // to the clause, so we have a hazard.
571   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
572 }
573 
574 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
575   int WaitStatesNeeded = 0;
576 
577   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
578 
579   // This SMRD hazard only affects SI.
580   if (!ST.hasSMRDReadVALUDefHazard())
581     return WaitStatesNeeded;
582 
583   // A read of an SGPR by SMRD instruction requires 4 wait states when the
584   // SGPR was written by a VALU instruction.
585   int SmrdSgprWaitStates = 4;
586   auto IsHazardDefFn = [this](const MachineInstr &MI) {
587     return TII.isVALU(MI);
588   };
589   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
590     return TII.isSALU(MI);
591   };
592 
593   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
594 
595   for (const MachineOperand &Use : SMRD->uses()) {
596     if (!Use.isReg())
597       continue;
598     int WaitStatesNeededForUse =
599         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
600                                                    SmrdSgprWaitStates);
601     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
602 
603     // This fixes what appears to be undocumented hardware behavior in SI where
604     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
605     // needs some number of nops in between. We don't know how many we need, but
606     // let's use 4. This wasn't discovered before probably because the only
607     // case when this happens is when we expand a 64-bit pointer into a full
608     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
609     // probably never encountered in the closed-source land.
610     if (IsBufferSMRD) {
611       int WaitStatesNeededForUse =
612         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
613                                                    IsBufferHazardDefFn,
614                                                    SmrdSgprWaitStates);
615       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
616     }
617   }
618 
619   return WaitStatesNeeded;
620 }
621 
622 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
623   if (!ST.hasVMEMReadSGPRVALUDefHazard())
624     return 0;
625 
626   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
627 
628   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
629   // SGPR was written by a VALU Instruction.
630   const int VmemSgprWaitStates = 5;
631   auto IsHazardDefFn = [this](const MachineInstr &MI) {
632     return TII.isVALU(MI);
633   };
634   for (const MachineOperand &Use : VMEM->uses()) {
635     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
636       continue;
637 
638     int WaitStatesNeededForUse =
639         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
640                                                    VmemSgprWaitStates);
641     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
642   }
643   return WaitStatesNeeded;
644 }
645 
646 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
647   const SIRegisterInfo *TRI = ST.getRegisterInfo();
648   const SIInstrInfo *TII = ST.getInstrInfo();
649 
650   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
651   int DppVgprWaitStates = 2;
652   int DppExecWaitStates = 5;
653   int WaitStatesNeeded = 0;
654   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
655     return TII->isVALU(MI);
656   };
657 
658   for (const MachineOperand &Use : DPP->uses()) {
659     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
660       continue;
661     int WaitStatesNeededForUse =
662         DppVgprWaitStates - getWaitStatesSinceDef(
663                                 Use.getReg(),
664                                 [](const MachineInstr &) { return true; },
665                                 DppVgprWaitStates);
666     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
667   }
668 
669   WaitStatesNeeded = std::max(
670       WaitStatesNeeded,
671       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
672                                                 DppExecWaitStates));
673 
674   return WaitStatesNeeded;
675 }
676 
677 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
678   const SIInstrInfo *TII = ST.getInstrInfo();
679 
680   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
681   // instruction.
682   const int DivFMasWaitStates = 4;
683   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
684     return TII->isVALU(MI);
685   };
686   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
687                                                DivFMasWaitStates);
688 
689   return DivFMasWaitStates - WaitStatesNeeded;
690 }
691 
692 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
693   const SIInstrInfo *TII = ST.getInstrInfo();
694   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
695 
696   const int GetRegWaitStates = 2;
697   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
698     return GetRegHWReg == getHWReg(TII, MI);
699   };
700   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
701 
702   return GetRegWaitStates - WaitStatesNeeded;
703 }
704 
705 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
706   const SIInstrInfo *TII = ST.getInstrInfo();
707   unsigned HWReg = getHWReg(TII, *SetRegInstr);
708 
709   const int SetRegWaitStates = ST.getSetRegWaitStates();
710   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
711     return HWReg == getHWReg(TII, MI);
712   };
713   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
714   return SetRegWaitStates - WaitStatesNeeded;
715 }
716 
717 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
718   if (!MI.mayStore())
719     return -1;
720 
721   const SIInstrInfo *TII = ST.getInstrInfo();
722   unsigned Opcode = MI.getOpcode();
723   const MCInstrDesc &Desc = MI.getDesc();
724 
725   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
726   int VDataRCID = -1;
727   if (VDataIdx != -1)
728     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
729 
730   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
731     // There is no hazard if the instruction does not use vector regs
732     // (like wbinvl1)
733     if (VDataIdx == -1)
734       return -1;
735     // For MUBUF/MTBUF instructions this hazard only exists if the
736     // instruction is not using a register in the soffset field.
737     const MachineOperand *SOffset =
738         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
739     // If we have no soffset operand, then assume this field has been
740     // hardcoded to zero.
741     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
742         (!SOffset || !SOffset->isReg()))
743       return VDataIdx;
744   }
745 
746   // MIMG instructions create a hazard if they don't use a 256-bit T# and
747   // the store size is greater than 8 bytes and they have more than two bits
748   // of their dmask set.
749   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
750   if (TII->isMIMG(MI)) {
751     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
752     assert(SRsrcIdx != -1 &&
753            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
754     (void)SRsrcIdx;
755   }
756 
757   if (TII->isFLAT(MI)) {
758     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
759     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
760       return DataIdx;
761   }
762 
763   return -1;
764 }
765 
766 int
767 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
768                                             const MachineRegisterInfo &MRI) {
769   // Helper to check for the hazard where VMEM instructions that store more than
770   // 8 bytes can have there store data over written by the next instruction.
771   const SIRegisterInfo *TRI = ST.getRegisterInfo();
772 
773   const int VALUWaitStates = 1;
774   int WaitStatesNeeded = 0;
775 
776   if (!TRI->isVectorRegister(MRI, Def.getReg()))
777     return WaitStatesNeeded;
778   Register Reg = Def.getReg();
779   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
780     int DataIdx = createsVALUHazard(MI);
781     return DataIdx >= 0 &&
782            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
783   };
784   int WaitStatesNeededForDef =
785     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
786   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
787 
788   return WaitStatesNeeded;
789 }
790 
791 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
792   // This checks for the hazard where VMEM instructions that store more than
793   // 8 bytes can have there store data over written by the next instruction.
794   if (!ST.has12DWordStoreHazard())
795     return 0;
796 
797   const MachineRegisterInfo &MRI = MF.getRegInfo();
798   int WaitStatesNeeded = 0;
799 
800   for (const MachineOperand &Def : VALU->defs()) {
801     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
802   }
803 
804   return WaitStatesNeeded;
805 }
806 
807 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
808   // This checks for hazards associated with inline asm statements.
809   // Since inline asms can contain just about anything, we use this
810   // to call/leverage other check*Hazard routines. Note that
811   // this function doesn't attempt to address all possible inline asm
812   // hazards (good luck), but is a collection of what has been
813   // problematic thus far.
814 
815   // see checkVALUHazards()
816   if (!ST.has12DWordStoreHazard())
817     return 0;
818 
819   const MachineRegisterInfo &MRI = MF.getRegInfo();
820   int WaitStatesNeeded = 0;
821 
822   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
823        I != E; ++I) {
824     const MachineOperand &Op = IA->getOperand(I);
825     if (Op.isReg() && Op.isDef()) {
826       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
827     }
828   }
829 
830   return WaitStatesNeeded;
831 }
832 
833 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
834   const SIInstrInfo *TII = ST.getInstrInfo();
835   const SIRegisterInfo *TRI = ST.getRegisterInfo();
836   const MachineRegisterInfo &MRI = MF.getRegInfo();
837 
838   const MachineOperand *LaneSelectOp =
839       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
840 
841   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
842     return 0;
843 
844   Register LaneSelectReg = LaneSelectOp->getReg();
845   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
846 
847   const int RWLaneWaitStates = 4;
848   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
849                                               RWLaneWaitStates);
850   return RWLaneWaitStates - WaitStatesSince;
851 }
852 
853 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
854   if (!ST.hasRFEHazards())
855     return 0;
856 
857   const SIInstrInfo *TII = ST.getInstrInfo();
858 
859   const int RFEWaitStates = 1;
860 
861   auto IsHazardFn = [TII](const MachineInstr &MI) {
862     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
863   };
864   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
865   return RFEWaitStates - WaitStatesNeeded;
866 }
867 
868 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
869   const SIInstrInfo *TII = ST.getInstrInfo();
870   const int SMovRelWaitStates = 1;
871   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
872   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
873                                                    SMovRelWaitStates);
874 }
875 
876 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
877   fixVMEMtoScalarWriteHazards(MI);
878   fixVcmpxPermlaneHazards(MI);
879   fixSMEMtoVectorWriteHazards(MI);
880   fixVcmpxExecWARHazard(MI);
881   fixLdsBranchVmemWARHazard(MI);
882 }
883 
884 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
885   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
886     return false;
887 
888   const SIInstrInfo *TII = ST.getInstrInfo();
889   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
890 
891   auto IsExpiredFn = [](const MachineInstr &MI, int) {
892     unsigned Opc = MI.getOpcode();
893     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
894            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
895   };
896 
897   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
898       std::numeric_limits<int>::max())
899     return false;
900 
901   // V_NOP will be discarded by SQ.
902   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
903   // which is always a VGPR and available.
904   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
905   Register Reg = Src0->getReg();
906   bool IsUndef = Src0->isUndef();
907   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
908           TII->get(AMDGPU::V_MOV_B32_e32))
909     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
910     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
911 
912   return true;
913 }
914 
915 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
916   if (!ST.hasVMEMtoScalarWriteHazard())
917     return false;
918 
919   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
920     return false;
921 
922   if (MI->getNumDefs() == 0)
923     return false;
924 
925   const SIRegisterInfo *TRI = ST.getRegisterInfo();
926 
927   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
928     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
929         !SIInstrInfo::isFLAT(I))
930       return false;
931 
932     for (const MachineOperand &Def : MI->defs()) {
933       const MachineOperand *Op =
934           I.findRegisterUseOperand(Def.getReg(), false, TRI);
935       if (!Op)
936         continue;
937       return true;
938     }
939     return false;
940   };
941 
942   auto IsExpiredFn = [](const MachineInstr &MI, int) {
943     return SIInstrInfo::isVALU(MI) ||
944            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
945             !MI.getOperand(0).getImm()) ||
946            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
947             MI.getOperand(0).getImm() == 0xffe3);
948   };
949 
950   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
951       std::numeric_limits<int>::max())
952     return false;
953 
954   const SIInstrInfo *TII = ST.getInstrInfo();
955   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
956           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
957       .addImm(0xffe3);
958   return true;
959 }
960 
961 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
962   if (!ST.hasSMEMtoVectorWriteHazard())
963     return false;
964 
965   if (!SIInstrInfo::isVALU(*MI))
966     return false;
967 
968   unsigned SDSTName;
969   switch (MI->getOpcode()) {
970   case AMDGPU::V_READLANE_B32:
971   case AMDGPU::V_READFIRSTLANE_B32:
972     SDSTName = AMDGPU::OpName::vdst;
973     break;
974   default:
975     SDSTName = AMDGPU::OpName::sdst;
976     break;
977   }
978 
979   const SIInstrInfo *TII = ST.getInstrInfo();
980   const SIRegisterInfo *TRI = ST.getRegisterInfo();
981   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
982   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
983   if (!SDST) {
984     for (const auto &MO : MI->implicit_operands()) {
985       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
986         SDST = &MO;
987         break;
988       }
989     }
990   }
991 
992   if (!SDST)
993     return false;
994 
995   const Register SDSTReg = SDST->getReg();
996   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
997     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
998   };
999 
1000   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1001     if (TII->isSALU(MI)) {
1002       switch (MI.getOpcode()) {
1003       case AMDGPU::S_SETVSKIP:
1004       case AMDGPU::S_VERSION:
1005       case AMDGPU::S_WAITCNT_VSCNT:
1006       case AMDGPU::S_WAITCNT_VMCNT:
1007       case AMDGPU::S_WAITCNT_EXPCNT:
1008         // These instructions cannot not mitigate the hazard.
1009         return false;
1010       case AMDGPU::S_WAITCNT_LGKMCNT:
1011         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1012         return (MI.getOperand(1).getImm() == 0) &&
1013                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1014       case AMDGPU::S_WAITCNT: {
1015         const int64_t Imm = MI.getOperand(0).getImm();
1016         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1017         return (Decoded.LgkmCnt == 0);
1018       }
1019       default:
1020         // SOPP instructions cannot mitigate the hazard.
1021         if (TII->isSOPP(MI))
1022           return false;
1023         // At this point the SALU can be assumed to mitigate the hazard
1024         // because either:
1025         // (a) it is independent of the at risk SMEM (breaking chain),
1026         // or
1027         // (b) it is dependent on the SMEM, in which case an appropriate
1028         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1029         //     SMEM instruction.
1030         return true;
1031       }
1032     }
1033     return false;
1034   };
1035 
1036   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1037       std::numeric_limits<int>::max())
1038     return false;
1039 
1040   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1041           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1042       .addImm(0);
1043   return true;
1044 }
1045 
1046 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1047   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1048     return false;
1049 
1050   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1051   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1052     return false;
1053 
1054   auto IsHazardFn = [TRI](const MachineInstr &I) {
1055     if (SIInstrInfo::isVALU(I))
1056       return false;
1057     return I.readsRegister(AMDGPU::EXEC, TRI);
1058   };
1059 
1060   const SIInstrInfo *TII = ST.getInstrInfo();
1061   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1062     if (SIInstrInfo::isVALU(MI)) {
1063       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1064         return true;
1065       for (auto MO : MI.implicit_operands())
1066         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1067           return true;
1068     }
1069     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1070         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1071       return true;
1072     return false;
1073   };
1074 
1075   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1076       std::numeric_limits<int>::max())
1077     return false;
1078 
1079   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1080           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1081     .addImm(0xfffe);
1082   return true;
1083 }
1084 
1085 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1086   if (!ST.hasLdsBranchVmemWARHazard())
1087     return false;
1088 
1089   auto IsHazardInst = [](const MachineInstr &MI) {
1090     if (SIInstrInfo::isDS(MI))
1091       return 1;
1092     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1093       return 2;
1094     return 0;
1095   };
1096 
1097   auto InstType = IsHazardInst(*MI);
1098   if (!InstType)
1099     return false;
1100 
1101   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1102     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1103                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1104                                !I.getOperand(1).getImm());
1105   };
1106 
1107   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1108     if (!I.isBranch())
1109       return false;
1110 
1111     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1112       auto InstType2 = IsHazardInst(I);
1113       return InstType2 && InstType != InstType2;
1114     };
1115 
1116     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1117       auto InstType2 = IsHazardInst(I);
1118       if (InstType == InstType2)
1119         return true;
1120 
1121       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1122              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1123              !I.getOperand(1).getImm();
1124     };
1125 
1126     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1127            std::numeric_limits<int>::max();
1128   };
1129 
1130   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1131       std::numeric_limits<int>::max())
1132     return false;
1133 
1134   const SIInstrInfo *TII = ST.getInstrInfo();
1135   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1136           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1137     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1138     .addImm(0);
1139 
1140   return true;
1141 }
1142 
1143 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1144   int NSAtoVMEMWaitStates = 1;
1145 
1146   if (!ST.hasNSAtoVMEMBug())
1147     return 0;
1148 
1149   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1150     return 0;
1151 
1152   const SIInstrInfo *TII = ST.getInstrInfo();
1153   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1154   if (!Offset || (Offset->getImm() & 6) == 0)
1155     return 0;
1156 
1157   auto IsHazardFn = [TII](const MachineInstr &I) {
1158     if (!SIInstrInfo::isMIMG(I))
1159       return false;
1160     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1161     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1162            TII->getInstSizeInBytes(I) >= 16;
1163   };
1164 
1165   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1166 }
1167 
1168 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1169   int FPAtomicToDenormModeWaitStates = 3;
1170 
1171   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1172     return 0;
1173 
1174   auto IsHazardFn = [](const MachineInstr &I) {
1175     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1176       return false;
1177     return SIInstrInfo::isFPAtomic(I);
1178   };
1179 
1180   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1181     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1182       return true;
1183 
1184     switch (MI.getOpcode()) {
1185     case AMDGPU::S_WAITCNT:
1186     case AMDGPU::S_WAITCNT_VSCNT:
1187     case AMDGPU::S_WAITCNT_VMCNT:
1188     case AMDGPU::S_WAITCNT_EXPCNT:
1189     case AMDGPU::S_WAITCNT_LGKMCNT:
1190     case AMDGPU::S_WAIT_IDLE:
1191       return true;
1192     default:
1193       break;
1194     }
1195 
1196     return false;
1197   };
1198 
1199   return FPAtomicToDenormModeWaitStates -
1200          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1201 }
1202 
1203 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1204   assert(SIInstrInfo::isMAI(*MI));
1205 
1206   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1207 }
1208 
1209 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1210   int WaitStatesNeeded = 0;
1211   unsigned Opc = MI->getOpcode();
1212 
1213   auto IsVALUFn = [](const MachineInstr &MI) {
1214     return SIInstrInfo::isVALU(MI);
1215   };
1216 
1217   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1218     const int LegacyVALUWritesVGPRWaitStates = 2;
1219     const int VALUWritesExecWaitStates = 4;
1220     const int MaxWaitStates = 4;
1221 
1222     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1223       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1224     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1225 
1226     if (WaitStatesNeeded < MaxWaitStates) {
1227       for (const MachineOperand &Use : MI->explicit_uses()) {
1228         const int MaxWaitStates = 2;
1229 
1230         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1231           continue;
1232 
1233         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1234           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1235         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1236 
1237         if (WaitStatesNeeded == MaxWaitStates)
1238           break;
1239       }
1240     }
1241   }
1242 
1243   auto IsMFMAFn = [](const MachineInstr &MI) {
1244     return SIInstrInfo::isMAI(MI) &&
1245            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1246            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1247   };
1248 
1249   for (const MachineOperand &Op : MI->explicit_operands()) {
1250     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1251       continue;
1252 
1253     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1254       continue;
1255 
1256     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1257     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1258     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1259     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1260     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1261     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1262     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1263     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1264     const int MaxWaitStates = 18;
1265     Register Reg = Op.getReg();
1266     unsigned HazardDefLatency = 0;
1267 
1268     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1269                                this](const MachineInstr &MI) {
1270       if (!IsMFMAFn(MI))
1271         return false;
1272       Register DstReg = MI.getOperand(0).getReg();
1273       if (DstReg == Reg)
1274         return false;
1275       HazardDefLatency =
1276           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1277       return TRI.regsOverlap(DstReg, Reg);
1278     };
1279 
1280     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1281                                                    MaxWaitStates);
1282     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1283     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1284     int OpNo = MI->getOperandNo(&Op);
1285     if (OpNo == SrcCIdx) {
1286       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1287     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1288       switch (HazardDefLatency) {
1289       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1290                break;
1291       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1292                break;
1293       case 16: LLVM_FALLTHROUGH;
1294       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1295                break;
1296       }
1297     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1298       switch (HazardDefLatency) {
1299       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1300                break;
1301       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1302                break;
1303       case 16: LLVM_FALLTHROUGH;
1304       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1305                break;
1306       }
1307     }
1308 
1309     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1310     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1311 
1312     if (WaitStatesNeeded == MaxWaitStates)
1313       return WaitStatesNeeded; // Early exit.
1314 
1315     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1316       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1317         return false;
1318       Register DstReg = MI.getOperand(0).getReg();
1319       return TRI.regsOverlap(Reg, DstReg);
1320     };
1321 
1322     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1323     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1324     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1325     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1326     if (OpNo == SrcCIdx)
1327       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1328     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1329       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1330 
1331     WaitStatesNeededForUse = NeedWaitStates -
1332       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1333     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1334 
1335     if (WaitStatesNeeded == MaxWaitStates)
1336       return WaitStatesNeeded; // Early exit.
1337   }
1338 
1339   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1340     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1341     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1342     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1343     const int MaxWaitStates = 13;
1344     Register DstReg = MI->getOperand(0).getReg();
1345     unsigned HazardDefLatency = 0;
1346 
1347     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1348                          this](const MachineInstr &MI) {
1349       if (!IsMFMAFn(MI))
1350         return false;
1351       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1352       HazardDefLatency =
1353           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1354       return TRI.regsOverlap(Reg, DstReg);
1355     };
1356 
1357     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1358     int NeedWaitStates;
1359     switch (HazardDefLatency) {
1360     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1361              break;
1362     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1363              break;
1364     case 16: LLVM_FALLTHROUGH;
1365     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1366              break;
1367     }
1368 
1369     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1370     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1371   }
1372 
1373   return WaitStatesNeeded;
1374 }
1375 
1376 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1377   int WaitStatesNeeded = 0;
1378   unsigned Opc = MI->getOpcode();
1379 
1380   auto IsMFMAFn = [](const MachineInstr &MI) {
1381     return SIInstrInfo::isMAI(MI) &&
1382            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1383            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1384   };
1385 
1386   auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1387     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1388   };
1389 
1390   auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1391     return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1392   };
1393 
1394   if (!IsMFMAFn(*MI))
1395     return WaitStatesNeeded;
1396 
1397   const int VALUWritesExecWaitStates = 4;
1398   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1399     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1400                           VALUWritesExecWaitStates);
1401   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1402 
1403   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1404 
1405   // Loop for both DGEMM and S/HGEMM 2nd instruction.
1406   for (const MachineOperand &Use : MI->explicit_uses()) {
1407     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1408     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1409     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1410     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1411     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1412     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1413     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1414     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1415     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1416     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1417     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1418     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1419     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1420     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1421     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1422     const int MaxWaitStates = 19;
1423 
1424     if (!Use.isReg())
1425       continue;
1426     unsigned Reg = Use.getReg();
1427     bool FullReg;
1428     const MachineInstr *MI1;
1429 
1430     auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1431                                      this](const MachineInstr &MI) {
1432       if (!IsMFMAFn(MI))
1433         return false;
1434       if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1435         return false;
1436       Register DstReg = MI.getOperand(0).getReg();
1437       FullReg = (DstReg == Reg);
1438       MI1 = &MI;
1439       return TRI.regsOverlap(DstReg, Reg);
1440     };
1441 
1442     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1443       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1444     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1445 
1446     int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
1447                                               MaxWaitStates);
1448     if (NumWaitStates == std::numeric_limits<int>::max())
1449       continue;
1450 
1451     int OpNo = MI->getOperandNo(&Use);
1452     unsigned Opc1 = MI1->getOpcode();
1453     int NeedWaitStates = 0;
1454     if (OpNo == SrcCIdx) {
1455       if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1456         NeedWaitStates = 0;
1457       } else if (FullReg) {
1458         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1459              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1460             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1461              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1462           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1463       } else {
1464         switch (Opc1) {
1465         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1466         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1467           if (!isXDL(ST, *MI))
1468             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1469           break;
1470         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1471         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1472           if (!isXDL(ST, *MI))
1473             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1474           break;
1475         default:
1476           switch (TSchedModel.computeInstrLatency(MI1)) {
1477           case 2:
1478             NeedWaitStates = isDGEMM(Opc)
1479               ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1480               : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1481             break;
1482           case 8:
1483             NeedWaitStates = isDGEMM(Opc)
1484               ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1485               : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1486             break;
1487           case 16: LLVM_FALLTHROUGH;
1488           default:
1489             NeedWaitStates = isDGEMM(Opc)
1490               ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1491               : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1492           }
1493         }
1494       }
1495     } else {
1496       switch (Opc1) {
1497       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1498       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1499         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1500         break;
1501       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1502       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1503         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1504         break;
1505       default:
1506         switch (TSchedModel.computeInstrLatency(MI1)) {
1507         case 2:
1508           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1509           break;
1510         case 8:
1511           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1512           break;
1513         case 16: LLVM_FALLTHROUGH;
1514         default:
1515           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1516         }
1517       }
1518     }
1519     if (WaitStatesNeeded >= NeedWaitStates)
1520       continue;
1521 
1522     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1523     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1524 
1525     if (WaitStatesNeeded == MaxWaitStates)
1526       break;
1527   }
1528 
1529   return WaitStatesNeeded;
1530 }
1531 
1532 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1533   // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
1534   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1535     return 0;
1536 
1537   int WaitStatesNeeded = 0;
1538 
1539   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1540     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1541   };
1542 
1543   for (const MachineOperand &Op : MI->explicit_uses()) {
1544     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1545       continue;
1546 
1547     Register Reg = Op.getReg();
1548 
1549     const int AccVgprReadLdStWaitStates = 2;
1550     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1551     const int MaxWaitStates = 2;
1552 
1553     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1554       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1555     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1556 
1557     if (WaitStatesNeeded == MaxWaitStates)
1558       return WaitStatesNeeded; // Early exit.
1559 
1560     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1561       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1562           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1563         return false;
1564       auto IsVALUFn = [](const MachineInstr &MI) {
1565         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1566       };
1567       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1568              std::numeric_limits<int>::max();
1569     };
1570 
1571     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1572       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1573     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1574   }
1575 
1576   return WaitStatesNeeded;
1577 }
1578 
1579 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1580   if (!ST.hasGFX90AInsts())
1581     return 0;
1582 
1583   auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1584     return SIInstrInfo::isMAI(MI) &&
1585            MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1586            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1587   };
1588 
1589   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1590     return isDGEMM(MI.getOpcode());
1591   };
1592 
1593   // This is checked in checkMAIHazards90A()
1594   if (IsMFMAFn(*MI))
1595     return 0;
1596 
1597   int WaitStatesNeeded = 0;
1598 
1599   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1600                        SIInstrInfo::isFLAT(*MI) ||
1601                        SIInstrInfo::isDS(*MI) ||
1602                        SIInstrInfo::isEXP(*MI);
1603   bool IsVALU = SIInstrInfo::isVALU(*MI);
1604 
1605   const MachineInstr *MFMA = nullptr;
1606   unsigned Reg;
1607   auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
1608                               this](const MachineInstr &MI) {
1609     if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1610       return false;
1611     if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1612       return false;
1613     MFMA = &MI;
1614     return true;
1615   };
1616 
1617   const MachineInstr *DOT = nullptr;
1618   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1619     if (!SIInstrInfo::isDOT(MI) ||
1620         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1621       return false;
1622     DOT = &MI;
1623     return true;
1624   };
1625 
1626   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1627                                            AMDGPU::OpName::src2);
1628 
1629   if (IsMemOrExport || IsVALU) {
1630     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1631     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1632     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1633     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1634     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1635     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1636     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1637     const int DotWriteSameDotReadSrcAB = 3;
1638     const int DotWriteDifferentVALURead = 3;
1639     const int MaxWaitStates = 19;
1640 
1641     for (const MachineOperand &Use : MI->explicit_uses()) {
1642       if (!Use.isReg())
1643         continue;
1644       Reg = Use.getReg();
1645 
1646       DOT = nullptr;
1647       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1648                                                      MaxWaitStates);
1649       if (DOT) {
1650         int NeedWaitStates = 0;
1651         if (DOT->getOpcode() == MI->getOpcode()) {
1652           if (&Use - &MI->getOperand(0) != SrcCIdx)
1653             NeedWaitStates = DotWriteSameDotReadSrcAB;
1654         } else {
1655           NeedWaitStates = DotWriteDifferentVALURead;
1656         }
1657 
1658         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1659         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1660       }
1661 
1662       MFMA = nullptr;
1663       WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1664                                                  MaxWaitStates);
1665       if (!MFMA)
1666         continue;
1667 
1668       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1669       int NeedWaitStates = MaxWaitStates;
1670       switch (HazardDefLatency) {
1671       case 2:
1672         NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1673         break;
1674       case 4:
1675         assert(isDGEMM(MFMA->getOpcode()));
1676         NeedWaitStates =
1677             IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1678                           : DMFMA4x4WriteVgprVALUReadWaitStates;
1679         break;
1680       case 8:
1681         NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1682         break;
1683       case 16: LLVM_FALLTHROUGH;
1684       default:
1685         NeedWaitStates =
1686           isDGEMM(MFMA->getOpcode())
1687             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1688                             : DMFMA16x16WriteVgprVALUReadWaitStates
1689             : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1690         break;
1691       }
1692 
1693       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1694       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1695 
1696       if (WaitStatesNeeded == MaxWaitStates)
1697         break;
1698     }
1699   }
1700 
1701   unsigned Opc = MI->getOpcode();
1702   const int DMFMAToFMA64WaitStates = 2;
1703   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1704        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1705        Opc == AMDGPU::V_FMAC_F64_dpp) &&
1706       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1707     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1708       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1709     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1710   }
1711 
1712   if (!IsVALU && !IsMemOrExport)
1713     return WaitStatesNeeded;
1714 
1715   for (const MachineOperand &Def : MI->defs()) {
1716     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1717     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1718     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1719     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1720     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1721     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1722     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1723     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1724     const int DotWriteDifferentVALUWrite = 3;
1725     const int MaxWaitStates = 19;
1726     const int MaxWarWaitStates = 15;
1727 
1728     Reg = Def.getReg();
1729 
1730     DOT = nullptr;
1731     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1732                                                    MaxWaitStates);
1733     if (DOT && DOT->getOpcode() != MI->getOpcode())
1734       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1735                                                     WaitStatesSinceDef);
1736 
1737     MFMA = nullptr;
1738     WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1739                                                MaxWaitStates);
1740     if (MFMA) {
1741       int NeedWaitStates = MaxWaitStates;
1742       switch (TSchedModel.computeInstrLatency(MFMA)) {
1743       case 2:
1744         NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1745         break;
1746       case 4:
1747         assert(isDGEMM(MFMA->getOpcode()));
1748         NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1749         break;
1750       case 8:
1751         NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1752         break;
1753       case 16: LLVM_FALLTHROUGH;
1754       default:
1755         NeedWaitStates = isDGEMM(MFMA->getOpcode())
1756                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
1757                    : SMFMA32x32WriteVgprVALUWawWaitStates;
1758         break;
1759       }
1760 
1761       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1762       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1763 
1764       if (WaitStatesNeeded == MaxWaitStates)
1765         break;
1766     }
1767 
1768     auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1769                              this](const MachineInstr &MI) {
1770       if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1771           !MI.readsRegister(Reg, &TRI))
1772         return false;
1773 
1774       const MachineOperand *SrcC =
1775           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1776       assert(SrcC);
1777       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1778         return false;
1779 
1780       MFMA = &MI;
1781       return true;
1782     };
1783 
1784     MFMA = nullptr;
1785     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1786                                                 MaxWarWaitStates);
1787     if (!MFMA)
1788       continue;
1789 
1790     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1791     int NeedWaitStates = MaxWaitStates;
1792     switch (HazardDefLatency) {
1793     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1794              break;
1795     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1796              break;
1797     case 16: LLVM_FALLTHROUGH;
1798     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1799              break;
1800     }
1801 
1802     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1803     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1804   }
1805 
1806   return WaitStatesNeeded;
1807 }
1808 
1809 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1810   if (!SU->isInstr())
1811     return false;
1812 
1813   const MachineInstr *MAI = nullptr;
1814   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1815     MAI = nullptr;
1816     if (SIInstrInfo::isMAI(MI) &&
1817         MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1818         MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1819       MAI = &MI;
1820     return MAI != nullptr;
1821   };
1822 
1823   MachineInstr *MI = SU->getInstr();
1824   if (IsMFMAFn(*MI)) {
1825     int W = getWaitStatesSince(IsMFMAFn, 16);
1826     if (MAI)
1827       return W < (int)TSchedModel.computeInstrLatency(MAI);
1828   }
1829 
1830   return false;
1831 }
1832