xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 314e29ed2b78c69111635ecab94541b94c9e4c67)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "llvm/CodeGen/MachineFunction.h"
16 #include "llvm/CodeGen/ScheduleDAG.h"
17 #include "llvm/Support/TargetParser.h"
18 
19 using namespace llvm;
20 
21 //===----------------------------------------------------------------------===//
22 // Hazard Recoginizer Implementation
23 //===----------------------------------------------------------------------===//
24 
25 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
26   IsHazardRecognizerMode(false),
27   CurrCycleInstr(nullptr),
28   MF(MF),
29   ST(MF.getSubtarget<GCNSubtarget>()),
30   TII(*ST.getInstrInfo()),
31   TRI(TII.getRegisterInfo()),
32   ClauseUses(TRI.getNumRegUnits()),
33   ClauseDefs(TRI.getNumRegUnits()) {
34   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
35   TSchedModel.init(&ST);
36 }
37 
38 void GCNHazardRecognizer::Reset() {
39   EmittedInstrs.clear();
40 }
41 
42 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
43   EmitInstruction(SU->getInstr());
44 }
45 
46 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
47   CurrCycleInstr = MI;
48 }
49 
50 static bool isDivFMas(unsigned Opcode) {
51   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
52 }
53 
54 static bool isSGetReg(unsigned Opcode) {
55   return Opcode == AMDGPU::S_GETREG_B32;
56 }
57 
58 static bool isSSetReg(unsigned Opcode) {
59   switch (Opcode) {
60   case AMDGPU::S_SETREG_B32:
61   case AMDGPU::S_SETREG_B32_mode:
62   case AMDGPU::S_SETREG_IMM32_B32:
63   case AMDGPU::S_SETREG_IMM32_B32_mode:
64     return true;
65   }
66   return false;
67 }
68 
69 static bool isRWLane(unsigned Opcode) {
70   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
71 }
72 
73 static bool isRFE(unsigned Opcode) {
74   return Opcode == AMDGPU::S_RFE_B64;
75 }
76 
77 static bool isSMovRel(unsigned Opcode) {
78   switch (Opcode) {
79   case AMDGPU::S_MOVRELS_B32:
80   case AMDGPU::S_MOVRELS_B64:
81   case AMDGPU::S_MOVRELD_B32:
82   case AMDGPU::S_MOVRELD_B64:
83     return true;
84   default:
85     return false;
86   }
87 }
88 
89 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
90                                     const MachineInstr &MI) {
91   if (TII.isAlwaysGDS(MI.getOpcode()))
92     return true;
93 
94   switch (MI.getOpcode()) {
95   case AMDGPU::S_SENDMSG:
96   case AMDGPU::S_SENDMSGHALT:
97   case AMDGPU::S_TTRACEDATA:
98     return true;
99   // These DS opcodes don't support GDS.
100   case AMDGPU::DS_NOP:
101   case AMDGPU::DS_PERMUTE_B32:
102   case AMDGPU::DS_BPERMUTE_B32:
103     return false;
104   default:
105     if (TII.isDS(MI.getOpcode())) {
106       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
107                                            AMDGPU::OpName::gds);
108       if (MI.getOperand(GDS).getImm())
109         return true;
110     }
111     return false;
112   }
113 }
114 
115 static bool isPermlane(const MachineInstr &MI) {
116   unsigned Opcode = MI.getOpcode();
117   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
118          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
119 }
120 
121 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
122   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
123                                                      AMDGPU::OpName::simm16);
124   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
125 }
126 
127 ScheduleHazardRecognizer::HazardType
128 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
129   MachineInstr *MI = SU->getInstr();
130   // If we are not in "HazardRecognizerMode" and therefore not being run from
131   // the scheduler, track possible stalls from hazards but don't insert noops.
132   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
133 
134   if (MI->isBundle())
135    return NoHazard;
136 
137   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138     return HazardType;
139 
140   // FIXME: Should flat be considered vmem?
141   if ((SIInstrInfo::isVMEM(*MI) ||
142        SIInstrInfo::isFLAT(*MI))
143       && checkVMEMHazards(MI) > 0)
144     return HazardType;
145 
146   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147     return HazardType;
148 
149   if (checkFPAtomicToDenormModeHazard(MI) > 0)
150     return HazardType;
151 
152   if (ST.hasNoDataDepHazard())
153     return NoHazard;
154 
155   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156     return HazardType;
157 
158   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159     return HazardType;
160 
161   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162     return HazardType;
163 
164   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165     return HazardType;
166 
167   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168     return HazardType;
169 
170   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171     return HazardType;
172 
173   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174     return HazardType;
175 
176   if (ST.hasReadM0MovRelInterpHazard() &&
177       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178       checkReadM0Hazards(MI) > 0)
179     return HazardType;
180 
181   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182       checkReadM0Hazards(MI) > 0)
183     return HazardType;
184 
185   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186     return HazardType;
187 
188   if ((SIInstrInfo::isVMEM(*MI) ||
189        SIInstrInfo::isFLAT(*MI) ||
190        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
191     return HazardType;
192 
193   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
194     return HazardType;
195 
196   return NoHazard;
197 }
198 
199 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
200                                 unsigned Quantity) {
201   while (Quantity > 0) {
202     unsigned Arg = std::min(Quantity, 8u);
203     Quantity -= Arg;
204     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
205         .addImm(Arg - 1);
206   }
207 }
208 
209 void GCNHazardRecognizer::processBundle() {
210   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
211   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
212   // Check bundled MachineInstr's for hazards.
213   for (; MI != E && MI->isInsideBundle(); ++MI) {
214     CurrCycleInstr = &*MI;
215     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
216 
217     if (IsHazardRecognizerMode) {
218       fixHazards(CurrCycleInstr);
219 
220       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
221     }
222 
223     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
224     // include the bundled MI directly after, only add a maximum of
225     // (MaxLookAhead - 1) noops to EmittedInstrs.
226     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
227       EmittedInstrs.push_front(nullptr);
228 
229     EmittedInstrs.push_front(CurrCycleInstr);
230     EmittedInstrs.resize(MaxLookAhead);
231   }
232   CurrCycleInstr = nullptr;
233 }
234 
235 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
236   IsHazardRecognizerMode = true;
237   CurrCycleInstr = MI;
238   unsigned W = PreEmitNoopsCommon(MI);
239   fixHazards(MI);
240   CurrCycleInstr = nullptr;
241   return W;
242 }
243 
244 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
245   if (MI->isBundle())
246     return 0;
247 
248   int WaitStates = 0;
249 
250   if (SIInstrInfo::isSMRD(*MI))
251     return std::max(WaitStates, checkSMRDHazards(MI));
252 
253   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
254     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
255 
256   if (ST.hasNSAtoVMEMBug())
257     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
258 
259   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
260 
261   if (ST.hasNoDataDepHazard())
262     return WaitStates;
263 
264   if (SIInstrInfo::isVALU(*MI))
265     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
266 
267   if (SIInstrInfo::isDPP(*MI))
268     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
269 
270   if (isDivFMas(MI->getOpcode()))
271     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
272 
273   if (isRWLane(MI->getOpcode()))
274     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
275 
276   if (MI->isInlineAsm())
277     return std::max(WaitStates, checkInlineAsmHazards(MI));
278 
279   if (isSGetReg(MI->getOpcode()))
280     return std::max(WaitStates, checkGetRegHazards(MI));
281 
282   if (isSSetReg(MI->getOpcode()))
283     return std::max(WaitStates, checkSetRegHazards(MI));
284 
285   if (isRFE(MI->getOpcode()))
286     return std::max(WaitStates, checkRFEHazards(MI));
287 
288   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
289                                            isSMovRel(MI->getOpcode())))
290     return std::max(WaitStates, checkReadM0Hazards(MI));
291 
292   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
293     return std::max(WaitStates, checkReadM0Hazards(MI));
294 
295   if (SIInstrInfo::isMAI(*MI))
296     return std::max(WaitStates, checkMAIHazards(MI));
297 
298   if (SIInstrInfo::isVMEM(*MI) ||
299       SIInstrInfo::isFLAT(*MI) ||
300       SIInstrInfo::isDS(*MI))
301     return std::max(WaitStates, checkMAILdStHazards(MI));
302 
303   return WaitStates;
304 }
305 
306 void GCNHazardRecognizer::EmitNoop() {
307   EmittedInstrs.push_front(nullptr);
308 }
309 
310 void GCNHazardRecognizer::AdvanceCycle() {
311   // When the scheduler detects a stall, it will call AdvanceCycle() without
312   // emitting any instructions.
313   if (!CurrCycleInstr) {
314     EmittedInstrs.push_front(nullptr);
315     return;
316   }
317 
318   // Do not track non-instructions which do not affect the wait states.
319   // If included, these instructions can lead to buffer overflow such that
320   // detectable hazards are missed.
321   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
322       CurrCycleInstr->isKill()) {
323     CurrCycleInstr = nullptr;
324     return;
325   }
326 
327   if (CurrCycleInstr->isBundle()) {
328     processBundle();
329     return;
330   }
331 
332   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
333 
334   // Keep track of emitted instructions
335   EmittedInstrs.push_front(CurrCycleInstr);
336 
337   // Add a nullptr for each additional wait state after the first.  Make sure
338   // not to add more than getMaxLookAhead() items to the list, since we
339   // truncate the list to that size right after this loop.
340   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
341        i < e; ++i) {
342     EmittedInstrs.push_front(nullptr);
343   }
344 
345   // getMaxLookahead() is the largest number of wait states we will ever need
346   // to insert, so there is no point in keeping track of more than that many
347   // wait states.
348   EmittedInstrs.resize(getMaxLookAhead());
349 
350   CurrCycleInstr = nullptr;
351 }
352 
353 void GCNHazardRecognizer::RecedeCycle() {
354   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
355 }
356 
357 //===----------------------------------------------------------------------===//
358 // Helper Functions
359 //===----------------------------------------------------------------------===//
360 
361 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
362 
363 // Returns a minimum wait states since \p I walking all predecessors.
364 // Only scans until \p IsExpired does not return true.
365 // Can only be run in a hazard recognizer mode.
366 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
367                               MachineBasicBlock *MBB,
368                               MachineBasicBlock::reverse_instr_iterator I,
369                               int WaitStates,
370                               IsExpiredFn IsExpired,
371                               DenseSet<const MachineBasicBlock *> &Visited) {
372   for (auto E = MBB->instr_rend(); I != E; ++I) {
373     // Don't add WaitStates for parent BUNDLE instructions.
374     if (I->isBundle())
375       continue;
376 
377     if (IsHazard(&*I))
378       return WaitStates;
379 
380     if (I->isInlineAsm() || I->isMetaInstruction())
381       continue;
382 
383     WaitStates += SIInstrInfo::getNumWaitStates(*I);
384 
385     if (IsExpired(&*I, WaitStates))
386       return std::numeric_limits<int>::max();
387   }
388 
389   int MinWaitStates = WaitStates;
390   bool Found = false;
391   for (MachineBasicBlock *Pred : MBB->predecessors()) {
392     if (!Visited.insert(Pred).second)
393       continue;
394 
395     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
396                                WaitStates, IsExpired, Visited);
397 
398     if (W == std::numeric_limits<int>::max())
399       continue;
400 
401     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
402     if (IsExpired(nullptr, MinWaitStates))
403       return MinWaitStates;
404 
405     Found = true;
406   }
407 
408   if (Found)
409     return MinWaitStates;
410 
411   return std::numeric_limits<int>::max();
412 }
413 
414 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
415                               MachineInstr *MI,
416                               IsExpiredFn IsExpired) {
417   DenseSet<const MachineBasicBlock *> Visited;
418   return getWaitStatesSince(IsHazard, MI->getParent(),
419                             std::next(MI->getReverseIterator()),
420                             0, IsExpired, Visited);
421 }
422 
423 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
424   if (IsHazardRecognizerMode) {
425     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
426       return WaitStates >= Limit;
427     };
428     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
429   }
430 
431   int WaitStates = 0;
432   for (MachineInstr *MI : EmittedInstrs) {
433     if (MI) {
434       if (IsHazard(MI))
435         return WaitStates;
436 
437       if (MI->isInlineAsm())
438         continue;
439     }
440     ++WaitStates;
441 
442     if (WaitStates >= Limit)
443       break;
444   }
445   return std::numeric_limits<int>::max();
446 }
447 
448 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
449                                                IsHazardFn IsHazardDef,
450                                                int Limit) {
451   const SIRegisterInfo *TRI = ST.getRegisterInfo();
452 
453   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
454     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
455   };
456 
457   return getWaitStatesSince(IsHazardFn, Limit);
458 }
459 
460 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
461                                                   int Limit) {
462   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
463     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
464   };
465 
466   return getWaitStatesSince(IsHazardFn, Limit);
467 }
468 
469 //===----------------------------------------------------------------------===//
470 // No-op Hazard Detection
471 //===----------------------------------------------------------------------===//
472 
473 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
474                         MCRegister Reg) {
475   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
476     BV.set(*RUI);
477 }
478 
479 static void addRegsToSet(const SIRegisterInfo &TRI,
480                          iterator_range<MachineInstr::const_mop_iterator> Ops,
481                          BitVector &Set) {
482   for (const MachineOperand &Op : Ops) {
483     if (Op.isReg())
484       addRegUnits(TRI, Set, Op.getReg().asMCReg());
485   }
486 }
487 
488 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
489   // XXX: Do we need to worry about implicit operands
490   addRegsToSet(TRI, MI.defs(), ClauseDefs);
491   addRegsToSet(TRI, MI.uses(), ClauseUses);
492 }
493 
494 static bool breaksSMEMSoftClause(MachineInstr *MI) {
495   return !SIInstrInfo::isSMRD(*MI);
496 }
497 
498 static bool breaksVMEMSoftClause(MachineInstr *MI) {
499   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
500 }
501 
502 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
503   // SMEM soft clause are only present on VI+, and only matter if xnack is
504   // enabled.
505   if (!ST.isXNACKEnabled())
506     return 0;
507 
508   bool IsSMRD = TII.isSMRD(*MEM);
509 
510   resetClause();
511 
512   // A soft-clause is any group of consecutive SMEM instructions.  The
513   // instructions in this group may return out of order and/or may be
514   // replayed (i.e. the same instruction issued more than once).
515   //
516   // In order to handle these situations correctly we need to make sure that
517   // when a clause has more than one instruction, no instruction in the clause
518   // writes to a register that is read by another instruction in the clause
519   // (including itself). If we encounter this situaion, we need to break the
520   // clause by inserting a non SMEM instruction.
521 
522   for (MachineInstr *MI : EmittedInstrs) {
523     // When we hit a non-SMEM instruction then we have passed the start of the
524     // clause and we can stop.
525     if (!MI)
526       break;
527 
528     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
529       break;
530 
531     addClauseInst(*MI);
532   }
533 
534   if (ClauseDefs.none())
535     return 0;
536 
537   // We need to make sure not to put loads and stores in the same clause if they
538   // use the same address. For now, just start a new clause whenever we see a
539   // store.
540   if (MEM->mayStore())
541     return 1;
542 
543   addClauseInst(*MEM);
544 
545   // If the set of defs and uses intersect then we cannot add this instruction
546   // to the clause, so we have a hazard.
547   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
548 }
549 
550 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
551   int WaitStatesNeeded = 0;
552 
553   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
554 
555   // This SMRD hazard only affects SI.
556   if (!ST.hasSMRDReadVALUDefHazard())
557     return WaitStatesNeeded;
558 
559   // A read of an SGPR by SMRD instruction requires 4 wait states when the
560   // SGPR was written by a VALU instruction.
561   int SmrdSgprWaitStates = 4;
562   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
563   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
564 
565   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
566 
567   for (const MachineOperand &Use : SMRD->uses()) {
568     if (!Use.isReg())
569       continue;
570     int WaitStatesNeededForUse =
571         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
572                                                    SmrdSgprWaitStates);
573     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
574 
575     // This fixes what appears to be undocumented hardware behavior in SI where
576     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
577     // needs some number of nops in between. We don't know how many we need, but
578     // let's use 4. This wasn't discovered before probably because the only
579     // case when this happens is when we expand a 64-bit pointer into a full
580     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
581     // probably never encountered in the closed-source land.
582     if (IsBufferSMRD) {
583       int WaitStatesNeededForUse =
584         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
585                                                    IsBufferHazardDefFn,
586                                                    SmrdSgprWaitStates);
587       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
588     }
589   }
590 
591   return WaitStatesNeeded;
592 }
593 
594 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
595   if (!ST.hasVMEMReadSGPRVALUDefHazard())
596     return 0;
597 
598   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
599 
600   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
601   // SGPR was written by a VALU Instruction.
602   const int VmemSgprWaitStates = 5;
603   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
604   for (const MachineOperand &Use : VMEM->uses()) {
605     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
606       continue;
607 
608     int WaitStatesNeededForUse =
609         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
610                                                    VmemSgprWaitStates);
611     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
612   }
613   return WaitStatesNeeded;
614 }
615 
616 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
617   const SIRegisterInfo *TRI = ST.getRegisterInfo();
618   const SIInstrInfo *TII = ST.getInstrInfo();
619 
620   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
621   int DppVgprWaitStates = 2;
622   int DppExecWaitStates = 5;
623   int WaitStatesNeeded = 0;
624   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
625 
626   for (const MachineOperand &Use : DPP->uses()) {
627     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
628       continue;
629     int WaitStatesNeededForUse =
630         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
631                               [](MachineInstr *) { return true; },
632                               DppVgprWaitStates);
633     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
634   }
635 
636   WaitStatesNeeded = std::max(
637       WaitStatesNeeded,
638       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
639                                                 DppExecWaitStates));
640 
641   return WaitStatesNeeded;
642 }
643 
644 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
645   const SIInstrInfo *TII = ST.getInstrInfo();
646 
647   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
648   // instruction.
649   const int DivFMasWaitStates = 4;
650   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
651   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
652                                                DivFMasWaitStates);
653 
654   return DivFMasWaitStates - WaitStatesNeeded;
655 }
656 
657 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
658   const SIInstrInfo *TII = ST.getInstrInfo();
659   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
660 
661   const int GetRegWaitStates = 2;
662   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
663     return GetRegHWReg == getHWReg(TII, *MI);
664   };
665   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
666 
667   return GetRegWaitStates - WaitStatesNeeded;
668 }
669 
670 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
671   const SIInstrInfo *TII = ST.getInstrInfo();
672   unsigned HWReg = getHWReg(TII, *SetRegInstr);
673 
674   const int SetRegWaitStates = ST.getSetRegWaitStates();
675   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
676     return HWReg == getHWReg(TII, *MI);
677   };
678   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
679   return SetRegWaitStates - WaitStatesNeeded;
680 }
681 
682 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
683   if (!MI.mayStore())
684     return -1;
685 
686   const SIInstrInfo *TII = ST.getInstrInfo();
687   unsigned Opcode = MI.getOpcode();
688   const MCInstrDesc &Desc = MI.getDesc();
689 
690   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
691   int VDataRCID = -1;
692   if (VDataIdx != -1)
693     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
694 
695   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
696     // There is no hazard if the instruction does not use vector regs
697     // (like wbinvl1)
698     if (VDataIdx == -1)
699       return -1;
700     // For MUBUF/MTBUF instructions this hazard only exists if the
701     // instruction is not using a register in the soffset field.
702     const MachineOperand *SOffset =
703         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
704     // If we have no soffset operand, then assume this field has been
705     // hardcoded to zero.
706     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
707         (!SOffset || !SOffset->isReg()))
708       return VDataIdx;
709   }
710 
711   // MIMG instructions create a hazard if they don't use a 256-bit T# and
712   // the store size is greater than 8 bytes and they have more than two bits
713   // of their dmask set.
714   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
715   if (TII->isMIMG(MI)) {
716     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
717     assert(SRsrcIdx != -1 &&
718            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
719     (void)SRsrcIdx;
720   }
721 
722   if (TII->isFLAT(MI)) {
723     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
724     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
725       return DataIdx;
726   }
727 
728   return -1;
729 }
730 
731 int
732 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
733                                             const MachineRegisterInfo &MRI) {
734   // Helper to check for the hazard where VMEM instructions that store more than
735   // 8 bytes can have there store data over written by the next instruction.
736   const SIRegisterInfo *TRI = ST.getRegisterInfo();
737 
738   const int VALUWaitStates = 1;
739   int WaitStatesNeeded = 0;
740 
741   if (!TRI->isVGPR(MRI, Def.getReg()))
742     return WaitStatesNeeded;
743   Register Reg = Def.getReg();
744   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
745     int DataIdx = createsVALUHazard(*MI);
746     return DataIdx >= 0 &&
747     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
748   };
749   int WaitStatesNeededForDef =
750     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
751   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
752 
753   return WaitStatesNeeded;
754 }
755 
756 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
757   // This checks for the hazard where VMEM instructions that store more than
758   // 8 bytes can have there store data over written by the next instruction.
759   if (!ST.has12DWordStoreHazard())
760     return 0;
761 
762   const MachineRegisterInfo &MRI = MF.getRegInfo();
763   int WaitStatesNeeded = 0;
764 
765   for (const MachineOperand &Def : VALU->defs()) {
766     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
767   }
768 
769   return WaitStatesNeeded;
770 }
771 
772 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
773   // This checks for hazards associated with inline asm statements.
774   // Since inline asms can contain just about anything, we use this
775   // to call/leverage other check*Hazard routines. Note that
776   // this function doesn't attempt to address all possible inline asm
777   // hazards (good luck), but is a collection of what has been
778   // problematic thus far.
779 
780   // see checkVALUHazards()
781   if (!ST.has12DWordStoreHazard())
782     return 0;
783 
784   const MachineRegisterInfo &MRI = MF.getRegInfo();
785   int WaitStatesNeeded = 0;
786 
787   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
788        I != E; ++I) {
789     const MachineOperand &Op = IA->getOperand(I);
790     if (Op.isReg() && Op.isDef()) {
791       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
792     }
793   }
794 
795   return WaitStatesNeeded;
796 }
797 
798 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
799   const SIInstrInfo *TII = ST.getInstrInfo();
800   const SIRegisterInfo *TRI = ST.getRegisterInfo();
801   const MachineRegisterInfo &MRI = MF.getRegInfo();
802 
803   const MachineOperand *LaneSelectOp =
804       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
805 
806   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
807     return 0;
808 
809   Register LaneSelectReg = LaneSelectOp->getReg();
810   auto IsHazardFn = [TII] (MachineInstr *MI) {
811     return TII->isVALU(*MI);
812   };
813 
814   const int RWLaneWaitStates = 4;
815   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
816                                               RWLaneWaitStates);
817   return RWLaneWaitStates - WaitStatesSince;
818 }
819 
820 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
821   if (!ST.hasRFEHazards())
822     return 0;
823 
824   const SIInstrInfo *TII = ST.getInstrInfo();
825 
826   const int RFEWaitStates = 1;
827 
828   auto IsHazardFn = [TII] (MachineInstr *MI) {
829     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
830   };
831   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
832   return RFEWaitStates - WaitStatesNeeded;
833 }
834 
835 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
836   const SIInstrInfo *TII = ST.getInstrInfo();
837   const int SMovRelWaitStates = 1;
838   auto IsHazardFn = [TII] (MachineInstr *MI) {
839     return TII->isSALU(*MI);
840   };
841   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
842                                                    SMovRelWaitStates);
843 }
844 
845 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
846   fixVMEMtoScalarWriteHazards(MI);
847   fixVcmpxPermlaneHazards(MI);
848   fixSMEMtoVectorWriteHazards(MI);
849   fixVcmpxExecWARHazard(MI);
850   fixLdsBranchVmemWARHazard(MI);
851 }
852 
853 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
854   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
855     return false;
856 
857   const SIInstrInfo *TII = ST.getInstrInfo();
858   auto IsHazardFn = [TII] (MachineInstr *MI) {
859     return TII->isVOPC(*MI);
860   };
861 
862   auto IsExpiredFn = [] (MachineInstr *MI, int) {
863     if (!MI)
864       return false;
865     unsigned Opc = MI->getOpcode();
866     return SIInstrInfo::isVALU(*MI) &&
867            Opc != AMDGPU::V_NOP_e32 &&
868            Opc != AMDGPU::V_NOP_e64 &&
869            Opc != AMDGPU::V_NOP_sdwa;
870   };
871 
872   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
873       std::numeric_limits<int>::max())
874     return false;
875 
876   // V_NOP will be discarded by SQ.
877   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
878   // which is always a VGPR and available.
879   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
880   Register Reg = Src0->getReg();
881   bool IsUndef = Src0->isUndef();
882   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
883           TII->get(AMDGPU::V_MOV_B32_e32))
884     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
885     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
886 
887   return true;
888 }
889 
890 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
891   if (!ST.hasVMEMtoScalarWriteHazard())
892     return false;
893 
894   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
895     return false;
896 
897   if (MI->getNumDefs() == 0)
898     return false;
899 
900   const SIRegisterInfo *TRI = ST.getRegisterInfo();
901 
902   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
903     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
904         !SIInstrInfo::isFLAT(*I))
905       return false;
906 
907     for (const MachineOperand &Def : MI->defs()) {
908       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
909       if (!Op)
910         continue;
911       return true;
912     }
913     return false;
914   };
915 
916   auto IsExpiredFn = [](MachineInstr *MI, int) {
917     return MI && (SIInstrInfo::isVALU(*MI) ||
918                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
919                    !MI->getOperand(0).getImm()) ||
920                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
921                    MI->getOperand(0).getImm() == 0xffe3));
922   };
923 
924   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
925       std::numeric_limits<int>::max())
926     return false;
927 
928   const SIInstrInfo *TII = ST.getInstrInfo();
929   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
930           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
931       .addImm(0xffe3);
932   return true;
933 }
934 
935 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
936   if (!ST.hasSMEMtoVectorWriteHazard())
937     return false;
938 
939   if (!SIInstrInfo::isVALU(*MI))
940     return false;
941 
942   unsigned SDSTName;
943   switch (MI->getOpcode()) {
944   case AMDGPU::V_READLANE_B32:
945   case AMDGPU::V_READFIRSTLANE_B32:
946     SDSTName = AMDGPU::OpName::vdst;
947     break;
948   default:
949     SDSTName = AMDGPU::OpName::sdst;
950     break;
951   }
952 
953   const SIInstrInfo *TII = ST.getInstrInfo();
954   const SIRegisterInfo *TRI = ST.getRegisterInfo();
955   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
956   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
957   if (!SDST) {
958     for (const auto &MO : MI->implicit_operands()) {
959       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
960         SDST = &MO;
961         break;
962       }
963     }
964   }
965 
966   if (!SDST)
967     return false;
968 
969   const Register SDSTReg = SDST->getReg();
970   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
971     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
972   };
973 
974   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
975     if (MI) {
976       if (TII->isSALU(*MI)) {
977         switch (MI->getOpcode()) {
978         case AMDGPU::S_SETVSKIP:
979         case AMDGPU::S_VERSION:
980         case AMDGPU::S_WAITCNT_VSCNT:
981         case AMDGPU::S_WAITCNT_VMCNT:
982         case AMDGPU::S_WAITCNT_EXPCNT:
983           // These instructions cannot not mitigate the hazard.
984           return false;
985         case AMDGPU::S_WAITCNT_LGKMCNT:
986           // Reducing lgkmcnt count to 0 always mitigates the hazard.
987           return (MI->getOperand(1).getImm() == 0) &&
988                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
989         case AMDGPU::S_WAITCNT: {
990           const int64_t Imm = MI->getOperand(0).getImm();
991           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
992           return (Decoded.LgkmCnt == 0);
993         }
994         default:
995           // SOPP instructions cannot mitigate the hazard.
996           if (TII->isSOPP(*MI))
997             return false;
998           // At this point the SALU can be assumed to mitigate the hazard
999           // because either:
1000           // (a) it is independent of the at risk SMEM (breaking chain),
1001           // or
1002           // (b) it is dependent on the SMEM, in which case an appropriate
1003           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1004           //     SMEM instruction.
1005           return true;
1006         }
1007       }
1008     }
1009     return false;
1010   };
1011 
1012   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1013       std::numeric_limits<int>::max())
1014     return false;
1015 
1016   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1017           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1018       .addImm(0);
1019   return true;
1020 }
1021 
1022 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1023   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1024     return false;
1025 
1026   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1027   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1028     return false;
1029 
1030   auto IsHazardFn = [TRI] (MachineInstr *I) {
1031     if (SIInstrInfo::isVALU(*I))
1032       return false;
1033     return I->readsRegister(AMDGPU::EXEC, TRI);
1034   };
1035 
1036   const SIInstrInfo *TII = ST.getInstrInfo();
1037   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1038     if (!MI)
1039       return false;
1040     if (SIInstrInfo::isVALU(*MI)) {
1041       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1042         return true;
1043       for (auto MO : MI->implicit_operands())
1044         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1045           return true;
1046     }
1047     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1048         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1049       return true;
1050     return false;
1051   };
1052 
1053   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1054       std::numeric_limits<int>::max())
1055     return false;
1056 
1057   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1058           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1059     .addImm(0xfffe);
1060   return true;
1061 }
1062 
1063 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1064   if (!ST.hasLdsBranchVmemWARHazard())
1065     return false;
1066 
1067   auto IsHazardInst = [] (const MachineInstr *MI) {
1068     if (SIInstrInfo::isDS(*MI))
1069       return 1;
1070     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1071       return 2;
1072     return 0;
1073   };
1074 
1075   auto InstType = IsHazardInst(MI);
1076   if (!InstType)
1077     return false;
1078 
1079   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1080     return I && (IsHazardInst(I) ||
1081                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1082                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1083                   !I->getOperand(1).getImm()));
1084   };
1085 
1086   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1087     if (!I->isBranch())
1088       return false;
1089 
1090     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1091       auto InstType2 = IsHazardInst(I);
1092       return InstType2 && InstType != InstType2;
1093     };
1094 
1095     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1096       if (!I)
1097         return false;
1098 
1099       auto InstType2 = IsHazardInst(I);
1100       if (InstType == InstType2)
1101         return true;
1102 
1103       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1104              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1105              !I->getOperand(1).getImm();
1106     };
1107 
1108     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1109            std::numeric_limits<int>::max();
1110   };
1111 
1112   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1113       std::numeric_limits<int>::max())
1114     return false;
1115 
1116   const SIInstrInfo *TII = ST.getInstrInfo();
1117   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1118           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1119     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1120     .addImm(0);
1121 
1122   return true;
1123 }
1124 
1125 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1126   int NSAtoVMEMWaitStates = 1;
1127 
1128   if (!ST.hasNSAtoVMEMBug())
1129     return 0;
1130 
1131   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1132     return 0;
1133 
1134   const SIInstrInfo *TII = ST.getInstrInfo();
1135   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1136   if (!Offset || (Offset->getImm() & 6) == 0)
1137     return 0;
1138 
1139   auto IsHazardFn = [TII] (MachineInstr *I) {
1140     if (!SIInstrInfo::isMIMG(*I))
1141       return false;
1142     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1143     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1144            TII->getInstSizeInBytes(*I) >= 16;
1145   };
1146 
1147   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1148 }
1149 
1150 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1151   int FPAtomicToDenormModeWaitStates = 3;
1152 
1153   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1154     return 0;
1155 
1156   auto IsHazardFn = [] (MachineInstr *I) {
1157     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1158       return false;
1159     return SIInstrInfo::isFPAtomic(*I);
1160   };
1161 
1162   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1163     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1164       return true;
1165 
1166     switch (MI->getOpcode()) {
1167     case AMDGPU::S_WAITCNT:
1168     case AMDGPU::S_WAITCNT_VSCNT:
1169     case AMDGPU::S_WAITCNT_VMCNT:
1170     case AMDGPU::S_WAITCNT_EXPCNT:
1171     case AMDGPU::S_WAITCNT_LGKMCNT:
1172     case AMDGPU::S_WAIT_IDLE:
1173       return true;
1174     default:
1175       break;
1176     }
1177 
1178     return false;
1179   };
1180 
1181 
1182   return FPAtomicToDenormModeWaitStates -
1183          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1184 }
1185 
1186 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1187   assert(SIInstrInfo::isMAI(*MI));
1188 
1189   int WaitStatesNeeded = 0;
1190   unsigned Opc = MI->getOpcode();
1191 
1192   auto IsVALUFn = [] (MachineInstr *MI) {
1193     return SIInstrInfo::isVALU(*MI);
1194   };
1195 
1196   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1197     const int LegacyVALUWritesVGPRWaitStates = 2;
1198     const int VALUWritesExecWaitStates = 4;
1199     const int MaxWaitStates = 4;
1200 
1201     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1202       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1203     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1204 
1205     if (WaitStatesNeeded < MaxWaitStates) {
1206       for (const MachineOperand &Use : MI->explicit_uses()) {
1207         const int MaxWaitStates = 2;
1208 
1209         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1210           continue;
1211 
1212         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1213           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1214         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1215 
1216         if (WaitStatesNeeded == MaxWaitStates)
1217           break;
1218       }
1219     }
1220   }
1221 
1222   auto IsMFMAFn = [] (MachineInstr *MI) {
1223     return SIInstrInfo::isMAI(*MI) &&
1224            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1225            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1226   };
1227 
1228   for (const MachineOperand &Op : MI->explicit_operands()) {
1229     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1230       continue;
1231 
1232     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1233       continue;
1234 
1235     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1236     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1237     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1238     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1239     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1240     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1241     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1242     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1243     const int MaxWaitStates = 18;
1244     Register Reg = Op.getReg();
1245     unsigned HazardDefLatency = 0;
1246 
1247     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1248                               (MachineInstr *MI) {
1249       if (!IsMFMAFn(MI))
1250         return false;
1251       Register DstReg = MI->getOperand(0).getReg();
1252       if (DstReg == Reg)
1253         return false;
1254       HazardDefLatency = std::max(HazardDefLatency,
1255                                   TSchedModel.computeInstrLatency(MI));
1256       return TRI.regsOverlap(DstReg, Reg);
1257     };
1258 
1259     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1260                                                    MaxWaitStates);
1261     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1262     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1263     int OpNo = MI->getOperandNo(&Op);
1264     if (OpNo == SrcCIdx) {
1265       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1266     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1267       switch (HazardDefLatency) {
1268       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1269                break;
1270       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1271                break;
1272       case 16: LLVM_FALLTHROUGH;
1273       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1274                break;
1275       }
1276     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1277       switch (HazardDefLatency) {
1278       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1279                break;
1280       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1281                break;
1282       case 16: LLVM_FALLTHROUGH;
1283       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1284                break;
1285       }
1286     }
1287 
1288     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1289     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1290 
1291     if (WaitStatesNeeded == MaxWaitStates)
1292       return WaitStatesNeeded; // Early exit.
1293 
1294     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1295       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1296         return false;
1297       Register DstReg = MI->getOperand(0).getReg();
1298       return TRI.regsOverlap(Reg, DstReg);
1299     };
1300 
1301     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1302     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1303     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1304     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1305     if (OpNo == SrcCIdx)
1306       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1307     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1308       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1309 
1310     WaitStatesNeededForUse = NeedWaitStates -
1311       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1312     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1313 
1314     if (WaitStatesNeeded == MaxWaitStates)
1315       return WaitStatesNeeded; // Early exit.
1316   }
1317 
1318   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1319     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1320     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1321     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1322     const int MaxWaitStates = 13;
1323     Register DstReg = MI->getOperand(0).getReg();
1324     unsigned HazardDefLatency = 0;
1325 
1326     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1327                          (MachineInstr *MI) {
1328       if (!IsMFMAFn(MI))
1329         return false;
1330       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1331       HazardDefLatency = std::max(HazardDefLatency,
1332                                   TSchedModel.computeInstrLatency(MI));
1333       return TRI.regsOverlap(Reg, DstReg);
1334     };
1335 
1336     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1337     int NeedWaitStates;
1338     switch (HazardDefLatency) {
1339     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1340              break;
1341     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1342              break;
1343     case 16: LLVM_FALLTHROUGH;
1344     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1345              break;
1346     }
1347 
1348     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1349     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1350   }
1351 
1352   return WaitStatesNeeded;
1353 }
1354 
1355 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1356   if (!ST.hasMAIInsts())
1357     return 0;
1358 
1359   int WaitStatesNeeded = 0;
1360 
1361   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1362     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1363   };
1364 
1365   for (const MachineOperand &Op : MI->explicit_uses()) {
1366     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1367       continue;
1368 
1369     Register Reg = Op.getReg();
1370 
1371     const int AccVgprReadLdStWaitStates = 2;
1372     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1373     const int MaxWaitStates = 2;
1374 
1375     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1376       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1377     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1378 
1379     if (WaitStatesNeeded == MaxWaitStates)
1380       return WaitStatesNeeded; // Early exit.
1381 
1382     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
1383       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1384           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1385         return false;
1386       auto IsVALUFn = [] (MachineInstr *MI) {
1387         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1388       };
1389       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1390              std::numeric_limits<int>::max();
1391     };
1392 
1393     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1394       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1395     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1396   }
1397 
1398   return WaitStatesNeeded;
1399 }
1400 
1401 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1402   if (!SU->isInstr())
1403     return false;
1404 
1405   MachineInstr *MAI = nullptr;
1406   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1407     MAI = nullptr;
1408     if (SIInstrInfo::isMAI(*MI) &&
1409         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1410         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1411       MAI = MI;
1412     return MAI != nullptr;
1413   };
1414 
1415   MachineInstr *MI = SU->getInstr();
1416   if (IsMFMAFn(MI)) {
1417     int W = getWaitStatesSince(IsMFMAFn, 16);
1418     if (MAI)
1419       return W < (int)TSchedModel.computeInstrLatency(MAI);
1420   }
1421 
1422   return false;
1423 }
1424