xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision decfdb8ce3d511739b976d47862a98fe5674e2aa)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50   TSchedModel.init(&ST);
51 }
52 
53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54   EmitInstruction(SU->getInstr());
55 }
56 
57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58   CurrCycleInstr = MI;
59 }
60 
61 static bool isDivFMas(unsigned Opcode) {
62   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63 }
64 
65 static bool isSGetReg(unsigned Opcode) {
66   return Opcode == AMDGPU::S_GETREG_B32;
67 }
68 
69 static bool isSSetReg(unsigned Opcode) {
70   return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
71 }
72 
73 static bool isRWLane(unsigned Opcode) {
74   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
75 }
76 
77 static bool isRFE(unsigned Opcode) {
78   return Opcode == AMDGPU::S_RFE_B64;
79 }
80 
81 static bool isSMovRel(unsigned Opcode) {
82   switch (Opcode) {
83   case AMDGPU::S_MOVRELS_B32:
84   case AMDGPU::S_MOVRELS_B64:
85   case AMDGPU::S_MOVRELD_B32:
86   case AMDGPU::S_MOVRELD_B64:
87     return true;
88   default:
89     return false;
90   }
91 }
92 
93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
94                                     const MachineInstr &MI) {
95   if (TII.isAlwaysGDS(MI.getOpcode()))
96     return true;
97 
98   switch (MI.getOpcode()) {
99   case AMDGPU::S_SENDMSG:
100   case AMDGPU::S_SENDMSGHALT:
101   case AMDGPU::S_TTRACEDATA:
102     return true;
103   // These DS opcodes don't support GDS.
104   case AMDGPU::DS_NOP:
105   case AMDGPU::DS_PERMUTE_B32:
106   case AMDGPU::DS_BPERMUTE_B32:
107     return false;
108   default:
109     if (TII.isDS(MI.getOpcode())) {
110       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
111                                            AMDGPU::OpName::gds);
112       if (MI.getOperand(GDS).getImm())
113         return true;
114     }
115     return false;
116   }
117 }
118 
119 static bool isPermlane(const MachineInstr &MI) {
120   unsigned Opcode = MI.getOpcode();
121   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
122          Opcode == AMDGPU::V_PERMLANEX16_B32;
123 }
124 
125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
126   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
127                                                      AMDGPU::OpName::simm16);
128   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
129 }
130 
131 ScheduleHazardRecognizer::HazardType
132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
133   MachineInstr *MI = SU->getInstr();
134   if (MI->isBundle())
135    return NoHazard;
136 
137   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138     return NoopHazard;
139 
140   // FIXME: Should flat be considered vmem?
141   if ((SIInstrInfo::isVMEM(*MI) ||
142        SIInstrInfo::isFLAT(*MI))
143       && checkVMEMHazards(MI) > 0)
144     return NoopHazard;
145 
146   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147     return NoopHazard;
148 
149   if (checkFPAtomicToDenormModeHazard(MI) > 0)
150     return NoopHazard;
151 
152   if (ST.hasNoDataDepHazard())
153     return NoHazard;
154 
155   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156     return NoopHazard;
157 
158   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159     return NoopHazard;
160 
161   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162     return NoopHazard;
163 
164   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165     return NoopHazard;
166 
167   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168     return NoopHazard;
169 
170   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171     return NoopHazard;
172 
173   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174     return NoopHazard;
175 
176   if (ST.hasReadM0MovRelInterpHazard() &&
177       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178       checkReadM0Hazards(MI) > 0)
179     return NoopHazard;
180 
181   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182       checkReadM0Hazards(MI) > 0)
183     return NoopHazard;
184 
185   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186     return NoopHazard;
187 
188   if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
189     return NoopHazard;
190 
191   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
192     return NoopHazard;
193 
194   return NoHazard;
195 }
196 
197 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
198   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
199       .addImm(0);
200 }
201 
202 void GCNHazardRecognizer::processBundle() {
203   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
204   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
205   // Check bundled MachineInstr's for hazards.
206   for (; MI != E && MI->isInsideBundle(); ++MI) {
207     CurrCycleInstr = &*MI;
208     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
209 
210     if (IsHazardRecognizerMode)
211       fixHazards(CurrCycleInstr);
212 
213     for (unsigned i = 0; i < WaitStates; ++i)
214       insertNoopInBundle(CurrCycleInstr, TII);
215 
216     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
217     // include the bundled MI directly after, only add a maximum of
218     // (MaxLookAhead - 1) noops to EmittedInstrs.
219     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
220       EmittedInstrs.push_front(nullptr);
221 
222     EmittedInstrs.push_front(CurrCycleInstr);
223     EmittedInstrs.resize(MaxLookAhead);
224   }
225   CurrCycleInstr = nullptr;
226 }
227 
228 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
229   IsHazardRecognizerMode = true;
230   CurrCycleInstr = MI;
231   unsigned W = PreEmitNoopsCommon(MI);
232   fixHazards(MI);
233   CurrCycleInstr = nullptr;
234   return W;
235 }
236 
237 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
238   if (MI->isBundle())
239     return 0;
240 
241   int WaitStates = 0;
242 
243   if (SIInstrInfo::isSMRD(*MI))
244     return std::max(WaitStates, checkSMRDHazards(MI));
245 
246   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
247     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
248 
249   if (ST.hasNSAtoVMEMBug())
250     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
251 
252   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
253 
254   if (ST.hasNoDataDepHazard())
255     return WaitStates;
256 
257   if (SIInstrInfo::isVALU(*MI))
258     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
259 
260   if (SIInstrInfo::isDPP(*MI))
261     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
262 
263   if (isDivFMas(MI->getOpcode()))
264     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
265 
266   if (isRWLane(MI->getOpcode()))
267     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
268 
269   if (MI->isInlineAsm())
270     return std::max(WaitStates, checkInlineAsmHazards(MI));
271 
272   if (isSGetReg(MI->getOpcode()))
273     return std::max(WaitStates, checkGetRegHazards(MI));
274 
275   if (isSSetReg(MI->getOpcode()))
276     return std::max(WaitStates, checkSetRegHazards(MI));
277 
278   if (isRFE(MI->getOpcode()))
279     return std::max(WaitStates, checkRFEHazards(MI));
280 
281   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
282                                            isSMovRel(MI->getOpcode())))
283     return std::max(WaitStates, checkReadM0Hazards(MI));
284 
285   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
286     return std::max(WaitStates, checkReadM0Hazards(MI));
287 
288   if (SIInstrInfo::isMAI(*MI))
289     return std::max(WaitStates, checkMAIHazards(MI));
290 
291   if (MI->mayLoadOrStore())
292     return std::max(WaitStates, checkMAILdStHazards(MI));
293 
294   return WaitStates;
295 }
296 
297 void GCNHazardRecognizer::EmitNoop() {
298   EmittedInstrs.push_front(nullptr);
299 }
300 
301 void GCNHazardRecognizer::AdvanceCycle() {
302   // When the scheduler detects a stall, it will call AdvanceCycle() without
303   // emitting any instructions.
304   if (!CurrCycleInstr)
305     return;
306 
307   // Do not track non-instructions which do not affect the wait states.
308   // If included, these instructions can lead to buffer overflow such that
309   // detectable hazards are missed.
310   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
311       CurrCycleInstr->isKill())
312     return;
313 
314   if (CurrCycleInstr->isBundle()) {
315     processBundle();
316     return;
317   }
318 
319   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
320 
321   // Keep track of emitted instructions
322   EmittedInstrs.push_front(CurrCycleInstr);
323 
324   // Add a nullptr for each additional wait state after the first.  Make sure
325   // not to add more than getMaxLookAhead() items to the list, since we
326   // truncate the list to that size right after this loop.
327   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
328        i < e; ++i) {
329     EmittedInstrs.push_front(nullptr);
330   }
331 
332   // getMaxLookahead() is the largest number of wait states we will ever need
333   // to insert, so there is no point in keeping track of more than that many
334   // wait states.
335   EmittedInstrs.resize(getMaxLookAhead());
336 
337   CurrCycleInstr = nullptr;
338 }
339 
340 void GCNHazardRecognizer::RecedeCycle() {
341   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
342 }
343 
344 //===----------------------------------------------------------------------===//
345 // Helper Functions
346 //===----------------------------------------------------------------------===//
347 
348 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
349 
350 // Returns a minimum wait states since \p I walking all predecessors.
351 // Only scans until \p IsExpired does not return true.
352 // Can only be run in a hazard recognizer mode.
353 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
354                               MachineBasicBlock *MBB,
355                               MachineBasicBlock::reverse_instr_iterator I,
356                               int WaitStates,
357                               IsExpiredFn IsExpired,
358                               DenseSet<const MachineBasicBlock *> &Visited) {
359   for (auto E = MBB->instr_rend(); I != E; ++I) {
360     // Don't add WaitStates for parent BUNDLE instructions.
361     if (I->isBundle())
362       continue;
363 
364     if (IsHazard(&*I))
365       return WaitStates;
366 
367     if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
368       continue;
369 
370     WaitStates += SIInstrInfo::getNumWaitStates(*I);
371 
372     if (IsExpired(&*I, WaitStates))
373       return std::numeric_limits<int>::max();
374   }
375 
376   int MinWaitStates = WaitStates;
377   bool Found = false;
378   for (MachineBasicBlock *Pred : MBB->predecessors()) {
379     if (!Visited.insert(Pred).second)
380       continue;
381 
382     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
383                                WaitStates, IsExpired, Visited);
384 
385     if (W == std::numeric_limits<int>::max())
386       continue;
387 
388     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
389     if (IsExpired(nullptr, MinWaitStates))
390       return MinWaitStates;
391 
392     Found = true;
393   }
394 
395   if (Found)
396     return MinWaitStates;
397 
398   return std::numeric_limits<int>::max();
399 }
400 
401 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
402                               MachineInstr *MI,
403                               IsExpiredFn IsExpired) {
404   DenseSet<const MachineBasicBlock *> Visited;
405   return getWaitStatesSince(IsHazard, MI->getParent(),
406                             std::next(MI->getReverseIterator()),
407                             0, IsExpired, Visited);
408 }
409 
410 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
411   if (IsHazardRecognizerMode) {
412     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
413       return WaitStates >= Limit;
414     };
415     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
416   }
417 
418   int WaitStates = 0;
419   for (MachineInstr *MI : EmittedInstrs) {
420     if (MI) {
421       if (IsHazard(MI))
422         return WaitStates;
423 
424       if (MI->isInlineAsm())
425         continue;
426     }
427     ++WaitStates;
428 
429     if (WaitStates >= Limit)
430       break;
431   }
432   return std::numeric_limits<int>::max();
433 }
434 
435 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
436                                                IsHazardFn IsHazardDef,
437                                                int Limit) {
438   const SIRegisterInfo *TRI = ST.getRegisterInfo();
439 
440   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
441     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
442   };
443 
444   return getWaitStatesSince(IsHazardFn, Limit);
445 }
446 
447 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
448                                                   int Limit) {
449   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
450     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
451   };
452 
453   return getWaitStatesSince(IsHazardFn, Limit);
454 }
455 
456 //===----------------------------------------------------------------------===//
457 // No-op Hazard Detection
458 //===----------------------------------------------------------------------===//
459 
460 static void addRegUnits(const SIRegisterInfo &TRI,
461                         BitVector &BV, unsigned Reg) {
462   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
463     BV.set(*RUI);
464 }
465 
466 static void addRegsToSet(const SIRegisterInfo &TRI,
467                          iterator_range<MachineInstr::const_mop_iterator> Ops,
468                          BitVector &Set) {
469   for (const MachineOperand &Op : Ops) {
470     if (Op.isReg())
471       addRegUnits(TRI, Set, Op.getReg());
472   }
473 }
474 
475 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
476   // XXX: Do we need to worry about implicit operands
477   addRegsToSet(TRI, MI.defs(), ClauseDefs);
478   addRegsToSet(TRI, MI.uses(), ClauseUses);
479 }
480 
481 static bool breaksSMEMSoftClause(MachineInstr *MI) {
482   return !SIInstrInfo::isSMRD(*MI);
483 }
484 
485 static bool breaksVMEMSoftClause(MachineInstr *MI) {
486   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
487 }
488 
489 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
490   // SMEM soft clause are only present on VI+, and only matter if xnack is
491   // enabled.
492   if (!ST.isXNACKEnabled())
493     return 0;
494 
495   bool IsSMRD = TII.isSMRD(*MEM);
496 
497   resetClause();
498 
499   // A soft-clause is any group of consecutive SMEM instructions.  The
500   // instructions in this group may return out of order and/or may be
501   // replayed (i.e. the same instruction issued more than once).
502   //
503   // In order to handle these situations correctly we need to make sure that
504   // when a clause has more than one instruction, no instruction in the clause
505   // writes to a register that is read by another instruction in the clause
506   // (including itself). If we encounter this situaion, we need to break the
507   // clause by inserting a non SMEM instruction.
508 
509   for (MachineInstr *MI : EmittedInstrs) {
510     // When we hit a non-SMEM instruction then we have passed the start of the
511     // clause and we can stop.
512     if (!MI)
513       break;
514 
515     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
516       break;
517 
518     addClauseInst(*MI);
519   }
520 
521   if (ClauseDefs.none())
522     return 0;
523 
524   // We need to make sure not to put loads and stores in the same clause if they
525   // use the same address. For now, just start a new clause whenever we see a
526   // store.
527   if (MEM->mayStore())
528     return 1;
529 
530   addClauseInst(*MEM);
531 
532   // If the set of defs and uses intersect then we cannot add this instruction
533   // to the clause, so we have a hazard.
534   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
535 }
536 
537 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
538   int WaitStatesNeeded = 0;
539 
540   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
541 
542   // This SMRD hazard only affects SI.
543   if (!ST.hasSMRDReadVALUDefHazard())
544     return WaitStatesNeeded;
545 
546   // A read of an SGPR by SMRD instruction requires 4 wait states when the
547   // SGPR was written by a VALU instruction.
548   int SmrdSgprWaitStates = 4;
549   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
550   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
551 
552   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
553 
554   for (const MachineOperand &Use : SMRD->uses()) {
555     if (!Use.isReg())
556       continue;
557     int WaitStatesNeededForUse =
558         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
559                                                    SmrdSgprWaitStates);
560     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
561 
562     // This fixes what appears to be undocumented hardware behavior in SI where
563     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
564     // needs some number of nops in between. We don't know how many we need, but
565     // let's use 4. This wasn't discovered before probably because the only
566     // case when this happens is when we expand a 64-bit pointer into a full
567     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
568     // probably never encountered in the closed-source land.
569     if (IsBufferSMRD) {
570       int WaitStatesNeededForUse =
571         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
572                                                    IsBufferHazardDefFn,
573                                                    SmrdSgprWaitStates);
574       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
575     }
576   }
577 
578   return WaitStatesNeeded;
579 }
580 
581 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
582   if (!ST.hasVMEMReadSGPRVALUDefHazard())
583     return 0;
584 
585   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
586 
587   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
588   // SGPR was written by a VALU Instruction.
589   const int VmemSgprWaitStates = 5;
590   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
591   for (const MachineOperand &Use : VMEM->uses()) {
592     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
593       continue;
594 
595     int WaitStatesNeededForUse =
596         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
597                                                    VmemSgprWaitStates);
598     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
599   }
600   return WaitStatesNeeded;
601 }
602 
603 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
604   const SIRegisterInfo *TRI = ST.getRegisterInfo();
605   const SIInstrInfo *TII = ST.getInstrInfo();
606 
607   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
608   int DppVgprWaitStates = 2;
609   int DppExecWaitStates = 5;
610   int WaitStatesNeeded = 0;
611   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
612 
613   for (const MachineOperand &Use : DPP->uses()) {
614     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
615       continue;
616     int WaitStatesNeededForUse =
617         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
618                               [](MachineInstr *) { return true; },
619                               DppVgprWaitStates);
620     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
621   }
622 
623   WaitStatesNeeded = std::max(
624       WaitStatesNeeded,
625       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
626                                                 DppExecWaitStates));
627 
628   return WaitStatesNeeded;
629 }
630 
631 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
632   const SIInstrInfo *TII = ST.getInstrInfo();
633 
634   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
635   // instruction.
636   const int DivFMasWaitStates = 4;
637   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
638   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
639                                                DivFMasWaitStates);
640 
641   return DivFMasWaitStates - WaitStatesNeeded;
642 }
643 
644 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
645   const SIInstrInfo *TII = ST.getInstrInfo();
646   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
647 
648   const int GetRegWaitStates = 2;
649   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
650     return GetRegHWReg == getHWReg(TII, *MI);
651   };
652   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
653 
654   return GetRegWaitStates - WaitStatesNeeded;
655 }
656 
657 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
658   const SIInstrInfo *TII = ST.getInstrInfo();
659   unsigned HWReg = getHWReg(TII, *SetRegInstr);
660 
661   const int SetRegWaitStates = ST.getSetRegWaitStates();
662   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
663     return HWReg == getHWReg(TII, *MI);
664   };
665   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
666   return SetRegWaitStates - WaitStatesNeeded;
667 }
668 
669 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
670   if (!MI.mayStore())
671     return -1;
672 
673   const SIInstrInfo *TII = ST.getInstrInfo();
674   unsigned Opcode = MI.getOpcode();
675   const MCInstrDesc &Desc = MI.getDesc();
676 
677   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
678   int VDataRCID = -1;
679   if (VDataIdx != -1)
680     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
681 
682   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
683     // There is no hazard if the instruction does not use vector regs
684     // (like wbinvl1)
685     if (VDataIdx == -1)
686       return -1;
687     // For MUBUF/MTBUF instructions this hazard only exists if the
688     // instruction is not using a register in the soffset field.
689     const MachineOperand *SOffset =
690         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
691     // If we have no soffset operand, then assume this field has been
692     // hardcoded to zero.
693     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
694         (!SOffset || !SOffset->isReg()))
695       return VDataIdx;
696   }
697 
698   // MIMG instructions create a hazard if they don't use a 256-bit T# and
699   // the store size is greater than 8 bytes and they have more than two bits
700   // of their dmask set.
701   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
702   if (TII->isMIMG(MI)) {
703     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
704     assert(SRsrcIdx != -1 &&
705            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
706     (void)SRsrcIdx;
707   }
708 
709   if (TII->isFLAT(MI)) {
710     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
711     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
712       return DataIdx;
713   }
714 
715   return -1;
716 }
717 
718 int
719 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
720                                             const MachineRegisterInfo &MRI) {
721   // Helper to check for the hazard where VMEM instructions that store more than
722   // 8 bytes can have there store data over written by the next instruction.
723   const SIRegisterInfo *TRI = ST.getRegisterInfo();
724 
725   const int VALUWaitStates = 1;
726   int WaitStatesNeeded = 0;
727 
728   if (!TRI->isVGPR(MRI, Def.getReg()))
729     return WaitStatesNeeded;
730   Register Reg = Def.getReg();
731   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
732     int DataIdx = createsVALUHazard(*MI);
733     return DataIdx >= 0 &&
734     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
735   };
736   int WaitStatesNeededForDef =
737     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
738   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
739 
740   return WaitStatesNeeded;
741 }
742 
743 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
744   // This checks for the hazard where VMEM instructions that store more than
745   // 8 bytes can have there store data over written by the next instruction.
746   if (!ST.has12DWordStoreHazard())
747     return 0;
748 
749   const MachineRegisterInfo &MRI = MF.getRegInfo();
750   int WaitStatesNeeded = 0;
751 
752   for (const MachineOperand &Def : VALU->defs()) {
753     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
754   }
755 
756   return WaitStatesNeeded;
757 }
758 
759 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
760   // This checks for hazards associated with inline asm statements.
761   // Since inline asms can contain just about anything, we use this
762   // to call/leverage other check*Hazard routines. Note that
763   // this function doesn't attempt to address all possible inline asm
764   // hazards (good luck), but is a collection of what has been
765   // problematic thus far.
766 
767   // see checkVALUHazards()
768   if (!ST.has12DWordStoreHazard())
769     return 0;
770 
771   const MachineRegisterInfo &MRI = MF.getRegInfo();
772   int WaitStatesNeeded = 0;
773 
774   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
775        I != E; ++I) {
776     const MachineOperand &Op = IA->getOperand(I);
777     if (Op.isReg() && Op.isDef()) {
778       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
779     }
780   }
781 
782   return WaitStatesNeeded;
783 }
784 
785 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
786   const SIInstrInfo *TII = ST.getInstrInfo();
787   const SIRegisterInfo *TRI = ST.getRegisterInfo();
788   const MachineRegisterInfo &MRI = MF.getRegInfo();
789 
790   const MachineOperand *LaneSelectOp =
791       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
792 
793   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
794     return 0;
795 
796   Register LaneSelectReg = LaneSelectOp->getReg();
797   auto IsHazardFn = [TII] (MachineInstr *MI) {
798     return TII->isVALU(*MI);
799   };
800 
801   const int RWLaneWaitStates = 4;
802   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
803                                               RWLaneWaitStates);
804   return RWLaneWaitStates - WaitStatesSince;
805 }
806 
807 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
808   if (!ST.hasRFEHazards())
809     return 0;
810 
811   const SIInstrInfo *TII = ST.getInstrInfo();
812 
813   const int RFEWaitStates = 1;
814 
815   auto IsHazardFn = [TII] (MachineInstr *MI) {
816     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
817   };
818   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
819   return RFEWaitStates - WaitStatesNeeded;
820 }
821 
822 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
823   const SIInstrInfo *TII = ST.getInstrInfo();
824   const int SMovRelWaitStates = 1;
825   auto IsHazardFn = [TII] (MachineInstr *MI) {
826     return TII->isSALU(*MI);
827   };
828   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
829                                                    SMovRelWaitStates);
830 }
831 
832 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
833   fixVMEMtoScalarWriteHazards(MI);
834   fixVcmpxPermlaneHazards(MI);
835   fixSMEMtoVectorWriteHazards(MI);
836   fixVcmpxExecWARHazard(MI);
837   fixLdsBranchVmemWARHazard(MI);
838 }
839 
840 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
841   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
842     return false;
843 
844   const SIInstrInfo *TII = ST.getInstrInfo();
845   auto IsHazardFn = [TII] (MachineInstr *MI) {
846     return TII->isVOPC(*MI);
847   };
848 
849   auto IsExpiredFn = [] (MachineInstr *MI, int) {
850     if (!MI)
851       return false;
852     unsigned Opc = MI->getOpcode();
853     return SIInstrInfo::isVALU(*MI) &&
854            Opc != AMDGPU::V_NOP_e32 &&
855            Opc != AMDGPU::V_NOP_e64 &&
856            Opc != AMDGPU::V_NOP_sdwa;
857   };
858 
859   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
860       std::numeric_limits<int>::max())
861     return false;
862 
863   // V_NOP will be discarded by SQ.
864   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
865   // which is always a VGPR and available.
866   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
867   Register Reg = Src0->getReg();
868   bool IsUndef = Src0->isUndef();
869   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
870           TII->get(AMDGPU::V_MOV_B32_e32))
871     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
872     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
873 
874   return true;
875 }
876 
877 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
878   if (!ST.hasVMEMtoScalarWriteHazard())
879     return false;
880 
881   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
882     return false;
883 
884   if (MI->getNumDefs() == 0)
885     return false;
886 
887   const SIRegisterInfo *TRI = ST.getRegisterInfo();
888 
889   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
890     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
891         !SIInstrInfo::isFLAT(*I))
892       return false;
893 
894     for (const MachineOperand &Def : MI->defs()) {
895       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
896       if (!Op)
897         continue;
898       return true;
899     }
900     return false;
901   };
902 
903   auto IsExpiredFn = [](MachineInstr *MI, int) {
904     return MI && (SIInstrInfo::isVALU(*MI) ||
905                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
906                    !MI->getOperand(0).getImm()) ||
907                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
908                    MI->getOperand(0).getImm() == 0xffe3));
909   };
910 
911   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
912       std::numeric_limits<int>::max())
913     return false;
914 
915   const SIInstrInfo *TII = ST.getInstrInfo();
916   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
917           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
918       .addImm(0xffe3);
919   return true;
920 }
921 
922 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
923   if (!ST.hasSMEMtoVectorWriteHazard())
924     return false;
925 
926   if (!SIInstrInfo::isVALU(*MI))
927     return false;
928 
929   unsigned SDSTName;
930   switch (MI->getOpcode()) {
931   case AMDGPU::V_READLANE_B32:
932   case AMDGPU::V_READLANE_B32_gfx10:
933   case AMDGPU::V_READFIRSTLANE_B32:
934     SDSTName = AMDGPU::OpName::vdst;
935     break;
936   default:
937     SDSTName = AMDGPU::OpName::sdst;
938     break;
939   }
940 
941   const SIInstrInfo *TII = ST.getInstrInfo();
942   const SIRegisterInfo *TRI = ST.getRegisterInfo();
943   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
944   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
945   if (!SDST) {
946     for (const auto &MO : MI->implicit_operands()) {
947       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
948         SDST = &MO;
949         break;
950       }
951     }
952   }
953 
954   if (!SDST)
955     return false;
956 
957   const Register SDSTReg = SDST->getReg();
958   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
959     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
960   };
961 
962   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
963     if (MI) {
964       if (TII->isSALU(*MI)) {
965         switch (MI->getOpcode()) {
966         case AMDGPU::S_SETVSKIP:
967         case AMDGPU::S_VERSION:
968         case AMDGPU::S_WAITCNT_VSCNT:
969         case AMDGPU::S_WAITCNT_VMCNT:
970         case AMDGPU::S_WAITCNT_EXPCNT:
971           // These instructions cannot not mitigate the hazard.
972           return false;
973         case AMDGPU::S_WAITCNT_LGKMCNT:
974           // Reducing lgkmcnt count to 0 always mitigates the hazard.
975           return (MI->getOperand(1).getImm() == 0) &&
976                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
977         case AMDGPU::S_WAITCNT: {
978           const int64_t Imm = MI->getOperand(0).getImm();
979           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
980           return (Decoded.LgkmCnt == 0);
981         }
982         default:
983           // SOPP instructions cannot mitigate the hazard.
984           if (TII->isSOPP(*MI))
985             return false;
986           // At this point the SALU can be assumed to mitigate the hazard
987           // because either:
988           // (a) it is independent of the at risk SMEM (breaking chain),
989           // or
990           // (b) it is dependent on the SMEM, in which case an appropriate
991           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
992           //     SMEM instruction.
993           return true;
994         }
995       }
996     }
997     return false;
998   };
999 
1000   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1001       std::numeric_limits<int>::max())
1002     return false;
1003 
1004   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1005           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1006       .addImm(0);
1007   return true;
1008 }
1009 
1010 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1011   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1012     return false;
1013 
1014   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1015   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1016     return false;
1017 
1018   auto IsHazardFn = [TRI] (MachineInstr *I) {
1019     if (SIInstrInfo::isVALU(*I))
1020       return false;
1021     return I->readsRegister(AMDGPU::EXEC, TRI);
1022   };
1023 
1024   const SIInstrInfo *TII = ST.getInstrInfo();
1025   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1026     if (!MI)
1027       return false;
1028     if (SIInstrInfo::isVALU(*MI)) {
1029       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1030         return true;
1031       for (auto MO : MI->implicit_operands())
1032         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1033           return true;
1034     }
1035     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1036         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1037       return true;
1038     return false;
1039   };
1040 
1041   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1042       std::numeric_limits<int>::max())
1043     return false;
1044 
1045   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1046           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1047     .addImm(0xfffe);
1048   return true;
1049 }
1050 
1051 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1052   if (!ST.hasLdsBranchVmemWARHazard())
1053     return false;
1054 
1055   auto IsHazardInst = [] (const MachineInstr *MI) {
1056     if (SIInstrInfo::isDS(*MI))
1057       return 1;
1058     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1059       return 2;
1060     return 0;
1061   };
1062 
1063   auto InstType = IsHazardInst(MI);
1064   if (!InstType)
1065     return false;
1066 
1067   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1068     return I && (IsHazardInst(I) ||
1069                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1070                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1071                   !I->getOperand(1).getImm()));
1072   };
1073 
1074   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1075     if (!I->isBranch())
1076       return false;
1077 
1078     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1079       auto InstType2 = IsHazardInst(I);
1080       return InstType2 && InstType != InstType2;
1081     };
1082 
1083     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1084       if (!I)
1085         return false;
1086 
1087       auto InstType2 = IsHazardInst(I);
1088       if (InstType == InstType2)
1089         return true;
1090 
1091       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1092              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1093              !I->getOperand(1).getImm();
1094     };
1095 
1096     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1097            std::numeric_limits<int>::max();
1098   };
1099 
1100   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1101       std::numeric_limits<int>::max())
1102     return false;
1103 
1104   const SIInstrInfo *TII = ST.getInstrInfo();
1105   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1106           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1107     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1108     .addImm(0);
1109 
1110   return true;
1111 }
1112 
1113 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1114   int NSAtoVMEMWaitStates = 1;
1115 
1116   if (!ST.hasNSAtoVMEMBug())
1117     return 0;
1118 
1119   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1120     return 0;
1121 
1122   const SIInstrInfo *TII = ST.getInstrInfo();
1123   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1124   if (!Offset || (Offset->getImm() & 6) == 0)
1125     return 0;
1126 
1127   auto IsHazardFn = [TII] (MachineInstr *I) {
1128     if (!SIInstrInfo::isMIMG(*I))
1129       return false;
1130     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1131     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1132            TII->getInstSizeInBytes(*I) >= 16;
1133   };
1134 
1135   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1136 }
1137 
1138 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1139   int FPAtomicToDenormModeWaitStates = 3;
1140 
1141   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1142     return 0;
1143 
1144   auto IsHazardFn = [] (MachineInstr *I) {
1145     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1146       return false;
1147     return SIInstrInfo::isFPAtomic(*I);
1148   };
1149 
1150   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1151     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1152       return true;
1153 
1154     switch (MI->getOpcode()) {
1155     case AMDGPU::S_WAITCNT:
1156     case AMDGPU::S_WAITCNT_VSCNT:
1157     case AMDGPU::S_WAITCNT_VMCNT:
1158     case AMDGPU::S_WAITCNT_EXPCNT:
1159     case AMDGPU::S_WAITCNT_LGKMCNT:
1160     case AMDGPU::S_WAITCNT_IDLE:
1161       return true;
1162     default:
1163       break;
1164     }
1165 
1166     return false;
1167   };
1168 
1169 
1170   return FPAtomicToDenormModeWaitStates -
1171          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1172 }
1173 
1174 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1175   assert(SIInstrInfo::isMAI(*MI));
1176 
1177   int WaitStatesNeeded = 0;
1178   unsigned Opc = MI->getOpcode();
1179 
1180   auto IsVALUFn = [] (MachineInstr *MI) {
1181     return SIInstrInfo::isVALU(*MI);
1182   };
1183 
1184   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1185     const int LegacyVALUWritesVGPRWaitStates = 2;
1186     const int VALUWritesExecWaitStates = 4;
1187     const int MaxWaitStates = 4;
1188 
1189     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1190       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1191     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1192 
1193     if (WaitStatesNeeded < MaxWaitStates) {
1194       for (const MachineOperand &Use : MI->explicit_uses()) {
1195         const int MaxWaitStates = 2;
1196 
1197         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1198           continue;
1199 
1200         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1201           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1202         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1203 
1204         if (WaitStatesNeeded == MaxWaitStates)
1205           break;
1206       }
1207     }
1208   }
1209 
1210   auto IsMFMAFn = [] (MachineInstr *MI) {
1211     return SIInstrInfo::isMAI(*MI) &&
1212            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1213            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1214   };
1215 
1216   for (const MachineOperand &Op : MI->explicit_operands()) {
1217     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1218       continue;
1219 
1220     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1221       continue;
1222 
1223     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1224     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1225     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1226     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1227     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1228     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1229     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1230     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1231     const int MaxWaitStates = 18;
1232     Register Reg = Op.getReg();
1233     unsigned HazardDefLatency = 0;
1234 
1235     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1236                               (MachineInstr *MI) {
1237       if (!IsMFMAFn(MI))
1238         return false;
1239       Register DstReg = MI->getOperand(0).getReg();
1240       if (DstReg == Reg)
1241         return false;
1242       HazardDefLatency = std::max(HazardDefLatency,
1243                                   TSchedModel.computeInstrLatency(MI));
1244       return TRI.regsOverlap(DstReg, Reg);
1245     };
1246 
1247     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1248                                                    MaxWaitStates);
1249     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1250     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1251     int OpNo = MI->getOperandNo(&Op);
1252     if (OpNo == SrcCIdx) {
1253       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1254     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1255       switch (HazardDefLatency) {
1256       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1257                break;
1258       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1259                break;
1260       case 16: LLVM_FALLTHROUGH;
1261       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1262                break;
1263       }
1264     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1265       switch (HazardDefLatency) {
1266       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1267                break;
1268       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1269                break;
1270       case 16: LLVM_FALLTHROUGH;
1271       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1272                break;
1273       }
1274     }
1275 
1276     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1277     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1278 
1279     if (WaitStatesNeeded == MaxWaitStates)
1280       return WaitStatesNeeded; // Early exit.
1281 
1282     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1283       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1284         return false;
1285       Register DstReg = MI->getOperand(0).getReg();
1286       return TRI.regsOverlap(Reg, DstReg);
1287     };
1288 
1289     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1290     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1291     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1292     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1293     if (OpNo == SrcCIdx)
1294       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1295     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1296       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1297 
1298     WaitStatesNeededForUse = NeedWaitStates -
1299       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1300     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1301 
1302     if (WaitStatesNeeded == MaxWaitStates)
1303       return WaitStatesNeeded; // Early exit.
1304   }
1305 
1306   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1307     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1308     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1309     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1310     const int MaxWaitStates = 13;
1311     Register DstReg = MI->getOperand(0).getReg();
1312     unsigned HazardDefLatency = 0;
1313 
1314     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1315                          (MachineInstr *MI) {
1316       if (!IsMFMAFn(MI))
1317         return false;
1318       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1319       HazardDefLatency = std::max(HazardDefLatency,
1320                                   TSchedModel.computeInstrLatency(MI));
1321       return TRI.regsOverlap(Reg, DstReg);
1322     };
1323 
1324     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1325     int NeedWaitStates;
1326     switch (HazardDefLatency) {
1327     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1328              break;
1329     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1330              break;
1331     case 16: LLVM_FALLTHROUGH;
1332     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1333              break;
1334     }
1335 
1336     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1337     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1338   }
1339 
1340   return WaitStatesNeeded;
1341 }
1342 
1343 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1344   if (!ST.hasMAIInsts())
1345     return 0;
1346 
1347   int WaitStatesNeeded = 0;
1348 
1349   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1350     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1351   };
1352 
1353   for (const MachineOperand &Op : MI->explicit_uses()) {
1354     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1355       continue;
1356 
1357     Register Reg = Op.getReg();
1358 
1359     const int AccVgprReadLdStWaitStates = 2;
1360     const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1361     const int MaxWaitStates = 2;
1362 
1363     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1364       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1365     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1366 
1367     if (WaitStatesNeeded == MaxWaitStates)
1368       return WaitStatesNeeded; // Early exit.
1369 
1370     auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1371       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1372         return false;
1373       auto IsVALUFn = [] (MachineInstr *MI) {
1374         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1375       };
1376       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1377              std::numeric_limits<int>::max();
1378     };
1379 
1380     WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1381       getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1382     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1383   }
1384 
1385   return WaitStatesNeeded;
1386 }
1387 
1388 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1389   if (!SU->isInstr())
1390     return false;
1391 
1392   MachineInstr *MAI = nullptr;
1393   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1394     MAI = nullptr;
1395     if (SIInstrInfo::isMAI(*MI) &&
1396         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1397         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1398       MAI = MI;
1399     return MAI != nullptr;
1400   };
1401 
1402   MachineInstr *MI = SU->getInstr();
1403   if (IsMFMAFn(MI)) {
1404     int W = getWaitStatesSince(IsMFMAFn, 16);
1405     if (MAI)
1406       return W < (int)TSchedModel.computeInstrLatency(MAI);
1407   }
1408 
1409   return false;
1410 }
1411