xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 5dc47541f9ead81d218d6d0b91ac38be0a8f9f8e)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50   TSchedModel.init(&ST);
51 }
52 
53 void GCNHazardRecognizer::Reset() {
54   EmittedInstrs.clear();
55 }
56 
57 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
58   EmitInstruction(SU->getInstr());
59 }
60 
61 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
62   CurrCycleInstr = MI;
63 }
64 
65 static bool isDivFMas(unsigned Opcode) {
66   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
67 }
68 
69 static bool isSGetReg(unsigned Opcode) {
70   return Opcode == AMDGPU::S_GETREG_B32;
71 }
72 
73 static bool isSSetReg(unsigned Opcode) {
74   switch (Opcode) {
75   case AMDGPU::S_SETREG_B32:
76   case AMDGPU::S_SETREG_B32_mode:
77   case AMDGPU::S_SETREG_IMM32_B32:
78   case AMDGPU::S_SETREG_IMM32_B32_mode:
79     return true;
80   }
81   return false;
82 }
83 
84 static bool isRWLane(unsigned Opcode) {
85   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
86 }
87 
88 static bool isRFE(unsigned Opcode) {
89   return Opcode == AMDGPU::S_RFE_B64;
90 }
91 
92 static bool isSMovRel(unsigned Opcode) {
93   switch (Opcode) {
94   case AMDGPU::S_MOVRELS_B32:
95   case AMDGPU::S_MOVRELS_B64:
96   case AMDGPU::S_MOVRELD_B32:
97   case AMDGPU::S_MOVRELD_B64:
98     return true;
99   default:
100     return false;
101   }
102 }
103 
104 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
105                                     const MachineInstr &MI) {
106   if (TII.isAlwaysGDS(MI.getOpcode()))
107     return true;
108 
109   switch (MI.getOpcode()) {
110   case AMDGPU::S_SENDMSG:
111   case AMDGPU::S_SENDMSGHALT:
112   case AMDGPU::S_TTRACEDATA:
113     return true;
114   // These DS opcodes don't support GDS.
115   case AMDGPU::DS_NOP:
116   case AMDGPU::DS_PERMUTE_B32:
117   case AMDGPU::DS_BPERMUTE_B32:
118     return false;
119   default:
120     if (TII.isDS(MI.getOpcode())) {
121       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
122                                            AMDGPU::OpName::gds);
123       if (MI.getOperand(GDS).getImm())
124         return true;
125     }
126     return false;
127   }
128 }
129 
130 static bool isPermlane(const MachineInstr &MI) {
131   unsigned Opcode = MI.getOpcode();
132   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
133          Opcode == AMDGPU::V_PERMLANEX16_B32;
134 }
135 
136 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
137   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
138                                                      AMDGPU::OpName::simm16);
139   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
140 }
141 
142 ScheduleHazardRecognizer::HazardType
143 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
144   MachineInstr *MI = SU->getInstr();
145   // If we are not in "HazardRecognizerMode" and therefore not being run from
146   // the scheduler, track possible stalls from hazards but don't insert noops.
147   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
148 
149   if (MI->isBundle())
150    return NoHazard;
151 
152   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
153     return HazardType;
154 
155   // FIXME: Should flat be considered vmem?
156   if ((SIInstrInfo::isVMEM(*MI) ||
157        SIInstrInfo::isFLAT(*MI))
158       && checkVMEMHazards(MI) > 0)
159     return HazardType;
160 
161   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
162     return HazardType;
163 
164   if (checkFPAtomicToDenormModeHazard(MI) > 0)
165     return HazardType;
166 
167   if (ST.hasNoDataDepHazard())
168     return NoHazard;
169 
170   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
171     return HazardType;
172 
173   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
174     return HazardType;
175 
176   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
177     return HazardType;
178 
179   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
180     return HazardType;
181 
182   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
183     return HazardType;
184 
185   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
186     return HazardType;
187 
188   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
189     return HazardType;
190 
191   if (ST.hasReadM0MovRelInterpHazard() &&
192       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
193       checkReadM0Hazards(MI) > 0)
194     return HazardType;
195 
196   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
197       checkReadM0Hazards(MI) > 0)
198     return HazardType;
199 
200   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
201     return HazardType;
202 
203   if ((SIInstrInfo::isVMEM(*MI) ||
204        SIInstrInfo::isFLAT(*MI) ||
205        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
206     return HazardType;
207 
208   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
209     return HazardType;
210 
211   return NoHazard;
212 }
213 
214 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
215                                 unsigned Quantity) {
216   while (Quantity > 0) {
217     unsigned Arg = std::min(Quantity, 8u);
218     Quantity -= Arg;
219     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
220         .addImm(Arg - 1);
221   }
222 }
223 
224 void GCNHazardRecognizer::processBundle() {
225   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
226   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
227   // Check bundled MachineInstr's for hazards.
228   for (; MI != E && MI->isInsideBundle(); ++MI) {
229     CurrCycleInstr = &*MI;
230     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
231 
232     if (IsHazardRecognizerMode) {
233       fixHazards(CurrCycleInstr);
234 
235       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
236     }
237 
238     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
239     // include the bundled MI directly after, only add a maximum of
240     // (MaxLookAhead - 1) noops to EmittedInstrs.
241     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
242       EmittedInstrs.push_front(nullptr);
243 
244     EmittedInstrs.push_front(CurrCycleInstr);
245     EmittedInstrs.resize(MaxLookAhead);
246   }
247   CurrCycleInstr = nullptr;
248 }
249 
250 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
251   IsHazardRecognizerMode = true;
252   CurrCycleInstr = MI;
253   unsigned W = PreEmitNoopsCommon(MI);
254   fixHazards(MI);
255   CurrCycleInstr = nullptr;
256   return W;
257 }
258 
259 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
260   if (MI->isBundle())
261     return 0;
262 
263   int WaitStates = 0;
264 
265   if (SIInstrInfo::isSMRD(*MI))
266     return std::max(WaitStates, checkSMRDHazards(MI));
267 
268   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
269     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
270 
271   if (ST.hasNSAtoVMEMBug())
272     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
273 
274   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
275 
276   if (ST.hasNoDataDepHazard())
277     return WaitStates;
278 
279   if (SIInstrInfo::isVALU(*MI))
280     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
281 
282   if (SIInstrInfo::isDPP(*MI))
283     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
284 
285   if (isDivFMas(MI->getOpcode()))
286     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
287 
288   if (isRWLane(MI->getOpcode()))
289     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
290 
291   if (MI->isInlineAsm())
292     return std::max(WaitStates, checkInlineAsmHazards(MI));
293 
294   if (isSGetReg(MI->getOpcode()))
295     return std::max(WaitStates, checkGetRegHazards(MI));
296 
297   if (isSSetReg(MI->getOpcode()))
298     return std::max(WaitStates, checkSetRegHazards(MI));
299 
300   if (isRFE(MI->getOpcode()))
301     return std::max(WaitStates, checkRFEHazards(MI));
302 
303   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
304                                            isSMovRel(MI->getOpcode())))
305     return std::max(WaitStates, checkReadM0Hazards(MI));
306 
307   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
308     return std::max(WaitStates, checkReadM0Hazards(MI));
309 
310   if (SIInstrInfo::isMAI(*MI))
311     return std::max(WaitStates, checkMAIHazards(MI));
312 
313   if (SIInstrInfo::isVMEM(*MI) ||
314       SIInstrInfo::isFLAT(*MI) ||
315       SIInstrInfo::isDS(*MI))
316     return std::max(WaitStates, checkMAILdStHazards(MI));
317 
318   return WaitStates;
319 }
320 
321 void GCNHazardRecognizer::EmitNoop() {
322   EmittedInstrs.push_front(nullptr);
323 }
324 
325 void GCNHazardRecognizer::AdvanceCycle() {
326   // When the scheduler detects a stall, it will call AdvanceCycle() without
327   // emitting any instructions.
328   if (!CurrCycleInstr) {
329     EmittedInstrs.push_front(nullptr);
330     return;
331   }
332 
333   // Do not track non-instructions which do not affect the wait states.
334   // If included, these instructions can lead to buffer overflow such that
335   // detectable hazards are missed.
336   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
337       CurrCycleInstr->isKill()) {
338     CurrCycleInstr = nullptr;
339     return;
340   }
341 
342   if (CurrCycleInstr->isBundle()) {
343     processBundle();
344     return;
345   }
346 
347   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
348 
349   // Keep track of emitted instructions
350   EmittedInstrs.push_front(CurrCycleInstr);
351 
352   // Add a nullptr for each additional wait state after the first.  Make sure
353   // not to add more than getMaxLookAhead() items to the list, since we
354   // truncate the list to that size right after this loop.
355   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
356        i < e; ++i) {
357     EmittedInstrs.push_front(nullptr);
358   }
359 
360   // getMaxLookahead() is the largest number of wait states we will ever need
361   // to insert, so there is no point in keeping track of more than that many
362   // wait states.
363   EmittedInstrs.resize(getMaxLookAhead());
364 
365   CurrCycleInstr = nullptr;
366 }
367 
368 void GCNHazardRecognizer::RecedeCycle() {
369   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
370 }
371 
372 //===----------------------------------------------------------------------===//
373 // Helper Functions
374 //===----------------------------------------------------------------------===//
375 
376 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
377 
378 // Returns a minimum wait states since \p I walking all predecessors.
379 // Only scans until \p IsExpired does not return true.
380 // Can only be run in a hazard recognizer mode.
381 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
382                               MachineBasicBlock *MBB,
383                               MachineBasicBlock::reverse_instr_iterator I,
384                               int WaitStates,
385                               IsExpiredFn IsExpired,
386                               DenseSet<const MachineBasicBlock *> &Visited) {
387   for (auto E = MBB->instr_rend(); I != E; ++I) {
388     // Don't add WaitStates for parent BUNDLE instructions.
389     if (I->isBundle())
390       continue;
391 
392     if (IsHazard(&*I))
393       return WaitStates;
394 
395     if (I->isInlineAsm() || I->isMetaInstruction())
396       continue;
397 
398     WaitStates += SIInstrInfo::getNumWaitStates(*I);
399 
400     if (IsExpired(&*I, WaitStates))
401       return std::numeric_limits<int>::max();
402   }
403 
404   int MinWaitStates = WaitStates;
405   bool Found = false;
406   for (MachineBasicBlock *Pred : MBB->predecessors()) {
407     if (!Visited.insert(Pred).second)
408       continue;
409 
410     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
411                                WaitStates, IsExpired, Visited);
412 
413     if (W == std::numeric_limits<int>::max())
414       continue;
415 
416     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
417     if (IsExpired(nullptr, MinWaitStates))
418       return MinWaitStates;
419 
420     Found = true;
421   }
422 
423   if (Found)
424     return MinWaitStates;
425 
426   return std::numeric_limits<int>::max();
427 }
428 
429 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
430                               MachineInstr *MI,
431                               IsExpiredFn IsExpired) {
432   DenseSet<const MachineBasicBlock *> Visited;
433   return getWaitStatesSince(IsHazard, MI->getParent(),
434                             std::next(MI->getReverseIterator()),
435                             0, IsExpired, Visited);
436 }
437 
438 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
439   if (IsHazardRecognizerMode) {
440     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
441       return WaitStates >= Limit;
442     };
443     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
444   }
445 
446   int WaitStates = 0;
447   for (MachineInstr *MI : EmittedInstrs) {
448     if (MI) {
449       if (IsHazard(MI))
450         return WaitStates;
451 
452       if (MI->isInlineAsm())
453         continue;
454     }
455     ++WaitStates;
456 
457     if (WaitStates >= Limit)
458       break;
459   }
460   return std::numeric_limits<int>::max();
461 }
462 
463 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
464                                                IsHazardFn IsHazardDef,
465                                                int Limit) {
466   const SIRegisterInfo *TRI = ST.getRegisterInfo();
467 
468   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
469     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
470   };
471 
472   return getWaitStatesSince(IsHazardFn, Limit);
473 }
474 
475 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
476                                                   int Limit) {
477   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
478     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
479   };
480 
481   return getWaitStatesSince(IsHazardFn, Limit);
482 }
483 
484 //===----------------------------------------------------------------------===//
485 // No-op Hazard Detection
486 //===----------------------------------------------------------------------===//
487 
488 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
489                         MCRegister Reg) {
490   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
491     BV.set(*RUI);
492 }
493 
494 static void addRegsToSet(const SIRegisterInfo &TRI,
495                          iterator_range<MachineInstr::const_mop_iterator> Ops,
496                          BitVector &Set) {
497   for (const MachineOperand &Op : Ops) {
498     if (Op.isReg())
499       addRegUnits(TRI, Set, Op.getReg().asMCReg());
500   }
501 }
502 
503 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
504   // XXX: Do we need to worry about implicit operands
505   addRegsToSet(TRI, MI.defs(), ClauseDefs);
506   addRegsToSet(TRI, MI.uses(), ClauseUses);
507 }
508 
509 static bool breaksSMEMSoftClause(MachineInstr *MI) {
510   return !SIInstrInfo::isSMRD(*MI);
511 }
512 
513 static bool breaksVMEMSoftClause(MachineInstr *MI) {
514   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
515 }
516 
517 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
518   // SMEM soft clause are only present on VI+, and only matter if xnack is
519   // enabled.
520   if (!ST.isXNACKEnabled())
521     return 0;
522 
523   bool IsSMRD = TII.isSMRD(*MEM);
524 
525   resetClause();
526 
527   // A soft-clause is any group of consecutive SMEM instructions.  The
528   // instructions in this group may return out of order and/or may be
529   // replayed (i.e. the same instruction issued more than once).
530   //
531   // In order to handle these situations correctly we need to make sure that
532   // when a clause has more than one instruction, no instruction in the clause
533   // writes to a register that is read by another instruction in the clause
534   // (including itself). If we encounter this situaion, we need to break the
535   // clause by inserting a non SMEM instruction.
536 
537   for (MachineInstr *MI : EmittedInstrs) {
538     // When we hit a non-SMEM instruction then we have passed the start of the
539     // clause and we can stop.
540     if (!MI)
541       break;
542 
543     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
544       break;
545 
546     addClauseInst(*MI);
547   }
548 
549   if (ClauseDefs.none())
550     return 0;
551 
552   // We need to make sure not to put loads and stores in the same clause if they
553   // use the same address. For now, just start a new clause whenever we see a
554   // store.
555   if (MEM->mayStore())
556     return 1;
557 
558   addClauseInst(*MEM);
559 
560   // If the set of defs and uses intersect then we cannot add this instruction
561   // to the clause, so we have a hazard.
562   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
563 }
564 
565 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
566   int WaitStatesNeeded = 0;
567 
568   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
569 
570   // This SMRD hazard only affects SI.
571   if (!ST.hasSMRDReadVALUDefHazard())
572     return WaitStatesNeeded;
573 
574   // A read of an SGPR by SMRD instruction requires 4 wait states when the
575   // SGPR was written by a VALU instruction.
576   int SmrdSgprWaitStates = 4;
577   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
578   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
579 
580   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
581 
582   for (const MachineOperand &Use : SMRD->uses()) {
583     if (!Use.isReg())
584       continue;
585     int WaitStatesNeededForUse =
586         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
587                                                    SmrdSgprWaitStates);
588     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
589 
590     // This fixes what appears to be undocumented hardware behavior in SI where
591     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
592     // needs some number of nops in between. We don't know how many we need, but
593     // let's use 4. This wasn't discovered before probably because the only
594     // case when this happens is when we expand a 64-bit pointer into a full
595     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
596     // probably never encountered in the closed-source land.
597     if (IsBufferSMRD) {
598       int WaitStatesNeededForUse =
599         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
600                                                    IsBufferHazardDefFn,
601                                                    SmrdSgprWaitStates);
602       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
603     }
604   }
605 
606   return WaitStatesNeeded;
607 }
608 
609 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
610   if (!ST.hasVMEMReadSGPRVALUDefHazard())
611     return 0;
612 
613   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
614 
615   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
616   // SGPR was written by a VALU Instruction.
617   const int VmemSgprWaitStates = 5;
618   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
619   for (const MachineOperand &Use : VMEM->uses()) {
620     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
621       continue;
622 
623     int WaitStatesNeededForUse =
624         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
625                                                    VmemSgprWaitStates);
626     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
627   }
628   return WaitStatesNeeded;
629 }
630 
631 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
632   const SIRegisterInfo *TRI = ST.getRegisterInfo();
633   const SIInstrInfo *TII = ST.getInstrInfo();
634 
635   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
636   int DppVgprWaitStates = 2;
637   int DppExecWaitStates = 5;
638   int WaitStatesNeeded = 0;
639   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
640 
641   for (const MachineOperand &Use : DPP->uses()) {
642     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
643       continue;
644     int WaitStatesNeededForUse =
645         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
646                               [](MachineInstr *) { return true; },
647                               DppVgprWaitStates);
648     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
649   }
650 
651   WaitStatesNeeded = std::max(
652       WaitStatesNeeded,
653       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
654                                                 DppExecWaitStates));
655 
656   return WaitStatesNeeded;
657 }
658 
659 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
660   const SIInstrInfo *TII = ST.getInstrInfo();
661 
662   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
663   // instruction.
664   const int DivFMasWaitStates = 4;
665   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
666   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
667                                                DivFMasWaitStates);
668 
669   return DivFMasWaitStates - WaitStatesNeeded;
670 }
671 
672 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
673   const SIInstrInfo *TII = ST.getInstrInfo();
674   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
675 
676   const int GetRegWaitStates = 2;
677   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
678     return GetRegHWReg == getHWReg(TII, *MI);
679   };
680   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
681 
682   return GetRegWaitStates - WaitStatesNeeded;
683 }
684 
685 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
686   const SIInstrInfo *TII = ST.getInstrInfo();
687   unsigned HWReg = getHWReg(TII, *SetRegInstr);
688 
689   const int SetRegWaitStates = ST.getSetRegWaitStates();
690   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
691     return HWReg == getHWReg(TII, *MI);
692   };
693   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
694   return SetRegWaitStates - WaitStatesNeeded;
695 }
696 
697 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
698   if (!MI.mayStore())
699     return -1;
700 
701   const SIInstrInfo *TII = ST.getInstrInfo();
702   unsigned Opcode = MI.getOpcode();
703   const MCInstrDesc &Desc = MI.getDesc();
704 
705   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
706   int VDataRCID = -1;
707   if (VDataIdx != -1)
708     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
709 
710   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
711     // There is no hazard if the instruction does not use vector regs
712     // (like wbinvl1)
713     if (VDataIdx == -1)
714       return -1;
715     // For MUBUF/MTBUF instructions this hazard only exists if the
716     // instruction is not using a register in the soffset field.
717     const MachineOperand *SOffset =
718         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
719     // If we have no soffset operand, then assume this field has been
720     // hardcoded to zero.
721     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
722         (!SOffset || !SOffset->isReg()))
723       return VDataIdx;
724   }
725 
726   // MIMG instructions create a hazard if they don't use a 256-bit T# and
727   // the store size is greater than 8 bytes and they have more than two bits
728   // of their dmask set.
729   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
730   if (TII->isMIMG(MI)) {
731     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
732     assert(SRsrcIdx != -1 &&
733            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
734     (void)SRsrcIdx;
735   }
736 
737   if (TII->isFLAT(MI)) {
738     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
739     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
740       return DataIdx;
741   }
742 
743   return -1;
744 }
745 
746 int
747 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
748                                             const MachineRegisterInfo &MRI) {
749   // Helper to check for the hazard where VMEM instructions that store more than
750   // 8 bytes can have there store data over written by the next instruction.
751   const SIRegisterInfo *TRI = ST.getRegisterInfo();
752 
753   const int VALUWaitStates = 1;
754   int WaitStatesNeeded = 0;
755 
756   if (!TRI->isVGPR(MRI, Def.getReg()))
757     return WaitStatesNeeded;
758   Register Reg = Def.getReg();
759   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
760     int DataIdx = createsVALUHazard(*MI);
761     return DataIdx >= 0 &&
762     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
763   };
764   int WaitStatesNeededForDef =
765     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
766   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
767 
768   return WaitStatesNeeded;
769 }
770 
771 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
772   // This checks for the hazard where VMEM instructions that store more than
773   // 8 bytes can have there store data over written by the next instruction.
774   if (!ST.has12DWordStoreHazard())
775     return 0;
776 
777   const MachineRegisterInfo &MRI = MF.getRegInfo();
778   int WaitStatesNeeded = 0;
779 
780   for (const MachineOperand &Def : VALU->defs()) {
781     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
782   }
783 
784   return WaitStatesNeeded;
785 }
786 
787 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
788   // This checks for hazards associated with inline asm statements.
789   // Since inline asms can contain just about anything, we use this
790   // to call/leverage other check*Hazard routines. Note that
791   // this function doesn't attempt to address all possible inline asm
792   // hazards (good luck), but is a collection of what has been
793   // problematic thus far.
794 
795   // see checkVALUHazards()
796   if (!ST.has12DWordStoreHazard())
797     return 0;
798 
799   const MachineRegisterInfo &MRI = MF.getRegInfo();
800   int WaitStatesNeeded = 0;
801 
802   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
803        I != E; ++I) {
804     const MachineOperand &Op = IA->getOperand(I);
805     if (Op.isReg() && Op.isDef()) {
806       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
807     }
808   }
809 
810   return WaitStatesNeeded;
811 }
812 
813 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
814   const SIInstrInfo *TII = ST.getInstrInfo();
815   const SIRegisterInfo *TRI = ST.getRegisterInfo();
816   const MachineRegisterInfo &MRI = MF.getRegInfo();
817 
818   const MachineOperand *LaneSelectOp =
819       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
820 
821   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
822     return 0;
823 
824   Register LaneSelectReg = LaneSelectOp->getReg();
825   auto IsHazardFn = [TII] (MachineInstr *MI) {
826     return TII->isVALU(*MI);
827   };
828 
829   const int RWLaneWaitStates = 4;
830   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
831                                               RWLaneWaitStates);
832   return RWLaneWaitStates - WaitStatesSince;
833 }
834 
835 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
836   if (!ST.hasRFEHazards())
837     return 0;
838 
839   const SIInstrInfo *TII = ST.getInstrInfo();
840 
841   const int RFEWaitStates = 1;
842 
843   auto IsHazardFn = [TII] (MachineInstr *MI) {
844     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
845   };
846   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
847   return RFEWaitStates - WaitStatesNeeded;
848 }
849 
850 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
851   const SIInstrInfo *TII = ST.getInstrInfo();
852   const int SMovRelWaitStates = 1;
853   auto IsHazardFn = [TII] (MachineInstr *MI) {
854     return TII->isSALU(*MI);
855   };
856   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
857                                                    SMovRelWaitStates);
858 }
859 
860 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
861   fixVMEMtoScalarWriteHazards(MI);
862   fixVcmpxPermlaneHazards(MI);
863   fixSMEMtoVectorWriteHazards(MI);
864   fixVcmpxExecWARHazard(MI);
865   fixLdsBranchVmemWARHazard(MI);
866 }
867 
868 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
869   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
870     return false;
871 
872   const SIInstrInfo *TII = ST.getInstrInfo();
873   auto IsHazardFn = [TII] (MachineInstr *MI) {
874     return TII->isVOPC(*MI);
875   };
876 
877   auto IsExpiredFn = [] (MachineInstr *MI, int) {
878     if (!MI)
879       return false;
880     unsigned Opc = MI->getOpcode();
881     return SIInstrInfo::isVALU(*MI) &&
882            Opc != AMDGPU::V_NOP_e32 &&
883            Opc != AMDGPU::V_NOP_e64 &&
884            Opc != AMDGPU::V_NOP_sdwa;
885   };
886 
887   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
888       std::numeric_limits<int>::max())
889     return false;
890 
891   // V_NOP will be discarded by SQ.
892   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
893   // which is always a VGPR and available.
894   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
895   Register Reg = Src0->getReg();
896   bool IsUndef = Src0->isUndef();
897   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
898           TII->get(AMDGPU::V_MOV_B32_e32))
899     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
900     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
901 
902   return true;
903 }
904 
905 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
906   if (!ST.hasVMEMtoScalarWriteHazard())
907     return false;
908 
909   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
910     return false;
911 
912   if (MI->getNumDefs() == 0)
913     return false;
914 
915   const SIRegisterInfo *TRI = ST.getRegisterInfo();
916 
917   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
918     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
919         !SIInstrInfo::isFLAT(*I))
920       return false;
921 
922     for (const MachineOperand &Def : MI->defs()) {
923       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
924       if (!Op)
925         continue;
926       return true;
927     }
928     return false;
929   };
930 
931   auto IsExpiredFn = [](MachineInstr *MI, int) {
932     return MI && (SIInstrInfo::isVALU(*MI) ||
933                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
934                    !MI->getOperand(0).getImm()) ||
935                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
936                    MI->getOperand(0).getImm() == 0xffe3));
937   };
938 
939   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
940       std::numeric_limits<int>::max())
941     return false;
942 
943   const SIInstrInfo *TII = ST.getInstrInfo();
944   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
945           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
946       .addImm(0xffe3);
947   return true;
948 }
949 
950 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
951   if (!ST.hasSMEMtoVectorWriteHazard())
952     return false;
953 
954   if (!SIInstrInfo::isVALU(*MI))
955     return false;
956 
957   unsigned SDSTName;
958   switch (MI->getOpcode()) {
959   case AMDGPU::V_READLANE_B32:
960   case AMDGPU::V_READFIRSTLANE_B32:
961     SDSTName = AMDGPU::OpName::vdst;
962     break;
963   default:
964     SDSTName = AMDGPU::OpName::sdst;
965     break;
966   }
967 
968   const SIInstrInfo *TII = ST.getInstrInfo();
969   const SIRegisterInfo *TRI = ST.getRegisterInfo();
970   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
971   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
972   if (!SDST) {
973     for (const auto &MO : MI->implicit_operands()) {
974       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
975         SDST = &MO;
976         break;
977       }
978     }
979   }
980 
981   if (!SDST)
982     return false;
983 
984   const Register SDSTReg = SDST->getReg();
985   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
986     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
987   };
988 
989   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
990     if (MI) {
991       if (TII->isSALU(*MI)) {
992         switch (MI->getOpcode()) {
993         case AMDGPU::S_SETVSKIP:
994         case AMDGPU::S_VERSION:
995         case AMDGPU::S_WAITCNT_VSCNT:
996         case AMDGPU::S_WAITCNT_VMCNT:
997         case AMDGPU::S_WAITCNT_EXPCNT:
998           // These instructions cannot not mitigate the hazard.
999           return false;
1000         case AMDGPU::S_WAITCNT_LGKMCNT:
1001           // Reducing lgkmcnt count to 0 always mitigates the hazard.
1002           return (MI->getOperand(1).getImm() == 0) &&
1003                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1004         case AMDGPU::S_WAITCNT: {
1005           const int64_t Imm = MI->getOperand(0).getImm();
1006           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1007           return (Decoded.LgkmCnt == 0);
1008         }
1009         default:
1010           // SOPP instructions cannot mitigate the hazard.
1011           if (TII->isSOPP(*MI))
1012             return false;
1013           // At this point the SALU can be assumed to mitigate the hazard
1014           // because either:
1015           // (a) it is independent of the at risk SMEM (breaking chain),
1016           // or
1017           // (b) it is dependent on the SMEM, in which case an appropriate
1018           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1019           //     SMEM instruction.
1020           return true;
1021         }
1022       }
1023     }
1024     return false;
1025   };
1026 
1027   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1028       std::numeric_limits<int>::max())
1029     return false;
1030 
1031   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1032           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1033       .addImm(0);
1034   return true;
1035 }
1036 
1037 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1038   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1039     return false;
1040 
1041   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1042   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1043     return false;
1044 
1045   auto IsHazardFn = [TRI] (MachineInstr *I) {
1046     if (SIInstrInfo::isVALU(*I))
1047       return false;
1048     return I->readsRegister(AMDGPU::EXEC, TRI);
1049   };
1050 
1051   const SIInstrInfo *TII = ST.getInstrInfo();
1052   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1053     if (!MI)
1054       return false;
1055     if (SIInstrInfo::isVALU(*MI)) {
1056       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1057         return true;
1058       for (auto MO : MI->implicit_operands())
1059         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1060           return true;
1061     }
1062     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1063         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1064       return true;
1065     return false;
1066   };
1067 
1068   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1069       std::numeric_limits<int>::max())
1070     return false;
1071 
1072   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1073           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1074     .addImm(0xfffe);
1075   return true;
1076 }
1077 
1078 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1079   if (!ST.hasLdsBranchVmemWARHazard())
1080     return false;
1081 
1082   auto IsHazardInst = [] (const MachineInstr *MI) {
1083     if (SIInstrInfo::isDS(*MI))
1084       return 1;
1085     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1086       return 2;
1087     return 0;
1088   };
1089 
1090   auto InstType = IsHazardInst(MI);
1091   if (!InstType)
1092     return false;
1093 
1094   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1095     return I && (IsHazardInst(I) ||
1096                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1097                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1098                   !I->getOperand(1).getImm()));
1099   };
1100 
1101   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1102     if (!I->isBranch())
1103       return false;
1104 
1105     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1106       auto InstType2 = IsHazardInst(I);
1107       return InstType2 && InstType != InstType2;
1108     };
1109 
1110     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1111       if (!I)
1112         return false;
1113 
1114       auto InstType2 = IsHazardInst(I);
1115       if (InstType == InstType2)
1116         return true;
1117 
1118       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1119              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1120              !I->getOperand(1).getImm();
1121     };
1122 
1123     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1124            std::numeric_limits<int>::max();
1125   };
1126 
1127   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1128       std::numeric_limits<int>::max())
1129     return false;
1130 
1131   const SIInstrInfo *TII = ST.getInstrInfo();
1132   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1133           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1134     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1135     .addImm(0);
1136 
1137   return true;
1138 }
1139 
1140 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1141   int NSAtoVMEMWaitStates = 1;
1142 
1143   if (!ST.hasNSAtoVMEMBug())
1144     return 0;
1145 
1146   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1147     return 0;
1148 
1149   const SIInstrInfo *TII = ST.getInstrInfo();
1150   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1151   if (!Offset || (Offset->getImm() & 6) == 0)
1152     return 0;
1153 
1154   auto IsHazardFn = [TII] (MachineInstr *I) {
1155     if (!SIInstrInfo::isMIMG(*I))
1156       return false;
1157     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1158     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1159            TII->getInstSizeInBytes(*I) >= 16;
1160   };
1161 
1162   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1163 }
1164 
1165 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1166   int FPAtomicToDenormModeWaitStates = 3;
1167 
1168   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1169     return 0;
1170 
1171   auto IsHazardFn = [] (MachineInstr *I) {
1172     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1173       return false;
1174     return SIInstrInfo::isFPAtomic(*I);
1175   };
1176 
1177   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1178     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1179       return true;
1180 
1181     switch (MI->getOpcode()) {
1182     case AMDGPU::S_WAITCNT:
1183     case AMDGPU::S_WAITCNT_VSCNT:
1184     case AMDGPU::S_WAITCNT_VMCNT:
1185     case AMDGPU::S_WAITCNT_EXPCNT:
1186     case AMDGPU::S_WAITCNT_LGKMCNT:
1187     case AMDGPU::S_WAITCNT_IDLE:
1188       return true;
1189     default:
1190       break;
1191     }
1192 
1193     return false;
1194   };
1195 
1196 
1197   return FPAtomicToDenormModeWaitStates -
1198          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1199 }
1200 
1201 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1202   assert(SIInstrInfo::isMAI(*MI));
1203 
1204   int WaitStatesNeeded = 0;
1205   unsigned Opc = MI->getOpcode();
1206 
1207   auto IsVALUFn = [] (MachineInstr *MI) {
1208     return SIInstrInfo::isVALU(*MI);
1209   };
1210 
1211   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1212     const int LegacyVALUWritesVGPRWaitStates = 2;
1213     const int VALUWritesExecWaitStates = 4;
1214     const int MaxWaitStates = 4;
1215 
1216     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1217       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1218     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1219 
1220     if (WaitStatesNeeded < MaxWaitStates) {
1221       for (const MachineOperand &Use : MI->explicit_uses()) {
1222         const int MaxWaitStates = 2;
1223 
1224         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1225           continue;
1226 
1227         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1228           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1229         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1230 
1231         if (WaitStatesNeeded == MaxWaitStates)
1232           break;
1233       }
1234     }
1235   }
1236 
1237   auto IsMFMAFn = [] (MachineInstr *MI) {
1238     return SIInstrInfo::isMAI(*MI) &&
1239            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1240            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1241   };
1242 
1243   for (const MachineOperand &Op : MI->explicit_operands()) {
1244     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1245       continue;
1246 
1247     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1248       continue;
1249 
1250     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1251     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1252     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1253     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1254     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1255     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1256     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1257     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1258     const int MaxWaitStates = 18;
1259     Register Reg = Op.getReg();
1260     unsigned HazardDefLatency = 0;
1261 
1262     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1263                               (MachineInstr *MI) {
1264       if (!IsMFMAFn(MI))
1265         return false;
1266       Register DstReg = MI->getOperand(0).getReg();
1267       if (DstReg == Reg)
1268         return false;
1269       HazardDefLatency = std::max(HazardDefLatency,
1270                                   TSchedModel.computeInstrLatency(MI));
1271       return TRI.regsOverlap(DstReg, Reg);
1272     };
1273 
1274     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1275                                                    MaxWaitStates);
1276     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1277     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1278     int OpNo = MI->getOperandNo(&Op);
1279     if (OpNo == SrcCIdx) {
1280       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1281     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1282       switch (HazardDefLatency) {
1283       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1284                break;
1285       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1286                break;
1287       case 16: LLVM_FALLTHROUGH;
1288       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1289                break;
1290       }
1291     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1292       switch (HazardDefLatency) {
1293       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1294                break;
1295       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1296                break;
1297       case 16: LLVM_FALLTHROUGH;
1298       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1299                break;
1300       }
1301     }
1302 
1303     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1304     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1305 
1306     if (WaitStatesNeeded == MaxWaitStates)
1307       return WaitStatesNeeded; // Early exit.
1308 
1309     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1310       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1311         return false;
1312       Register DstReg = MI->getOperand(0).getReg();
1313       return TRI.regsOverlap(Reg, DstReg);
1314     };
1315 
1316     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1317     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1318     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1319     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1320     if (OpNo == SrcCIdx)
1321       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1322     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1323       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1324 
1325     WaitStatesNeededForUse = NeedWaitStates -
1326       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1327     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1328 
1329     if (WaitStatesNeeded == MaxWaitStates)
1330       return WaitStatesNeeded; // Early exit.
1331   }
1332 
1333   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1334     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1335     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1336     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1337     const int MaxWaitStates = 13;
1338     Register DstReg = MI->getOperand(0).getReg();
1339     unsigned HazardDefLatency = 0;
1340 
1341     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1342                          (MachineInstr *MI) {
1343       if (!IsMFMAFn(MI))
1344         return false;
1345       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1346       HazardDefLatency = std::max(HazardDefLatency,
1347                                   TSchedModel.computeInstrLatency(MI));
1348       return TRI.regsOverlap(Reg, DstReg);
1349     };
1350 
1351     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1352     int NeedWaitStates;
1353     switch (HazardDefLatency) {
1354     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1355              break;
1356     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1357              break;
1358     case 16: LLVM_FALLTHROUGH;
1359     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1360              break;
1361     }
1362 
1363     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1364     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1365   }
1366 
1367   return WaitStatesNeeded;
1368 }
1369 
1370 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1371   if (!ST.hasMAIInsts())
1372     return 0;
1373 
1374   int WaitStatesNeeded = 0;
1375 
1376   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1377     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1378   };
1379 
1380   for (const MachineOperand &Op : MI->explicit_uses()) {
1381     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1382       continue;
1383 
1384     Register Reg = Op.getReg();
1385 
1386     const int AccVgprReadLdStWaitStates = 2;
1387     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1388     const int MaxWaitStates = 2;
1389 
1390     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1391       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1392     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1393 
1394     if (WaitStatesNeeded == MaxWaitStates)
1395       return WaitStatesNeeded; // Early exit.
1396 
1397     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
1398       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32 &&
1399           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1400         return false;
1401       auto IsVALUFn = [] (MachineInstr *MI) {
1402         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1403       };
1404       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1405              std::numeric_limits<int>::max();
1406     };
1407 
1408     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1409       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1410     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1411   }
1412 
1413   return WaitStatesNeeded;
1414 }
1415 
1416 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1417   if (!SU->isInstr())
1418     return false;
1419 
1420   MachineInstr *MAI = nullptr;
1421   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1422     MAI = nullptr;
1423     if (SIInstrInfo::isMAI(*MI) &&
1424         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1425         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1426       MAI = MI;
1427     return MAI != nullptr;
1428   };
1429 
1430   MachineInstr *MI = SU->getInstr();
1431   if (IsMFMAFn(MI)) {
1432     int W = getWaitStatesSince(IsMFMAFn, 16);
1433     if (MAI)
1434       return W < (int)TSchedModel.computeInstrLatency(MAI);
1435   }
1436 
1437   return false;
1438 }
1439