xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 69f5105f5c009e1ca34d2c1f60ee4c78b8dfa543)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50   TSchedModel.init(&ST);
51 }
52 
53 void GCNHazardRecognizer::Reset() {
54   EmittedInstrs.clear();
55 }
56 
57 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
58   EmitInstruction(SU->getInstr());
59 }
60 
61 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
62   CurrCycleInstr = MI;
63 }
64 
65 static bool isDivFMas(unsigned Opcode) {
66   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
67 }
68 
69 static bool isSGetReg(unsigned Opcode) {
70   return Opcode == AMDGPU::S_GETREG_B32;
71 }
72 
73 static bool isSSetReg(unsigned Opcode) {
74   switch (Opcode) {
75   case AMDGPU::S_SETREG_B32:
76   case AMDGPU::S_SETREG_B32_mode:
77   case AMDGPU::S_SETREG_IMM32_B32:
78   case AMDGPU::S_SETREG_IMM32_B32_mode:
79     return true;
80   }
81   return false;
82 }
83 
84 static bool isRWLane(unsigned Opcode) {
85   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
86 }
87 
88 static bool isRFE(unsigned Opcode) {
89   return Opcode == AMDGPU::S_RFE_B64;
90 }
91 
92 static bool isSMovRel(unsigned Opcode) {
93   switch (Opcode) {
94   case AMDGPU::S_MOVRELS_B32:
95   case AMDGPU::S_MOVRELS_B64:
96   case AMDGPU::S_MOVRELD_B32:
97   case AMDGPU::S_MOVRELD_B64:
98     return true;
99   default:
100     return false;
101   }
102 }
103 
104 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
105                                     const MachineInstr &MI) {
106   if (TII.isAlwaysGDS(MI.getOpcode()))
107     return true;
108 
109   switch (MI.getOpcode()) {
110   case AMDGPU::S_SENDMSG:
111   case AMDGPU::S_SENDMSGHALT:
112   case AMDGPU::S_TTRACEDATA:
113     return true;
114   // These DS opcodes don't support GDS.
115   case AMDGPU::DS_NOP:
116   case AMDGPU::DS_PERMUTE_B32:
117   case AMDGPU::DS_BPERMUTE_B32:
118     return false;
119   default:
120     if (TII.isDS(MI.getOpcode())) {
121       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
122                                            AMDGPU::OpName::gds);
123       if (MI.getOperand(GDS).getImm())
124         return true;
125     }
126     return false;
127   }
128 }
129 
130 static bool isPermlane(const MachineInstr &MI) {
131   unsigned Opcode = MI.getOpcode();
132   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
133          Opcode == AMDGPU::V_PERMLANEX16_B32;
134 }
135 
136 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
137   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
138                                                      AMDGPU::OpName::simm16);
139   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
140 }
141 
142 ScheduleHazardRecognizer::HazardType
143 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
144   MachineInstr *MI = SU->getInstr();
145   // If we are not in "HazardRecognizerMode" and therefore not being run from
146   // the scheduler, track possible stalls from hazards but don't insert noops.
147   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
148 
149   if (MI->isBundle())
150    return NoHazard;
151 
152   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
153     return HazardType;
154 
155   // FIXME: Should flat be considered vmem?
156   if ((SIInstrInfo::isVMEM(*MI) ||
157        SIInstrInfo::isFLAT(*MI))
158       && checkVMEMHazards(MI) > 0)
159     return HazardType;
160 
161   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
162     return HazardType;
163 
164   if (checkFPAtomicToDenormModeHazard(MI) > 0)
165     return HazardType;
166 
167   if (ST.hasNoDataDepHazard())
168     return NoHazard;
169 
170   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
171     return HazardType;
172 
173   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
174     return HazardType;
175 
176   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
177     return HazardType;
178 
179   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
180     return HazardType;
181 
182   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
183     return HazardType;
184 
185   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
186     return HazardType;
187 
188   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
189     return HazardType;
190 
191   if (ST.hasReadM0MovRelInterpHazard() &&
192       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
193       checkReadM0Hazards(MI) > 0)
194     return HazardType;
195 
196   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
197       checkReadM0Hazards(MI) > 0)
198     return HazardType;
199 
200   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
201     return HazardType;
202 
203   if ((SIInstrInfo::isVMEM(*MI) ||
204        SIInstrInfo::isFLAT(*MI) ||
205        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
206     return HazardType;
207 
208   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
209     return HazardType;
210 
211   return NoHazard;
212 }
213 
214 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
215                                 unsigned Quantity) {
216   while (Quantity > 0) {
217     unsigned Arg = std::min(Quantity, 8u);
218     Quantity -= Arg;
219     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
220         .addImm(Arg - 1);
221   }
222 }
223 
224 void GCNHazardRecognizer::processBundle() {
225   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
226   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
227   // Check bundled MachineInstr's for hazards.
228   for (; MI != E && MI->isInsideBundle(); ++MI) {
229     CurrCycleInstr = &*MI;
230     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
231 
232     if (IsHazardRecognizerMode) {
233       fixHazards(CurrCycleInstr);
234 
235       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
236     }
237 
238     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
239     // include the bundled MI directly after, only add a maximum of
240     // (MaxLookAhead - 1) noops to EmittedInstrs.
241     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
242       EmittedInstrs.push_front(nullptr);
243 
244     EmittedInstrs.push_front(CurrCycleInstr);
245     EmittedInstrs.resize(MaxLookAhead);
246   }
247   CurrCycleInstr = nullptr;
248 }
249 
250 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
251   IsHazardRecognizerMode = true;
252   CurrCycleInstr = MI;
253   unsigned W = PreEmitNoopsCommon(MI);
254   fixHazards(MI);
255   CurrCycleInstr = nullptr;
256   return W;
257 }
258 
259 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
260   if (MI->isBundle())
261     return 0;
262 
263   int WaitStates = 0;
264 
265   if (SIInstrInfo::isSMRD(*MI))
266     return std::max(WaitStates, checkSMRDHazards(MI));
267 
268   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
269     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
270 
271   if (ST.hasNSAtoVMEMBug())
272     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
273 
274   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
275 
276   if (ST.hasNoDataDepHazard())
277     return WaitStates;
278 
279   if (SIInstrInfo::isVALU(*MI))
280     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
281 
282   if (SIInstrInfo::isDPP(*MI))
283     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
284 
285   if (isDivFMas(MI->getOpcode()))
286     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
287 
288   if (isRWLane(MI->getOpcode()))
289     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
290 
291   if (MI->isInlineAsm())
292     return std::max(WaitStates, checkInlineAsmHazards(MI));
293 
294   if (isSGetReg(MI->getOpcode()))
295     return std::max(WaitStates, checkGetRegHazards(MI));
296 
297   if (isSSetReg(MI->getOpcode()))
298     return std::max(WaitStates, checkSetRegHazards(MI));
299 
300   if (isRFE(MI->getOpcode()))
301     return std::max(WaitStates, checkRFEHazards(MI));
302 
303   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
304                                            isSMovRel(MI->getOpcode())))
305     return std::max(WaitStates, checkReadM0Hazards(MI));
306 
307   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
308     return std::max(WaitStates, checkReadM0Hazards(MI));
309 
310   if (SIInstrInfo::isMAI(*MI))
311     return std::max(WaitStates, checkMAIHazards(MI));
312 
313   if (SIInstrInfo::isVMEM(*MI) ||
314       SIInstrInfo::isFLAT(*MI) ||
315       SIInstrInfo::isDS(*MI))
316     return std::max(WaitStates, checkMAILdStHazards(MI));
317 
318   return WaitStates;
319 }
320 
321 void GCNHazardRecognizer::EmitNoop() {
322   EmittedInstrs.push_front(nullptr);
323 }
324 
325 void GCNHazardRecognizer::AdvanceCycle() {
326   // When the scheduler detects a stall, it will call AdvanceCycle() without
327   // emitting any instructions.
328   if (!CurrCycleInstr) {
329     EmittedInstrs.push_front(nullptr);
330     return;
331   }
332 
333   // Do not track non-instructions which do not affect the wait states.
334   // If included, these instructions can lead to buffer overflow such that
335   // detectable hazards are missed.
336   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
337       CurrCycleInstr->isKill()) {
338     CurrCycleInstr = nullptr;
339     return;
340   }
341 
342   if (CurrCycleInstr->isBundle()) {
343     processBundle();
344     return;
345   }
346 
347   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
348 
349   // Keep track of emitted instructions
350   EmittedInstrs.push_front(CurrCycleInstr);
351 
352   // Add a nullptr for each additional wait state after the first.  Make sure
353   // not to add more than getMaxLookAhead() items to the list, since we
354   // truncate the list to that size right after this loop.
355   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
356        i < e; ++i) {
357     EmittedInstrs.push_front(nullptr);
358   }
359 
360   // getMaxLookahead() is the largest number of wait states we will ever need
361   // to insert, so there is no point in keeping track of more than that many
362   // wait states.
363   EmittedInstrs.resize(getMaxLookAhead());
364 
365   CurrCycleInstr = nullptr;
366 }
367 
368 void GCNHazardRecognizer::RecedeCycle() {
369   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
370 }
371 
372 //===----------------------------------------------------------------------===//
373 // Helper Functions
374 //===----------------------------------------------------------------------===//
375 
376 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
377 
378 // Returns a minimum wait states since \p I walking all predecessors.
379 // Only scans until \p IsExpired does not return true.
380 // Can only be run in a hazard recognizer mode.
381 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
382                               MachineBasicBlock *MBB,
383                               MachineBasicBlock::reverse_instr_iterator I,
384                               int WaitStates,
385                               IsExpiredFn IsExpired,
386                               DenseSet<const MachineBasicBlock *> &Visited) {
387   for (auto E = MBB->instr_rend(); I != E; ++I) {
388     // Don't add WaitStates for parent BUNDLE instructions.
389     if (I->isBundle())
390       continue;
391 
392     if (IsHazard(&*I))
393       return WaitStates;
394 
395     if (I->isInlineAsm() || I->isMetaInstruction())
396       continue;
397 
398     WaitStates += SIInstrInfo::getNumWaitStates(*I);
399 
400     if (IsExpired(&*I, WaitStates))
401       return std::numeric_limits<int>::max();
402   }
403 
404   int MinWaitStates = WaitStates;
405   bool Found = false;
406   for (MachineBasicBlock *Pred : MBB->predecessors()) {
407     if (!Visited.insert(Pred).second)
408       continue;
409 
410     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
411                                WaitStates, IsExpired, Visited);
412 
413     if (W == std::numeric_limits<int>::max())
414       continue;
415 
416     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
417     if (IsExpired(nullptr, MinWaitStates))
418       return MinWaitStates;
419 
420     Found = true;
421   }
422 
423   if (Found)
424     return MinWaitStates;
425 
426   return std::numeric_limits<int>::max();
427 }
428 
429 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
430                               MachineInstr *MI,
431                               IsExpiredFn IsExpired) {
432   DenseSet<const MachineBasicBlock *> Visited;
433   return getWaitStatesSince(IsHazard, MI->getParent(),
434                             std::next(MI->getReverseIterator()),
435                             0, IsExpired, Visited);
436 }
437 
438 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
439   if (IsHazardRecognizerMode) {
440     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
441       return WaitStates >= Limit;
442     };
443     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
444   }
445 
446   int WaitStates = 0;
447   for (MachineInstr *MI : EmittedInstrs) {
448     if (MI) {
449       if (IsHazard(MI))
450         return WaitStates;
451 
452       if (MI->isInlineAsm())
453         continue;
454     }
455     ++WaitStates;
456 
457     if (WaitStates >= Limit)
458       break;
459   }
460   return std::numeric_limits<int>::max();
461 }
462 
463 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
464                                                IsHazardFn IsHazardDef,
465                                                int Limit) {
466   const SIRegisterInfo *TRI = ST.getRegisterInfo();
467 
468   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
469     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
470   };
471 
472   return getWaitStatesSince(IsHazardFn, Limit);
473 }
474 
475 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
476                                                   int Limit) {
477   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
478     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
479   };
480 
481   return getWaitStatesSince(IsHazardFn, Limit);
482 }
483 
484 //===----------------------------------------------------------------------===//
485 // No-op Hazard Detection
486 //===----------------------------------------------------------------------===//
487 
488 static void addRegUnits(const SIRegisterInfo &TRI,
489                         BitVector &BV, unsigned Reg) {
490   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
491     BV.set(*RUI);
492 }
493 
494 static void addRegsToSet(const SIRegisterInfo &TRI,
495                          iterator_range<MachineInstr::const_mop_iterator> Ops,
496                          BitVector &Set) {
497   for (const MachineOperand &Op : Ops) {
498     if (Op.isReg())
499       addRegUnits(TRI, Set, Op.getReg());
500   }
501 }
502 
503 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
504   // XXX: Do we need to worry about implicit operands
505   addRegsToSet(TRI, MI.defs(), ClauseDefs);
506   addRegsToSet(TRI, MI.uses(), ClauseUses);
507 }
508 
509 static bool breaksSMEMSoftClause(MachineInstr *MI) {
510   return !SIInstrInfo::isSMRD(*MI);
511 }
512 
513 static bool breaksVMEMSoftClause(MachineInstr *MI) {
514   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
515 }
516 
517 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
518   // SMEM soft clause are only present on VI+, and only matter if xnack is
519   // enabled.
520   if (!ST.isXNACKEnabled())
521     return 0;
522 
523   bool IsSMRD = TII.isSMRD(*MEM);
524 
525   resetClause();
526 
527   // A soft-clause is any group of consecutive SMEM instructions.  The
528   // instructions in this group may return out of order and/or may be
529   // replayed (i.e. the same instruction issued more than once).
530   //
531   // In order to handle these situations correctly we need to make sure that
532   // when a clause has more than one instruction, no instruction in the clause
533   // writes to a register that is read by another instruction in the clause
534   // (including itself). If we encounter this situaion, we need to break the
535   // clause by inserting a non SMEM instruction.
536 
537   for (MachineInstr *MI : EmittedInstrs) {
538     // When we hit a non-SMEM instruction then we have passed the start of the
539     // clause and we can stop.
540     if (!MI)
541       break;
542 
543     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
544       break;
545 
546     addClauseInst(*MI);
547   }
548 
549   if (ClauseDefs.none())
550     return 0;
551 
552   // We need to make sure not to put loads and stores in the same clause if they
553   // use the same address. For now, just start a new clause whenever we see a
554   // store.
555   if (MEM->mayStore())
556     return 1;
557 
558   addClauseInst(*MEM);
559 
560   // If the set of defs and uses intersect then we cannot add this instruction
561   // to the clause, so we have a hazard.
562   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
563 }
564 
565 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
566   int WaitStatesNeeded = 0;
567 
568   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
569 
570   // This SMRD hazard only affects SI.
571   if (!ST.hasSMRDReadVALUDefHazard())
572     return WaitStatesNeeded;
573 
574   // A read of an SGPR by SMRD instruction requires 4 wait states when the
575   // SGPR was written by a VALU instruction.
576   int SmrdSgprWaitStates = 4;
577   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
578   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
579 
580   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
581 
582   for (const MachineOperand &Use : SMRD->uses()) {
583     if (!Use.isReg())
584       continue;
585     int WaitStatesNeededForUse =
586         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
587                                                    SmrdSgprWaitStates);
588     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
589 
590     // This fixes what appears to be undocumented hardware behavior in SI where
591     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
592     // needs some number of nops in between. We don't know how many we need, but
593     // let's use 4. This wasn't discovered before probably because the only
594     // case when this happens is when we expand a 64-bit pointer into a full
595     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
596     // probably never encountered in the closed-source land.
597     if (IsBufferSMRD) {
598       int WaitStatesNeededForUse =
599         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
600                                                    IsBufferHazardDefFn,
601                                                    SmrdSgprWaitStates);
602       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
603     }
604   }
605 
606   return WaitStatesNeeded;
607 }
608 
609 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
610   if (!ST.hasVMEMReadSGPRVALUDefHazard())
611     return 0;
612 
613   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
614 
615   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
616   // SGPR was written by a VALU Instruction.
617   const int VmemSgprWaitStates = 5;
618   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
619   for (const MachineOperand &Use : VMEM->uses()) {
620     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
621       continue;
622 
623     int WaitStatesNeededForUse =
624         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
625                                                    VmemSgprWaitStates);
626     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
627   }
628   return WaitStatesNeeded;
629 }
630 
631 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
632   const SIRegisterInfo *TRI = ST.getRegisterInfo();
633   const SIInstrInfo *TII = ST.getInstrInfo();
634 
635   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
636   int DppVgprWaitStates = 2;
637   int DppExecWaitStates = 5;
638   int WaitStatesNeeded = 0;
639   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
640 
641   for (const MachineOperand &Use : DPP->uses()) {
642     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
643       continue;
644     int WaitStatesNeededForUse =
645         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
646                               [](MachineInstr *) { return true; },
647                               DppVgprWaitStates);
648     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
649   }
650 
651   WaitStatesNeeded = std::max(
652       WaitStatesNeeded,
653       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
654                                                 DppExecWaitStates));
655 
656   return WaitStatesNeeded;
657 }
658 
659 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
660   const SIInstrInfo *TII = ST.getInstrInfo();
661 
662   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
663   // instruction.
664   const int DivFMasWaitStates = 4;
665   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
666   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
667                                                DivFMasWaitStates);
668 
669   return DivFMasWaitStates - WaitStatesNeeded;
670 }
671 
672 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
673   const SIInstrInfo *TII = ST.getInstrInfo();
674   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
675 
676   const int GetRegWaitStates = 2;
677   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
678     return GetRegHWReg == getHWReg(TII, *MI);
679   };
680   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
681 
682   return GetRegWaitStates - WaitStatesNeeded;
683 }
684 
685 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
686   const SIInstrInfo *TII = ST.getInstrInfo();
687   unsigned HWReg = getHWReg(TII, *SetRegInstr);
688 
689   const int SetRegWaitStates = ST.getSetRegWaitStates();
690   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
691     return HWReg == getHWReg(TII, *MI);
692   };
693   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
694   return SetRegWaitStates - WaitStatesNeeded;
695 }
696 
697 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
698   if (!MI.mayStore())
699     return -1;
700 
701   const SIInstrInfo *TII = ST.getInstrInfo();
702   unsigned Opcode = MI.getOpcode();
703   const MCInstrDesc &Desc = MI.getDesc();
704 
705   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
706   int VDataRCID = -1;
707   if (VDataIdx != -1)
708     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
709 
710   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
711     // There is no hazard if the instruction does not use vector regs
712     // (like wbinvl1)
713     if (VDataIdx == -1)
714       return -1;
715     // For MUBUF/MTBUF instructions this hazard only exists if the
716     // instruction is not using a register in the soffset field.
717     const MachineOperand *SOffset =
718         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
719     // If we have no soffset operand, then assume this field has been
720     // hardcoded to zero.
721     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
722         (!SOffset || !SOffset->isReg()))
723       return VDataIdx;
724   }
725 
726   // MIMG instructions create a hazard if they don't use a 256-bit T# and
727   // the store size is greater than 8 bytes and they have more than two bits
728   // of their dmask set.
729   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
730   if (TII->isMIMG(MI)) {
731     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
732     assert(SRsrcIdx != -1 &&
733            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
734     (void)SRsrcIdx;
735   }
736 
737   if (TII->isFLAT(MI)) {
738     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
739     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
740       return DataIdx;
741   }
742 
743   return -1;
744 }
745 
746 int
747 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
748                                             const MachineRegisterInfo &MRI) {
749   // Helper to check for the hazard where VMEM instructions that store more than
750   // 8 bytes can have there store data over written by the next instruction.
751   const SIRegisterInfo *TRI = ST.getRegisterInfo();
752 
753   const int VALUWaitStates = 1;
754   int WaitStatesNeeded = 0;
755 
756   if (!TRI->isVGPR(MRI, Def.getReg()))
757     return WaitStatesNeeded;
758   Register Reg = Def.getReg();
759   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
760     int DataIdx = createsVALUHazard(*MI);
761     return DataIdx >= 0 &&
762     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
763   };
764   int WaitStatesNeededForDef =
765     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
766   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
767 
768   return WaitStatesNeeded;
769 }
770 
771 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
772   // This checks for the hazard where VMEM instructions that store more than
773   // 8 bytes can have there store data over written by the next instruction.
774   if (!ST.has12DWordStoreHazard())
775     return 0;
776 
777   const MachineRegisterInfo &MRI = MF.getRegInfo();
778   int WaitStatesNeeded = 0;
779 
780   for (const MachineOperand &Def : VALU->defs()) {
781     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
782   }
783 
784   return WaitStatesNeeded;
785 }
786 
787 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
788   // This checks for hazards associated with inline asm statements.
789   // Since inline asms can contain just about anything, we use this
790   // to call/leverage other check*Hazard routines. Note that
791   // this function doesn't attempt to address all possible inline asm
792   // hazards (good luck), but is a collection of what has been
793   // problematic thus far.
794 
795   // see checkVALUHazards()
796   if (!ST.has12DWordStoreHazard())
797     return 0;
798 
799   const MachineRegisterInfo &MRI = MF.getRegInfo();
800   int WaitStatesNeeded = 0;
801 
802   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
803        I != E; ++I) {
804     const MachineOperand &Op = IA->getOperand(I);
805     if (Op.isReg() && Op.isDef()) {
806       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
807     }
808   }
809 
810   return WaitStatesNeeded;
811 }
812 
813 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
814   const SIInstrInfo *TII = ST.getInstrInfo();
815   const SIRegisterInfo *TRI = ST.getRegisterInfo();
816   const MachineRegisterInfo &MRI = MF.getRegInfo();
817 
818   const MachineOperand *LaneSelectOp =
819       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
820 
821   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
822     return 0;
823 
824   Register LaneSelectReg = LaneSelectOp->getReg();
825   auto IsHazardFn = [TII] (MachineInstr *MI) {
826     return TII->isVALU(*MI);
827   };
828 
829   const int RWLaneWaitStates = 4;
830   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
831                                               RWLaneWaitStates);
832   return RWLaneWaitStates - WaitStatesSince;
833 }
834 
835 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
836   if (!ST.hasRFEHazards())
837     return 0;
838 
839   const SIInstrInfo *TII = ST.getInstrInfo();
840 
841   const int RFEWaitStates = 1;
842 
843   auto IsHazardFn = [TII] (MachineInstr *MI) {
844     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
845   };
846   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
847   return RFEWaitStates - WaitStatesNeeded;
848 }
849 
850 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
851   const SIInstrInfo *TII = ST.getInstrInfo();
852   const int SMovRelWaitStates = 1;
853   auto IsHazardFn = [TII] (MachineInstr *MI) {
854     return TII->isSALU(*MI);
855   };
856   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
857                                                    SMovRelWaitStates);
858 }
859 
860 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
861   fixVMEMtoScalarWriteHazards(MI);
862   fixVcmpxPermlaneHazards(MI);
863   fixSMEMtoVectorWriteHazards(MI);
864   fixVcmpxExecWARHazard(MI);
865   fixLdsBranchVmemWARHazard(MI);
866 }
867 
868 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
869   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
870     return false;
871 
872   const SIInstrInfo *TII = ST.getInstrInfo();
873   auto IsHazardFn = [TII] (MachineInstr *MI) {
874     return TII->isVOPC(*MI);
875   };
876 
877   auto IsExpiredFn = [] (MachineInstr *MI, int) {
878     if (!MI)
879       return false;
880     unsigned Opc = MI->getOpcode();
881     return SIInstrInfo::isVALU(*MI) &&
882            Opc != AMDGPU::V_NOP_e32 &&
883            Opc != AMDGPU::V_NOP_e64 &&
884            Opc != AMDGPU::V_NOP_sdwa;
885   };
886 
887   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
888       std::numeric_limits<int>::max())
889     return false;
890 
891   // V_NOP will be discarded by SQ.
892   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
893   // which is always a VGPR and available.
894   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
895   Register Reg = Src0->getReg();
896   bool IsUndef = Src0->isUndef();
897   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
898           TII->get(AMDGPU::V_MOV_B32_e32))
899     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
900     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
901 
902   return true;
903 }
904 
905 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
906   if (!ST.hasVMEMtoScalarWriteHazard())
907     return false;
908 
909   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
910     return false;
911 
912   if (MI->getNumDefs() == 0)
913     return false;
914 
915   const SIRegisterInfo *TRI = ST.getRegisterInfo();
916 
917   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
918     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
919         !SIInstrInfo::isFLAT(*I))
920       return false;
921 
922     for (const MachineOperand &Def : MI->defs()) {
923       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
924       if (!Op)
925         continue;
926       return true;
927     }
928     return false;
929   };
930 
931   auto IsExpiredFn = [](MachineInstr *MI, int) {
932     return MI && (SIInstrInfo::isVALU(*MI) ||
933                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
934                    !MI->getOperand(0).getImm()) ||
935                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
936                    MI->getOperand(0).getImm() == 0xffe3));
937   };
938 
939   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
940       std::numeric_limits<int>::max())
941     return false;
942 
943   const SIInstrInfo *TII = ST.getInstrInfo();
944   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
945           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
946       .addImm(0xffe3);
947   return true;
948 }
949 
950 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
951   if (!ST.hasSMEMtoVectorWriteHazard())
952     return false;
953 
954   if (!SIInstrInfo::isVALU(*MI))
955     return false;
956 
957   unsigned SDSTName;
958   switch (MI->getOpcode()) {
959   case AMDGPU::V_READLANE_B32:
960   case AMDGPU::V_READLANE_B32_gfx10:
961   case AMDGPU::V_READFIRSTLANE_B32:
962     SDSTName = AMDGPU::OpName::vdst;
963     break;
964   default:
965     SDSTName = AMDGPU::OpName::sdst;
966     break;
967   }
968 
969   const SIInstrInfo *TII = ST.getInstrInfo();
970   const SIRegisterInfo *TRI = ST.getRegisterInfo();
971   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
972   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
973   if (!SDST) {
974     for (const auto &MO : MI->implicit_operands()) {
975       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
976         SDST = &MO;
977         break;
978       }
979     }
980   }
981 
982   if (!SDST)
983     return false;
984 
985   const Register SDSTReg = SDST->getReg();
986   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
987     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
988   };
989 
990   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
991     if (MI) {
992       if (TII->isSALU(*MI)) {
993         switch (MI->getOpcode()) {
994         case AMDGPU::S_SETVSKIP:
995         case AMDGPU::S_VERSION:
996         case AMDGPU::S_WAITCNT_VSCNT:
997         case AMDGPU::S_WAITCNT_VMCNT:
998         case AMDGPU::S_WAITCNT_EXPCNT:
999           // These instructions cannot not mitigate the hazard.
1000           return false;
1001         case AMDGPU::S_WAITCNT_LGKMCNT:
1002           // Reducing lgkmcnt count to 0 always mitigates the hazard.
1003           return (MI->getOperand(1).getImm() == 0) &&
1004                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1005         case AMDGPU::S_WAITCNT: {
1006           const int64_t Imm = MI->getOperand(0).getImm();
1007           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1008           return (Decoded.LgkmCnt == 0);
1009         }
1010         default:
1011           // SOPP instructions cannot mitigate the hazard.
1012           if (TII->isSOPP(*MI))
1013             return false;
1014           // At this point the SALU can be assumed to mitigate the hazard
1015           // because either:
1016           // (a) it is independent of the at risk SMEM (breaking chain),
1017           // or
1018           // (b) it is dependent on the SMEM, in which case an appropriate
1019           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1020           //     SMEM instruction.
1021           return true;
1022         }
1023       }
1024     }
1025     return false;
1026   };
1027 
1028   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1029       std::numeric_limits<int>::max())
1030     return false;
1031 
1032   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1033           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1034       .addImm(0);
1035   return true;
1036 }
1037 
1038 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1039   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1040     return false;
1041 
1042   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1043   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1044     return false;
1045 
1046   auto IsHazardFn = [TRI] (MachineInstr *I) {
1047     if (SIInstrInfo::isVALU(*I))
1048       return false;
1049     return I->readsRegister(AMDGPU::EXEC, TRI);
1050   };
1051 
1052   const SIInstrInfo *TII = ST.getInstrInfo();
1053   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1054     if (!MI)
1055       return false;
1056     if (SIInstrInfo::isVALU(*MI)) {
1057       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1058         return true;
1059       for (auto MO : MI->implicit_operands())
1060         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1061           return true;
1062     }
1063     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1064         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1065       return true;
1066     return false;
1067   };
1068 
1069   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1070       std::numeric_limits<int>::max())
1071     return false;
1072 
1073   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1074           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1075     .addImm(0xfffe);
1076   return true;
1077 }
1078 
1079 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1080   if (!ST.hasLdsBranchVmemWARHazard())
1081     return false;
1082 
1083   auto IsHazardInst = [] (const MachineInstr *MI) {
1084     if (SIInstrInfo::isDS(*MI))
1085       return 1;
1086     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1087       return 2;
1088     return 0;
1089   };
1090 
1091   auto InstType = IsHazardInst(MI);
1092   if (!InstType)
1093     return false;
1094 
1095   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1096     return I && (IsHazardInst(I) ||
1097                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1098                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1099                   !I->getOperand(1).getImm()));
1100   };
1101 
1102   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1103     if (!I->isBranch())
1104       return false;
1105 
1106     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1107       auto InstType2 = IsHazardInst(I);
1108       return InstType2 && InstType != InstType2;
1109     };
1110 
1111     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1112       if (!I)
1113         return false;
1114 
1115       auto InstType2 = IsHazardInst(I);
1116       if (InstType == InstType2)
1117         return true;
1118 
1119       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1120              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1121              !I->getOperand(1).getImm();
1122     };
1123 
1124     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1125            std::numeric_limits<int>::max();
1126   };
1127 
1128   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1129       std::numeric_limits<int>::max())
1130     return false;
1131 
1132   const SIInstrInfo *TII = ST.getInstrInfo();
1133   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1134           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1135     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1136     .addImm(0);
1137 
1138   return true;
1139 }
1140 
1141 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1142   int NSAtoVMEMWaitStates = 1;
1143 
1144   if (!ST.hasNSAtoVMEMBug())
1145     return 0;
1146 
1147   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1148     return 0;
1149 
1150   const SIInstrInfo *TII = ST.getInstrInfo();
1151   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1152   if (!Offset || (Offset->getImm() & 6) == 0)
1153     return 0;
1154 
1155   auto IsHazardFn = [TII] (MachineInstr *I) {
1156     if (!SIInstrInfo::isMIMG(*I))
1157       return false;
1158     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1159     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1160            TII->getInstSizeInBytes(*I) >= 16;
1161   };
1162 
1163   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1164 }
1165 
1166 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1167   int FPAtomicToDenormModeWaitStates = 3;
1168 
1169   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1170     return 0;
1171 
1172   auto IsHazardFn = [] (MachineInstr *I) {
1173     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1174       return false;
1175     return SIInstrInfo::isFPAtomic(*I);
1176   };
1177 
1178   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1179     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1180       return true;
1181 
1182     switch (MI->getOpcode()) {
1183     case AMDGPU::S_WAITCNT:
1184     case AMDGPU::S_WAITCNT_VSCNT:
1185     case AMDGPU::S_WAITCNT_VMCNT:
1186     case AMDGPU::S_WAITCNT_EXPCNT:
1187     case AMDGPU::S_WAITCNT_LGKMCNT:
1188     case AMDGPU::S_WAITCNT_IDLE:
1189       return true;
1190     default:
1191       break;
1192     }
1193 
1194     return false;
1195   };
1196 
1197 
1198   return FPAtomicToDenormModeWaitStates -
1199          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1200 }
1201 
1202 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1203   assert(SIInstrInfo::isMAI(*MI));
1204 
1205   int WaitStatesNeeded = 0;
1206   unsigned Opc = MI->getOpcode();
1207 
1208   auto IsVALUFn = [] (MachineInstr *MI) {
1209     return SIInstrInfo::isVALU(*MI);
1210   };
1211 
1212   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1213     const int LegacyVALUWritesVGPRWaitStates = 2;
1214     const int VALUWritesExecWaitStates = 4;
1215     const int MaxWaitStates = 4;
1216 
1217     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1218       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1219     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1220 
1221     if (WaitStatesNeeded < MaxWaitStates) {
1222       for (const MachineOperand &Use : MI->explicit_uses()) {
1223         const int MaxWaitStates = 2;
1224 
1225         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1226           continue;
1227 
1228         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1229           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1230         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1231 
1232         if (WaitStatesNeeded == MaxWaitStates)
1233           break;
1234       }
1235     }
1236   }
1237 
1238   auto IsMFMAFn = [] (MachineInstr *MI) {
1239     return SIInstrInfo::isMAI(*MI) &&
1240            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1241            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1242   };
1243 
1244   for (const MachineOperand &Op : MI->explicit_operands()) {
1245     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1246       continue;
1247 
1248     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1249       continue;
1250 
1251     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1252     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1253     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1254     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1255     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1256     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1257     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1258     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1259     const int MaxWaitStates = 18;
1260     Register Reg = Op.getReg();
1261     unsigned HazardDefLatency = 0;
1262 
1263     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1264                               (MachineInstr *MI) {
1265       if (!IsMFMAFn(MI))
1266         return false;
1267       Register DstReg = MI->getOperand(0).getReg();
1268       if (DstReg == Reg)
1269         return false;
1270       HazardDefLatency = std::max(HazardDefLatency,
1271                                   TSchedModel.computeInstrLatency(MI));
1272       return TRI.regsOverlap(DstReg, Reg);
1273     };
1274 
1275     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1276                                                    MaxWaitStates);
1277     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1278     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1279     int OpNo = MI->getOperandNo(&Op);
1280     if (OpNo == SrcCIdx) {
1281       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1282     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1283       switch (HazardDefLatency) {
1284       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1285                break;
1286       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1287                break;
1288       case 16: LLVM_FALLTHROUGH;
1289       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1290                break;
1291       }
1292     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1293       switch (HazardDefLatency) {
1294       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1295                break;
1296       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1297                break;
1298       case 16: LLVM_FALLTHROUGH;
1299       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1300                break;
1301       }
1302     }
1303 
1304     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1305     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1306 
1307     if (WaitStatesNeeded == MaxWaitStates)
1308       return WaitStatesNeeded; // Early exit.
1309 
1310     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1311       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1312         return false;
1313       Register DstReg = MI->getOperand(0).getReg();
1314       return TRI.regsOverlap(Reg, DstReg);
1315     };
1316 
1317     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1318     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1319     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1320     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1321     if (OpNo == SrcCIdx)
1322       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1323     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1324       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1325 
1326     WaitStatesNeededForUse = NeedWaitStates -
1327       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1328     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1329 
1330     if (WaitStatesNeeded == MaxWaitStates)
1331       return WaitStatesNeeded; // Early exit.
1332   }
1333 
1334   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1335     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1336     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1337     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1338     const int MaxWaitStates = 13;
1339     Register DstReg = MI->getOperand(0).getReg();
1340     unsigned HazardDefLatency = 0;
1341 
1342     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1343                          (MachineInstr *MI) {
1344       if (!IsMFMAFn(MI))
1345         return false;
1346       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1347       HazardDefLatency = std::max(HazardDefLatency,
1348                                   TSchedModel.computeInstrLatency(MI));
1349       return TRI.regsOverlap(Reg, DstReg);
1350     };
1351 
1352     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1353     int NeedWaitStates;
1354     switch (HazardDefLatency) {
1355     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1356              break;
1357     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1358              break;
1359     case 16: LLVM_FALLTHROUGH;
1360     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1361              break;
1362     }
1363 
1364     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1365     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1366   }
1367 
1368   return WaitStatesNeeded;
1369 }
1370 
1371 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1372   if (!ST.hasMAIInsts())
1373     return 0;
1374 
1375   int WaitStatesNeeded = 0;
1376 
1377   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1378     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1379   };
1380 
1381   for (const MachineOperand &Op : MI->explicit_uses()) {
1382     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1383       continue;
1384 
1385     Register Reg = Op.getReg();
1386 
1387     const int AccVgprReadLdStWaitStates = 2;
1388     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1389     const int MaxWaitStates = 2;
1390 
1391     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1392       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1393     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1394 
1395     if (WaitStatesNeeded == MaxWaitStates)
1396       return WaitStatesNeeded; // Early exit.
1397 
1398     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
1399       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32 &&
1400           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1401         return false;
1402       auto IsVALUFn = [] (MachineInstr *MI) {
1403         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1404       };
1405       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1406              std::numeric_limits<int>::max();
1407     };
1408 
1409     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1410       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1411     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1412   }
1413 
1414   return WaitStatesNeeded;
1415 }
1416 
1417 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1418   if (!SU->isInstr())
1419     return false;
1420 
1421   MachineInstr *MAI = nullptr;
1422   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1423     MAI = nullptr;
1424     if (SIInstrInfo::isMAI(*MI) &&
1425         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1426         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1427       MAI = MI;
1428     return MAI != nullptr;
1429   };
1430 
1431   MachineInstr *MI = SU->getInstr();
1432   if (IsMFMAFn(MI)) {
1433     int W = getWaitStatesSince(IsMFMAFn, 16);
1434     if (MAI)
1435       return W < (int)TSchedModel.computeInstrLatency(MAI);
1436   }
1437 
1438   return false;
1439 }
1440