xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 90777e2924ec7f99a3f1b718a636f47036012514)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50   TSchedModel.init(&ST);
51 }
52 
53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54   EmitInstruction(SU->getInstr());
55 }
56 
57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58   CurrCycleInstr = MI;
59 }
60 
61 static bool isDivFMas(unsigned Opcode) {
62   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63 }
64 
65 static bool isSGetReg(unsigned Opcode) {
66   return Opcode == AMDGPU::S_GETREG_B32;
67 }
68 
69 static bool isSSetReg(unsigned Opcode) {
70   switch (Opcode) {
71   case AMDGPU::S_SETREG_B32:
72   case AMDGPU::S_SETREG_B32_mode:
73   case AMDGPU::S_SETREG_IMM32_B32:
74   case AMDGPU::S_SETREG_IMM32_B32_mode:
75     return true;
76   }
77   return false;
78 }
79 
80 static bool isRWLane(unsigned Opcode) {
81   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
82 }
83 
84 static bool isRFE(unsigned Opcode) {
85   return Opcode == AMDGPU::S_RFE_B64;
86 }
87 
88 static bool isSMovRel(unsigned Opcode) {
89   switch (Opcode) {
90   case AMDGPU::S_MOVRELS_B32:
91   case AMDGPU::S_MOVRELS_B64:
92   case AMDGPU::S_MOVRELD_B32:
93   case AMDGPU::S_MOVRELD_B64:
94     return true;
95   default:
96     return false;
97   }
98 }
99 
100 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
101                                     const MachineInstr &MI) {
102   if (TII.isAlwaysGDS(MI.getOpcode()))
103     return true;
104 
105   switch (MI.getOpcode()) {
106   case AMDGPU::S_SENDMSG:
107   case AMDGPU::S_SENDMSGHALT:
108   case AMDGPU::S_TTRACEDATA:
109     return true;
110   // These DS opcodes don't support GDS.
111   case AMDGPU::DS_NOP:
112   case AMDGPU::DS_PERMUTE_B32:
113   case AMDGPU::DS_BPERMUTE_B32:
114     return false;
115   default:
116     if (TII.isDS(MI.getOpcode())) {
117       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
118                                            AMDGPU::OpName::gds);
119       if (MI.getOperand(GDS).getImm())
120         return true;
121     }
122     return false;
123   }
124 }
125 
126 static bool isPermlane(const MachineInstr &MI) {
127   unsigned Opcode = MI.getOpcode();
128   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
129          Opcode == AMDGPU::V_PERMLANEX16_B32;
130 }
131 
132 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
133   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
134                                                      AMDGPU::OpName::simm16);
135   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
136 }
137 
138 ScheduleHazardRecognizer::HazardType
139 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
140   MachineInstr *MI = SU->getInstr();
141   if (MI->isBundle())
142    return NoHazard;
143 
144   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
145     return NoopHazard;
146 
147   // FIXME: Should flat be considered vmem?
148   if ((SIInstrInfo::isVMEM(*MI) ||
149        SIInstrInfo::isFLAT(*MI))
150       && checkVMEMHazards(MI) > 0)
151     return NoopHazard;
152 
153   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
154     return NoopHazard;
155 
156   if (checkFPAtomicToDenormModeHazard(MI) > 0)
157     return NoopHazard;
158 
159   if (ST.hasNoDataDepHazard())
160     return NoHazard;
161 
162   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
163     return NoopHazard;
164 
165   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
166     return NoopHazard;
167 
168   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
169     return NoopHazard;
170 
171   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
172     return NoopHazard;
173 
174   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
175     return NoopHazard;
176 
177   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
178     return NoopHazard;
179 
180   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
181     return NoopHazard;
182 
183   if (ST.hasReadM0MovRelInterpHazard() &&
184       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
185       checkReadM0Hazards(MI) > 0)
186     return NoopHazard;
187 
188   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
189       checkReadM0Hazards(MI) > 0)
190     return NoopHazard;
191 
192   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
193     return NoopHazard;
194 
195   if ((SIInstrInfo::isVMEM(*MI) ||
196        SIInstrInfo::isFLAT(*MI) ||
197        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
198     return NoopHazard;
199 
200   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
201     return NoopHazard;
202 
203   return NoHazard;
204 }
205 
206 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
207   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
208       .addImm(0);
209 }
210 
211 void GCNHazardRecognizer::processBundle() {
212   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
213   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
214   // Check bundled MachineInstr's for hazards.
215   for (; MI != E && MI->isInsideBundle(); ++MI) {
216     CurrCycleInstr = &*MI;
217     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
218 
219     if (IsHazardRecognizerMode)
220       fixHazards(CurrCycleInstr);
221 
222     for (unsigned i = 0; i < WaitStates; ++i)
223       insertNoopInBundle(CurrCycleInstr, TII);
224 
225     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
226     // include the bundled MI directly after, only add a maximum of
227     // (MaxLookAhead - 1) noops to EmittedInstrs.
228     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
229       EmittedInstrs.push_front(nullptr);
230 
231     EmittedInstrs.push_front(CurrCycleInstr);
232     EmittedInstrs.resize(MaxLookAhead);
233   }
234   CurrCycleInstr = nullptr;
235 }
236 
237 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
238   IsHazardRecognizerMode = true;
239   CurrCycleInstr = MI;
240   unsigned W = PreEmitNoopsCommon(MI);
241   fixHazards(MI);
242   CurrCycleInstr = nullptr;
243   return W;
244 }
245 
246 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
247   if (MI->isBundle())
248     return 0;
249 
250   int WaitStates = 0;
251 
252   if (SIInstrInfo::isSMRD(*MI))
253     return std::max(WaitStates, checkSMRDHazards(MI));
254 
255   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
256     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
257 
258   if (ST.hasNSAtoVMEMBug())
259     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
260 
261   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
262 
263   if (ST.hasNoDataDepHazard())
264     return WaitStates;
265 
266   if (SIInstrInfo::isVALU(*MI))
267     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
268 
269   if (SIInstrInfo::isDPP(*MI))
270     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
271 
272   if (isDivFMas(MI->getOpcode()))
273     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
274 
275   if (isRWLane(MI->getOpcode()))
276     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
277 
278   if (MI->isInlineAsm())
279     return std::max(WaitStates, checkInlineAsmHazards(MI));
280 
281   if (isSGetReg(MI->getOpcode()))
282     return std::max(WaitStates, checkGetRegHazards(MI));
283 
284   if (isSSetReg(MI->getOpcode()))
285     return std::max(WaitStates, checkSetRegHazards(MI));
286 
287   if (isRFE(MI->getOpcode()))
288     return std::max(WaitStates, checkRFEHazards(MI));
289 
290   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
291                                            isSMovRel(MI->getOpcode())))
292     return std::max(WaitStates, checkReadM0Hazards(MI));
293 
294   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
295     return std::max(WaitStates, checkReadM0Hazards(MI));
296 
297   if (SIInstrInfo::isMAI(*MI))
298     return std::max(WaitStates, checkMAIHazards(MI));
299 
300   if (SIInstrInfo::isVMEM(*MI) ||
301       SIInstrInfo::isFLAT(*MI) ||
302       SIInstrInfo::isDS(*MI))
303     return std::max(WaitStates, checkMAILdStHazards(MI));
304 
305   return WaitStates;
306 }
307 
308 void GCNHazardRecognizer::EmitNoop() {
309   EmittedInstrs.push_front(nullptr);
310 }
311 
312 void GCNHazardRecognizer::AdvanceCycle() {
313   // When the scheduler detects a stall, it will call AdvanceCycle() without
314   // emitting any instructions.
315   if (!CurrCycleInstr)
316     return;
317 
318   // Do not track non-instructions which do not affect the wait states.
319   // If included, these instructions can lead to buffer overflow such that
320   // detectable hazards are missed.
321   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
322       CurrCycleInstr->isKill())
323     return;
324 
325   if (CurrCycleInstr->isBundle()) {
326     processBundle();
327     return;
328   }
329 
330   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
331 
332   // Keep track of emitted instructions
333   EmittedInstrs.push_front(CurrCycleInstr);
334 
335   // Add a nullptr for each additional wait state after the first.  Make sure
336   // not to add more than getMaxLookAhead() items to the list, since we
337   // truncate the list to that size right after this loop.
338   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
339        i < e; ++i) {
340     EmittedInstrs.push_front(nullptr);
341   }
342 
343   // getMaxLookahead() is the largest number of wait states we will ever need
344   // to insert, so there is no point in keeping track of more than that many
345   // wait states.
346   EmittedInstrs.resize(getMaxLookAhead());
347 
348   CurrCycleInstr = nullptr;
349 }
350 
351 void GCNHazardRecognizer::RecedeCycle() {
352   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
353 }
354 
355 //===----------------------------------------------------------------------===//
356 // Helper Functions
357 //===----------------------------------------------------------------------===//
358 
359 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
360 
361 // Returns a minimum wait states since \p I walking all predecessors.
362 // Only scans until \p IsExpired does not return true.
363 // Can only be run in a hazard recognizer mode.
364 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
365                               MachineBasicBlock *MBB,
366                               MachineBasicBlock::reverse_instr_iterator I,
367                               int WaitStates,
368                               IsExpiredFn IsExpired,
369                               DenseSet<const MachineBasicBlock *> &Visited) {
370   for (auto E = MBB->instr_rend(); I != E; ++I) {
371     // Don't add WaitStates for parent BUNDLE instructions.
372     if (I->isBundle())
373       continue;
374 
375     if (IsHazard(&*I))
376       return WaitStates;
377 
378     if (I->isInlineAsm() || I->isMetaInstruction())
379       continue;
380 
381     WaitStates += SIInstrInfo::getNumWaitStates(*I);
382 
383     if (IsExpired(&*I, WaitStates))
384       return std::numeric_limits<int>::max();
385   }
386 
387   int MinWaitStates = WaitStates;
388   bool Found = false;
389   for (MachineBasicBlock *Pred : MBB->predecessors()) {
390     if (!Visited.insert(Pred).second)
391       continue;
392 
393     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
394                                WaitStates, IsExpired, Visited);
395 
396     if (W == std::numeric_limits<int>::max())
397       continue;
398 
399     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
400     if (IsExpired(nullptr, MinWaitStates))
401       return MinWaitStates;
402 
403     Found = true;
404   }
405 
406   if (Found)
407     return MinWaitStates;
408 
409   return std::numeric_limits<int>::max();
410 }
411 
412 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
413                               MachineInstr *MI,
414                               IsExpiredFn IsExpired) {
415   DenseSet<const MachineBasicBlock *> Visited;
416   return getWaitStatesSince(IsHazard, MI->getParent(),
417                             std::next(MI->getReverseIterator()),
418                             0, IsExpired, Visited);
419 }
420 
421 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
422   if (IsHazardRecognizerMode) {
423     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
424       return WaitStates >= Limit;
425     };
426     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
427   }
428 
429   int WaitStates = 0;
430   for (MachineInstr *MI : EmittedInstrs) {
431     if (MI) {
432       if (IsHazard(MI))
433         return WaitStates;
434 
435       if (MI->isInlineAsm())
436         continue;
437     }
438     ++WaitStates;
439 
440     if (WaitStates >= Limit)
441       break;
442   }
443   return std::numeric_limits<int>::max();
444 }
445 
446 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
447                                                IsHazardFn IsHazardDef,
448                                                int Limit) {
449   const SIRegisterInfo *TRI = ST.getRegisterInfo();
450 
451   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
452     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
453   };
454 
455   return getWaitStatesSince(IsHazardFn, Limit);
456 }
457 
458 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
459                                                   int Limit) {
460   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
461     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
462   };
463 
464   return getWaitStatesSince(IsHazardFn, Limit);
465 }
466 
467 //===----------------------------------------------------------------------===//
468 // No-op Hazard Detection
469 //===----------------------------------------------------------------------===//
470 
471 static void addRegUnits(const SIRegisterInfo &TRI,
472                         BitVector &BV, unsigned Reg) {
473   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
474     BV.set(*RUI);
475 }
476 
477 static void addRegsToSet(const SIRegisterInfo &TRI,
478                          iterator_range<MachineInstr::const_mop_iterator> Ops,
479                          BitVector &Set) {
480   for (const MachineOperand &Op : Ops) {
481     if (Op.isReg())
482       addRegUnits(TRI, Set, Op.getReg());
483   }
484 }
485 
486 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
487   // XXX: Do we need to worry about implicit operands
488   addRegsToSet(TRI, MI.defs(), ClauseDefs);
489   addRegsToSet(TRI, MI.uses(), ClauseUses);
490 }
491 
492 static bool breaksSMEMSoftClause(MachineInstr *MI) {
493   return !SIInstrInfo::isSMRD(*MI);
494 }
495 
496 static bool breaksVMEMSoftClause(MachineInstr *MI) {
497   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
498 }
499 
500 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
501   // SMEM soft clause are only present on VI+, and only matter if xnack is
502   // enabled.
503   if (!ST.isXNACKEnabled())
504     return 0;
505 
506   bool IsSMRD = TII.isSMRD(*MEM);
507 
508   resetClause();
509 
510   // A soft-clause is any group of consecutive SMEM instructions.  The
511   // instructions in this group may return out of order and/or may be
512   // replayed (i.e. the same instruction issued more than once).
513   //
514   // In order to handle these situations correctly we need to make sure that
515   // when a clause has more than one instruction, no instruction in the clause
516   // writes to a register that is read by another instruction in the clause
517   // (including itself). If we encounter this situaion, we need to break the
518   // clause by inserting a non SMEM instruction.
519 
520   for (MachineInstr *MI : EmittedInstrs) {
521     // When we hit a non-SMEM instruction then we have passed the start of the
522     // clause and we can stop.
523     if (!MI)
524       break;
525 
526     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
527       break;
528 
529     addClauseInst(*MI);
530   }
531 
532   if (ClauseDefs.none())
533     return 0;
534 
535   // We need to make sure not to put loads and stores in the same clause if they
536   // use the same address. For now, just start a new clause whenever we see a
537   // store.
538   if (MEM->mayStore())
539     return 1;
540 
541   addClauseInst(*MEM);
542 
543   // If the set of defs and uses intersect then we cannot add this instruction
544   // to the clause, so we have a hazard.
545   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
546 }
547 
548 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
549   int WaitStatesNeeded = 0;
550 
551   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
552 
553   // This SMRD hazard only affects SI.
554   if (!ST.hasSMRDReadVALUDefHazard())
555     return WaitStatesNeeded;
556 
557   // A read of an SGPR by SMRD instruction requires 4 wait states when the
558   // SGPR was written by a VALU instruction.
559   int SmrdSgprWaitStates = 4;
560   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
561   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
562 
563   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
564 
565   for (const MachineOperand &Use : SMRD->uses()) {
566     if (!Use.isReg())
567       continue;
568     int WaitStatesNeededForUse =
569         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
570                                                    SmrdSgprWaitStates);
571     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
572 
573     // This fixes what appears to be undocumented hardware behavior in SI where
574     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
575     // needs some number of nops in between. We don't know how many we need, but
576     // let's use 4. This wasn't discovered before probably because the only
577     // case when this happens is when we expand a 64-bit pointer into a full
578     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
579     // probably never encountered in the closed-source land.
580     if (IsBufferSMRD) {
581       int WaitStatesNeededForUse =
582         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
583                                                    IsBufferHazardDefFn,
584                                                    SmrdSgprWaitStates);
585       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
586     }
587   }
588 
589   return WaitStatesNeeded;
590 }
591 
592 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
593   if (!ST.hasVMEMReadSGPRVALUDefHazard())
594     return 0;
595 
596   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
597 
598   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
599   // SGPR was written by a VALU Instruction.
600   const int VmemSgprWaitStates = 5;
601   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
602   for (const MachineOperand &Use : VMEM->uses()) {
603     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
604       continue;
605 
606     int WaitStatesNeededForUse =
607         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
608                                                    VmemSgprWaitStates);
609     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
610   }
611   return WaitStatesNeeded;
612 }
613 
614 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
615   const SIRegisterInfo *TRI = ST.getRegisterInfo();
616   const SIInstrInfo *TII = ST.getInstrInfo();
617 
618   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
619   int DppVgprWaitStates = 2;
620   int DppExecWaitStates = 5;
621   int WaitStatesNeeded = 0;
622   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
623 
624   for (const MachineOperand &Use : DPP->uses()) {
625     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
626       continue;
627     int WaitStatesNeededForUse =
628         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
629                               [](MachineInstr *) { return true; },
630                               DppVgprWaitStates);
631     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
632   }
633 
634   WaitStatesNeeded = std::max(
635       WaitStatesNeeded,
636       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
637                                                 DppExecWaitStates));
638 
639   return WaitStatesNeeded;
640 }
641 
642 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
643   const SIInstrInfo *TII = ST.getInstrInfo();
644 
645   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
646   // instruction.
647   const int DivFMasWaitStates = 4;
648   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
649   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
650                                                DivFMasWaitStates);
651 
652   return DivFMasWaitStates - WaitStatesNeeded;
653 }
654 
655 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
656   const SIInstrInfo *TII = ST.getInstrInfo();
657   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
658 
659   const int GetRegWaitStates = 2;
660   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
661     return GetRegHWReg == getHWReg(TII, *MI);
662   };
663   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
664 
665   return GetRegWaitStates - WaitStatesNeeded;
666 }
667 
668 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
669   const SIInstrInfo *TII = ST.getInstrInfo();
670   unsigned HWReg = getHWReg(TII, *SetRegInstr);
671 
672   const int SetRegWaitStates = ST.getSetRegWaitStates();
673   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
674     return HWReg == getHWReg(TII, *MI);
675   };
676   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
677   return SetRegWaitStates - WaitStatesNeeded;
678 }
679 
680 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
681   if (!MI.mayStore())
682     return -1;
683 
684   const SIInstrInfo *TII = ST.getInstrInfo();
685   unsigned Opcode = MI.getOpcode();
686   const MCInstrDesc &Desc = MI.getDesc();
687 
688   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
689   int VDataRCID = -1;
690   if (VDataIdx != -1)
691     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
692 
693   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
694     // There is no hazard if the instruction does not use vector regs
695     // (like wbinvl1)
696     if (VDataIdx == -1)
697       return -1;
698     // For MUBUF/MTBUF instructions this hazard only exists if the
699     // instruction is not using a register in the soffset field.
700     const MachineOperand *SOffset =
701         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
702     // If we have no soffset operand, then assume this field has been
703     // hardcoded to zero.
704     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
705         (!SOffset || !SOffset->isReg()))
706       return VDataIdx;
707   }
708 
709   // MIMG instructions create a hazard if they don't use a 256-bit T# and
710   // the store size is greater than 8 bytes and they have more than two bits
711   // of their dmask set.
712   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
713   if (TII->isMIMG(MI)) {
714     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
715     assert(SRsrcIdx != -1 &&
716            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
717     (void)SRsrcIdx;
718   }
719 
720   if (TII->isFLAT(MI)) {
721     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
722     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
723       return DataIdx;
724   }
725 
726   return -1;
727 }
728 
729 int
730 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
731                                             const MachineRegisterInfo &MRI) {
732   // Helper to check for the hazard where VMEM instructions that store more than
733   // 8 bytes can have there store data over written by the next instruction.
734   const SIRegisterInfo *TRI = ST.getRegisterInfo();
735 
736   const int VALUWaitStates = 1;
737   int WaitStatesNeeded = 0;
738 
739   if (!TRI->isVGPR(MRI, Def.getReg()))
740     return WaitStatesNeeded;
741   Register Reg = Def.getReg();
742   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
743     int DataIdx = createsVALUHazard(*MI);
744     return DataIdx >= 0 &&
745     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
746   };
747   int WaitStatesNeededForDef =
748     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
749   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
750 
751   return WaitStatesNeeded;
752 }
753 
754 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
755   // This checks for the hazard where VMEM instructions that store more than
756   // 8 bytes can have there store data over written by the next instruction.
757   if (!ST.has12DWordStoreHazard())
758     return 0;
759 
760   const MachineRegisterInfo &MRI = MF.getRegInfo();
761   int WaitStatesNeeded = 0;
762 
763   for (const MachineOperand &Def : VALU->defs()) {
764     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
765   }
766 
767   return WaitStatesNeeded;
768 }
769 
770 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
771   // This checks for hazards associated with inline asm statements.
772   // Since inline asms can contain just about anything, we use this
773   // to call/leverage other check*Hazard routines. Note that
774   // this function doesn't attempt to address all possible inline asm
775   // hazards (good luck), but is a collection of what has been
776   // problematic thus far.
777 
778   // see checkVALUHazards()
779   if (!ST.has12DWordStoreHazard())
780     return 0;
781 
782   const MachineRegisterInfo &MRI = MF.getRegInfo();
783   int WaitStatesNeeded = 0;
784 
785   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
786        I != E; ++I) {
787     const MachineOperand &Op = IA->getOperand(I);
788     if (Op.isReg() && Op.isDef()) {
789       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
790     }
791   }
792 
793   return WaitStatesNeeded;
794 }
795 
796 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
797   const SIInstrInfo *TII = ST.getInstrInfo();
798   const SIRegisterInfo *TRI = ST.getRegisterInfo();
799   const MachineRegisterInfo &MRI = MF.getRegInfo();
800 
801   const MachineOperand *LaneSelectOp =
802       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
803 
804   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
805     return 0;
806 
807   Register LaneSelectReg = LaneSelectOp->getReg();
808   auto IsHazardFn = [TII] (MachineInstr *MI) {
809     return TII->isVALU(*MI);
810   };
811 
812   const int RWLaneWaitStates = 4;
813   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
814                                               RWLaneWaitStates);
815   return RWLaneWaitStates - WaitStatesSince;
816 }
817 
818 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
819   if (!ST.hasRFEHazards())
820     return 0;
821 
822   const SIInstrInfo *TII = ST.getInstrInfo();
823 
824   const int RFEWaitStates = 1;
825 
826   auto IsHazardFn = [TII] (MachineInstr *MI) {
827     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
828   };
829   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
830   return RFEWaitStates - WaitStatesNeeded;
831 }
832 
833 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
834   const SIInstrInfo *TII = ST.getInstrInfo();
835   const int SMovRelWaitStates = 1;
836   auto IsHazardFn = [TII] (MachineInstr *MI) {
837     return TII->isSALU(*MI);
838   };
839   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
840                                                    SMovRelWaitStates);
841 }
842 
843 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
844   fixVMEMtoScalarWriteHazards(MI);
845   fixVcmpxPermlaneHazards(MI);
846   fixSMEMtoVectorWriteHazards(MI);
847   fixVcmpxExecWARHazard(MI);
848   fixLdsBranchVmemWARHazard(MI);
849 }
850 
851 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
852   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
853     return false;
854 
855   const SIInstrInfo *TII = ST.getInstrInfo();
856   auto IsHazardFn = [TII] (MachineInstr *MI) {
857     return TII->isVOPC(*MI);
858   };
859 
860   auto IsExpiredFn = [] (MachineInstr *MI, int) {
861     if (!MI)
862       return false;
863     unsigned Opc = MI->getOpcode();
864     return SIInstrInfo::isVALU(*MI) &&
865            Opc != AMDGPU::V_NOP_e32 &&
866            Opc != AMDGPU::V_NOP_e64 &&
867            Opc != AMDGPU::V_NOP_sdwa;
868   };
869 
870   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
871       std::numeric_limits<int>::max())
872     return false;
873 
874   // V_NOP will be discarded by SQ.
875   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
876   // which is always a VGPR and available.
877   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
878   Register Reg = Src0->getReg();
879   bool IsUndef = Src0->isUndef();
880   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
881           TII->get(AMDGPU::V_MOV_B32_e32))
882     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
883     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
884 
885   return true;
886 }
887 
888 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
889   if (!ST.hasVMEMtoScalarWriteHazard())
890     return false;
891 
892   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
893     return false;
894 
895   if (MI->getNumDefs() == 0)
896     return false;
897 
898   const SIRegisterInfo *TRI = ST.getRegisterInfo();
899 
900   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
901     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
902         !SIInstrInfo::isFLAT(*I))
903       return false;
904 
905     for (const MachineOperand &Def : MI->defs()) {
906       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
907       if (!Op)
908         continue;
909       return true;
910     }
911     return false;
912   };
913 
914   auto IsExpiredFn = [](MachineInstr *MI, int) {
915     return MI && (SIInstrInfo::isVALU(*MI) ||
916                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
917                    !MI->getOperand(0).getImm()) ||
918                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
919                    MI->getOperand(0).getImm() == 0xffe3));
920   };
921 
922   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
923       std::numeric_limits<int>::max())
924     return false;
925 
926   const SIInstrInfo *TII = ST.getInstrInfo();
927   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
928           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
929       .addImm(0xffe3);
930   return true;
931 }
932 
933 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
934   if (!ST.hasSMEMtoVectorWriteHazard())
935     return false;
936 
937   if (!SIInstrInfo::isVALU(*MI))
938     return false;
939 
940   unsigned SDSTName;
941   switch (MI->getOpcode()) {
942   case AMDGPU::V_READLANE_B32:
943   case AMDGPU::V_READLANE_B32_gfx10:
944   case AMDGPU::V_READFIRSTLANE_B32:
945     SDSTName = AMDGPU::OpName::vdst;
946     break;
947   default:
948     SDSTName = AMDGPU::OpName::sdst;
949     break;
950   }
951 
952   const SIInstrInfo *TII = ST.getInstrInfo();
953   const SIRegisterInfo *TRI = ST.getRegisterInfo();
954   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
955   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
956   if (!SDST) {
957     for (const auto &MO : MI->implicit_operands()) {
958       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
959         SDST = &MO;
960         break;
961       }
962     }
963   }
964 
965   if (!SDST)
966     return false;
967 
968   const Register SDSTReg = SDST->getReg();
969   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
970     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
971   };
972 
973   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
974     if (MI) {
975       if (TII->isSALU(*MI)) {
976         switch (MI->getOpcode()) {
977         case AMDGPU::S_SETVSKIP:
978         case AMDGPU::S_VERSION:
979         case AMDGPU::S_WAITCNT_VSCNT:
980         case AMDGPU::S_WAITCNT_VMCNT:
981         case AMDGPU::S_WAITCNT_EXPCNT:
982           // These instructions cannot not mitigate the hazard.
983           return false;
984         case AMDGPU::S_WAITCNT_LGKMCNT:
985           // Reducing lgkmcnt count to 0 always mitigates the hazard.
986           return (MI->getOperand(1).getImm() == 0) &&
987                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
988         case AMDGPU::S_WAITCNT: {
989           const int64_t Imm = MI->getOperand(0).getImm();
990           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
991           return (Decoded.LgkmCnt == 0);
992         }
993         default:
994           // SOPP instructions cannot mitigate the hazard.
995           if (TII->isSOPP(*MI))
996             return false;
997           // At this point the SALU can be assumed to mitigate the hazard
998           // because either:
999           // (a) it is independent of the at risk SMEM (breaking chain),
1000           // or
1001           // (b) it is dependent on the SMEM, in which case an appropriate
1002           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1003           //     SMEM instruction.
1004           return true;
1005         }
1006       }
1007     }
1008     return false;
1009   };
1010 
1011   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1012       std::numeric_limits<int>::max())
1013     return false;
1014 
1015   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1016           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1017       .addImm(0);
1018   return true;
1019 }
1020 
1021 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1022   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1023     return false;
1024 
1025   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1026   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1027     return false;
1028 
1029   auto IsHazardFn = [TRI] (MachineInstr *I) {
1030     if (SIInstrInfo::isVALU(*I))
1031       return false;
1032     return I->readsRegister(AMDGPU::EXEC, TRI);
1033   };
1034 
1035   const SIInstrInfo *TII = ST.getInstrInfo();
1036   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1037     if (!MI)
1038       return false;
1039     if (SIInstrInfo::isVALU(*MI)) {
1040       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1041         return true;
1042       for (auto MO : MI->implicit_operands())
1043         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1044           return true;
1045     }
1046     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1047         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1048       return true;
1049     return false;
1050   };
1051 
1052   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1053       std::numeric_limits<int>::max())
1054     return false;
1055 
1056   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1057           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1058     .addImm(0xfffe);
1059   return true;
1060 }
1061 
1062 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1063   if (!ST.hasLdsBranchVmemWARHazard())
1064     return false;
1065 
1066   auto IsHazardInst = [] (const MachineInstr *MI) {
1067     if (SIInstrInfo::isDS(*MI))
1068       return 1;
1069     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1070       return 2;
1071     return 0;
1072   };
1073 
1074   auto InstType = IsHazardInst(MI);
1075   if (!InstType)
1076     return false;
1077 
1078   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1079     return I && (IsHazardInst(I) ||
1080                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1081                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1082                   !I->getOperand(1).getImm()));
1083   };
1084 
1085   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1086     if (!I->isBranch())
1087       return false;
1088 
1089     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1090       auto InstType2 = IsHazardInst(I);
1091       return InstType2 && InstType != InstType2;
1092     };
1093 
1094     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1095       if (!I)
1096         return false;
1097 
1098       auto InstType2 = IsHazardInst(I);
1099       if (InstType == InstType2)
1100         return true;
1101 
1102       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1103              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1104              !I->getOperand(1).getImm();
1105     };
1106 
1107     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1108            std::numeric_limits<int>::max();
1109   };
1110 
1111   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1112       std::numeric_limits<int>::max())
1113     return false;
1114 
1115   const SIInstrInfo *TII = ST.getInstrInfo();
1116   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1117           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1118     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1119     .addImm(0);
1120 
1121   return true;
1122 }
1123 
1124 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1125   int NSAtoVMEMWaitStates = 1;
1126 
1127   if (!ST.hasNSAtoVMEMBug())
1128     return 0;
1129 
1130   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1131     return 0;
1132 
1133   const SIInstrInfo *TII = ST.getInstrInfo();
1134   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1135   if (!Offset || (Offset->getImm() & 6) == 0)
1136     return 0;
1137 
1138   auto IsHazardFn = [TII] (MachineInstr *I) {
1139     if (!SIInstrInfo::isMIMG(*I))
1140       return false;
1141     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1142     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1143            TII->getInstSizeInBytes(*I) >= 16;
1144   };
1145 
1146   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1147 }
1148 
1149 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1150   int FPAtomicToDenormModeWaitStates = 3;
1151 
1152   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1153     return 0;
1154 
1155   auto IsHazardFn = [] (MachineInstr *I) {
1156     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1157       return false;
1158     return SIInstrInfo::isFPAtomic(*I);
1159   };
1160 
1161   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1162     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1163       return true;
1164 
1165     switch (MI->getOpcode()) {
1166     case AMDGPU::S_WAITCNT:
1167     case AMDGPU::S_WAITCNT_VSCNT:
1168     case AMDGPU::S_WAITCNT_VMCNT:
1169     case AMDGPU::S_WAITCNT_EXPCNT:
1170     case AMDGPU::S_WAITCNT_LGKMCNT:
1171     case AMDGPU::S_WAITCNT_IDLE:
1172       return true;
1173     default:
1174       break;
1175     }
1176 
1177     return false;
1178   };
1179 
1180 
1181   return FPAtomicToDenormModeWaitStates -
1182          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1183 }
1184 
1185 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1186   assert(SIInstrInfo::isMAI(*MI));
1187 
1188   int WaitStatesNeeded = 0;
1189   unsigned Opc = MI->getOpcode();
1190 
1191   auto IsVALUFn = [] (MachineInstr *MI) {
1192     return SIInstrInfo::isVALU(*MI);
1193   };
1194 
1195   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1196     const int LegacyVALUWritesVGPRWaitStates = 2;
1197     const int VALUWritesExecWaitStates = 4;
1198     const int MaxWaitStates = 4;
1199 
1200     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1201       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1202     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1203 
1204     if (WaitStatesNeeded < MaxWaitStates) {
1205       for (const MachineOperand &Use : MI->explicit_uses()) {
1206         const int MaxWaitStates = 2;
1207 
1208         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1209           continue;
1210 
1211         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1212           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1213         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1214 
1215         if (WaitStatesNeeded == MaxWaitStates)
1216           break;
1217       }
1218     }
1219   }
1220 
1221   auto IsMFMAFn = [] (MachineInstr *MI) {
1222     return SIInstrInfo::isMAI(*MI) &&
1223            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1224            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1225   };
1226 
1227   for (const MachineOperand &Op : MI->explicit_operands()) {
1228     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1229       continue;
1230 
1231     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1232       continue;
1233 
1234     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1235     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1236     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1237     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1238     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1239     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1240     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1241     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1242     const int MaxWaitStates = 18;
1243     Register Reg = Op.getReg();
1244     unsigned HazardDefLatency = 0;
1245 
1246     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1247                               (MachineInstr *MI) {
1248       if (!IsMFMAFn(MI))
1249         return false;
1250       Register DstReg = MI->getOperand(0).getReg();
1251       if (DstReg == Reg)
1252         return false;
1253       HazardDefLatency = std::max(HazardDefLatency,
1254                                   TSchedModel.computeInstrLatency(MI));
1255       return TRI.regsOverlap(DstReg, Reg);
1256     };
1257 
1258     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1259                                                    MaxWaitStates);
1260     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1261     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1262     int OpNo = MI->getOperandNo(&Op);
1263     if (OpNo == SrcCIdx) {
1264       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1265     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1266       switch (HazardDefLatency) {
1267       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1268                break;
1269       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1270                break;
1271       case 16: LLVM_FALLTHROUGH;
1272       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1273                break;
1274       }
1275     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1276       switch (HazardDefLatency) {
1277       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1278                break;
1279       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1280                break;
1281       case 16: LLVM_FALLTHROUGH;
1282       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1283                break;
1284       }
1285     }
1286 
1287     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1288     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1289 
1290     if (WaitStatesNeeded == MaxWaitStates)
1291       return WaitStatesNeeded; // Early exit.
1292 
1293     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1294       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1295         return false;
1296       Register DstReg = MI->getOperand(0).getReg();
1297       return TRI.regsOverlap(Reg, DstReg);
1298     };
1299 
1300     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1301     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1302     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1303     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1304     if (OpNo == SrcCIdx)
1305       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1306     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1307       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1308 
1309     WaitStatesNeededForUse = NeedWaitStates -
1310       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1311     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1312 
1313     if (WaitStatesNeeded == MaxWaitStates)
1314       return WaitStatesNeeded; // Early exit.
1315   }
1316 
1317   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1318     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1319     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1320     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1321     const int MaxWaitStates = 13;
1322     Register DstReg = MI->getOperand(0).getReg();
1323     unsigned HazardDefLatency = 0;
1324 
1325     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1326                          (MachineInstr *MI) {
1327       if (!IsMFMAFn(MI))
1328         return false;
1329       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1330       HazardDefLatency = std::max(HazardDefLatency,
1331                                   TSchedModel.computeInstrLatency(MI));
1332       return TRI.regsOverlap(Reg, DstReg);
1333     };
1334 
1335     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1336     int NeedWaitStates;
1337     switch (HazardDefLatency) {
1338     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1339              break;
1340     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1341              break;
1342     case 16: LLVM_FALLTHROUGH;
1343     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1344              break;
1345     }
1346 
1347     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1348     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1349   }
1350 
1351   return WaitStatesNeeded;
1352 }
1353 
1354 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1355   if (!ST.hasMAIInsts())
1356     return 0;
1357 
1358   int WaitStatesNeeded = 0;
1359 
1360   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1361     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1362   };
1363 
1364   for (const MachineOperand &Op : MI->explicit_uses()) {
1365     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1366       continue;
1367 
1368     Register Reg = Op.getReg();
1369 
1370     const int AccVgprReadLdStWaitStates = 2;
1371     const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1372     const int MaxWaitStates = 2;
1373 
1374     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1375       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1376     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1377 
1378     if (WaitStatesNeeded == MaxWaitStates)
1379       return WaitStatesNeeded; // Early exit.
1380 
1381     auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1382       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1383         return false;
1384       auto IsVALUFn = [] (MachineInstr *MI) {
1385         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1386       };
1387       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1388              std::numeric_limits<int>::max();
1389     };
1390 
1391     WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1392       getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1393     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1394   }
1395 
1396   return WaitStatesNeeded;
1397 }
1398 
1399 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1400   if (!SU->isInstr())
1401     return false;
1402 
1403   MachineInstr *MAI = nullptr;
1404   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1405     MAI = nullptr;
1406     if (SIInstrInfo::isMAI(*MI) &&
1407         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1408         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1409       MAI = MI;
1410     return MAI != nullptr;
1411   };
1412 
1413   MachineInstr *MI = SU->getInstr();
1414   if (IsMFMAFn(MI)) {
1415     int W = getWaitStatesSince(IsMFMAFn, 16);
1416     if (MAI)
1417       return W < (int)TSchedModel.computeInstrLatency(MAI);
1418   }
1419 
1420   return false;
1421 }
1422