xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements hazard recognizers for scheduling on GCN processors.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric #include "GCNHazardRecognizer.h"
14e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
150b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1681ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
17*0fca6ea1SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
190b57cec5SDimitry Andric #include "llvm/CodeGen/ScheduleDAG.h"
2006c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h"
210b57cec5SDimitry Andric 
220b57cec5SDimitry Andric using namespace llvm;
230b57cec5SDimitry Andric 
2481ad6265SDimitry Andric namespace {
2581ad6265SDimitry Andric 
2681ad6265SDimitry Andric struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
2781ad6265SDimitry Andric   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
2881ad6265SDimitry Andric 
2981ad6265SDimitry Andric   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
3081ad6265SDimitry Andric     if (Arg.getAsInteger(0, Value))
3181ad6265SDimitry Andric       return O.error("'" + Arg + "' value invalid for uint argument!");
3281ad6265SDimitry Andric 
3381ad6265SDimitry Andric     if (Value > 100)
3481ad6265SDimitry Andric       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
3581ad6265SDimitry Andric 
3681ad6265SDimitry Andric     return false;
3781ad6265SDimitry Andric   }
3881ad6265SDimitry Andric };
3981ad6265SDimitry Andric 
4081ad6265SDimitry Andric } // end anonymous namespace
4181ad6265SDimitry Andric 
4281ad6265SDimitry Andric static cl::opt<unsigned, false, MFMAPaddingRatioParser>
4381ad6265SDimitry Andric     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
4481ad6265SDimitry Andric                      cl::desc("Fill a percentage of the latency between "
4581ad6265SDimitry Andric                               "neighboring MFMA with s_nops."));
4681ad6265SDimitry Andric 
470b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4881ad6265SDimitry Andric // Hazard Recognizer Implementation
490b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
500b57cec5SDimitry Andric 
51fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
52fe6060f1SDimitry Andric                                                  const GCNSubtarget &ST);
53fe6060f1SDimitry Andric 
540b57cec5SDimitry Andric GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
550b57cec5SDimitry Andric   IsHazardRecognizerMode(false),
560b57cec5SDimitry Andric   CurrCycleInstr(nullptr),
570b57cec5SDimitry Andric   MF(MF),
580b57cec5SDimitry Andric   ST(MF.getSubtarget<GCNSubtarget>()),
590b57cec5SDimitry Andric   TII(*ST.getInstrInfo()),
600b57cec5SDimitry Andric   TRI(TII.getRegisterInfo()),
610b57cec5SDimitry Andric   ClauseUses(TRI.getNumRegUnits()),
620b57cec5SDimitry Andric   ClauseDefs(TRI.getNumRegUnits()) {
63fe6060f1SDimitry Andric   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
640b57cec5SDimitry Andric   TSchedModel.init(&ST);
65fe6060f1SDimitry Andric   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
660b57cec5SDimitry Andric }
670b57cec5SDimitry Andric 
68e8d8bef9SDimitry Andric void GCNHazardRecognizer::Reset() {
69e8d8bef9SDimitry Andric   EmittedInstrs.clear();
70e8d8bef9SDimitry Andric }
71e8d8bef9SDimitry Andric 
720b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
730b57cec5SDimitry Andric   EmitInstruction(SU->getInstr());
740b57cec5SDimitry Andric }
750b57cec5SDimitry Andric 
760b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
770b57cec5SDimitry Andric   CurrCycleInstr = MI;
780b57cec5SDimitry Andric }
790b57cec5SDimitry Andric 
800b57cec5SDimitry Andric static bool isDivFMas(unsigned Opcode) {
81e8d8bef9SDimitry Andric   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
820b57cec5SDimitry Andric }
830b57cec5SDimitry Andric 
840b57cec5SDimitry Andric static bool isSGetReg(unsigned Opcode) {
850b57cec5SDimitry Andric   return Opcode == AMDGPU::S_GETREG_B32;
860b57cec5SDimitry Andric }
870b57cec5SDimitry Andric 
880b57cec5SDimitry Andric static bool isSSetReg(unsigned Opcode) {
89e8d8bef9SDimitry Andric   switch (Opcode) {
90e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_B32:
91e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_B32_mode:
92e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_IMM32_B32:
93e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_IMM32_B32_mode:
94e8d8bef9SDimitry Andric     return true;
95e8d8bef9SDimitry Andric   }
96e8d8bef9SDimitry Andric   return false;
970b57cec5SDimitry Andric }
980b57cec5SDimitry Andric 
990b57cec5SDimitry Andric static bool isRWLane(unsigned Opcode) {
1000b57cec5SDimitry Andric   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
1010b57cec5SDimitry Andric }
1020b57cec5SDimitry Andric 
1030b57cec5SDimitry Andric static bool isRFE(unsigned Opcode) {
1040b57cec5SDimitry Andric   return Opcode == AMDGPU::S_RFE_B64;
1050b57cec5SDimitry Andric }
1060b57cec5SDimitry Andric 
1070b57cec5SDimitry Andric static bool isSMovRel(unsigned Opcode) {
1080b57cec5SDimitry Andric   switch (Opcode) {
1090b57cec5SDimitry Andric   case AMDGPU::S_MOVRELS_B32:
1100b57cec5SDimitry Andric   case AMDGPU::S_MOVRELS_B64:
1110b57cec5SDimitry Andric   case AMDGPU::S_MOVRELD_B32:
1120b57cec5SDimitry Andric   case AMDGPU::S_MOVRELD_B64:
1130b57cec5SDimitry Andric     return true;
1140b57cec5SDimitry Andric   default:
1150b57cec5SDimitry Andric     return false;
1160b57cec5SDimitry Andric   }
1170b57cec5SDimitry Andric }
1180b57cec5SDimitry Andric 
119fe6060f1SDimitry Andric static bool isDGEMM(unsigned Opcode) {
12081ad6265SDimitry Andric   return AMDGPU::getMAIIsDGEMM(Opcode);
121fe6060f1SDimitry Andric }
122fe6060f1SDimitry Andric 
123fe6060f1SDimitry Andric static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
124fe6060f1SDimitry Andric   unsigned Opcode = MI.getOpcode();
125fe6060f1SDimitry Andric 
126fe6060f1SDimitry Andric   if (!SIInstrInfo::isMAI(MI) ||
127fe6060f1SDimitry Andric       isDGEMM(Opcode) ||
128fe6060f1SDimitry Andric       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
129fe6060f1SDimitry Andric       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
130fe6060f1SDimitry Andric     return false;
131fe6060f1SDimitry Andric 
13281ad6265SDimitry Andric   if (!ST.hasGFX940Insts())
133fe6060f1SDimitry Andric     return true;
13481ad6265SDimitry Andric 
13581ad6265SDimitry Andric   return AMDGPU::getMAIIsGFX940XDL(Opcode);
136fe6060f1SDimitry Andric }
137fe6060f1SDimitry Andric 
1380b57cec5SDimitry Andric static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
1390b57cec5SDimitry Andric                                     const MachineInstr &MI) {
1400b57cec5SDimitry Andric   if (TII.isAlwaysGDS(MI.getOpcode()))
1410b57cec5SDimitry Andric     return true;
1420b57cec5SDimitry Andric 
1430b57cec5SDimitry Andric   switch (MI.getOpcode()) {
1440b57cec5SDimitry Andric   case AMDGPU::S_SENDMSG:
1450b57cec5SDimitry Andric   case AMDGPU::S_SENDMSGHALT:
1460b57cec5SDimitry Andric   case AMDGPU::S_TTRACEDATA:
1470b57cec5SDimitry Andric     return true;
1480b57cec5SDimitry Andric   // These DS opcodes don't support GDS.
1490b57cec5SDimitry Andric   case AMDGPU::DS_NOP:
1500b57cec5SDimitry Andric   case AMDGPU::DS_PERMUTE_B32:
1510b57cec5SDimitry Andric   case AMDGPU::DS_BPERMUTE_B32:
1520b57cec5SDimitry Andric     return false;
1530b57cec5SDimitry Andric   default:
1540b57cec5SDimitry Andric     if (TII.isDS(MI.getOpcode())) {
1550b57cec5SDimitry Andric       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1560b57cec5SDimitry Andric                                            AMDGPU::OpName::gds);
1570b57cec5SDimitry Andric       if (MI.getOperand(GDS).getImm())
1580b57cec5SDimitry Andric         return true;
1590b57cec5SDimitry Andric     }
1600b57cec5SDimitry Andric     return false;
1610b57cec5SDimitry Andric   }
1620b57cec5SDimitry Andric }
1630b57cec5SDimitry Andric 
1640b57cec5SDimitry Andric static bool isPermlane(const MachineInstr &MI) {
1650b57cec5SDimitry Andric   unsigned Opcode = MI.getOpcode();
166e8d8bef9SDimitry Andric   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
1677a6dacacSDimitry Andric          Opcode == AMDGPU::V_PERMLANE64_B32 ||
1685f757f3fSDimitry Andric          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
1695f757f3fSDimitry Andric          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
1705f757f3fSDimitry Andric          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
1710b57cec5SDimitry Andric }
1720b57cec5SDimitry Andric 
17381ad6265SDimitry Andric static bool isLdsDma(const MachineInstr &MI) {
17481ad6265SDimitry Andric   return SIInstrInfo::isVALU(MI) &&
17581ad6265SDimitry Andric          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
17681ad6265SDimitry Andric }
17781ad6265SDimitry Andric 
1780b57cec5SDimitry Andric static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
1790b57cec5SDimitry Andric   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
1800b57cec5SDimitry Andric                                                      AMDGPU::OpName::simm16);
181*0fca6ea1SDimitry Andric   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
1820b57cec5SDimitry Andric }
1830b57cec5SDimitry Andric 
1840b57cec5SDimitry Andric ScheduleHazardRecognizer::HazardType
1850b57cec5SDimitry Andric GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
1860b57cec5SDimitry Andric   MachineInstr *MI = SU->getInstr();
187e8d8bef9SDimitry Andric   // If we are not in "HazardRecognizerMode" and therefore not being run from
188e8d8bef9SDimitry Andric   // the scheduler, track possible stalls from hazards but don't insert noops.
189e8d8bef9SDimitry Andric   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
190e8d8bef9SDimitry Andric 
1910b57cec5SDimitry Andric   if (MI->isBundle())
1920b57cec5SDimitry Andric    return NoHazard;
1930b57cec5SDimitry Andric 
1940b57cec5SDimitry Andric   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
195e8d8bef9SDimitry Andric     return HazardType;
1960b57cec5SDimitry Andric 
1970b57cec5SDimitry Andric   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
198e8d8bef9SDimitry Andric     return HazardType;
1990b57cec5SDimitry Andric 
2000b57cec5SDimitry Andric   if (checkFPAtomicToDenormModeHazard(MI) > 0)
201e8d8bef9SDimitry Andric     return HazardType;
2020b57cec5SDimitry Andric 
2030b57cec5SDimitry Andric   if (ST.hasNoDataDepHazard())
2040b57cec5SDimitry Andric     return NoHazard;
2050b57cec5SDimitry Andric 
206fe6060f1SDimitry Andric   // FIXME: Should flat be considered vmem?
207fe6060f1SDimitry Andric   if ((SIInstrInfo::isVMEM(*MI) ||
208fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI))
209fe6060f1SDimitry Andric       && checkVMEMHazards(MI) > 0)
210fe6060f1SDimitry Andric     return HazardType;
211fe6060f1SDimitry Andric 
2120b57cec5SDimitry Andric   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
213e8d8bef9SDimitry Andric     return HazardType;
2140b57cec5SDimitry Andric 
2150b57cec5SDimitry Andric   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
216e8d8bef9SDimitry Andric     return HazardType;
2170b57cec5SDimitry Andric 
2180b57cec5SDimitry Andric   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
219e8d8bef9SDimitry Andric     return HazardType;
2200b57cec5SDimitry Andric 
2210b57cec5SDimitry Andric   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
222e8d8bef9SDimitry Andric     return HazardType;
2230b57cec5SDimitry Andric 
224fe6060f1SDimitry Andric   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
225fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
226fe6060f1SDimitry Andric        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
227fe6060f1SDimitry Andric     return HazardType;
228fe6060f1SDimitry Andric 
2290b57cec5SDimitry Andric   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
230e8d8bef9SDimitry Andric     return HazardType;
2310b57cec5SDimitry Andric 
2320b57cec5SDimitry Andric   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
233e8d8bef9SDimitry Andric     return HazardType;
2340b57cec5SDimitry Andric 
2350b57cec5SDimitry Andric   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
236e8d8bef9SDimitry Andric     return HazardType;
2370b57cec5SDimitry Andric 
23881ad6265SDimitry Andric   if (((ST.hasReadM0MovRelInterpHazard() &&
239bdd1243dSDimitry Andric         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
240bdd1243dSDimitry Andric          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
241bdd1243dSDimitry Andric          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
24281ad6265SDimitry Andric        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
24381ad6265SDimitry Andric        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
24481ad6265SDimitry Andric        (ST.hasReadM0LdsDirectHazard() &&
245*0fca6ea1SDimitry Andric         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
2460b57cec5SDimitry Andric       checkReadM0Hazards(MI) > 0)
247e8d8bef9SDimitry Andric     return HazardType;
2480b57cec5SDimitry Andric 
2490b57cec5SDimitry Andric   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
250e8d8bef9SDimitry Andric     return HazardType;
2510b57cec5SDimitry Andric 
252e8d8bef9SDimitry Andric   if ((SIInstrInfo::isVMEM(*MI) ||
253e8d8bef9SDimitry Andric        SIInstrInfo::isFLAT(*MI) ||
254e8d8bef9SDimitry Andric        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
255e8d8bef9SDimitry Andric     return HazardType;
2560b57cec5SDimitry Andric 
2570b57cec5SDimitry Andric   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
258e8d8bef9SDimitry Andric     return HazardType;
2590b57cec5SDimitry Andric 
2600b57cec5SDimitry Andric   return NoHazard;
2610b57cec5SDimitry Andric }
2620b57cec5SDimitry Andric 
263e8d8bef9SDimitry Andric static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
264e8d8bef9SDimitry Andric                                 unsigned Quantity) {
265e8d8bef9SDimitry Andric   while (Quantity > 0) {
266e8d8bef9SDimitry Andric     unsigned Arg = std::min(Quantity, 8u);
267e8d8bef9SDimitry Andric     Quantity -= Arg;
2680b57cec5SDimitry Andric     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
269e8d8bef9SDimitry Andric         .addImm(Arg - 1);
270e8d8bef9SDimitry Andric   }
2710b57cec5SDimitry Andric }
2720b57cec5SDimitry Andric 
27381ad6265SDimitry Andric unsigned
27481ad6265SDimitry Andric GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
27581ad6265SDimitry Andric   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
27681ad6265SDimitry Andric   assert(TSchedModel.getWriteProcResBegin(SC) !=
27781ad6265SDimitry Andric          TSchedModel.getWriteProcResEnd(SC));
2785f757f3fSDimitry Andric   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
27981ad6265SDimitry Andric }
28081ad6265SDimitry Andric 
2810b57cec5SDimitry Andric void GCNHazardRecognizer::processBundle() {
2820b57cec5SDimitry Andric   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
2830b57cec5SDimitry Andric   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
2840b57cec5SDimitry Andric   // Check bundled MachineInstr's for hazards.
2850b57cec5SDimitry Andric   for (; MI != E && MI->isInsideBundle(); ++MI) {
2860b57cec5SDimitry Andric     CurrCycleInstr = &*MI;
2870b57cec5SDimitry Andric     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
2880b57cec5SDimitry Andric 
289e8d8bef9SDimitry Andric     if (IsHazardRecognizerMode) {
2900b57cec5SDimitry Andric       fixHazards(CurrCycleInstr);
2910b57cec5SDimitry Andric 
292e8d8bef9SDimitry Andric       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
293e8d8bef9SDimitry Andric     }
2940b57cec5SDimitry Andric 
2950b57cec5SDimitry Andric     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
2960b57cec5SDimitry Andric     // include the bundled MI directly after, only add a maximum of
2970b57cec5SDimitry Andric     // (MaxLookAhead - 1) noops to EmittedInstrs.
2980b57cec5SDimitry Andric     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
2990b57cec5SDimitry Andric       EmittedInstrs.push_front(nullptr);
3000b57cec5SDimitry Andric 
3010b57cec5SDimitry Andric     EmittedInstrs.push_front(CurrCycleInstr);
3020b57cec5SDimitry Andric     EmittedInstrs.resize(MaxLookAhead);
3030b57cec5SDimitry Andric   }
3040b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
3050b57cec5SDimitry Andric }
3060b57cec5SDimitry Andric 
307bdd1243dSDimitry Andric void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
308bdd1243dSDimitry Andric   assert(IsHazardRecognizerMode);
309bdd1243dSDimitry Andric 
310bdd1243dSDimitry Andric   unsigned NumPreNoops = PreEmitNoops(MI);
311bdd1243dSDimitry Andric   EmitNoops(NumPreNoops);
312bdd1243dSDimitry Andric   if (MI->isInsideBundle())
313bdd1243dSDimitry Andric     insertNoopsInBundle(MI, TII, NumPreNoops);
314bdd1243dSDimitry Andric   else
315bdd1243dSDimitry Andric     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
316bdd1243dSDimitry Andric                     NumPreNoops);
317bdd1243dSDimitry Andric   EmitInstruction(MI);
318bdd1243dSDimitry Andric   AdvanceCycle();
319bdd1243dSDimitry Andric }
320bdd1243dSDimitry Andric 
3210b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
3220b57cec5SDimitry Andric   IsHazardRecognizerMode = true;
3230b57cec5SDimitry Andric   CurrCycleInstr = MI;
3240b57cec5SDimitry Andric   unsigned W = PreEmitNoopsCommon(MI);
3250b57cec5SDimitry Andric   fixHazards(MI);
3260b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
3270b57cec5SDimitry Andric   return W;
3280b57cec5SDimitry Andric }
3290b57cec5SDimitry Andric 
3300b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
3310b57cec5SDimitry Andric   if (MI->isBundle())
3320b57cec5SDimitry Andric     return 0;
3330b57cec5SDimitry Andric 
334e8d8bef9SDimitry Andric   int WaitStates = 0;
3350b57cec5SDimitry Andric 
3360b57cec5SDimitry Andric   if (SIInstrInfo::isSMRD(*MI))
3370b57cec5SDimitry Andric     return std::max(WaitStates, checkSMRDHazards(MI));
3380b57cec5SDimitry Andric 
3390b57cec5SDimitry Andric   if (ST.hasNSAtoVMEMBug())
3400b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
3410b57cec5SDimitry Andric 
3420b57cec5SDimitry Andric   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
3430b57cec5SDimitry Andric 
3440b57cec5SDimitry Andric   if (ST.hasNoDataDepHazard())
3450b57cec5SDimitry Andric     return WaitStates;
3460b57cec5SDimitry Andric 
347fe6060f1SDimitry Andric   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
348fe6060f1SDimitry Andric     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
349fe6060f1SDimitry Andric 
3500b57cec5SDimitry Andric   if (SIInstrInfo::isVALU(*MI))
3510b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
3520b57cec5SDimitry Andric 
3530b57cec5SDimitry Andric   if (SIInstrInfo::isDPP(*MI))
3540b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
3550b57cec5SDimitry Andric 
3560b57cec5SDimitry Andric   if (isDivFMas(MI->getOpcode()))
3570b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
3580b57cec5SDimitry Andric 
3590b57cec5SDimitry Andric   if (isRWLane(MI->getOpcode()))
3600b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
3610b57cec5SDimitry Andric 
362fe6060f1SDimitry Andric   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
363fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
364fe6060f1SDimitry Andric        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
365fe6060f1SDimitry Andric     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
366fe6060f1SDimitry Andric 
3670b57cec5SDimitry Andric   if (MI->isInlineAsm())
3680b57cec5SDimitry Andric     return std::max(WaitStates, checkInlineAsmHazards(MI));
3690b57cec5SDimitry Andric 
3700b57cec5SDimitry Andric   if (isSGetReg(MI->getOpcode()))
3710b57cec5SDimitry Andric     return std::max(WaitStates, checkGetRegHazards(MI));
3720b57cec5SDimitry Andric 
3730b57cec5SDimitry Andric   if (isSSetReg(MI->getOpcode()))
3740b57cec5SDimitry Andric     return std::max(WaitStates, checkSetRegHazards(MI));
3750b57cec5SDimitry Andric 
3760b57cec5SDimitry Andric   if (isRFE(MI->getOpcode()))
3770b57cec5SDimitry Andric     return std::max(WaitStates, checkRFEHazards(MI));
3780b57cec5SDimitry Andric 
37981ad6265SDimitry Andric   if ((ST.hasReadM0MovRelInterpHazard() &&
380bdd1243dSDimitry Andric        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
381bdd1243dSDimitry Andric         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
382bdd1243dSDimitry Andric         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
38381ad6265SDimitry Andric       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
38481ad6265SDimitry Andric       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
385*0fca6ea1SDimitry Andric       (ST.hasReadM0LdsDirectHazard() &&
386*0fca6ea1SDimitry Andric        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
3870b57cec5SDimitry Andric     return std::max(WaitStates, checkReadM0Hazards(MI));
3880b57cec5SDimitry Andric 
3890b57cec5SDimitry Andric   if (SIInstrInfo::isMAI(*MI))
3900b57cec5SDimitry Andric     return std::max(WaitStates, checkMAIHazards(MI));
3910b57cec5SDimitry Andric 
392e8d8bef9SDimitry Andric   if (SIInstrInfo::isVMEM(*MI) ||
393e8d8bef9SDimitry Andric       SIInstrInfo::isFLAT(*MI) ||
394e8d8bef9SDimitry Andric       SIInstrInfo::isDS(*MI))
3950b57cec5SDimitry Andric     return std::max(WaitStates, checkMAILdStHazards(MI));
3960b57cec5SDimitry Andric 
3970b57cec5SDimitry Andric   return WaitStates;
3980b57cec5SDimitry Andric }
3990b57cec5SDimitry Andric 
4000b57cec5SDimitry Andric void GCNHazardRecognizer::EmitNoop() {
4010b57cec5SDimitry Andric   EmittedInstrs.push_front(nullptr);
4020b57cec5SDimitry Andric }
4030b57cec5SDimitry Andric 
4040b57cec5SDimitry Andric void GCNHazardRecognizer::AdvanceCycle() {
4050b57cec5SDimitry Andric   // When the scheduler detects a stall, it will call AdvanceCycle() without
4060b57cec5SDimitry Andric   // emitting any instructions.
407e8d8bef9SDimitry Andric   if (!CurrCycleInstr) {
408e8d8bef9SDimitry Andric     EmittedInstrs.push_front(nullptr);
4090b57cec5SDimitry Andric     return;
410e8d8bef9SDimitry Andric   }
4110b57cec5SDimitry Andric 
4120b57cec5SDimitry Andric   if (CurrCycleInstr->isBundle()) {
4130b57cec5SDimitry Andric     processBundle();
4140b57cec5SDimitry Andric     return;
4150b57cec5SDimitry Andric   }
4160b57cec5SDimitry Andric 
4170b57cec5SDimitry Andric   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
418349cc55cSDimitry Andric   if (!NumWaitStates) {
419349cc55cSDimitry Andric     CurrCycleInstr = nullptr;
420349cc55cSDimitry Andric     return;
421349cc55cSDimitry Andric   }
4220b57cec5SDimitry Andric 
4230b57cec5SDimitry Andric   // Keep track of emitted instructions
4240b57cec5SDimitry Andric   EmittedInstrs.push_front(CurrCycleInstr);
4250b57cec5SDimitry Andric 
4260b57cec5SDimitry Andric   // Add a nullptr for each additional wait state after the first.  Make sure
4270b57cec5SDimitry Andric   // not to add more than getMaxLookAhead() items to the list, since we
4280b57cec5SDimitry Andric   // truncate the list to that size right after this loop.
4290b57cec5SDimitry Andric   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
4300b57cec5SDimitry Andric        i < e; ++i) {
4310b57cec5SDimitry Andric     EmittedInstrs.push_front(nullptr);
4320b57cec5SDimitry Andric   }
4330b57cec5SDimitry Andric 
4340b57cec5SDimitry Andric   // getMaxLookahead() is the largest number of wait states we will ever need
4350b57cec5SDimitry Andric   // to insert, so there is no point in keeping track of more than that many
4360b57cec5SDimitry Andric   // wait states.
4370b57cec5SDimitry Andric   EmittedInstrs.resize(getMaxLookAhead());
4380b57cec5SDimitry Andric 
4390b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
4400b57cec5SDimitry Andric }
4410b57cec5SDimitry Andric 
4420b57cec5SDimitry Andric void GCNHazardRecognizer::RecedeCycle() {
4430b57cec5SDimitry Andric   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
4440b57cec5SDimitry Andric }
4450b57cec5SDimitry Andric 
4460b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4470b57cec5SDimitry Andric // Helper Functions
4480b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4490b57cec5SDimitry Andric 
450*0fca6ea1SDimitry Andric using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
45181ad6265SDimitry Andric 
452*0fca6ea1SDimitry Andric using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
453*0fca6ea1SDimitry Andric using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
45481ad6265SDimitry Andric 
45581ad6265SDimitry Andric // Search for a hazard in a block and its predecessors.
45681ad6265SDimitry Andric template <typename StateT>
45781ad6265SDimitry Andric static bool
45881ad6265SDimitry Andric hasHazard(StateT State,
45981ad6265SDimitry Andric           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
46081ad6265SDimitry Andric           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
46181ad6265SDimitry Andric           const MachineBasicBlock *MBB,
46281ad6265SDimitry Andric           MachineBasicBlock::const_reverse_instr_iterator I,
46381ad6265SDimitry Andric           DenseSet<const MachineBasicBlock *> &Visited) {
46481ad6265SDimitry Andric   for (auto E = MBB->instr_rend(); I != E; ++I) {
46581ad6265SDimitry Andric     // No need to look at parent BUNDLE instructions.
46681ad6265SDimitry Andric     if (I->isBundle())
46781ad6265SDimitry Andric       continue;
46881ad6265SDimitry Andric 
46981ad6265SDimitry Andric     switch (IsHazard(State, *I)) {
47081ad6265SDimitry Andric     case HazardFound:
47181ad6265SDimitry Andric       return true;
47281ad6265SDimitry Andric     case HazardExpired:
47381ad6265SDimitry Andric       return false;
47481ad6265SDimitry Andric     default:
47581ad6265SDimitry Andric       // Continue search
47681ad6265SDimitry Andric       break;
47781ad6265SDimitry Andric     }
47881ad6265SDimitry Andric 
47981ad6265SDimitry Andric     if (I->isInlineAsm() || I->isMetaInstruction())
48081ad6265SDimitry Andric       continue;
48181ad6265SDimitry Andric 
48281ad6265SDimitry Andric     UpdateState(State, *I);
48381ad6265SDimitry Andric   }
48481ad6265SDimitry Andric 
48581ad6265SDimitry Andric   for (MachineBasicBlock *Pred : MBB->predecessors()) {
48681ad6265SDimitry Andric     if (!Visited.insert(Pred).second)
48781ad6265SDimitry Andric       continue;
48881ad6265SDimitry Andric 
48981ad6265SDimitry Andric     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
49081ad6265SDimitry Andric                   Visited))
49181ad6265SDimitry Andric       return true;
49281ad6265SDimitry Andric   }
49381ad6265SDimitry Andric 
49481ad6265SDimitry Andric   return false;
49581ad6265SDimitry Andric }
4960b57cec5SDimitry Andric 
4970b57cec5SDimitry Andric // Returns a minimum wait states since \p I walking all predecessors.
4980b57cec5SDimitry Andric // Only scans until \p IsExpired does not return true.
4990b57cec5SDimitry Andric // Can only be run in a hazard recognizer mode.
50081ad6265SDimitry Andric static int getWaitStatesSince(
50181ad6265SDimitry Andric     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
50281ad6265SDimitry Andric     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
50381ad6265SDimitry Andric     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
50481ad6265SDimitry Andric     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
5050b57cec5SDimitry Andric   for (auto E = MBB->instr_rend(); I != E; ++I) {
5060b57cec5SDimitry Andric     // Don't add WaitStates for parent BUNDLE instructions.
5070b57cec5SDimitry Andric     if (I->isBundle())
5080b57cec5SDimitry Andric       continue;
5090b57cec5SDimitry Andric 
510fe6060f1SDimitry Andric     if (IsHazard(*I))
5110b57cec5SDimitry Andric       return WaitStates;
5120b57cec5SDimitry Andric 
513349cc55cSDimitry Andric     if (I->isInlineAsm())
5140b57cec5SDimitry Andric       continue;
5150b57cec5SDimitry Andric 
51681ad6265SDimitry Andric     WaitStates += GetNumWaitStates(*I);
5170b57cec5SDimitry Andric 
518fe6060f1SDimitry Andric     if (IsExpired(*I, WaitStates))
5190b57cec5SDimitry Andric       return std::numeric_limits<int>::max();
5200b57cec5SDimitry Andric   }
5210b57cec5SDimitry Andric 
522fe6060f1SDimitry Andric   int MinWaitStates = std::numeric_limits<int>::max();
5230b57cec5SDimitry Andric   for (MachineBasicBlock *Pred : MBB->predecessors()) {
5240b57cec5SDimitry Andric     if (!Visited.insert(Pred).second)
5250b57cec5SDimitry Andric       continue;
5260b57cec5SDimitry Andric 
52781ad6265SDimitry Andric     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
52881ad6265SDimitry Andric                                IsExpired, Visited, GetNumWaitStates);
5290b57cec5SDimitry Andric 
530fe6060f1SDimitry Andric     MinWaitStates = std::min(MinWaitStates, W);
5310b57cec5SDimitry Andric   }
5320b57cec5SDimitry Andric 
5330b57cec5SDimitry Andric   return MinWaitStates;
5340b57cec5SDimitry Andric }
5350b57cec5SDimitry Andric 
5360b57cec5SDimitry Andric static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
537fe6060f1SDimitry Andric                               const MachineInstr *MI, IsExpiredFn IsExpired) {
5380b57cec5SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
5390b57cec5SDimitry Andric   return getWaitStatesSince(IsHazard, MI->getParent(),
5400b57cec5SDimitry Andric                             std::next(MI->getReverseIterator()),
5410b57cec5SDimitry Andric                             0, IsExpired, Visited);
5420b57cec5SDimitry Andric }
5430b57cec5SDimitry Andric 
5440b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
5450b57cec5SDimitry Andric   if (IsHazardRecognizerMode) {
546fe6060f1SDimitry Andric     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
5470b57cec5SDimitry Andric       return WaitStates >= Limit;
5480b57cec5SDimitry Andric     };
5490b57cec5SDimitry Andric     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
5500b57cec5SDimitry Andric   }
5510b57cec5SDimitry Andric 
5520b57cec5SDimitry Andric   int WaitStates = 0;
5530b57cec5SDimitry Andric   for (MachineInstr *MI : EmittedInstrs) {
5540b57cec5SDimitry Andric     if (MI) {
555fe6060f1SDimitry Andric       if (IsHazard(*MI))
5560b57cec5SDimitry Andric         return WaitStates;
5570b57cec5SDimitry Andric 
5580b57cec5SDimitry Andric       if (MI->isInlineAsm())
5590b57cec5SDimitry Andric         continue;
5600b57cec5SDimitry Andric     }
5610b57cec5SDimitry Andric     ++WaitStates;
5620b57cec5SDimitry Andric 
5630b57cec5SDimitry Andric     if (WaitStates >= Limit)
5640b57cec5SDimitry Andric       break;
5650b57cec5SDimitry Andric   }
5660b57cec5SDimitry Andric   return std::numeric_limits<int>::max();
5670b57cec5SDimitry Andric }
5680b57cec5SDimitry Andric 
5690b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
5700b57cec5SDimitry Andric                                                IsHazardFn IsHazardDef,
5710b57cec5SDimitry Andric                                                int Limit) {
5720b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5730b57cec5SDimitry Andric 
574fe6060f1SDimitry Andric   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
575fe6060f1SDimitry Andric     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
5760b57cec5SDimitry Andric   };
5770b57cec5SDimitry Andric 
5780b57cec5SDimitry Andric   return getWaitStatesSince(IsHazardFn, Limit);
5790b57cec5SDimitry Andric }
5800b57cec5SDimitry Andric 
5810b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
5820b57cec5SDimitry Andric                                                   int Limit) {
583fe6060f1SDimitry Andric   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
584fe6060f1SDimitry Andric     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
5850b57cec5SDimitry Andric   };
5860b57cec5SDimitry Andric 
5870b57cec5SDimitry Andric   return getWaitStatesSince(IsHazardFn, Limit);
5880b57cec5SDimitry Andric }
5890b57cec5SDimitry Andric 
5900b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5910b57cec5SDimitry Andric // No-op Hazard Detection
5920b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5930b57cec5SDimitry Andric 
594e8d8bef9SDimitry Andric static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
595e8d8bef9SDimitry Andric                         MCRegister Reg) {
59606c3fb27SDimitry Andric   for (MCRegUnit Unit : TRI.regunits(Reg))
59706c3fb27SDimitry Andric     BV.set(Unit);
5980b57cec5SDimitry Andric }
5990b57cec5SDimitry Andric 
6000b57cec5SDimitry Andric static void addRegsToSet(const SIRegisterInfo &TRI,
6010b57cec5SDimitry Andric                          iterator_range<MachineInstr::const_mop_iterator> Ops,
60206c3fb27SDimitry Andric                          BitVector &DefSet, BitVector &UseSet) {
6030b57cec5SDimitry Andric   for (const MachineOperand &Op : Ops) {
6040b57cec5SDimitry Andric     if (Op.isReg())
60506c3fb27SDimitry Andric       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
6060b57cec5SDimitry Andric   }
6070b57cec5SDimitry Andric }
6080b57cec5SDimitry Andric 
6090b57cec5SDimitry Andric void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
61006c3fb27SDimitry Andric   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
6110b57cec5SDimitry Andric }
6120b57cec5SDimitry Andric 
6135ffd83dbSDimitry Andric static bool breaksSMEMSoftClause(MachineInstr *MI) {
6145ffd83dbSDimitry Andric   return !SIInstrInfo::isSMRD(*MI);
6155ffd83dbSDimitry Andric }
6165ffd83dbSDimitry Andric 
6175ffd83dbSDimitry Andric static bool breaksVMEMSoftClause(MachineInstr *MI) {
6185ffd83dbSDimitry Andric   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
6195ffd83dbSDimitry Andric }
6205ffd83dbSDimitry Andric 
6210b57cec5SDimitry Andric int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
6220b57cec5SDimitry Andric   // SMEM soft clause are only present on VI+, and only matter if xnack is
6230b57cec5SDimitry Andric   // enabled.
6240b57cec5SDimitry Andric   if (!ST.isXNACKEnabled())
6250b57cec5SDimitry Andric     return 0;
6260b57cec5SDimitry Andric 
6270b57cec5SDimitry Andric   bool IsSMRD = TII.isSMRD(*MEM);
6280b57cec5SDimitry Andric 
6290b57cec5SDimitry Andric   resetClause();
6300b57cec5SDimitry Andric 
6310b57cec5SDimitry Andric   // A soft-clause is any group of consecutive SMEM instructions.  The
6320b57cec5SDimitry Andric   // instructions in this group may return out of order and/or may be
6330b57cec5SDimitry Andric   // replayed (i.e. the same instruction issued more than once).
6340b57cec5SDimitry Andric   //
6350b57cec5SDimitry Andric   // In order to handle these situations correctly we need to make sure that
6360b57cec5SDimitry Andric   // when a clause has more than one instruction, no instruction in the clause
6370b57cec5SDimitry Andric   // writes to a register that is read by another instruction in the clause
63881ad6265SDimitry Andric   // (including itself). If we encounter this situation, we need to break the
6390b57cec5SDimitry Andric   // clause by inserting a non SMEM instruction.
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   for (MachineInstr *MI : EmittedInstrs) {
6420b57cec5SDimitry Andric     // When we hit a non-SMEM instruction then we have passed the start of the
6430b57cec5SDimitry Andric     // clause and we can stop.
6440b57cec5SDimitry Andric     if (!MI)
6450b57cec5SDimitry Andric       break;
6460b57cec5SDimitry Andric 
6475ffd83dbSDimitry Andric     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
6480b57cec5SDimitry Andric       break;
6490b57cec5SDimitry Andric 
6500b57cec5SDimitry Andric     addClauseInst(*MI);
6510b57cec5SDimitry Andric   }
6520b57cec5SDimitry Andric 
6530b57cec5SDimitry Andric   if (ClauseDefs.none())
6540b57cec5SDimitry Andric     return 0;
6550b57cec5SDimitry Andric 
6560b57cec5SDimitry Andric   // We need to make sure not to put loads and stores in the same clause if they
6570b57cec5SDimitry Andric   // use the same address. For now, just start a new clause whenever we see a
6580b57cec5SDimitry Andric   // store.
6590b57cec5SDimitry Andric   if (MEM->mayStore())
6600b57cec5SDimitry Andric     return 1;
6610b57cec5SDimitry Andric 
6620b57cec5SDimitry Andric   addClauseInst(*MEM);
6630b57cec5SDimitry Andric 
6640b57cec5SDimitry Andric   // If the set of defs and uses intersect then we cannot add this instruction
6650b57cec5SDimitry Andric   // to the clause, so we have a hazard.
6660b57cec5SDimitry Andric   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
6670b57cec5SDimitry Andric }
6680b57cec5SDimitry Andric 
6690b57cec5SDimitry Andric int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
6700b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
6710b57cec5SDimitry Andric 
6720b57cec5SDimitry Andric   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
6730b57cec5SDimitry Andric 
6740b57cec5SDimitry Andric   // This SMRD hazard only affects SI.
6750b57cec5SDimitry Andric   if (!ST.hasSMRDReadVALUDefHazard())
6760b57cec5SDimitry Andric     return WaitStatesNeeded;
6770b57cec5SDimitry Andric 
6780b57cec5SDimitry Andric   // A read of an SGPR by SMRD instruction requires 4 wait states when the
6790b57cec5SDimitry Andric   // SGPR was written by a VALU instruction.
6800b57cec5SDimitry Andric   int SmrdSgprWaitStates = 4;
681fe6060f1SDimitry Andric   auto IsHazardDefFn = [this](const MachineInstr &MI) {
682fe6060f1SDimitry Andric     return TII.isVALU(MI);
683fe6060f1SDimitry Andric   };
684fe6060f1SDimitry Andric   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
685fe6060f1SDimitry Andric     return TII.isSALU(MI);
686fe6060f1SDimitry Andric   };
6870b57cec5SDimitry Andric 
6880b57cec5SDimitry Andric   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
6890b57cec5SDimitry Andric 
6900b57cec5SDimitry Andric   for (const MachineOperand &Use : SMRD->uses()) {
6910b57cec5SDimitry Andric     if (!Use.isReg())
6920b57cec5SDimitry Andric       continue;
6930b57cec5SDimitry Andric     int WaitStatesNeededForUse =
6940b57cec5SDimitry Andric         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
6950b57cec5SDimitry Andric                                                    SmrdSgprWaitStates);
6960b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6970b57cec5SDimitry Andric 
6980b57cec5SDimitry Andric     // This fixes what appears to be undocumented hardware behavior in SI where
6990b57cec5SDimitry Andric     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
7000b57cec5SDimitry Andric     // needs some number of nops in between. We don't know how many we need, but
7010b57cec5SDimitry Andric     // let's use 4. This wasn't discovered before probably because the only
7020b57cec5SDimitry Andric     // case when this happens is when we expand a 64-bit pointer into a full
7030b57cec5SDimitry Andric     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
7040b57cec5SDimitry Andric     // probably never encountered in the closed-source land.
7050b57cec5SDimitry Andric     if (IsBufferSMRD) {
7060b57cec5SDimitry Andric       int WaitStatesNeededForUse =
7070b57cec5SDimitry Andric         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
7080b57cec5SDimitry Andric                                                    IsBufferHazardDefFn,
7090b57cec5SDimitry Andric                                                    SmrdSgprWaitStates);
7100b57cec5SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7110b57cec5SDimitry Andric     }
7120b57cec5SDimitry Andric   }
7130b57cec5SDimitry Andric 
7140b57cec5SDimitry Andric   return WaitStatesNeeded;
7150b57cec5SDimitry Andric }
7160b57cec5SDimitry Andric 
7170b57cec5SDimitry Andric int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
7180b57cec5SDimitry Andric   if (!ST.hasVMEMReadSGPRVALUDefHazard())
7190b57cec5SDimitry Andric     return 0;
7200b57cec5SDimitry Andric 
7210b57cec5SDimitry Andric   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
7220b57cec5SDimitry Andric 
7230b57cec5SDimitry Andric   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
7240b57cec5SDimitry Andric   // SGPR was written by a VALU Instruction.
7250b57cec5SDimitry Andric   const int VmemSgprWaitStates = 5;
726fe6060f1SDimitry Andric   auto IsHazardDefFn = [this](const MachineInstr &MI) {
727fe6060f1SDimitry Andric     return TII.isVALU(MI);
728fe6060f1SDimitry Andric   };
7290b57cec5SDimitry Andric   for (const MachineOperand &Use : VMEM->uses()) {
730fe6060f1SDimitry Andric     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
7310b57cec5SDimitry Andric       continue;
7320b57cec5SDimitry Andric 
7330b57cec5SDimitry Andric     int WaitStatesNeededForUse =
7340b57cec5SDimitry Andric         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
7350b57cec5SDimitry Andric                                                    VmemSgprWaitStates);
7360b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7370b57cec5SDimitry Andric   }
7380b57cec5SDimitry Andric   return WaitStatesNeeded;
7390b57cec5SDimitry Andric }
7400b57cec5SDimitry Andric 
7410b57cec5SDimitry Andric int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
7420b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
7430b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7440b57cec5SDimitry Andric 
7450b57cec5SDimitry Andric   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
7460b57cec5SDimitry Andric   int DppVgprWaitStates = 2;
7470b57cec5SDimitry Andric   int DppExecWaitStates = 5;
7480b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
749fe6060f1SDimitry Andric   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
750fe6060f1SDimitry Andric     return TII->isVALU(MI);
751fe6060f1SDimitry Andric   };
7520b57cec5SDimitry Andric 
7530b57cec5SDimitry Andric   for (const MachineOperand &Use : DPP->uses()) {
7540b57cec5SDimitry Andric     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
7550b57cec5SDimitry Andric       continue;
7560b57cec5SDimitry Andric     int WaitStatesNeededForUse =
757fe6060f1SDimitry Andric         DppVgprWaitStates - getWaitStatesSinceDef(
758fe6060f1SDimitry Andric                                 Use.getReg(),
759fe6060f1SDimitry Andric                                 [](const MachineInstr &) { return true; },
7600b57cec5SDimitry Andric                                 DppVgprWaitStates);
7610b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7620b57cec5SDimitry Andric   }
7630b57cec5SDimitry Andric 
7640b57cec5SDimitry Andric   WaitStatesNeeded = std::max(
7650b57cec5SDimitry Andric       WaitStatesNeeded,
7660b57cec5SDimitry Andric       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
7670b57cec5SDimitry Andric                                                 DppExecWaitStates));
7680b57cec5SDimitry Andric 
7690b57cec5SDimitry Andric   return WaitStatesNeeded;
7700b57cec5SDimitry Andric }
7710b57cec5SDimitry Andric 
7720b57cec5SDimitry Andric int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
7730b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7740b57cec5SDimitry Andric 
7750b57cec5SDimitry Andric   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
7760b57cec5SDimitry Andric   // instruction.
7770b57cec5SDimitry Andric   const int DivFMasWaitStates = 4;
778fe6060f1SDimitry Andric   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
779fe6060f1SDimitry Andric     return TII->isVALU(MI);
780fe6060f1SDimitry Andric   };
7810b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
7820b57cec5SDimitry Andric                                                DivFMasWaitStates);
7830b57cec5SDimitry Andric 
7840b57cec5SDimitry Andric   return DivFMasWaitStates - WaitStatesNeeded;
7850b57cec5SDimitry Andric }
7860b57cec5SDimitry Andric 
7870b57cec5SDimitry Andric int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
7880b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7890b57cec5SDimitry Andric   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
7900b57cec5SDimitry Andric 
7910b57cec5SDimitry Andric   const int GetRegWaitStates = 2;
792fe6060f1SDimitry Andric   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
793fe6060f1SDimitry Andric     return GetRegHWReg == getHWReg(TII, MI);
7940b57cec5SDimitry Andric   };
7950b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
7960b57cec5SDimitry Andric 
7970b57cec5SDimitry Andric   return GetRegWaitStates - WaitStatesNeeded;
7980b57cec5SDimitry Andric }
7990b57cec5SDimitry Andric 
8000b57cec5SDimitry Andric int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
8010b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
8020b57cec5SDimitry Andric   unsigned HWReg = getHWReg(TII, *SetRegInstr);
8030b57cec5SDimitry Andric 
8040b57cec5SDimitry Andric   const int SetRegWaitStates = ST.getSetRegWaitStates();
805fe6060f1SDimitry Andric   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
806fe6060f1SDimitry Andric     return HWReg == getHWReg(TII, MI);
8070b57cec5SDimitry Andric   };
8080b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
8090b57cec5SDimitry Andric   return SetRegWaitStates - WaitStatesNeeded;
8100b57cec5SDimitry Andric }
8110b57cec5SDimitry Andric 
8120b57cec5SDimitry Andric int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
8130b57cec5SDimitry Andric   if (!MI.mayStore())
8140b57cec5SDimitry Andric     return -1;
8150b57cec5SDimitry Andric 
8160b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
8170b57cec5SDimitry Andric   unsigned Opcode = MI.getOpcode();
8180b57cec5SDimitry Andric   const MCInstrDesc &Desc = MI.getDesc();
8190b57cec5SDimitry Andric 
8200b57cec5SDimitry Andric   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
8210b57cec5SDimitry Andric   int VDataRCID = -1;
8220b57cec5SDimitry Andric   if (VDataIdx != -1)
823bdd1243dSDimitry Andric     VDataRCID = Desc.operands()[VDataIdx].RegClass;
8240b57cec5SDimitry Andric 
8250b57cec5SDimitry Andric   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
8260b57cec5SDimitry Andric     // There is no hazard if the instruction does not use vector regs
8270b57cec5SDimitry Andric     // (like wbinvl1)
8280b57cec5SDimitry Andric     if (VDataIdx == -1)
8290b57cec5SDimitry Andric       return -1;
8300b57cec5SDimitry Andric     // For MUBUF/MTBUF instructions this hazard only exists if the
8310b57cec5SDimitry Andric     // instruction is not using a register in the soffset field.
8320b57cec5SDimitry Andric     const MachineOperand *SOffset =
8330b57cec5SDimitry Andric         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
8340b57cec5SDimitry Andric     // If we have no soffset operand, then assume this field has been
8350b57cec5SDimitry Andric     // hardcoded to zero.
8360b57cec5SDimitry Andric     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
8370b57cec5SDimitry Andric         (!SOffset || !SOffset->isReg()))
8380b57cec5SDimitry Andric       return VDataIdx;
8390b57cec5SDimitry Andric   }
8400b57cec5SDimitry Andric 
8410b57cec5SDimitry Andric   // MIMG instructions create a hazard if they don't use a 256-bit T# and
8420b57cec5SDimitry Andric   // the store size is greater than 8 bytes and they have more than two bits
8430b57cec5SDimitry Andric   // of their dmask set.
8440b57cec5SDimitry Andric   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
8450b57cec5SDimitry Andric   if (TII->isMIMG(MI)) {
8460b57cec5SDimitry Andric     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
8470b57cec5SDimitry Andric     assert(SRsrcIdx != -1 &&
848bdd1243dSDimitry Andric            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
8490b57cec5SDimitry Andric     (void)SRsrcIdx;
8500b57cec5SDimitry Andric   }
8510b57cec5SDimitry Andric 
8520b57cec5SDimitry Andric   if (TII->isFLAT(MI)) {
8530b57cec5SDimitry Andric     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
854bdd1243dSDimitry Andric     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
8550b57cec5SDimitry Andric       return DataIdx;
8560b57cec5SDimitry Andric   }
8570b57cec5SDimitry Andric 
8580b57cec5SDimitry Andric   return -1;
8590b57cec5SDimitry Andric }
8600b57cec5SDimitry Andric 
861e8d8bef9SDimitry Andric int
862e8d8bef9SDimitry Andric GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
8630b57cec5SDimitry Andric                                             const MachineRegisterInfo &MRI) {
8640b57cec5SDimitry Andric   // Helper to check for the hazard where VMEM instructions that store more than
8650b57cec5SDimitry Andric   // 8 bytes can have there store data over written by the next instruction.
8660b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
8670b57cec5SDimitry Andric 
86881ad6265SDimitry Andric   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
8690b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
8700b57cec5SDimitry Andric 
871fe6060f1SDimitry Andric   if (!TRI->isVectorRegister(MRI, Def.getReg()))
8720b57cec5SDimitry Andric     return WaitStatesNeeded;
8738bcb0991SDimitry Andric   Register Reg = Def.getReg();
874fe6060f1SDimitry Andric   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
875fe6060f1SDimitry Andric     int DataIdx = createsVALUHazard(MI);
8760b57cec5SDimitry Andric     return DataIdx >= 0 &&
877fe6060f1SDimitry Andric            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
8780b57cec5SDimitry Andric   };
8790b57cec5SDimitry Andric   int WaitStatesNeededForDef =
8800b57cec5SDimitry Andric     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
8810b57cec5SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
8820b57cec5SDimitry Andric 
8830b57cec5SDimitry Andric   return WaitStatesNeeded;
8840b57cec5SDimitry Andric }
8850b57cec5SDimitry Andric 
8860b57cec5SDimitry Andric int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
88781ad6265SDimitry Andric   int WaitStatesNeeded = 0;
88881ad6265SDimitry Andric 
88981ad6265SDimitry Andric   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
89081ad6265SDimitry Andric     const int TransDefWaitstates = 1;
89181ad6265SDimitry Andric 
89281ad6265SDimitry Andric     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
89381ad6265SDimitry Andric       if (!SIInstrInfo::isTRANS(MI))
89481ad6265SDimitry Andric         return false;
89581ad6265SDimitry Andric       const SIRegisterInfo *TRI = ST.getRegisterInfo();
89681ad6265SDimitry Andric       const SIInstrInfo *TII = ST.getInstrInfo();
89781ad6265SDimitry Andric       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
89881ad6265SDimitry Andric 
89981ad6265SDimitry Andric       for (const MachineOperand &Use : VALU->explicit_uses()) {
90081ad6265SDimitry Andric         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
90181ad6265SDimitry Andric           return true;
90281ad6265SDimitry Andric       }
90381ad6265SDimitry Andric 
90481ad6265SDimitry Andric       return false;
90581ad6265SDimitry Andric     };
90681ad6265SDimitry Andric 
90781ad6265SDimitry Andric     int WaitStatesNeededForDef =
90881ad6265SDimitry Andric         TransDefWaitstates -
90981ad6265SDimitry Andric         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
91081ad6265SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
91181ad6265SDimitry Andric   }
91281ad6265SDimitry Andric 
91381ad6265SDimitry Andric   if (ST.hasDstSelForwardingHazard()) {
91481ad6265SDimitry Andric     const int Shift16DefWaitstates = 1;
91581ad6265SDimitry Andric 
91681ad6265SDimitry Andric     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
91781ad6265SDimitry Andric       if (!SIInstrInfo::isVALU(MI))
91881ad6265SDimitry Andric         return false;
91981ad6265SDimitry Andric       const SIInstrInfo *TII = ST.getInstrInfo();
92081ad6265SDimitry Andric       if (SIInstrInfo::isSDWA(MI)) {
92181ad6265SDimitry Andric         if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
92281ad6265SDimitry Andric           if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
92381ad6265SDimitry Andric             return false;
92481ad6265SDimitry Andric       } else {
925bdd1243dSDimitry Andric         if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
92681ad6265SDimitry Andric             !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
92781ad6265SDimitry Andric                   ->getImm() &
92881ad6265SDimitry Andric               SISrcMods::DST_OP_SEL))
92981ad6265SDimitry Andric           return false;
93081ad6265SDimitry Andric       }
93181ad6265SDimitry Andric       const SIRegisterInfo *TRI = ST.getRegisterInfo();
93281ad6265SDimitry Andric       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
93381ad6265SDimitry Andric         Register Def = Dst->getReg();
93481ad6265SDimitry Andric 
93581ad6265SDimitry Andric         for (const MachineOperand &Use : VALU->explicit_uses()) {
93681ad6265SDimitry Andric           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
93781ad6265SDimitry Andric             return true;
93881ad6265SDimitry Andric         }
93981ad6265SDimitry Andric       }
94081ad6265SDimitry Andric 
94181ad6265SDimitry Andric       return false;
94281ad6265SDimitry Andric     };
94381ad6265SDimitry Andric 
94481ad6265SDimitry Andric     int WaitStatesNeededForDef =
94581ad6265SDimitry Andric         Shift16DefWaitstates -
94681ad6265SDimitry Andric         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
94781ad6265SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
94881ad6265SDimitry Andric   }
94981ad6265SDimitry Andric 
95081ad6265SDimitry Andric   if (ST.hasVDecCoExecHazard()) {
95181ad6265SDimitry Andric     const int VALUWriteSGPRVALUReadWaitstates = 2;
95281ad6265SDimitry Andric     const int VALUWriteEXECRWLane = 4;
95381ad6265SDimitry Andric     const int VALUWriteVGPRReadlaneRead = 1;
95481ad6265SDimitry Andric 
95581ad6265SDimitry Andric     const SIRegisterInfo *TRI = ST.getRegisterInfo();
95681ad6265SDimitry Andric     const MachineRegisterInfo &MRI = MF.getRegInfo();
95781ad6265SDimitry Andric     Register UseReg;
95881ad6265SDimitry Andric     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
95981ad6265SDimitry Andric       if (!SIInstrInfo::isVALU(MI))
96081ad6265SDimitry Andric         return false;
96181ad6265SDimitry Andric       return MI.modifiesRegister(UseReg, TRI);
96281ad6265SDimitry Andric     };
96381ad6265SDimitry Andric 
96481ad6265SDimitry Andric     for (const MachineOperand &Use : VALU->explicit_uses()) {
96581ad6265SDimitry Andric       if (!Use.isReg())
96681ad6265SDimitry Andric         continue;
96781ad6265SDimitry Andric 
96881ad6265SDimitry Andric       UseReg = Use.getReg();
96981ad6265SDimitry Andric       if (TRI->isSGPRReg(MRI, UseReg)) {
97081ad6265SDimitry Andric         int WaitStatesNeededForDef =
97181ad6265SDimitry Andric             VALUWriteSGPRVALUReadWaitstates -
97281ad6265SDimitry Andric             getWaitStatesSince(IsVALUDefSGPRFn,
97381ad6265SDimitry Andric                                VALUWriteSGPRVALUReadWaitstates);
97481ad6265SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
97581ad6265SDimitry Andric       }
97681ad6265SDimitry Andric     }
97781ad6265SDimitry Andric 
97881ad6265SDimitry Andric     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
97981ad6265SDimitry Andric       UseReg = AMDGPU::VCC;
98081ad6265SDimitry Andric       int WaitStatesNeededForDef =
98181ad6265SDimitry Andric           VALUWriteSGPRVALUReadWaitstates -
98281ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
98381ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
98481ad6265SDimitry Andric     }
98581ad6265SDimitry Andric 
98681ad6265SDimitry Andric     switch (VALU->getOpcode()) {
98781ad6265SDimitry Andric     case AMDGPU::V_READLANE_B32:
98881ad6265SDimitry Andric     case AMDGPU::V_READFIRSTLANE_B32: {
98981ad6265SDimitry Andric       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
99081ad6265SDimitry Andric       UseReg = Src->getReg();
99181ad6265SDimitry Andric       int WaitStatesNeededForDef =
99281ad6265SDimitry Andric           VALUWriteVGPRReadlaneRead -
99381ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
99481ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
99581ad6265SDimitry Andric     }
996bdd1243dSDimitry Andric       [[fallthrough]];
99781ad6265SDimitry Andric     case AMDGPU::V_WRITELANE_B32: {
99881ad6265SDimitry Andric       UseReg = AMDGPU::EXEC;
99981ad6265SDimitry Andric       int WaitStatesNeededForDef =
100081ad6265SDimitry Andric           VALUWriteEXECRWLane -
100181ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
100281ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
100381ad6265SDimitry Andric       break;
100481ad6265SDimitry Andric     }
100581ad6265SDimitry Andric     default:
100681ad6265SDimitry Andric       break;
100781ad6265SDimitry Andric     }
100881ad6265SDimitry Andric   }
100981ad6265SDimitry Andric 
10100b57cec5SDimitry Andric   // This checks for the hazard where VMEM instructions that store more than
10110b57cec5SDimitry Andric   // 8 bytes can have there store data over written by the next instruction.
10120b57cec5SDimitry Andric   if (!ST.has12DWordStoreHazard())
101381ad6265SDimitry Andric     return WaitStatesNeeded;
10140b57cec5SDimitry Andric 
10150b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10160b57cec5SDimitry Andric 
10170b57cec5SDimitry Andric   for (const MachineOperand &Def : VALU->defs()) {
10180b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
10190b57cec5SDimitry Andric   }
10200b57cec5SDimitry Andric 
10210b57cec5SDimitry Andric   return WaitStatesNeeded;
10220b57cec5SDimitry Andric }
10230b57cec5SDimitry Andric 
10240b57cec5SDimitry Andric int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10250b57cec5SDimitry Andric   // This checks for hazards associated with inline asm statements.
10260b57cec5SDimitry Andric   // Since inline asms can contain just about anything, we use this
10270b57cec5SDimitry Andric   // to call/leverage other check*Hazard routines. Note that
10280b57cec5SDimitry Andric   // this function doesn't attempt to address all possible inline asm
10290b57cec5SDimitry Andric   // hazards (good luck), but is a collection of what has been
10300b57cec5SDimitry Andric   // problematic thus far.
10310b57cec5SDimitry Andric 
10320b57cec5SDimitry Andric   // see checkVALUHazards()
10330b57cec5SDimitry Andric   if (!ST.has12DWordStoreHazard())
10340b57cec5SDimitry Andric     return 0;
10350b57cec5SDimitry Andric 
10360b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10370b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
10380b57cec5SDimitry Andric 
103906c3fb27SDimitry Andric   for (const MachineOperand &Op :
104006c3fb27SDimitry Andric        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
10410b57cec5SDimitry Andric     if (Op.isReg() && Op.isDef()) {
104206c3fb27SDimitry Andric       WaitStatesNeeded =
104306c3fb27SDimitry Andric           std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
10440b57cec5SDimitry Andric     }
10450b57cec5SDimitry Andric   }
10460b57cec5SDimitry Andric 
10470b57cec5SDimitry Andric   return WaitStatesNeeded;
10480b57cec5SDimitry Andric }
10490b57cec5SDimitry Andric 
10500b57cec5SDimitry Andric int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
10510b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
10520b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
10530b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10540b57cec5SDimitry Andric 
10550b57cec5SDimitry Andric   const MachineOperand *LaneSelectOp =
10560b57cec5SDimitry Andric       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
10570b57cec5SDimitry Andric 
10580b57cec5SDimitry Andric   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
10590b57cec5SDimitry Andric     return 0;
10600b57cec5SDimitry Andric 
10618bcb0991SDimitry Andric   Register LaneSelectReg = LaneSelectOp->getReg();
1062fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
10630b57cec5SDimitry Andric 
10640b57cec5SDimitry Andric   const int RWLaneWaitStates = 4;
10650b57cec5SDimitry Andric   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
10660b57cec5SDimitry Andric                                               RWLaneWaitStates);
10670b57cec5SDimitry Andric   return RWLaneWaitStates - WaitStatesSince;
10680b57cec5SDimitry Andric }
10690b57cec5SDimitry Andric 
10700b57cec5SDimitry Andric int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
10710b57cec5SDimitry Andric   if (!ST.hasRFEHazards())
10720b57cec5SDimitry Andric     return 0;
10730b57cec5SDimitry Andric 
10740b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
10750b57cec5SDimitry Andric 
10760b57cec5SDimitry Andric   const int RFEWaitStates = 1;
10770b57cec5SDimitry Andric 
1078fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) {
1079fe6060f1SDimitry Andric     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
10800b57cec5SDimitry Andric   };
10810b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
10820b57cec5SDimitry Andric   return RFEWaitStates - WaitStatesNeeded;
10830b57cec5SDimitry Andric }
10840b57cec5SDimitry Andric 
10850b57cec5SDimitry Andric int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
10860b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
108781ad6265SDimitry Andric   const int ReadM0WaitStates = 1;
1088fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
108981ad6265SDimitry Andric   return ReadM0WaitStates -
109081ad6265SDimitry Andric          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
10910b57cec5SDimitry Andric }
10920b57cec5SDimitry Andric 
10930b57cec5SDimitry Andric void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
10940b57cec5SDimitry Andric   fixVMEMtoScalarWriteHazards(MI);
10950b57cec5SDimitry Andric   fixVcmpxPermlaneHazards(MI);
10960b57cec5SDimitry Andric   fixSMEMtoVectorWriteHazards(MI);
10970b57cec5SDimitry Andric   fixVcmpxExecWARHazard(MI);
10980b57cec5SDimitry Andric   fixLdsBranchVmemWARHazard(MI);
109981ad6265SDimitry Andric   if (ST.hasLdsDirect()) {
110081ad6265SDimitry Andric     fixLdsDirectVALUHazard(MI);
110181ad6265SDimitry Andric     fixLdsDirectVMEMHazard(MI);
110281ad6265SDimitry Andric   }
110381ad6265SDimitry Andric   fixVALUPartialForwardingHazard(MI);
110481ad6265SDimitry Andric   fixVALUTransUseHazard(MI);
110581ad6265SDimitry Andric   fixWMMAHazards(MI);
1106bdd1243dSDimitry Andric   fixShift64HighRegBug(MI);
1107bdd1243dSDimitry Andric   fixVALUMaskWriteHazard(MI);
1108*0fca6ea1SDimitry Andric   fixRequiredExportPriority(MI);
11090b57cec5SDimitry Andric }
11100b57cec5SDimitry Andric 
11110b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
11120b57cec5SDimitry Andric   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
11130b57cec5SDimitry Andric     return false;
11140b57cec5SDimitry Andric 
11150b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
111681ad6265SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
111781ad6265SDimitry Andric   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
111881ad6265SDimitry Andric     return (TII->isVOPC(MI) ||
111981ad6265SDimitry Andric             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
112081ad6265SDimitry Andric            MI.modifiesRegister(AMDGPU::EXEC, TRI);
112181ad6265SDimitry Andric   };
11220b57cec5SDimitry Andric 
1123fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1124fe6060f1SDimitry Andric     unsigned Opc = MI.getOpcode();
1125fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1126fe6060f1SDimitry Andric            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
11270b57cec5SDimitry Andric   };
11280b57cec5SDimitry Andric 
11290b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11300b57cec5SDimitry Andric       std::numeric_limits<int>::max())
11310b57cec5SDimitry Andric     return false;
11320b57cec5SDimitry Andric 
11330b57cec5SDimitry Andric   // V_NOP will be discarded by SQ.
113481ad6265SDimitry Andric   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
11350b57cec5SDimitry Andric   // which is always a VGPR and available.
11360b57cec5SDimitry Andric   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
11378bcb0991SDimitry Andric   Register Reg = Src0->getReg();
11380b57cec5SDimitry Andric   bool IsUndef = Src0->isUndef();
11390b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
11400b57cec5SDimitry Andric           TII->get(AMDGPU::V_MOV_B32_e32))
11410b57cec5SDimitry Andric     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
11420b57cec5SDimitry Andric     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
11430b57cec5SDimitry Andric 
11440b57cec5SDimitry Andric   return true;
11450b57cec5SDimitry Andric }
11460b57cec5SDimitry Andric 
11470b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
11480b57cec5SDimitry Andric   if (!ST.hasVMEMtoScalarWriteHazard())
11490b57cec5SDimitry Andric     return false;
11507a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
11510b57cec5SDimitry Andric 
11520b57cec5SDimitry Andric   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
11530b57cec5SDimitry Andric     return false;
11540b57cec5SDimitry Andric 
11550b57cec5SDimitry Andric   if (MI->getNumDefs() == 0)
11560b57cec5SDimitry Andric     return false;
11570b57cec5SDimitry Andric 
11580b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
11590b57cec5SDimitry Andric 
1160fe6060f1SDimitry Andric   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1161fe6060f1SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1162fe6060f1SDimitry Andric         !SIInstrInfo::isFLAT(I))
11630b57cec5SDimitry Andric       return false;
11640b57cec5SDimitry Andric 
11650b57cec5SDimitry Andric     for (const MachineOperand &Def : MI->defs()) {
1166fe6060f1SDimitry Andric       const MachineOperand *Op =
1167*0fca6ea1SDimitry Andric           I.findRegisterUseOperand(Def.getReg(), TRI, false);
11680b57cec5SDimitry Andric       if (!Op)
11690b57cec5SDimitry Andric         continue;
11700b57cec5SDimitry Andric       return true;
11710b57cec5SDimitry Andric     }
11720b57cec5SDimitry Andric     return false;
11730b57cec5SDimitry Andric   };
11740b57cec5SDimitry Andric 
1175fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1176fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI) ||
1177fe6060f1SDimitry Andric            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1178fe6060f1SDimitry Andric             !MI.getOperand(0).getImm()) ||
1179fe6060f1SDimitry Andric            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
118006c3fb27SDimitry Andric             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
11810b57cec5SDimitry Andric   };
11820b57cec5SDimitry Andric 
11830b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11840b57cec5SDimitry Andric       std::numeric_limits<int>::max())
11850b57cec5SDimitry Andric     return false;
11860b57cec5SDimitry Andric 
11870b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1188e8d8bef9SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1189e8d8bef9SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
119006c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
11910b57cec5SDimitry Andric   return true;
11920b57cec5SDimitry Andric }
11930b57cec5SDimitry Andric 
11940b57cec5SDimitry Andric bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
11950b57cec5SDimitry Andric   if (!ST.hasSMEMtoVectorWriteHazard())
11960b57cec5SDimitry Andric     return false;
11977a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
11980b57cec5SDimitry Andric 
11990b57cec5SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
12000b57cec5SDimitry Andric     return false;
12010b57cec5SDimitry Andric 
12020b57cec5SDimitry Andric   unsigned SDSTName;
12030b57cec5SDimitry Andric   switch (MI->getOpcode()) {
12040b57cec5SDimitry Andric   case AMDGPU::V_READLANE_B32:
12050b57cec5SDimitry Andric   case AMDGPU::V_READFIRSTLANE_B32:
12060b57cec5SDimitry Andric     SDSTName = AMDGPU::OpName::vdst;
12070b57cec5SDimitry Andric     break;
12080b57cec5SDimitry Andric   default:
12090b57cec5SDimitry Andric     SDSTName = AMDGPU::OpName::sdst;
12100b57cec5SDimitry Andric     break;
12110b57cec5SDimitry Andric   }
12120b57cec5SDimitry Andric 
12130b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
12140b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
12150b57cec5SDimitry Andric   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
12160b57cec5SDimitry Andric   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
12170b57cec5SDimitry Andric   if (!SDST) {
12180b57cec5SDimitry Andric     for (const auto &MO : MI->implicit_operands()) {
1219bdd1243dSDimitry Andric       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
12200b57cec5SDimitry Andric         SDST = &MO;
12210b57cec5SDimitry Andric         break;
12220b57cec5SDimitry Andric       }
12230b57cec5SDimitry Andric     }
12240b57cec5SDimitry Andric   }
12250b57cec5SDimitry Andric 
12260b57cec5SDimitry Andric   if (!SDST)
12270b57cec5SDimitry Andric     return false;
12280b57cec5SDimitry Andric 
12298bcb0991SDimitry Andric   const Register SDSTReg = SDST->getReg();
1230fe6060f1SDimitry Andric   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1231fe6060f1SDimitry Andric     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
12320b57cec5SDimitry Andric   };
12330b57cec5SDimitry Andric 
1234fe6060f1SDimitry Andric   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1235fe6060f1SDimitry Andric     if (TII->isSALU(MI)) {
1236fe6060f1SDimitry Andric       switch (MI.getOpcode()) {
12370b57cec5SDimitry Andric       case AMDGPU::S_SETVSKIP:
12380b57cec5SDimitry Andric       case AMDGPU::S_VERSION:
12390b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_VSCNT:
12400b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_VMCNT:
12410b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_EXPCNT:
12420b57cec5SDimitry Andric         // These instructions cannot not mitigate the hazard.
12430b57cec5SDimitry Andric         return false;
12440b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_LGKMCNT:
12450b57cec5SDimitry Andric         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1246fe6060f1SDimitry Andric         return (MI.getOperand(1).getImm() == 0) &&
1247fe6060f1SDimitry Andric                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
12480b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT: {
1249fe6060f1SDimitry Andric         const int64_t Imm = MI.getOperand(0).getImm();
12500b57cec5SDimitry Andric         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
12517a6dacacSDimitry Andric         // DsCnt corresponds to LGKMCnt here.
12527a6dacacSDimitry Andric         return (Decoded.DsCnt == 0);
12530b57cec5SDimitry Andric       }
12540b57cec5SDimitry Andric       default:
12550b57cec5SDimitry Andric         // SOPP instructions cannot mitigate the hazard.
1256fe6060f1SDimitry Andric         if (TII->isSOPP(MI))
12570b57cec5SDimitry Andric           return false;
12580b57cec5SDimitry Andric         // At this point the SALU can be assumed to mitigate the hazard
12590b57cec5SDimitry Andric         // because either:
12600b57cec5SDimitry Andric         // (a) it is independent of the at risk SMEM (breaking chain),
12610b57cec5SDimitry Andric         // or
12620b57cec5SDimitry Andric         // (b) it is dependent on the SMEM, in which case an appropriate
12630b57cec5SDimitry Andric         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
12640b57cec5SDimitry Andric         //     SMEM instruction.
12650b57cec5SDimitry Andric         return true;
12660b57cec5SDimitry Andric       }
12670b57cec5SDimitry Andric     }
12680b57cec5SDimitry Andric     return false;
12690b57cec5SDimitry Andric   };
12700b57cec5SDimitry Andric 
12710b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
12720b57cec5SDimitry Andric       std::numeric_limits<int>::max())
12730b57cec5SDimitry Andric     return false;
12740b57cec5SDimitry Andric 
12750b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
12760b57cec5SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
12770b57cec5SDimitry Andric       .addImm(0);
12780b57cec5SDimitry Andric   return true;
12790b57cec5SDimitry Andric }
12800b57cec5SDimitry Andric 
12810b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
12827a6dacacSDimitry Andric   if (!ST.hasVcmpxExecWARHazard())
12837a6dacacSDimitry Andric     return false;
12847a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
12857a6dacacSDimitry Andric 
12867a6dacacSDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
12870b57cec5SDimitry Andric     return false;
12880b57cec5SDimitry Andric 
12890b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
12900b57cec5SDimitry Andric   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
12910b57cec5SDimitry Andric     return false;
12920b57cec5SDimitry Andric 
1293fe6060f1SDimitry Andric   auto IsHazardFn = [TRI](const MachineInstr &I) {
1294fe6060f1SDimitry Andric     if (SIInstrInfo::isVALU(I))
12950b57cec5SDimitry Andric       return false;
1296fe6060f1SDimitry Andric     return I.readsRegister(AMDGPU::EXEC, TRI);
12970b57cec5SDimitry Andric   };
12980b57cec5SDimitry Andric 
12990b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1300fe6060f1SDimitry Andric   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1301fe6060f1SDimitry Andric     if (SIInstrInfo::isVALU(MI)) {
1302fe6060f1SDimitry Andric       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
13030b57cec5SDimitry Andric         return true;
1304fe6060f1SDimitry Andric       for (auto MO : MI.implicit_operands())
1305bdd1243dSDimitry Andric         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
13060b57cec5SDimitry Andric           return true;
13070b57cec5SDimitry Andric     }
1308fe6060f1SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
130906c3fb27SDimitry Andric         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
13100b57cec5SDimitry Andric       return true;
13110b57cec5SDimitry Andric     return false;
13120b57cec5SDimitry Andric   };
13130b57cec5SDimitry Andric 
13140b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13150b57cec5SDimitry Andric       std::numeric_limits<int>::max())
13160b57cec5SDimitry Andric     return false;
13170b57cec5SDimitry Andric 
13180b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
13190b57cec5SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
132006c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
13210b57cec5SDimitry Andric   return true;
13220b57cec5SDimitry Andric }
13230b57cec5SDimitry Andric 
1324fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1325fe6060f1SDimitry Andric                                                  const GCNSubtarget &ST) {
13260b57cec5SDimitry Andric   if (!ST.hasLdsBranchVmemWARHazard())
13270b57cec5SDimitry Andric     return false;
13280b57cec5SDimitry Andric 
1329fe6060f1SDimitry Andric   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1330fe6060f1SDimitry Andric   // instructions need to appear in the same function.
1331fe6060f1SDimitry Andric   bool HasLds = false;
1332fe6060f1SDimitry Andric   bool HasVmem = false;
1333fe6060f1SDimitry Andric   for (auto &MBB : MF) {
1334fe6060f1SDimitry Andric     for (auto &MI : MBB) {
1335fe6060f1SDimitry Andric       HasLds |= SIInstrInfo::isDS(MI);
1336fe6060f1SDimitry Andric       HasVmem |=
1337fe6060f1SDimitry Andric           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1338fe6060f1SDimitry Andric       if (HasLds && HasVmem)
1339fe6060f1SDimitry Andric         return true;
1340fe6060f1SDimitry Andric     }
1341fe6060f1SDimitry Andric   }
1342fe6060f1SDimitry Andric   return false;
1343fe6060f1SDimitry Andric }
1344fe6060f1SDimitry Andric 
1345bdd1243dSDimitry Andric static bool isStoreCountWaitZero(const MachineInstr &I) {
1346bdd1243dSDimitry Andric   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1347bdd1243dSDimitry Andric          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1348bdd1243dSDimitry Andric          !I.getOperand(1).getImm();
1349bdd1243dSDimitry Andric }
1350bdd1243dSDimitry Andric 
1351fe6060f1SDimitry Andric bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1352fe6060f1SDimitry Andric   if (!RunLdsBranchVmemWARHazardFixup)
1353fe6060f1SDimitry Andric     return false;
1354fe6060f1SDimitry Andric 
1355fe6060f1SDimitry Andric   assert(ST.hasLdsBranchVmemWARHazard());
13567a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
1357fe6060f1SDimitry Andric 
1358fe6060f1SDimitry Andric   auto IsHazardInst = [](const MachineInstr &MI) {
1359fe6060f1SDimitry Andric     if (SIInstrInfo::isDS(MI))
13600b57cec5SDimitry Andric       return 1;
1361fe6060f1SDimitry Andric     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
13620b57cec5SDimitry Andric       return 2;
13630b57cec5SDimitry Andric     return 0;
13640b57cec5SDimitry Andric   };
13650b57cec5SDimitry Andric 
1366fe6060f1SDimitry Andric   auto InstType = IsHazardInst(*MI);
13670b57cec5SDimitry Andric   if (!InstType)
13680b57cec5SDimitry Andric     return false;
13690b57cec5SDimitry Andric 
1370fe6060f1SDimitry Andric   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1371bdd1243dSDimitry Andric     return IsHazardInst(I) || isStoreCountWaitZero(I);
13720b57cec5SDimitry Andric   };
13730b57cec5SDimitry Andric 
1374fe6060f1SDimitry Andric   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1375fe6060f1SDimitry Andric     if (!I.isBranch())
13760b57cec5SDimitry Andric       return false;
13770b57cec5SDimitry Andric 
1378fe6060f1SDimitry Andric     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
13790b57cec5SDimitry Andric       auto InstType2 = IsHazardInst(I);
13800b57cec5SDimitry Andric       return InstType2 && InstType != InstType2;
13810b57cec5SDimitry Andric     };
13820b57cec5SDimitry Andric 
1383fe6060f1SDimitry Andric     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
13840b57cec5SDimitry Andric       auto InstType2 = IsHazardInst(I);
13850b57cec5SDimitry Andric       if (InstType == InstType2)
13860b57cec5SDimitry Andric         return true;
13870b57cec5SDimitry Andric 
1388bdd1243dSDimitry Andric       return isStoreCountWaitZero(I);
13890b57cec5SDimitry Andric     };
13900b57cec5SDimitry Andric 
1391fe6060f1SDimitry Andric     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
13920b57cec5SDimitry Andric            std::numeric_limits<int>::max();
13930b57cec5SDimitry Andric   };
13940b57cec5SDimitry Andric 
13950b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13960b57cec5SDimitry Andric       std::numeric_limits<int>::max())
13970b57cec5SDimitry Andric     return false;
13980b57cec5SDimitry Andric 
13990b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
14000b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
14010b57cec5SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_VSCNT))
14020b57cec5SDimitry Andric     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
14030b57cec5SDimitry Andric     .addImm(0);
14040b57cec5SDimitry Andric 
14050b57cec5SDimitry Andric   return true;
14060b57cec5SDimitry Andric }
14070b57cec5SDimitry Andric 
140881ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
140981ad6265SDimitry Andric   if (!SIInstrInfo::isLDSDIR(*MI))
141081ad6265SDimitry Andric     return false;
141181ad6265SDimitry Andric 
141281ad6265SDimitry Andric   const int NoHazardWaitStates = 15;
141381ad6265SDimitry Andric   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
141481ad6265SDimitry Andric   const Register VDSTReg = VDST->getReg();
141581ad6265SDimitry Andric 
141681ad6265SDimitry Andric   bool VisitedTrans = false;
141781ad6265SDimitry Andric   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
141881ad6265SDimitry Andric     if (!SIInstrInfo::isVALU(I))
141981ad6265SDimitry Andric       return false;
142081ad6265SDimitry Andric     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
142181ad6265SDimitry Andric     // Cover both WAR and WAW
142281ad6265SDimitry Andric     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
142381ad6265SDimitry Andric   };
142481ad6265SDimitry Andric   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
142581ad6265SDimitry Andric     if (WaitStates >= NoHazardWaitStates)
142681ad6265SDimitry Andric       return true;
142781ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
142881ad6265SDimitry Andric     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
142981ad6265SDimitry Andric            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
143081ad6265SDimitry Andric   };
143181ad6265SDimitry Andric   auto GetWaitStatesFn = [](const MachineInstr &MI) {
143281ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) ? 1 : 0;
143381ad6265SDimitry Andric   };
143481ad6265SDimitry Andric 
143581ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
143681ad6265SDimitry Andric   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
143781ad6265SDimitry Andric                                     std::next(MI->getReverseIterator()), 0,
143881ad6265SDimitry Andric                                     IsExpiredFn, Visited, GetWaitStatesFn);
143981ad6265SDimitry Andric 
144081ad6265SDimitry Andric   // Transcendentals can execute in parallel to other VALUs.
144181ad6265SDimitry Andric   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
144281ad6265SDimitry Andric   if (VisitedTrans)
144381ad6265SDimitry Andric     Count = 0;
144481ad6265SDimitry Andric 
144581ad6265SDimitry Andric   MachineOperand *WaitVdstOp =
144681ad6265SDimitry Andric       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
144781ad6265SDimitry Andric   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
144881ad6265SDimitry Andric 
144981ad6265SDimitry Andric   return true;
145081ad6265SDimitry Andric }
145181ad6265SDimitry Andric 
145281ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
145381ad6265SDimitry Andric   if (!SIInstrInfo::isLDSDIR(*MI))
145481ad6265SDimitry Andric     return false;
145581ad6265SDimitry Andric 
145681ad6265SDimitry Andric   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
145781ad6265SDimitry Andric   const Register VDSTReg = VDST->getReg();
145881ad6265SDimitry Andric 
145981ad6265SDimitry Andric   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
146081ad6265SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
146181ad6265SDimitry Andric         !SIInstrInfo::isDS(I))
146281ad6265SDimitry Andric       return false;
146381ad6265SDimitry Andric     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
146481ad6265SDimitry Andric   };
1465297eecfbSDimitry Andric   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
14667a6dacacSDimitry Andric   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
14677a6dacacSDimitry Andric   // according to the type of VMEM instruction.
1468297eecfbSDimitry Andric   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
146981ad6265SDimitry Andric     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
147081ad6265SDimitry Andric            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
147181ad6265SDimitry Andric            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1472297eecfbSDimitry Andric             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1473297eecfbSDimitry Andric            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1474297eecfbSDimitry Andric             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
147581ad6265SDimitry Andric   };
147681ad6265SDimitry Andric 
147781ad6265SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
147881ad6265SDimitry Andric       std::numeric_limits<int>::max())
147981ad6265SDimitry Andric     return false;
148081ad6265SDimitry Andric 
1481297eecfbSDimitry Andric   if (LdsdirCanWait) {
1482297eecfbSDimitry Andric     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1483297eecfbSDimitry Andric   } else {
148481ad6265SDimitry Andric     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
148581ad6265SDimitry Andric             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
148606c3fb27SDimitry Andric         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1487297eecfbSDimitry Andric   }
148881ad6265SDimitry Andric 
148981ad6265SDimitry Andric   return true;
149081ad6265SDimitry Andric }
149181ad6265SDimitry Andric 
149281ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
149381ad6265SDimitry Andric   if (!ST.hasVALUPartialForwardingHazard())
149481ad6265SDimitry Andric     return false;
14957a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
14967a6dacacSDimitry Andric 
14977a6dacacSDimitry Andric   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
149881ad6265SDimitry Andric     return false;
149981ad6265SDimitry Andric 
150081ad6265SDimitry Andric   SmallSetVector<Register, 4> SrcVGPRs;
150181ad6265SDimitry Andric 
150281ad6265SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
150381ad6265SDimitry Andric     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
150481ad6265SDimitry Andric       SrcVGPRs.insert(Use.getReg());
150581ad6265SDimitry Andric   }
150681ad6265SDimitry Andric 
150781ad6265SDimitry Andric   // Only applies with >= 2 unique VGPR sources
150881ad6265SDimitry Andric   if (SrcVGPRs.size() <= 1)
150981ad6265SDimitry Andric     return false;
151081ad6265SDimitry Andric 
151181ad6265SDimitry Andric   // Look for the following pattern:
151281ad6265SDimitry Andric   //   Va <- VALU [PreExecPos]
151381ad6265SDimitry Andric   //   intv1
151481ad6265SDimitry Andric   //   Exec <- SALU [ExecPos]
151581ad6265SDimitry Andric   //   intv2
151681ad6265SDimitry Andric   //   Vb <- VALU [PostExecPos]
151781ad6265SDimitry Andric   //   intv3
151881ad6265SDimitry Andric   //   MI Va, Vb (WaitState = 0)
151981ad6265SDimitry Andric   //
152081ad6265SDimitry Andric   // Where:
152181ad6265SDimitry Andric   // intv1 + intv2 <= 2 VALUs
152281ad6265SDimitry Andric   // intv3 <= 4 VALUs
152381ad6265SDimitry Andric   //
152481ad6265SDimitry Andric   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
152581ad6265SDimitry Andric 
152681ad6265SDimitry Andric   const int Intv1plus2MaxVALUs = 2;
152781ad6265SDimitry Andric   const int Intv3MaxVALUs = 4;
152881ad6265SDimitry Andric   const int IntvMaxVALUs = 6;
152981ad6265SDimitry Andric   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
153081ad6265SDimitry Andric 
153181ad6265SDimitry Andric   struct StateType {
153281ad6265SDimitry Andric     SmallDenseMap<Register, int, 4> DefPos;
153381ad6265SDimitry Andric     int ExecPos = std::numeric_limits<int>::max();
153481ad6265SDimitry Andric     int VALUs = 0;
153581ad6265SDimitry Andric   };
153681ad6265SDimitry Andric 
153781ad6265SDimitry Andric   StateType State;
153881ad6265SDimitry Andric 
153981ad6265SDimitry Andric   // This overloads expiry testing with all the hazard detection
154081ad6265SDimitry Andric   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
154181ad6265SDimitry Andric     // Too many VALU states have passed
154281ad6265SDimitry Andric     if (State.VALUs > NoHazardVALUWaitStates)
154381ad6265SDimitry Andric       return HazardExpired;
154481ad6265SDimitry Andric 
154581ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
154681ad6265SDimitry Andric     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
154781ad6265SDimitry Andric         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
154881ad6265SDimitry Andric         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
154906c3fb27SDimitry Andric          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
155081ad6265SDimitry Andric       return HazardExpired;
155181ad6265SDimitry Andric 
155281ad6265SDimitry Andric     // Track registers writes
155381ad6265SDimitry Andric     bool Changed = false;
155481ad6265SDimitry Andric     if (SIInstrInfo::isVALU(I)) {
155581ad6265SDimitry Andric       for (Register Src : SrcVGPRs) {
155681ad6265SDimitry Andric         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
155781ad6265SDimitry Andric           State.DefPos[Src] = State.VALUs;
155881ad6265SDimitry Andric           Changed = true;
155981ad6265SDimitry Andric         }
156081ad6265SDimitry Andric       }
156181ad6265SDimitry Andric     } else if (SIInstrInfo::isSALU(I)) {
156281ad6265SDimitry Andric       if (State.ExecPos == std::numeric_limits<int>::max()) {
156381ad6265SDimitry Andric         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
156481ad6265SDimitry Andric           State.ExecPos = State.VALUs;
156581ad6265SDimitry Andric           Changed = true;
156681ad6265SDimitry Andric         }
156781ad6265SDimitry Andric       }
156881ad6265SDimitry Andric     }
156981ad6265SDimitry Andric 
157081ad6265SDimitry Andric     // Early expiration: too many VALUs in intv3
157181ad6265SDimitry Andric     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
157281ad6265SDimitry Andric       return HazardExpired;
157381ad6265SDimitry Andric 
157481ad6265SDimitry Andric     // Only evaluate state if something changed
157581ad6265SDimitry Andric     if (!Changed)
157681ad6265SDimitry Andric       return NoHazardFound;
157781ad6265SDimitry Andric 
157881ad6265SDimitry Andric     // Determine positions of VALUs pre/post exec change
157981ad6265SDimitry Andric     if (State.ExecPos == std::numeric_limits<int>::max())
158081ad6265SDimitry Andric       return NoHazardFound;
158181ad6265SDimitry Andric 
158281ad6265SDimitry Andric     int PreExecPos = std::numeric_limits<int>::max();
158381ad6265SDimitry Andric     int PostExecPos = std::numeric_limits<int>::max();
158481ad6265SDimitry Andric 
158581ad6265SDimitry Andric     for (auto Entry : State.DefPos) {
158681ad6265SDimitry Andric       int DefVALUs = Entry.second;
158781ad6265SDimitry Andric       if (DefVALUs != std::numeric_limits<int>::max()) {
158881ad6265SDimitry Andric         if (DefVALUs >= State.ExecPos)
158981ad6265SDimitry Andric           PreExecPos = std::min(PreExecPos, DefVALUs);
1590*0fca6ea1SDimitry Andric         else
159181ad6265SDimitry Andric           PostExecPos = std::min(PostExecPos, DefVALUs);
159281ad6265SDimitry Andric       }
159381ad6265SDimitry Andric     }
159481ad6265SDimitry Andric 
159581ad6265SDimitry Andric     // Need a VALUs post exec change
159681ad6265SDimitry Andric     if (PostExecPos == std::numeric_limits<int>::max())
159781ad6265SDimitry Andric       return NoHazardFound;
159881ad6265SDimitry Andric 
159981ad6265SDimitry Andric     // Too many VALUs in intv3?
160081ad6265SDimitry Andric     int Intv3VALUs = PostExecPos;
160181ad6265SDimitry Andric     if (Intv3VALUs > Intv3MaxVALUs)
160281ad6265SDimitry Andric       return HazardExpired;
160381ad6265SDimitry Andric 
160481ad6265SDimitry Andric     // Too many VALUs in intv2?
160581ad6265SDimitry Andric     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
160681ad6265SDimitry Andric     if (Intv2VALUs > Intv1plus2MaxVALUs)
160781ad6265SDimitry Andric       return HazardExpired;
160881ad6265SDimitry Andric 
160981ad6265SDimitry Andric     // Need a VALUs pre exec change
161081ad6265SDimitry Andric     if (PreExecPos == std::numeric_limits<int>::max())
161181ad6265SDimitry Andric       return NoHazardFound;
161281ad6265SDimitry Andric 
161381ad6265SDimitry Andric     // Too many VALUs in intv1?
161481ad6265SDimitry Andric     int Intv1VALUs = PreExecPos - State.ExecPos;
161581ad6265SDimitry Andric     if (Intv1VALUs > Intv1plus2MaxVALUs)
161681ad6265SDimitry Andric       return HazardExpired;
161781ad6265SDimitry Andric 
161881ad6265SDimitry Andric     // Too many VALUs in intv1 + intv2
161981ad6265SDimitry Andric     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
162081ad6265SDimitry Andric       return HazardExpired;
162181ad6265SDimitry Andric 
162281ad6265SDimitry Andric     return HazardFound;
162381ad6265SDimitry Andric   };
162481ad6265SDimitry Andric   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
162581ad6265SDimitry Andric     if (SIInstrInfo::isVALU(MI))
162681ad6265SDimitry Andric       State.VALUs += 1;
162781ad6265SDimitry Andric   };
162881ad6265SDimitry Andric 
162981ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
163081ad6265SDimitry Andric   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
163181ad6265SDimitry Andric                             std::next(MI->getReverseIterator()), Visited))
163281ad6265SDimitry Andric     return false;
163381ad6265SDimitry Andric 
163481ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
163581ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
163681ad6265SDimitry Andric       .addImm(0x0fff);
163781ad6265SDimitry Andric 
163881ad6265SDimitry Andric   return true;
163981ad6265SDimitry Andric }
164081ad6265SDimitry Andric 
164181ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
164281ad6265SDimitry Andric   if (!ST.hasVALUTransUseHazard())
164381ad6265SDimitry Andric     return false;
16447a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
16457a6dacacSDimitry Andric 
164681ad6265SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
164781ad6265SDimitry Andric     return false;
164881ad6265SDimitry Andric 
164981ad6265SDimitry Andric   SmallSet<Register, 4> SrcVGPRs;
165081ad6265SDimitry Andric 
165181ad6265SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
165281ad6265SDimitry Andric     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
165381ad6265SDimitry Andric       SrcVGPRs.insert(Use.getReg());
165481ad6265SDimitry Andric   }
165581ad6265SDimitry Andric 
165681ad6265SDimitry Andric   // Look for the following pattern:
165781ad6265SDimitry Andric   //   Va <- TRANS VALU
165881ad6265SDimitry Andric   //   intv
165981ad6265SDimitry Andric   //   MI Va (WaitState = 0)
166081ad6265SDimitry Andric   //
166181ad6265SDimitry Andric   // Where:
166281ad6265SDimitry Andric   // intv <= 5 VALUs / 1 TRANS
166381ad6265SDimitry Andric   //
166481ad6265SDimitry Andric   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
166581ad6265SDimitry Andric 
166681ad6265SDimitry Andric   const int IntvMaxVALUs = 5;
166781ad6265SDimitry Andric   const int IntvMaxTRANS = 1;
166881ad6265SDimitry Andric 
166981ad6265SDimitry Andric   struct StateType {
167081ad6265SDimitry Andric     int VALUs = 0;
167181ad6265SDimitry Andric     int TRANS = 0;
167281ad6265SDimitry Andric   };
167381ad6265SDimitry Andric 
167481ad6265SDimitry Andric   StateType State;
167581ad6265SDimitry Andric 
167681ad6265SDimitry Andric   // This overloads expiry testing with all the hazard detection
167781ad6265SDimitry Andric   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
167881ad6265SDimitry Andric     // Too many VALU states have passed
167981ad6265SDimitry Andric     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
168081ad6265SDimitry Andric       return HazardExpired;
168181ad6265SDimitry Andric 
168281ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
168381ad6265SDimitry Andric     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
168481ad6265SDimitry Andric         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
168581ad6265SDimitry Andric         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
168681ad6265SDimitry Andric          I.getOperand(0).getImm() == 0x0fff))
168781ad6265SDimitry Andric       return HazardExpired;
168881ad6265SDimitry Andric 
168981ad6265SDimitry Andric     // Track registers writes
169081ad6265SDimitry Andric     if (SIInstrInfo::isTRANS(I)) {
169181ad6265SDimitry Andric       for (Register Src : SrcVGPRs) {
169281ad6265SDimitry Andric         if (I.modifiesRegister(Src, &TRI)) {
169381ad6265SDimitry Andric           return HazardFound;
169481ad6265SDimitry Andric         }
169581ad6265SDimitry Andric       }
169681ad6265SDimitry Andric     }
169781ad6265SDimitry Andric 
169881ad6265SDimitry Andric     return NoHazardFound;
169981ad6265SDimitry Andric   };
170081ad6265SDimitry Andric   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
170181ad6265SDimitry Andric     if (SIInstrInfo::isVALU(MI))
170281ad6265SDimitry Andric       State.VALUs += 1;
170381ad6265SDimitry Andric     if (SIInstrInfo::isTRANS(MI))
170481ad6265SDimitry Andric       State.TRANS += 1;
170581ad6265SDimitry Andric   };
170681ad6265SDimitry Andric 
170781ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
170881ad6265SDimitry Andric   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
170981ad6265SDimitry Andric                             std::next(MI->getReverseIterator()), Visited))
171081ad6265SDimitry Andric     return false;
171181ad6265SDimitry Andric 
171281ad6265SDimitry Andric   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
171306c3fb27SDimitry Andric   // avoided.
171481ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
171581ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
171606c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
171781ad6265SDimitry Andric 
171881ad6265SDimitry Andric   return true;
171981ad6265SDimitry Andric }
172081ad6265SDimitry Andric 
172181ad6265SDimitry Andric bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1722b3edf446SDimitry Andric   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
172381ad6265SDimitry Andric     return false;
172481ad6265SDimitry Andric 
172581ad6265SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
172681ad6265SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
172781ad6265SDimitry Andric 
1728b3edf446SDimitry Andric   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1729b3edf446SDimitry Andric     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
173081ad6265SDimitry Andric       return false;
173181ad6265SDimitry Andric 
1732*0fca6ea1SDimitry Andric     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1733*0fca6ea1SDimitry Andric     // with the dest(matrix D) of the previous wmma.
173481ad6265SDimitry Andric     const Register CurSrc0Reg =
173581ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
173681ad6265SDimitry Andric     const Register CurSrc1Reg =
173781ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
173881ad6265SDimitry Andric 
173981ad6265SDimitry Andric     const Register PrevDstReg =
174081ad6265SDimitry Andric         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
174181ad6265SDimitry Andric 
174281ad6265SDimitry Andric     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
174381ad6265SDimitry Andric         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
174481ad6265SDimitry Andric       return true;
174581ad6265SDimitry Andric     }
174681ad6265SDimitry Andric 
1747b3edf446SDimitry Andric     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1748b3edf446SDimitry Andric     // but Index can't overlap with PrevDstReg.
1749b3edf446SDimitry Andric     if (AMDGPU::isGFX12Plus(ST)) {
1750b3edf446SDimitry Andric       if (SIInstrInfo::isSWMMAC(*MI)) {
1751b3edf446SDimitry Andric         const Register CurIndex =
1752b3edf446SDimitry Andric             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1753b3edf446SDimitry Andric         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1754b3edf446SDimitry Andric           return true;
1755b3edf446SDimitry Andric       }
1756b3edf446SDimitry Andric       return false;
1757b3edf446SDimitry Andric     }
1758b3edf446SDimitry Andric 
175981ad6265SDimitry Andric     return false;
176081ad6265SDimitry Andric   };
176181ad6265SDimitry Andric 
176281ad6265SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &I, int) {
176381ad6265SDimitry Andric     return SIInstrInfo::isVALU(I);
176481ad6265SDimitry Andric   };
176581ad6265SDimitry Andric 
176681ad6265SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
176781ad6265SDimitry Andric       std::numeric_limits<int>::max())
176881ad6265SDimitry Andric     return false;
176981ad6265SDimitry Andric 
177081ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
177181ad6265SDimitry Andric 
177281ad6265SDimitry Andric   return true;
177381ad6265SDimitry Andric }
177481ad6265SDimitry Andric 
1775bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1776bdd1243dSDimitry Andric   if (!ST.hasShift64HighRegBug())
1777bdd1243dSDimitry Andric     return false;
17787a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
1779bdd1243dSDimitry Andric 
1780bdd1243dSDimitry Andric   switch (MI->getOpcode()) {
1781bdd1243dSDimitry Andric   default:
1782bdd1243dSDimitry Andric     return false;
1783bdd1243dSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64:
1784bdd1243dSDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64:
1785bdd1243dSDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64:
1786bdd1243dSDimitry Andric     break;
1787bdd1243dSDimitry Andric   }
1788bdd1243dSDimitry Andric 
1789bdd1243dSDimitry Andric   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1790bdd1243dSDimitry Andric   if (!Amt->isReg())
1791bdd1243dSDimitry Andric     return false;
1792bdd1243dSDimitry Andric 
1793bdd1243dSDimitry Andric   Register AmtReg = Amt->getReg();
1794bdd1243dSDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
1795bdd1243dSDimitry Andric   // Check if this is a last VGPR in the allocation block.
1796bdd1243dSDimitry Andric   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1797bdd1243dSDimitry Andric     return false;
1798bdd1243dSDimitry Andric 
1799bdd1243dSDimitry Andric   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1800bdd1243dSDimitry Andric     return false;
1801bdd1243dSDimitry Andric 
1802bdd1243dSDimitry Andric   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1803bdd1243dSDimitry Andric   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1804bdd1243dSDimitry Andric   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1805bdd1243dSDimitry Andric   bool Overlapped = OverlappedSrc || OverlappedDst;
1806bdd1243dSDimitry Andric 
1807bdd1243dSDimitry Andric   assert(!OverlappedDst || !OverlappedSrc ||
1808bdd1243dSDimitry Andric          Src1->getReg() == MI->getOperand(0).getReg());
1809bdd1243dSDimitry Andric   assert(ST.needsAlignedVGPRs());
1810bdd1243dSDimitry Andric   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1811bdd1243dSDimitry Andric 
1812bdd1243dSDimitry Andric   Register NewReg;
1813bdd1243dSDimitry Andric   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1814bdd1243dSDimitry Andric                                    : AMDGPU::VGPR_32RegClass) {
1815bdd1243dSDimitry Andric     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1816bdd1243dSDimitry Andric       NewReg = Reg;
1817bdd1243dSDimitry Andric       break;
1818bdd1243dSDimitry Andric     }
1819bdd1243dSDimitry Andric   }
1820bdd1243dSDimitry Andric 
1821bdd1243dSDimitry Andric   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1822bdd1243dSDimitry Andric                                : NewReg;
1823bdd1243dSDimitry Andric   Register NewAmtLo;
1824bdd1243dSDimitry Andric 
1825bdd1243dSDimitry Andric   if (Overlapped)
1826bdd1243dSDimitry Andric     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1827bdd1243dSDimitry Andric 
1828bdd1243dSDimitry Andric   DebugLoc DL = MI->getDebugLoc();
1829bdd1243dSDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
1830bdd1243dSDimitry Andric   // Insert a full wait count because found register might be pending a wait.
1831bdd1243dSDimitry Andric   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1832bdd1243dSDimitry Andric       .addImm(0);
1833bdd1243dSDimitry Andric 
1834bdd1243dSDimitry Andric   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1835bdd1243dSDimitry Andric   if (Overlapped)
1836bdd1243dSDimitry Andric     runOnInstruction(
1837bdd1243dSDimitry Andric         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1838bdd1243dSDimitry Andric             .addDef(AmtReg - 1)
1839bdd1243dSDimitry Andric             .addReg(AmtReg - 1, RegState::Undef)
1840bdd1243dSDimitry Andric             .addReg(NewAmtLo, RegState::Undef));
1841bdd1243dSDimitry Andric   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1842bdd1243dSDimitry Andric                        .addDef(AmtReg)
1843bdd1243dSDimitry Andric                        .addReg(AmtReg, RegState::Undef)
1844bdd1243dSDimitry Andric                        .addReg(NewAmt, RegState::Undef));
1845bdd1243dSDimitry Andric 
1846bdd1243dSDimitry Andric   // Instructions emitted after the current instruction will be processed by the
1847bdd1243dSDimitry Andric   // parent loop of the hazard recognizer in a natural way.
1848bdd1243dSDimitry Andric   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1849bdd1243dSDimitry Andric           AmtReg)
1850bdd1243dSDimitry Andric       .addDef(NewAmt)
1851bdd1243dSDimitry Andric       .addReg(NewAmt)
1852bdd1243dSDimitry Andric       .addReg(AmtReg);
1853bdd1243dSDimitry Andric   if (Overlapped)
1854bdd1243dSDimitry Andric     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1855bdd1243dSDimitry Andric             AmtReg - 1)
1856bdd1243dSDimitry Andric         .addDef(NewAmtLo)
1857bdd1243dSDimitry Andric         .addReg(NewAmtLo)
1858bdd1243dSDimitry Andric         .addReg(AmtReg - 1);
1859bdd1243dSDimitry Andric 
1860bdd1243dSDimitry Andric   // Re-running hazard recognizer on the modified instruction is not necessary,
1861bdd1243dSDimitry Andric   // inserted V_SWAP_B32 has already both read and write new registers so
1862bdd1243dSDimitry Andric   // hazards related to these register has already been handled.
1863bdd1243dSDimitry Andric   Amt->setReg(NewAmt);
1864bdd1243dSDimitry Andric   Amt->setIsKill(false);
1865bdd1243dSDimitry Andric   // We do not update liveness, so verifier may see it as undef.
1866bdd1243dSDimitry Andric   Amt->setIsUndef();
1867bdd1243dSDimitry Andric   if (OverlappedDst)
1868bdd1243dSDimitry Andric     MI->getOperand(0).setReg(NewReg);
1869bdd1243dSDimitry Andric   if (OverlappedSrc) {
1870bdd1243dSDimitry Andric     Src1->setReg(NewReg);
1871bdd1243dSDimitry Andric     Src1->setIsKill(false);
1872bdd1243dSDimitry Andric     Src1->setIsUndef();
1873bdd1243dSDimitry Andric   }
1874bdd1243dSDimitry Andric 
1875bdd1243dSDimitry Andric   return true;
1876bdd1243dSDimitry Andric }
1877bdd1243dSDimitry Andric 
18780b57cec5SDimitry Andric int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
18790b57cec5SDimitry Andric   int NSAtoVMEMWaitStates = 1;
18800b57cec5SDimitry Andric 
18810b57cec5SDimitry Andric   if (!ST.hasNSAtoVMEMBug())
18820b57cec5SDimitry Andric     return 0;
18830b57cec5SDimitry Andric 
18840b57cec5SDimitry Andric   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
18850b57cec5SDimitry Andric     return 0;
18860b57cec5SDimitry Andric 
18870b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
18880b57cec5SDimitry Andric   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
18890b57cec5SDimitry Andric   if (!Offset || (Offset->getImm() & 6) == 0)
18900b57cec5SDimitry Andric     return 0;
18910b57cec5SDimitry Andric 
1892fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &I) {
1893fe6060f1SDimitry Andric     if (!SIInstrInfo::isMIMG(I))
18940b57cec5SDimitry Andric       return false;
1895fe6060f1SDimitry Andric     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
18960b57cec5SDimitry Andric     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1897fe6060f1SDimitry Andric            TII->getInstSizeInBytes(I) >= 16;
18980b57cec5SDimitry Andric   };
18990b57cec5SDimitry Andric 
19000b57cec5SDimitry Andric   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
19010b57cec5SDimitry Andric }
19020b57cec5SDimitry Andric 
19030b57cec5SDimitry Andric int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
19040b57cec5SDimitry Andric   int FPAtomicToDenormModeWaitStates = 3;
19050b57cec5SDimitry Andric 
1906bdd1243dSDimitry Andric   if (!ST.hasFPAtomicToDenormModeHazard())
1907bdd1243dSDimitry Andric     return 0;
19087a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
1909bdd1243dSDimitry Andric 
19100b57cec5SDimitry Andric   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
19110b57cec5SDimitry Andric     return 0;
19120b57cec5SDimitry Andric 
1913fe6060f1SDimitry Andric   auto IsHazardFn = [](const MachineInstr &I) {
1914fe6060f1SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
19150b57cec5SDimitry Andric       return false;
1916fe6060f1SDimitry Andric     return SIInstrInfo::isFPAtomic(I);
19170b57cec5SDimitry Andric   };
19180b57cec5SDimitry Andric 
1919fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1920fe6060f1SDimitry Andric     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
19210b57cec5SDimitry Andric       return true;
19220b57cec5SDimitry Andric 
1923fe6060f1SDimitry Andric     switch (MI.getOpcode()) {
19240b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT:
19250b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_VSCNT:
19260b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_VMCNT:
19270b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_EXPCNT:
19280b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_LGKMCNT:
1929e8d8bef9SDimitry Andric     case AMDGPU::S_WAIT_IDLE:
19300b57cec5SDimitry Andric       return true;
19310b57cec5SDimitry Andric     default:
19320b57cec5SDimitry Andric       break;
19330b57cec5SDimitry Andric     }
19340b57cec5SDimitry Andric 
19350b57cec5SDimitry Andric     return false;
19360b57cec5SDimitry Andric   };
19370b57cec5SDimitry Andric 
19380b57cec5SDimitry Andric   return FPAtomicToDenormModeWaitStates -
19390b57cec5SDimitry Andric          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
19400b57cec5SDimitry Andric }
19410b57cec5SDimitry Andric 
19420b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
19430b57cec5SDimitry Andric   assert(SIInstrInfo::isMAI(*MI));
19440b57cec5SDimitry Andric 
1945fe6060f1SDimitry Andric   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1946fe6060f1SDimitry Andric }
1947fe6060f1SDimitry Andric 
194881ad6265SDimitry Andric int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
194981ad6265SDimitry Andric   // Early exit if no padding is requested.
195081ad6265SDimitry Andric   if (MFMAPaddingRatio == 0)
195181ad6265SDimitry Andric     return 0;
195281ad6265SDimitry Andric 
195381ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
195481ad6265SDimitry Andric   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
195581ad6265SDimitry Andric     return 0;
195681ad6265SDimitry Andric 
195781ad6265SDimitry Andric   int NeighborMFMALatency = 0;
195881ad6265SDimitry Andric   auto IsNeighboringMFMA = [&NeighborMFMALatency,
195981ad6265SDimitry Andric                             this](const MachineInstr &MI) {
196081ad6265SDimitry Andric     if (!SIInstrInfo::isMFMA(MI))
196181ad6265SDimitry Andric       return false;
196281ad6265SDimitry Andric 
196381ad6265SDimitry Andric     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
196481ad6265SDimitry Andric     return true;
196581ad6265SDimitry Andric   };
196681ad6265SDimitry Andric 
196781ad6265SDimitry Andric   const int MaxMFMAPipelineWaitStates = 16;
196881ad6265SDimitry Andric   int WaitStatesSinceNeighborMFMA =
196981ad6265SDimitry Andric       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
197081ad6265SDimitry Andric 
197181ad6265SDimitry Andric   int NeighborMFMAPaddingNeeded =
197281ad6265SDimitry Andric       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
197381ad6265SDimitry Andric       WaitStatesSinceNeighborMFMA;
197481ad6265SDimitry Andric 
197581ad6265SDimitry Andric   return std::max(0, NeighborMFMAPaddingNeeded);
197681ad6265SDimitry Andric }
197781ad6265SDimitry Andric 
1978fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
19790b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
19800b57cec5SDimitry Andric   unsigned Opc = MI->getOpcode();
19810b57cec5SDimitry Andric 
1982fe6060f1SDimitry Andric   auto IsVALUFn = [](const MachineInstr &MI) {
1983bdd1243dSDimitry Andric     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
19840b57cec5SDimitry Andric   };
19850b57cec5SDimitry Andric 
1986e8d8bef9SDimitry Andric   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
19870b57cec5SDimitry Andric     const int LegacyVALUWritesVGPRWaitStates = 2;
19880b57cec5SDimitry Andric     const int VALUWritesExecWaitStates = 4;
19890b57cec5SDimitry Andric     const int MaxWaitStates = 4;
19900b57cec5SDimitry Andric 
19910b57cec5SDimitry Andric     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
19920b57cec5SDimitry Andric       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
19930b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
19940b57cec5SDimitry Andric 
19950b57cec5SDimitry Andric     if (WaitStatesNeeded < MaxWaitStates) {
19960b57cec5SDimitry Andric       for (const MachineOperand &Use : MI->explicit_uses()) {
19970b57cec5SDimitry Andric         const int MaxWaitStates = 2;
19980b57cec5SDimitry Andric 
19990b57cec5SDimitry Andric         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
20000b57cec5SDimitry Andric           continue;
20010b57cec5SDimitry Andric 
20020b57cec5SDimitry Andric         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
20030b57cec5SDimitry Andric           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
20040b57cec5SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20050b57cec5SDimitry Andric 
20060b57cec5SDimitry Andric         if (WaitStatesNeeded == MaxWaitStates)
20070b57cec5SDimitry Andric           break;
20080b57cec5SDimitry Andric       }
20090b57cec5SDimitry Andric     }
20100b57cec5SDimitry Andric   }
20110b57cec5SDimitry Andric 
20120b57cec5SDimitry Andric   for (const MachineOperand &Op : MI->explicit_operands()) {
20130b57cec5SDimitry Andric     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
20140b57cec5SDimitry Andric       continue;
20150b57cec5SDimitry Andric 
2016e8d8bef9SDimitry Andric     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
20170b57cec5SDimitry Andric       continue;
20180b57cec5SDimitry Andric 
20190b57cec5SDimitry Andric     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
20200b57cec5SDimitry Andric     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
20210b57cec5SDimitry Andric     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
20220b57cec5SDimitry Andric     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
20230b57cec5SDimitry Andric     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
20240b57cec5SDimitry Andric     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
20250b57cec5SDimitry Andric     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
20260b57cec5SDimitry Andric     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
20270b57cec5SDimitry Andric     const int MaxWaitStates = 18;
20288bcb0991SDimitry Andric     Register Reg = Op.getReg();
20290b57cec5SDimitry Andric     unsigned HazardDefLatency = 0;
20300b57cec5SDimitry Andric 
203181ad6265SDimitry Andric     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2032fe6060f1SDimitry Andric                                this](const MachineInstr &MI) {
203381ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
20340b57cec5SDimitry Andric         return false;
2035fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
20360b57cec5SDimitry Andric       if (DstReg == Reg)
20370b57cec5SDimitry Andric         return false;
2038fe6060f1SDimitry Andric       HazardDefLatency =
2039fe6060f1SDimitry Andric           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
20400b57cec5SDimitry Andric       return TRI.regsOverlap(DstReg, Reg);
20410b57cec5SDimitry Andric     };
20420b57cec5SDimitry Andric 
20430b57cec5SDimitry Andric     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
20440b57cec5SDimitry Andric                                                    MaxWaitStates);
20450b57cec5SDimitry Andric     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
20460b57cec5SDimitry Andric     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
204706c3fb27SDimitry Andric     int OpNo = Op.getOperandNo();
20480b57cec5SDimitry Andric     if (OpNo == SrcCIdx) {
20490b57cec5SDimitry Andric       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2050e8d8bef9SDimitry Andric     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
20510b57cec5SDimitry Andric       switch (HazardDefLatency) {
20520b57cec5SDimitry Andric       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
20530b57cec5SDimitry Andric                break;
20540b57cec5SDimitry Andric       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
20550b57cec5SDimitry Andric                break;
2056bdd1243dSDimitry Andric       case 16: [[fallthrough]];
20570b57cec5SDimitry Andric       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
20580b57cec5SDimitry Andric                break;
20590b57cec5SDimitry Andric       }
2060e8d8bef9SDimitry Andric     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
20610b57cec5SDimitry Andric       switch (HazardDefLatency) {
20620b57cec5SDimitry Andric       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
20630b57cec5SDimitry Andric                break;
20640b57cec5SDimitry Andric       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
20650b57cec5SDimitry Andric                break;
2066bdd1243dSDimitry Andric       case 16: [[fallthrough]];
20670b57cec5SDimitry Andric       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
20680b57cec5SDimitry Andric                break;
20690b57cec5SDimitry Andric       }
20700b57cec5SDimitry Andric     }
20710b57cec5SDimitry Andric 
20720b57cec5SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
20730b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20740b57cec5SDimitry Andric 
20750b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
20760b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
20770b57cec5SDimitry Andric 
2078fe6060f1SDimitry Andric     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2079fe6060f1SDimitry Andric       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
20800b57cec5SDimitry Andric         return false;
2081fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
20820b57cec5SDimitry Andric       return TRI.regsOverlap(Reg, DstReg);
20830b57cec5SDimitry Andric     };
20840b57cec5SDimitry Andric 
20850b57cec5SDimitry Andric     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
20860b57cec5SDimitry Andric     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
20870b57cec5SDimitry Andric     const int AccVGPRWriteAccVgprReadWaitStates = 3;
20880b57cec5SDimitry Andric     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
20890b57cec5SDimitry Andric     if (OpNo == SrcCIdx)
20900b57cec5SDimitry Andric       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2091e8d8bef9SDimitry Andric     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
20920b57cec5SDimitry Andric       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
20930b57cec5SDimitry Andric 
20940b57cec5SDimitry Andric     WaitStatesNeededForUse = NeedWaitStates -
20950b57cec5SDimitry Andric       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
20960b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20970b57cec5SDimitry Andric 
20980b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
20990b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
21000b57cec5SDimitry Andric   }
21010b57cec5SDimitry Andric 
2102e8d8bef9SDimitry Andric   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
21030b57cec5SDimitry Andric     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
21040b57cec5SDimitry Andric     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
21050b57cec5SDimitry Andric     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
21060b57cec5SDimitry Andric     const int MaxWaitStates = 13;
21078bcb0991SDimitry Andric     Register DstReg = MI->getOperand(0).getReg();
21080b57cec5SDimitry Andric     unsigned HazardDefLatency = 0;
21090b57cec5SDimitry Andric 
211081ad6265SDimitry Andric     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2111fe6060f1SDimitry Andric                          this](const MachineInstr &MI) {
211281ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
21130b57cec5SDimitry Andric         return false;
2114fe6060f1SDimitry Andric       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2115fe6060f1SDimitry Andric       HazardDefLatency =
2116fe6060f1SDimitry Andric           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
21170b57cec5SDimitry Andric       return TRI.regsOverlap(Reg, DstReg);
21180b57cec5SDimitry Andric     };
21190b57cec5SDimitry Andric 
21200b57cec5SDimitry Andric     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
21210b57cec5SDimitry Andric     int NeedWaitStates;
21220b57cec5SDimitry Andric     switch (HazardDefLatency) {
21230b57cec5SDimitry Andric     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
21240b57cec5SDimitry Andric              break;
21250b57cec5SDimitry Andric     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
21260b57cec5SDimitry Andric              break;
2127bdd1243dSDimitry Andric     case 16: [[fallthrough]];
21280b57cec5SDimitry Andric     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
21290b57cec5SDimitry Andric              break;
21300b57cec5SDimitry Andric     }
21310b57cec5SDimitry Andric 
21320b57cec5SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
21330b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
21340b57cec5SDimitry Andric   }
21350b57cec5SDimitry Andric 
213681ad6265SDimitry Andric   // Pad neighboring MFMA with noops for better inter-wave performance.
213781ad6265SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
213881ad6265SDimitry Andric 
21390b57cec5SDimitry Andric   return WaitStatesNeeded;
21400b57cec5SDimitry Andric }
21410b57cec5SDimitry Andric 
2142*0fca6ea1SDimitry Andric static int
2143*0fca6ea1SDimitry Andric GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2144*0fca6ea1SDimitry Andric   // 2 pass -> 3
2145*0fca6ea1SDimitry Andric   // 4 pass -> 5
2146*0fca6ea1SDimitry Andric   // 8 pass -> 9
2147*0fca6ea1SDimitry Andric   // 16 pass -> 17
2148*0fca6ea1SDimitry Andric   return NumPasses + 1;
2149*0fca6ea1SDimitry Andric }
2150*0fca6ea1SDimitry Andric 
2151*0fca6ea1SDimitry Andric static int
2152*0fca6ea1SDimitry Andric GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2153*0fca6ea1SDimitry Andric   // 2 pass -> 2
2154*0fca6ea1SDimitry Andric   // 4 pass -> 4
2155*0fca6ea1SDimitry Andric   // 8 pass -> 8
2156*0fca6ea1SDimitry Andric   // 16 pass -> 16
2157*0fca6ea1SDimitry Andric   return NumPasses;
2158*0fca6ea1SDimitry Andric }
2159*0fca6ea1SDimitry Andric 
2160*0fca6ea1SDimitry Andric static int
2161*0fca6ea1SDimitry Andric GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2162*0fca6ea1SDimitry Andric   // 2 pass -> 4
2163*0fca6ea1SDimitry Andric   // 4 pass -> 6
2164*0fca6ea1SDimitry Andric   // 8 pass -> 10
2165*0fca6ea1SDimitry Andric   // 16 pass -> 18
2166*0fca6ea1SDimitry Andric   return NumPasses + 2;
2167*0fca6ea1SDimitry Andric }
2168*0fca6ea1SDimitry Andric 
2169*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2170*0fca6ea1SDimitry Andric   // 2 pass -> 5
2171*0fca6ea1SDimitry Andric   // 4 pass -> 7
2172*0fca6ea1SDimitry Andric   // 8 pass -> 11
2173*0fca6ea1SDimitry Andric   // 16 pass -> 19
2174*0fca6ea1SDimitry Andric   return NumPasses + 3;
2175*0fca6ea1SDimitry Andric }
2176*0fca6ea1SDimitry Andric 
2177fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2178fe6060f1SDimitry Andric   int WaitStatesNeeded = 0;
2179fe6060f1SDimitry Andric   unsigned Opc = MI->getOpcode();
2180fe6060f1SDimitry Andric 
218181ad6265SDimitry Andric   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
218281ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2183fe6060f1SDimitry Andric   };
2184fe6060f1SDimitry Andric 
218581ad6265SDimitry Andric   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
218681ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
218781ad6265SDimitry Andric            !SIInstrInfo::isDOT(MI);
2188fe6060f1SDimitry Andric   };
2189fe6060f1SDimitry Andric 
219081ad6265SDimitry Andric   if (!SIInstrInfo::isMFMA(*MI))
2191fe6060f1SDimitry Andric     return WaitStatesNeeded;
2192fe6060f1SDimitry Andric 
2193fe6060f1SDimitry Andric   const int VALUWritesExecWaitStates = 4;
2194fe6060f1SDimitry Andric   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2195fe6060f1SDimitry Andric     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2196fe6060f1SDimitry Andric                           VALUWritesExecWaitStates);
2197fe6060f1SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2198fe6060f1SDimitry Andric 
2199fe6060f1SDimitry Andric   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2200fe6060f1SDimitry Andric 
2201fe6060f1SDimitry Andric   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2202fe6060f1SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
2203fe6060f1SDimitry Andric     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2204fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2205fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2206fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2207fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2208fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2209fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2210fe6060f1SDimitry Andric     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2211fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2212fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2213fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2214fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2215fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2216fe6060f1SDimitry Andric     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2217fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
221881ad6265SDimitry Andric     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2219fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2220fe6060f1SDimitry Andric 
2221fe6060f1SDimitry Andric     if (!Use.isReg())
2222fe6060f1SDimitry Andric       continue;
222304eeddc0SDimitry Andric     Register Reg = Use.getReg();
2224fe6060f1SDimitry Andric     bool FullReg;
2225fe6060f1SDimitry Andric     const MachineInstr *MI1;
2226fe6060f1SDimitry Andric 
222781ad6265SDimitry Andric     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2228fe6060f1SDimitry Andric                                this](const MachineInstr &MI) {
222981ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
2230fe6060f1SDimitry Andric         return false;
2231fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
2232fe6060f1SDimitry Andric       FullReg = (DstReg == Reg);
2233fe6060f1SDimitry Andric       MI1 = &MI;
2234fe6060f1SDimitry Andric       return TRI.regsOverlap(DstReg, Reg);
2235fe6060f1SDimitry Andric     };
2236fe6060f1SDimitry Andric 
2237fe6060f1SDimitry Andric     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2238fe6060f1SDimitry Andric       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2239fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2240fe6060f1SDimitry Andric 
22414824e7fdSDimitry Andric     int NumWaitStates =
22424824e7fdSDimitry Andric         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2243fe6060f1SDimitry Andric     if (NumWaitStates == std::numeric_limits<int>::max())
2244fe6060f1SDimitry Andric       continue;
2245fe6060f1SDimitry Andric 
224606c3fb27SDimitry Andric     int OpNo = Use.getOperandNo();
2247fe6060f1SDimitry Andric     unsigned Opc1 = MI1->getOpcode();
2248fe6060f1SDimitry Andric     int NeedWaitStates = 0;
2249fe6060f1SDimitry Andric     if (OpNo == SrcCIdx) {
225081ad6265SDimitry Andric       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2251fe6060f1SDimitry Andric         NeedWaitStates = 0;
2252fe6060f1SDimitry Andric       } else if (FullReg) {
2253fe6060f1SDimitry Andric         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254fe6060f1SDimitry Andric              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2255fe6060f1SDimitry Andric             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2256fe6060f1SDimitry Andric              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2257fe6060f1SDimitry Andric           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
225881ad6265SDimitry Andric         else if (ST.hasGFX940Insts() &&
225981ad6265SDimitry Andric                  TSchedModel.computeInstrLatency(MI1) == 2)
226081ad6265SDimitry Andric           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2261fe6060f1SDimitry Andric       } else {
2262fe6060f1SDimitry Andric         switch (Opc1) {
2263fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2264fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
226504eeddc0SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
226604eeddc0SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2267fe6060f1SDimitry Andric           if (!isXDL(ST, *MI))
2268fe6060f1SDimitry Andric             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2269fe6060f1SDimitry Andric           break;
2270fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2271fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2272fe6060f1SDimitry Andric           if (!isXDL(ST, *MI))
2273fe6060f1SDimitry Andric             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2274fe6060f1SDimitry Andric           break;
2275fe6060f1SDimitry Andric         default:
2276*0fca6ea1SDimitry Andric           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2277*0fca6ea1SDimitry Andric           if (ST.hasGFX940Insts()) {
2278*0fca6ea1SDimitry Andric             if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
227981ad6265SDimitry Andric               break;
2280*0fca6ea1SDimitry Andric 
2281*0fca6ea1SDimitry Andric             NeedWaitStates =
2282*0fca6ea1SDimitry Andric                 isXDL(ST, *MI1)
2283*0fca6ea1SDimitry Andric                     ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2284*0fca6ea1SDimitry Andric                           NumPasses)
2285*0fca6ea1SDimitry Andric                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2286*0fca6ea1SDimitry Andric                           NumPasses);
2287*0fca6ea1SDimitry Andric             break;
2288*0fca6ea1SDimitry Andric           }
2289*0fca6ea1SDimitry Andric 
2290*0fca6ea1SDimitry Andric           switch (NumPasses) {
2291fe6060f1SDimitry Andric           case 2:
2292*0fca6ea1SDimitry Andric             NeedWaitStates =
2293*0fca6ea1SDimitry Andric                 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294fe6060f1SDimitry Andric                              : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2295fe6060f1SDimitry Andric             break;
2296fe6060f1SDimitry Andric           case 8:
2297*0fca6ea1SDimitry Andric             NeedWaitStates =
2298*0fca6ea1SDimitry Andric                 isDGEMM(Opc)
2299fe6060f1SDimitry Andric                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300fe6060f1SDimitry Andric                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2301fe6060f1SDimitry Andric             break;
2302*0fca6ea1SDimitry Andric           case 16:
2303*0fca6ea1SDimitry Andric             NeedWaitStates =
2304*0fca6ea1SDimitry Andric                 isDGEMM(Opc)
2305fe6060f1SDimitry Andric                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306fe6060f1SDimitry Andric                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2307*0fca6ea1SDimitry Andric             break;
2308*0fca6ea1SDimitry Andric           default:
2309*0fca6ea1SDimitry Andric             llvm_unreachable("unexpected number of passes");
2310fe6060f1SDimitry Andric           }
2311fe6060f1SDimitry Andric         }
2312fe6060f1SDimitry Andric       }
2313fe6060f1SDimitry Andric     } else {
2314fe6060f1SDimitry Andric       switch (Opc1) {
2315fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2316fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
231704eeddc0SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
231804eeddc0SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2319fe6060f1SDimitry Andric         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2320fe6060f1SDimitry Andric         break;
2321fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2322fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2323fe6060f1SDimitry Andric         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2324fe6060f1SDimitry Andric         break;
2325fe6060f1SDimitry Andric       default:
2326*0fca6ea1SDimitry Andric         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2327*0fca6ea1SDimitry Andric 
2328*0fca6ea1SDimitry Andric         if (ST.hasGFX940Insts()) {
2329*0fca6ea1SDimitry Andric           NeedWaitStates =
2330*0fca6ea1SDimitry Andric               isXDL(ST, *MI1)
2331*0fca6ea1SDimitry Andric                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2332*0fca6ea1SDimitry Andric                         NumPasses)
2333*0fca6ea1SDimitry Andric                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2334*0fca6ea1SDimitry Andric                         NumPasses);
2335*0fca6ea1SDimitry Andric           break;
2336*0fca6ea1SDimitry Andric         }
2337*0fca6ea1SDimitry Andric 
2338*0fca6ea1SDimitry Andric         switch (NumPasses) {
2339fe6060f1SDimitry Andric         case 2:
2340*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
234181ad6265SDimitry Andric           break;
234281ad6265SDimitry Andric         case 4:
2343*0fca6ea1SDimitry Andric           llvm_unreachable("unexpected number of passes for mfma");
2344fe6060f1SDimitry Andric         case 8:
2345*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2346fe6060f1SDimitry Andric           break;
2347*0fca6ea1SDimitry Andric         case 16:
2348fe6060f1SDimitry Andric         default:
2349*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2350fe6060f1SDimitry Andric         }
2351fe6060f1SDimitry Andric       }
2352fe6060f1SDimitry Andric     }
2353fe6060f1SDimitry Andric     if (WaitStatesNeeded >= NeedWaitStates)
2354fe6060f1SDimitry Andric       continue;
2355fe6060f1SDimitry Andric 
2356fe6060f1SDimitry Andric     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2357fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2358fe6060f1SDimitry Andric 
2359fe6060f1SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
2360fe6060f1SDimitry Andric       break;
2361fe6060f1SDimitry Andric   }
2362fe6060f1SDimitry Andric 
2363*0fca6ea1SDimitry Andric   // Pad neighboring MFMA with noops for better inter-wave performance.
2364*0fca6ea1SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2365*0fca6ea1SDimitry Andric 
2366fe6060f1SDimitry Andric   return WaitStatesNeeded;
2367fe6060f1SDimitry Andric }
2368fe6060f1SDimitry Andric 
23690b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2370349cc55cSDimitry Andric   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371fe6060f1SDimitry Andric   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
23720b57cec5SDimitry Andric     return 0;
23730b57cec5SDimitry Andric 
23740b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
23750b57cec5SDimitry Andric 
2376fe6060f1SDimitry Andric   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2377fe6060f1SDimitry Andric     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
23780b57cec5SDimitry Andric   };
23790b57cec5SDimitry Andric 
23800b57cec5SDimitry Andric   for (const MachineOperand &Op : MI->explicit_uses()) {
23810b57cec5SDimitry Andric     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
23820b57cec5SDimitry Andric       continue;
23830b57cec5SDimitry Andric 
23848bcb0991SDimitry Andric     Register Reg = Op.getReg();
23850b57cec5SDimitry Andric 
23860b57cec5SDimitry Andric     const int AccVgprReadLdStWaitStates = 2;
2387e8d8bef9SDimitry Andric     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
23880b57cec5SDimitry Andric     const int MaxWaitStates = 2;
23890b57cec5SDimitry Andric 
23900b57cec5SDimitry Andric     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
23910b57cec5SDimitry Andric       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
23920b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
23930b57cec5SDimitry Andric 
23940b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
23950b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
23960b57cec5SDimitry Andric 
2397fe6060f1SDimitry Andric     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2398fe6060f1SDimitry Andric       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399fe6060f1SDimitry Andric           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
24000b57cec5SDimitry Andric         return false;
2401fe6060f1SDimitry Andric       auto IsVALUFn = [](const MachineInstr &MI) {
2402fe6060f1SDimitry Andric         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
24030b57cec5SDimitry Andric       };
24040b57cec5SDimitry Andric       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
24050b57cec5SDimitry Andric              std::numeric_limits<int>::max();
24060b57cec5SDimitry Andric     };
24070b57cec5SDimitry Andric 
2408e8d8bef9SDimitry Andric     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409e8d8bef9SDimitry Andric       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
24100b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
24110b57cec5SDimitry Andric   }
24120b57cec5SDimitry Andric 
24130b57cec5SDimitry Andric   return WaitStatesNeeded;
24140b57cec5SDimitry Andric }
2415e8d8bef9SDimitry Andric 
2416*0fca6ea1SDimitry Andric static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2417*0fca6ea1SDimitry Andric   // 2 pass -> 4
2418*0fca6ea1SDimitry Andric   // 4 pass -> 6
2419*0fca6ea1SDimitry Andric   // 8 pass -> 10
2420*0fca6ea1SDimitry Andric   // 16 pass -> 18
2421*0fca6ea1SDimitry Andric   return NumPasses + 2;
2422*0fca6ea1SDimitry Andric }
2423*0fca6ea1SDimitry Andric 
2424*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2425*0fca6ea1SDimitry Andric   // 2 pass -> 5
2426*0fca6ea1SDimitry Andric   // 4 pass -> 7
2427*0fca6ea1SDimitry Andric   // 8 pass -> 11
2428*0fca6ea1SDimitry Andric   // 16 pass -> 19
2429*0fca6ea1SDimitry Andric   return NumPasses + 3;
2430*0fca6ea1SDimitry Andric }
2431*0fca6ea1SDimitry Andric 
2432*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2433*0fca6ea1SDimitry Andric   // 2 pass -> 5
2434*0fca6ea1SDimitry Andric   // 4 pass -> 7
2435*0fca6ea1SDimitry Andric   // 8 pass -> 11
2436*0fca6ea1SDimitry Andric   // 16 pass -> 19
2437*0fca6ea1SDimitry Andric   return NumPasses + 3;
2438*0fca6ea1SDimitry Andric }
2439*0fca6ea1SDimitry Andric 
2440*0fca6ea1SDimitry Andric static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2441*0fca6ea1SDimitry Andric   // 2 pass -> 4
2442*0fca6ea1SDimitry Andric   // 4 pass -> 6
2443*0fca6ea1SDimitry Andric   // 8 pass -> 10
2444*0fca6ea1SDimitry Andric   // 16 pass -> 18
2445*0fca6ea1SDimitry Andric   return NumPasses + 2;
2446*0fca6ea1SDimitry Andric }
2447*0fca6ea1SDimitry Andric 
2448fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2449fe6060f1SDimitry Andric   if (!ST.hasGFX90AInsts())
2450fe6060f1SDimitry Andric     return 0;
2451fe6060f1SDimitry Andric 
2452fe6060f1SDimitry Andric   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2453fe6060f1SDimitry Andric     return isDGEMM(MI.getOpcode());
2454fe6060f1SDimitry Andric   };
2455fe6060f1SDimitry Andric 
2456fe6060f1SDimitry Andric   // This is checked in checkMAIHazards90A()
245781ad6265SDimitry Andric   if (SIInstrInfo::isMFMA(*MI))
2458fe6060f1SDimitry Andric     return 0;
2459fe6060f1SDimitry Andric 
2460bdd1243dSDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
2461bdd1243dSDimitry Andric 
2462fe6060f1SDimitry Andric   int WaitStatesNeeded = 0;
2463fe6060f1SDimitry Andric 
2464bdd1243dSDimitry Andric   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2465fe6060f1SDimitry Andric                SIInstrInfo::isFLAT(*MI) ||
2466bdd1243dSDimitry Andric                SIInstrInfo::isDS(*MI);
2467bdd1243dSDimitry Andric   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2468fe6060f1SDimitry Andric   bool IsVALU = SIInstrInfo::isVALU(*MI);
2469fe6060f1SDimitry Andric 
2470fe6060f1SDimitry Andric   const MachineInstr *MFMA = nullptr;
2471fe6060f1SDimitry Andric   unsigned Reg;
247281ad6265SDimitry Andric   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
247381ad6265SDimitry Andric     if (!SIInstrInfo::isMFMA(MI) ||
247481ad6265SDimitry Andric         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2475fe6060f1SDimitry Andric       return false;
2476fe6060f1SDimitry Andric     MFMA = &MI;
2477fe6060f1SDimitry Andric     return true;
2478fe6060f1SDimitry Andric   };
2479fe6060f1SDimitry Andric 
2480fe6060f1SDimitry Andric   const MachineInstr *DOT = nullptr;
2481fe6060f1SDimitry Andric   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2482fe6060f1SDimitry Andric     if (!SIInstrInfo::isDOT(MI) ||
2483fe6060f1SDimitry Andric         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2484fe6060f1SDimitry Andric       return false;
2485fe6060f1SDimitry Andric     DOT = &MI;
2486fe6060f1SDimitry Andric     return true;
2487fe6060f1SDimitry Andric   };
2488fe6060f1SDimitry Andric 
2489bdd1243dSDimitry Andric   bool DGEMMAfterVALUWrite = false;
2490bdd1243dSDimitry Andric   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2491bdd1243dSDimitry Andric     // Found DGEMM on reverse traversal to def.
2492bdd1243dSDimitry Andric     if (isDGEMM(MI.getOpcode()))
2493bdd1243dSDimitry Andric       DGEMMAfterVALUWrite = true;
2494bdd1243dSDimitry Andric 
2495bdd1243dSDimitry Andric     // Only hazard if register is defined by a VALU and a DGEMM is found after
2496bdd1243dSDimitry Andric     // after the def.
2497bdd1243dSDimitry Andric     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2498bdd1243dSDimitry Andric       return false;
2499bdd1243dSDimitry Andric 
2500bdd1243dSDimitry Andric     return true;
2501bdd1243dSDimitry Andric   };
2502bdd1243dSDimitry Andric 
2503fe6060f1SDimitry Andric   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2504fe6060f1SDimitry Andric                                            AMDGPU::OpName::src2);
2505fe6060f1SDimitry Andric 
2506fe6060f1SDimitry Andric   if (IsMemOrExport || IsVALU) {
2507fe6060f1SDimitry Andric     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2508fe6060f1SDimitry Andric     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2509fe6060f1SDimitry Andric     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2510fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2511fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2512fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2513fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2514fe6060f1SDimitry Andric     const int DotWriteSameDotReadSrcAB = 3;
2515fe6060f1SDimitry Andric     const int DotWriteDifferentVALURead = 3;
2516bdd1243dSDimitry Andric     const int DMFMABetweenVALUWriteVMEMRead = 2;
2517fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2518fe6060f1SDimitry Andric 
2519fe6060f1SDimitry Andric     for (const MachineOperand &Use : MI->explicit_uses()) {
2520fe6060f1SDimitry Andric       if (!Use.isReg())
2521fe6060f1SDimitry Andric         continue;
2522fe6060f1SDimitry Andric       Reg = Use.getReg();
2523fe6060f1SDimitry Andric 
2524fe6060f1SDimitry Andric       DOT = nullptr;
2525fe6060f1SDimitry Andric       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2526fe6060f1SDimitry Andric                                                      MaxWaitStates);
2527fe6060f1SDimitry Andric       if (DOT) {
2528fe6060f1SDimitry Andric         int NeedWaitStates = 0;
2529fe6060f1SDimitry Andric         if (DOT->getOpcode() == MI->getOpcode()) {
2530fe6060f1SDimitry Andric           if (&Use - &MI->getOperand(0) != SrcCIdx)
2531fe6060f1SDimitry Andric             NeedWaitStates = DotWriteSameDotReadSrcAB;
2532fe6060f1SDimitry Andric         } else {
2533fe6060f1SDimitry Andric           NeedWaitStates = DotWriteDifferentVALURead;
2534fe6060f1SDimitry Andric         }
2535fe6060f1SDimitry Andric 
2536fe6060f1SDimitry Andric         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2537fe6060f1SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2538fe6060f1SDimitry Andric       }
2539fe6060f1SDimitry Andric 
2540bdd1243dSDimitry Andric       // Workaround for HW data hazard bug observed only in GFX90A. When there
2541bdd1243dSDimitry Andric       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2542bdd1243dSDimitry Andric       // causes the SQ to incorrectly not insert two wait states between the two
2543bdd1243dSDimitry Andric       // instructions needed to avoid data hazard.
2544bdd1243dSDimitry Andric       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2545bdd1243dSDimitry Andric         DGEMMAfterVALUWrite = false;
2546bdd1243dSDimitry Andric         if (TRI.isVectorRegister(MRI, Reg)) {
2547bdd1243dSDimitry Andric           int WaitStatesNeededForUse =
2548bdd1243dSDimitry Andric                 DMFMABetweenVALUWriteVMEMRead -
2549bdd1243dSDimitry Andric                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2550bdd1243dSDimitry Andric                                       DMFMABetweenVALUWriteVMEMRead);
2551bdd1243dSDimitry Andric 
2552bdd1243dSDimitry Andric           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553bdd1243dSDimitry Andric         }
2554bdd1243dSDimitry Andric       }
2555bdd1243dSDimitry Andric 
2556fe6060f1SDimitry Andric       MFMA = nullptr;
25574824e7fdSDimitry Andric       WaitStatesSinceDef =
25584824e7fdSDimitry Andric           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2559fe6060f1SDimitry Andric       if (!MFMA)
2560fe6060f1SDimitry Andric         continue;
2561fe6060f1SDimitry Andric 
2562fe6060f1SDimitry Andric       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2563*0fca6ea1SDimitry Andric       int NumPasses = HazardDefLatency;
2564fe6060f1SDimitry Andric       int NeedWaitStates = MaxWaitStates;
2565*0fca6ea1SDimitry Andric 
2566*0fca6ea1SDimitry Andric       if (isDGEMM(MFMA->getOpcode())) {
2567fe6060f1SDimitry Andric         switch (HazardDefLatency) {
2568fe6060f1SDimitry Andric         case 4:
2569*0fca6ea1SDimitry Andric           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2570*0fca6ea1SDimitry Andric                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2571fe6060f1SDimitry Andric           break;
2572fe6060f1SDimitry Andric         case 8:
2573*0fca6ea1SDimitry Andric         case 16:
2574*0fca6ea1SDimitry Andric           NeedWaitStates = IsMemOrExport
2575*0fca6ea1SDimitry Andric                                ? DMFMA16x16WriteVgprMemExpReadWaitStates
2576*0fca6ea1SDimitry Andric                                : DMFMA16x16WriteVgprVALUReadWaitStates;
2577fe6060f1SDimitry Andric           break;
2578fe6060f1SDimitry Andric         default:
2579*0fca6ea1SDimitry Andric           llvm_unreachable("unexpected dgemm");
2580*0fca6ea1SDimitry Andric         }
2581*0fca6ea1SDimitry Andric       } else if (ST.hasGFX940Insts()) {
2582fe6060f1SDimitry Andric         NeedWaitStates =
2583*0fca6ea1SDimitry Andric             isXDL(ST, *MFMA)
2584*0fca6ea1SDimitry Andric                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2585*0fca6ea1SDimitry Andric                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2586*0fca6ea1SDimitry Andric                       NumPasses);
2587*0fca6ea1SDimitry Andric       } else {
2588*0fca6ea1SDimitry Andric         switch (HazardDefLatency) {
2589*0fca6ea1SDimitry Andric         case 2:
2590*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2591fe6060f1SDimitry Andric           break;
2592*0fca6ea1SDimitry Andric         case 8:
2593*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2594*0fca6ea1SDimitry Andric           break;
2595*0fca6ea1SDimitry Andric         case 16:
2596*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2597*0fca6ea1SDimitry Andric           break;
2598*0fca6ea1SDimitry Andric         default:
2599*0fca6ea1SDimitry Andric           llvm_unreachable("unexpected number of passes for mfma");
2600*0fca6ea1SDimitry Andric         }
2601fe6060f1SDimitry Andric       }
2602fe6060f1SDimitry Andric 
2603fe6060f1SDimitry Andric       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2604fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2605fe6060f1SDimitry Andric 
2606fe6060f1SDimitry Andric       if (WaitStatesNeeded == MaxWaitStates)
2607fe6060f1SDimitry Andric         break;
2608fe6060f1SDimitry Andric     }
2609fe6060f1SDimitry Andric   }
2610fe6060f1SDimitry Andric 
2611fe6060f1SDimitry Andric   unsigned Opc = MI->getOpcode();
2612fe6060f1SDimitry Andric   const int DMFMAToFMA64WaitStates = 2;
2613fe6060f1SDimitry Andric   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2614fe6060f1SDimitry Andric        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2615fe6060f1SDimitry Andric        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2616fe6060f1SDimitry Andric       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2617fe6060f1SDimitry Andric     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2618fe6060f1SDimitry Andric       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2619fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2620fe6060f1SDimitry Andric   }
2621fe6060f1SDimitry Andric 
2622fe6060f1SDimitry Andric   if (!IsVALU && !IsMemOrExport)
2623fe6060f1SDimitry Andric     return WaitStatesNeeded;
2624fe6060f1SDimitry Andric 
2625fe6060f1SDimitry Andric   for (const MachineOperand &Def : MI->defs()) {
2626fe6060f1SDimitry Andric     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2627fe6060f1SDimitry Andric     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2628fe6060f1SDimitry Andric     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2629fe6060f1SDimitry Andric     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
263081ad6265SDimitry Andric     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2631fe6060f1SDimitry Andric     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2632fe6060f1SDimitry Andric     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2633fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2634fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2635fe6060f1SDimitry Andric     const int DotWriteDifferentVALUWrite = 3;
2636fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2637fe6060f1SDimitry Andric     const int MaxWarWaitStates = 15;
2638fe6060f1SDimitry Andric 
2639fe6060f1SDimitry Andric     Reg = Def.getReg();
2640fe6060f1SDimitry Andric 
2641fe6060f1SDimitry Andric     DOT = nullptr;
2642fe6060f1SDimitry Andric     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2643fe6060f1SDimitry Andric                                                    MaxWaitStates);
2644fe6060f1SDimitry Andric     if (DOT && DOT->getOpcode() != MI->getOpcode())
2645fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2646fe6060f1SDimitry Andric                                                     WaitStatesSinceDef);
2647fe6060f1SDimitry Andric 
2648fe6060f1SDimitry Andric     MFMA = nullptr;
26494824e7fdSDimitry Andric     WaitStatesSinceDef =
26504824e7fdSDimitry Andric         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2651fe6060f1SDimitry Andric     if (MFMA) {
2652fe6060f1SDimitry Andric       int NeedWaitStates = MaxWaitStates;
2653*0fca6ea1SDimitry Andric       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2654*0fca6ea1SDimitry Andric 
2655*0fca6ea1SDimitry Andric       if (isDGEMM(MFMA->getOpcode())) {
2656*0fca6ea1SDimitry Andric         switch (NumPasses) {
2657fe6060f1SDimitry Andric         case 4:
2658*0fca6ea1SDimitry Andric           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2659fe6060f1SDimitry Andric           break;
2660fe6060f1SDimitry Andric         case 8:
2661*0fca6ea1SDimitry Andric         case 16:
2662*0fca6ea1SDimitry Andric           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2663fe6060f1SDimitry Andric           break;
2664fe6060f1SDimitry Andric         default:
2665*0fca6ea1SDimitry Andric           llvm_unreachable("unexpected number of cycles for dgemm");
2666*0fca6ea1SDimitry Andric         }
2667*0fca6ea1SDimitry Andric       } else if (ST.hasGFX940Insts()) {
2668*0fca6ea1SDimitry Andric         NeedWaitStates =
2669*0fca6ea1SDimitry Andric             isXDL(ST, *MFMA)
2670*0fca6ea1SDimitry Andric                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2671*0fca6ea1SDimitry Andric                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2672*0fca6ea1SDimitry Andric       } else {
2673*0fca6ea1SDimitry Andric         switch (NumPasses) {
2674*0fca6ea1SDimitry Andric         case 2:
2675*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2676fe6060f1SDimitry Andric           break;
2677*0fca6ea1SDimitry Andric         case 8:
2678*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2679*0fca6ea1SDimitry Andric           break;
2680*0fca6ea1SDimitry Andric         case 16:
2681*0fca6ea1SDimitry Andric           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2682*0fca6ea1SDimitry Andric           break;
2683*0fca6ea1SDimitry Andric         default:
2684*0fca6ea1SDimitry Andric           llvm_unreachable("Unexpected number of passes for mfma");
2685*0fca6ea1SDimitry Andric         }
2686fe6060f1SDimitry Andric       }
2687fe6060f1SDimitry Andric 
2688fe6060f1SDimitry Andric       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2689fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2690fe6060f1SDimitry Andric 
2691fe6060f1SDimitry Andric       if (WaitStatesNeeded == MaxWaitStates)
2692fe6060f1SDimitry Andric         break;
2693fe6060f1SDimitry Andric     }
2694fe6060f1SDimitry Andric 
269581ad6265SDimitry Andric     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
269681ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2697fe6060f1SDimitry Andric           !MI.readsRegister(Reg, &TRI))
2698fe6060f1SDimitry Andric         return false;
2699fe6060f1SDimitry Andric 
270081ad6265SDimitry Andric       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
270181ad6265SDimitry Andric         return false;
270281ad6265SDimitry Andric 
2703fe6060f1SDimitry Andric       const MachineOperand *SrcC =
2704fe6060f1SDimitry Andric           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2705fe6060f1SDimitry Andric       assert(SrcC);
2706fe6060f1SDimitry Andric       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2707fe6060f1SDimitry Andric         return false;
2708fe6060f1SDimitry Andric 
2709fe6060f1SDimitry Andric       MFMA = &MI;
2710fe6060f1SDimitry Andric       return true;
2711fe6060f1SDimitry Andric     };
2712fe6060f1SDimitry Andric 
2713fe6060f1SDimitry Andric     MFMA = nullptr;
2714fe6060f1SDimitry Andric     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2715fe6060f1SDimitry Andric                                                 MaxWarWaitStates);
2716fe6060f1SDimitry Andric     if (!MFMA)
2717fe6060f1SDimitry Andric       continue;
2718fe6060f1SDimitry Andric 
2719fe6060f1SDimitry Andric     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2720fe6060f1SDimitry Andric     int NeedWaitStates = MaxWaitStates;
2721fe6060f1SDimitry Andric     switch (HazardDefLatency) {
2722fe6060f1SDimitry Andric     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2723fe6060f1SDimitry Andric              break;
272481ad6265SDimitry Andric     case 4:  assert(ST.hasGFX940Insts());
272581ad6265SDimitry Andric              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
272681ad6265SDimitry Andric              break;
2727fe6060f1SDimitry Andric     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2728fe6060f1SDimitry Andric              break;
2729bdd1243dSDimitry Andric     case 16: [[fallthrough]];
2730fe6060f1SDimitry Andric     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2731fe6060f1SDimitry Andric              break;
2732fe6060f1SDimitry Andric     }
2733fe6060f1SDimitry Andric 
2734fe6060f1SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2735fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736fe6060f1SDimitry Andric   }
2737fe6060f1SDimitry Andric 
2738fe6060f1SDimitry Andric   return WaitStatesNeeded;
2739fe6060f1SDimitry Andric }
2740fe6060f1SDimitry Andric 
2741e8d8bef9SDimitry Andric bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2742e8d8bef9SDimitry Andric   if (!SU->isInstr())
2743e8d8bef9SDimitry Andric     return false;
2744e8d8bef9SDimitry Andric 
2745fe6060f1SDimitry Andric   const MachineInstr *MAI = nullptr;
274681ad6265SDimitry Andric 
2747fe6060f1SDimitry Andric   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2748e8d8bef9SDimitry Andric     MAI = nullptr;
274981ad6265SDimitry Andric     if (SIInstrInfo::isMFMA(MI))
2750fe6060f1SDimitry Andric       MAI = &MI;
2751e8d8bef9SDimitry Andric     return MAI != nullptr;
2752e8d8bef9SDimitry Andric   };
2753e8d8bef9SDimitry Andric 
2754e8d8bef9SDimitry Andric   MachineInstr *MI = SU->getInstr();
2755fe6060f1SDimitry Andric   if (IsMFMAFn(*MI)) {
2756e8d8bef9SDimitry Andric     int W = getWaitStatesSince(IsMFMAFn, 16);
2757e8d8bef9SDimitry Andric     if (MAI)
2758e8d8bef9SDimitry Andric       return W < (int)TSchedModel.computeInstrLatency(MAI);
2759e8d8bef9SDimitry Andric   }
2760e8d8bef9SDimitry Andric 
2761e8d8bef9SDimitry Andric   return false;
2762e8d8bef9SDimitry Andric }
2763bdd1243dSDimitry Andric 
2764bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2765bdd1243dSDimitry Andric   if (!ST.hasVALUMaskWriteHazard())
2766bdd1243dSDimitry Andric     return false;
27677a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
27687a6dacacSDimitry Andric 
27697a6dacacSDimitry Andric   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2770bdd1243dSDimitry Andric     return false;
2771bdd1243dSDimitry Andric 
2772bdd1243dSDimitry Andric   // The hazard sequence is three instructions:
2773bdd1243dSDimitry Andric   //   1. VALU reads SGPR as mask
2774bdd1243dSDimitry Andric   //   2. SALU writes SGPR
2775bdd1243dSDimitry Andric   //   3. SALU reads SGPR
2776bdd1243dSDimitry Andric   // The hazard can expire if the distance between 2 and 3 is sufficient.
2777bdd1243dSDimitry Andric   // In practice this happens <10% of the time, hence this always assumes
2778bdd1243dSDimitry Andric   // the hazard exists if 1 and 2 are present to avoid searching.
2779bdd1243dSDimitry Andric 
2780bdd1243dSDimitry Andric   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2781bdd1243dSDimitry Andric   if (!SDSTOp || !SDSTOp->isReg())
2782bdd1243dSDimitry Andric     return false;
2783bdd1243dSDimitry Andric 
2784bdd1243dSDimitry Andric   const Register HazardReg = SDSTOp->getReg();
2785bdd1243dSDimitry Andric   if (HazardReg == AMDGPU::EXEC ||
2786bdd1243dSDimitry Andric       HazardReg == AMDGPU::EXEC_LO ||
2787bdd1243dSDimitry Andric       HazardReg == AMDGPU::EXEC_HI ||
2788bdd1243dSDimitry Andric       HazardReg == AMDGPU::M0)
2789bdd1243dSDimitry Andric     return false;
2790bdd1243dSDimitry Andric 
2791bdd1243dSDimitry Andric   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2792bdd1243dSDimitry Andric     switch (I.getOpcode()) {
2793bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_e32:
2794bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_dpp:
2795bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_e32:
2796bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_dpp:
2797bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_e32:
2798bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_dpp:
2799bdd1243dSDimitry Andric     case AMDGPU::V_DIV_FMAS_F32_e64:
2800bdd1243dSDimitry Andric     case AMDGPU::V_DIV_FMAS_F64_e64:
2801bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_e32:
2802bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_dpp:
2803bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_e32:
2804bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_dpp:
2805bdd1243dSDimitry Andric       // These implicitly read VCC as mask source.
2806bdd1243dSDimitry Andric       return HazardReg == AMDGPU::VCC ||
2807bdd1243dSDimitry Andric              HazardReg == AMDGPU::VCC_LO ||
2808bdd1243dSDimitry Andric              HazardReg == AMDGPU::VCC_HI;
2809bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_e64:
2810bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_e64_dpp:
2811bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_e64:
2812bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2813bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_e64:
2814bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2815bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_e64:
2816bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_e64_dpp:
2817bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_e64:
2818bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2819bdd1243dSDimitry Andric       // Only check mask register overlaps.
2820bdd1243dSDimitry Andric       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2821bdd1243dSDimitry Andric       assert(SSRCOp);
2822bdd1243dSDimitry Andric       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2823bdd1243dSDimitry Andric     }
2824bdd1243dSDimitry Andric     default:
2825bdd1243dSDimitry Andric       return false;
2826bdd1243dSDimitry Andric     }
2827bdd1243dSDimitry Andric   };
2828bdd1243dSDimitry Andric 
2829bdd1243dSDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
2830bdd1243dSDimitry Andric   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2831bdd1243dSDimitry Andric     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2832bdd1243dSDimitry Andric     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
283306c3fb27SDimitry Andric         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2834bdd1243dSDimitry Andric       return true;
2835bdd1243dSDimitry Andric 
2836bdd1243dSDimitry Andric     // VALU access to any SGPR or literal constant other than HazardReg
2837bdd1243dSDimitry Andric     // mitigates hazard. No need to check HazardReg here as this will
2838bdd1243dSDimitry Andric     // only be called when !IsHazardFn.
2839bdd1243dSDimitry Andric     if (!SIInstrInfo::isVALU(I))
2840bdd1243dSDimitry Andric       return false;
2841bdd1243dSDimitry Andric     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2842bdd1243dSDimitry Andric       const MachineOperand &Op = I.getOperand(OpNo);
2843bdd1243dSDimitry Andric       if (Op.isReg()) {
2844bdd1243dSDimitry Andric         Register OpReg = Op.getReg();
2845bdd1243dSDimitry Andric         // Only consider uses
2846bdd1243dSDimitry Andric         if (!Op.isUse())
2847bdd1243dSDimitry Andric           continue;
2848bdd1243dSDimitry Andric         // Ignore EXEC
2849bdd1243dSDimitry Andric         if (OpReg == AMDGPU::EXEC ||
2850bdd1243dSDimitry Andric             OpReg == AMDGPU::EXEC_LO ||
2851bdd1243dSDimitry Andric             OpReg == AMDGPU::EXEC_HI)
2852bdd1243dSDimitry Andric           continue;
2853bdd1243dSDimitry Andric         // Ignore all implicit uses except VCC
2854bdd1243dSDimitry Andric         if (Op.isImplicit()) {
2855bdd1243dSDimitry Andric           if (OpReg == AMDGPU::VCC ||
2856bdd1243dSDimitry Andric               OpReg == AMDGPU::VCC_LO ||
2857bdd1243dSDimitry Andric               OpReg == AMDGPU::VCC_HI)
2858bdd1243dSDimitry Andric             return true;
2859bdd1243dSDimitry Andric           continue;
2860bdd1243dSDimitry Andric         }
2861bdd1243dSDimitry Andric         if (TRI.isSGPRReg(MRI, OpReg))
2862bdd1243dSDimitry Andric           return true;
2863bdd1243dSDimitry Andric       } else {
2864bdd1243dSDimitry Andric         const MCInstrDesc &InstDesc = I.getDesc();
2865bdd1243dSDimitry Andric         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2866bdd1243dSDimitry Andric         if (!TII.isInlineConstant(Op, OpInfo))
2867bdd1243dSDimitry Andric           return true;
2868bdd1243dSDimitry Andric       }
2869bdd1243dSDimitry Andric     }
2870bdd1243dSDimitry Andric     return false;
2871bdd1243dSDimitry Andric   };
2872bdd1243dSDimitry Andric 
2873bdd1243dSDimitry Andric   // Check for hazard
2874bdd1243dSDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2875bdd1243dSDimitry Andric       std::numeric_limits<int>::max())
2876bdd1243dSDimitry Andric     return false;
2877bdd1243dSDimitry Andric 
2878bdd1243dSDimitry Andric   auto NextMI = std::next(MI->getIterator());
2879bdd1243dSDimitry Andric 
2880bdd1243dSDimitry Andric   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881bdd1243dSDimitry Andric   BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2882bdd1243dSDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
288306c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2884bdd1243dSDimitry Andric 
2885bdd1243dSDimitry Andric   // SALU write may be s_getpc in a bundle.
2886bdd1243dSDimitry Andric   if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2887bdd1243dSDimitry Andric     // Update offsets of any references in the bundle.
2888bdd1243dSDimitry Andric     while (NextMI != MI->getParent()->end() &&
2889bdd1243dSDimitry Andric            NextMI->isBundledWithPred()) {
2890bdd1243dSDimitry Andric       for (auto &Operand : NextMI->operands()) {
2891bdd1243dSDimitry Andric         if (Operand.isGlobal())
2892bdd1243dSDimitry Andric           Operand.setOffset(Operand.getOffset() + 4);
2893bdd1243dSDimitry Andric       }
2894bdd1243dSDimitry Andric       NextMI++;
2895bdd1243dSDimitry Andric     }
2896bdd1243dSDimitry Andric   }
2897bdd1243dSDimitry Andric 
2898bdd1243dSDimitry Andric   return true;
2899bdd1243dSDimitry Andric }
2900*0fca6ea1SDimitry Andric 
2901*0fca6ea1SDimitry Andric static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
2902*0fca6ea1SDimitry Andric                                const SIInstrInfo &TII) {
2903*0fca6ea1SDimitry Andric   MachineBasicBlock &EntryMBB = MF->front();
2904*0fca6ea1SDimitry Andric   if (EntryMBB.begin() != EntryMBB.end()) {
2905*0fca6ea1SDimitry Andric     auto &EntryMI = *EntryMBB.begin();
2906*0fca6ea1SDimitry Andric     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2907*0fca6ea1SDimitry Andric         EntryMI.getOperand(0).getImm() >= Priority)
2908*0fca6ea1SDimitry Andric       return false;
2909*0fca6ea1SDimitry Andric   }
2910*0fca6ea1SDimitry Andric 
2911*0fca6ea1SDimitry Andric   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
2912*0fca6ea1SDimitry Andric       .addImm(Priority);
2913*0fca6ea1SDimitry Andric   return true;
2914*0fca6ea1SDimitry Andric }
2915*0fca6ea1SDimitry Andric 
2916*0fca6ea1SDimitry Andric bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
2917*0fca6ea1SDimitry Andric   if (!ST.hasRequiredExportPriority())
2918*0fca6ea1SDimitry Andric     return false;
2919*0fca6ea1SDimitry Andric 
2920*0fca6ea1SDimitry Andric   // Assume the following shader types will never have exports,
2921*0fca6ea1SDimitry Andric   // and avoid adding or adjusting S_SETPRIO.
2922*0fca6ea1SDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
2923*0fca6ea1SDimitry Andric   MachineFunction *MF = MBB->getParent();
2924*0fca6ea1SDimitry Andric   auto CC = MF->getFunction().getCallingConv();
2925*0fca6ea1SDimitry Andric   switch (CC) {
2926*0fca6ea1SDimitry Andric   case CallingConv::AMDGPU_CS:
2927*0fca6ea1SDimitry Andric   case CallingConv::AMDGPU_CS_Chain:
2928*0fca6ea1SDimitry Andric   case CallingConv::AMDGPU_CS_ChainPreserve:
2929*0fca6ea1SDimitry Andric   case CallingConv::AMDGPU_KERNEL:
2930*0fca6ea1SDimitry Andric     return false;
2931*0fca6ea1SDimitry Andric   default:
2932*0fca6ea1SDimitry Andric     break;
2933*0fca6ea1SDimitry Andric   }
2934*0fca6ea1SDimitry Andric 
2935*0fca6ea1SDimitry Andric   const int MaxPriority = 3;
2936*0fca6ea1SDimitry Andric   const int NormalPriority = 2;
2937*0fca6ea1SDimitry Andric   const int PostExportPriority = 0;
2938*0fca6ea1SDimitry Andric 
2939*0fca6ea1SDimitry Andric   auto It = MI->getIterator();
2940*0fca6ea1SDimitry Andric   switch (MI->getOpcode()) {
2941*0fca6ea1SDimitry Andric   case AMDGPU::S_ENDPGM:
2942*0fca6ea1SDimitry Andric   case AMDGPU::S_ENDPGM_SAVED:
2943*0fca6ea1SDimitry Andric   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
2944*0fca6ea1SDimitry Andric   case AMDGPU::SI_RETURN_TO_EPILOG:
2945*0fca6ea1SDimitry Andric     // Ensure shader with calls raises priority at entry.
2946*0fca6ea1SDimitry Andric     // This ensures correct priority if exports exist in callee.
2947*0fca6ea1SDimitry Andric     if (MF->getFrameInfo().hasCalls())
2948*0fca6ea1SDimitry Andric       return ensureEntrySetPrio(MF, NormalPriority, TII);
2949*0fca6ea1SDimitry Andric     return false;
2950*0fca6ea1SDimitry Andric   case AMDGPU::S_SETPRIO: {
2951*0fca6ea1SDimitry Andric     // Raise minimum priority unless in workaround.
2952*0fca6ea1SDimitry Andric     auto &PrioOp = MI->getOperand(0);
2953*0fca6ea1SDimitry Andric     int Prio = PrioOp.getImm();
2954*0fca6ea1SDimitry Andric     bool InWA = (Prio == PostExportPriority) &&
2955*0fca6ea1SDimitry Andric                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
2956*0fca6ea1SDimitry Andric     if (InWA || Prio >= NormalPriority)
2957*0fca6ea1SDimitry Andric       return false;
2958*0fca6ea1SDimitry Andric     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
2959*0fca6ea1SDimitry Andric     return true;
2960*0fca6ea1SDimitry Andric   }
2961*0fca6ea1SDimitry Andric   default:
2962*0fca6ea1SDimitry Andric     if (!TII.isEXP(*MI))
2963*0fca6ea1SDimitry Andric       return false;
2964*0fca6ea1SDimitry Andric     break;
2965*0fca6ea1SDimitry Andric   }
2966*0fca6ea1SDimitry Andric 
2967*0fca6ea1SDimitry Andric   // Check entry priority at each export (as there will only be a few).
2968*0fca6ea1SDimitry Andric   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
2969*0fca6ea1SDimitry Andric   bool Changed = false;
2970*0fca6ea1SDimitry Andric   if (CC != CallingConv::AMDGPU_Gfx)
2971*0fca6ea1SDimitry Andric     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
2972*0fca6ea1SDimitry Andric 
2973*0fca6ea1SDimitry Andric   auto NextMI = std::next(It);
2974*0fca6ea1SDimitry Andric   bool EndOfShader = false;
2975*0fca6ea1SDimitry Andric   if (NextMI != MBB->end()) {
2976*0fca6ea1SDimitry Andric     // Only need WA at end of sequence of exports.
2977*0fca6ea1SDimitry Andric     if (TII.isEXP(*NextMI))
2978*0fca6ea1SDimitry Andric       return Changed;
2979*0fca6ea1SDimitry Andric     // Assume appropriate S_SETPRIO after export means WA already applied.
2980*0fca6ea1SDimitry Andric     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
2981*0fca6ea1SDimitry Andric         NextMI->getOperand(0).getImm() == PostExportPriority)
2982*0fca6ea1SDimitry Andric       return Changed;
2983*0fca6ea1SDimitry Andric     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
2984*0fca6ea1SDimitry Andric   }
2985*0fca6ea1SDimitry Andric 
2986*0fca6ea1SDimitry Andric   const DebugLoc &DL = MI->getDebugLoc();
2987*0fca6ea1SDimitry Andric 
2988*0fca6ea1SDimitry Andric   // Lower priority.
2989*0fca6ea1SDimitry Andric   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
2990*0fca6ea1SDimitry Andric       .addImm(PostExportPriority);
2991*0fca6ea1SDimitry Andric 
2992*0fca6ea1SDimitry Andric   if (!EndOfShader) {
2993*0fca6ea1SDimitry Andric     // Wait for exports to complete.
2994*0fca6ea1SDimitry Andric     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
2995*0fca6ea1SDimitry Andric         .addReg(AMDGPU::SGPR_NULL)
2996*0fca6ea1SDimitry Andric         .addImm(0);
2997*0fca6ea1SDimitry Andric   }
2998*0fca6ea1SDimitry Andric 
2999*0fca6ea1SDimitry Andric   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3000*0fca6ea1SDimitry Andric   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3001*0fca6ea1SDimitry Andric 
3002*0fca6ea1SDimitry Andric   if (!EndOfShader) {
3003*0fca6ea1SDimitry Andric     // Return to normal (higher) priority.
3004*0fca6ea1SDimitry Andric     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3005*0fca6ea1SDimitry Andric         .addImm(NormalPriority);
3006*0fca6ea1SDimitry Andric   }
3007*0fca6ea1SDimitry Andric 
3008*0fca6ea1SDimitry Andric   return true;
3009*0fca6ea1SDimitry Andric }
3010