17330f729Sjoerg //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
27330f729Sjoerg //
37330f729Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
47330f729Sjoerg // See https://llvm.org/LICENSE.txt for license information.
57330f729Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
67330f729Sjoerg //
77330f729Sjoerg //===----------------------------------------------------------------------===//
87330f729Sjoerg //
97330f729Sjoerg // This file implements hazard recognizers for scheduling on GCN processors.
107330f729Sjoerg //
117330f729Sjoerg //===----------------------------------------------------------------------===//
127330f729Sjoerg
137330f729Sjoerg #include "GCNHazardRecognizer.h"
14*82d56013Sjoerg #include "GCNSubtarget.h"
157330f729Sjoerg #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
167330f729Sjoerg #include "llvm/CodeGen/MachineFunction.h"
177330f729Sjoerg #include "llvm/CodeGen/ScheduleDAG.h"
18*82d56013Sjoerg #include "llvm/Support/TargetParser.h"
197330f729Sjoerg
207330f729Sjoerg using namespace llvm;
217330f729Sjoerg
227330f729Sjoerg //===----------------------------------------------------------------------===//
237330f729Sjoerg // Hazard Recoginizer Implementation
247330f729Sjoerg //===----------------------------------------------------------------------===//
257330f729Sjoerg
GCNHazardRecognizer(const MachineFunction & MF)267330f729Sjoerg GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
277330f729Sjoerg IsHazardRecognizerMode(false),
287330f729Sjoerg CurrCycleInstr(nullptr),
297330f729Sjoerg MF(MF),
307330f729Sjoerg ST(MF.getSubtarget<GCNSubtarget>()),
317330f729Sjoerg TII(*ST.getInstrInfo()),
327330f729Sjoerg TRI(TII.getRegisterInfo()),
337330f729Sjoerg ClauseUses(TRI.getNumRegUnits()),
347330f729Sjoerg ClauseDefs(TRI.getNumRegUnits()) {
35*82d56013Sjoerg MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
367330f729Sjoerg TSchedModel.init(&ST);
377330f729Sjoerg }
387330f729Sjoerg
Reset()39*82d56013Sjoerg void GCNHazardRecognizer::Reset() {
40*82d56013Sjoerg EmittedInstrs.clear();
41*82d56013Sjoerg }
42*82d56013Sjoerg
EmitInstruction(SUnit * SU)437330f729Sjoerg void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
447330f729Sjoerg EmitInstruction(SU->getInstr());
457330f729Sjoerg }
467330f729Sjoerg
EmitInstruction(MachineInstr * MI)477330f729Sjoerg void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
487330f729Sjoerg CurrCycleInstr = MI;
497330f729Sjoerg }
507330f729Sjoerg
isDivFMas(unsigned Opcode)517330f729Sjoerg static bool isDivFMas(unsigned Opcode) {
52*82d56013Sjoerg return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
537330f729Sjoerg }
547330f729Sjoerg
isSGetReg(unsigned Opcode)557330f729Sjoerg static bool isSGetReg(unsigned Opcode) {
567330f729Sjoerg return Opcode == AMDGPU::S_GETREG_B32;
577330f729Sjoerg }
587330f729Sjoerg
isSSetReg(unsigned Opcode)597330f729Sjoerg static bool isSSetReg(unsigned Opcode) {
60*82d56013Sjoerg switch (Opcode) {
61*82d56013Sjoerg case AMDGPU::S_SETREG_B32:
62*82d56013Sjoerg case AMDGPU::S_SETREG_B32_mode:
63*82d56013Sjoerg case AMDGPU::S_SETREG_IMM32_B32:
64*82d56013Sjoerg case AMDGPU::S_SETREG_IMM32_B32_mode:
65*82d56013Sjoerg return true;
66*82d56013Sjoerg }
67*82d56013Sjoerg return false;
687330f729Sjoerg }
697330f729Sjoerg
isRWLane(unsigned Opcode)707330f729Sjoerg static bool isRWLane(unsigned Opcode) {
717330f729Sjoerg return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
727330f729Sjoerg }
737330f729Sjoerg
isRFE(unsigned Opcode)747330f729Sjoerg static bool isRFE(unsigned Opcode) {
757330f729Sjoerg return Opcode == AMDGPU::S_RFE_B64;
767330f729Sjoerg }
777330f729Sjoerg
isSMovRel(unsigned Opcode)787330f729Sjoerg static bool isSMovRel(unsigned Opcode) {
797330f729Sjoerg switch (Opcode) {
807330f729Sjoerg case AMDGPU::S_MOVRELS_B32:
817330f729Sjoerg case AMDGPU::S_MOVRELS_B64:
827330f729Sjoerg case AMDGPU::S_MOVRELD_B32:
837330f729Sjoerg case AMDGPU::S_MOVRELD_B64:
847330f729Sjoerg return true;
857330f729Sjoerg default:
867330f729Sjoerg return false;
877330f729Sjoerg }
887330f729Sjoerg }
897330f729Sjoerg
isDGEMM(unsigned Opcode)90*82d56013Sjoerg static bool isDGEMM(unsigned Opcode) {
91*82d56013Sjoerg return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
92*82d56013Sjoerg Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
93*82d56013Sjoerg Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
94*82d56013Sjoerg Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
95*82d56013Sjoerg }
96*82d56013Sjoerg
isXDL(const GCNSubtarget & ST,const MachineInstr & MI)97*82d56013Sjoerg static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
98*82d56013Sjoerg unsigned Opcode = MI.getOpcode();
99*82d56013Sjoerg
100*82d56013Sjoerg if (!SIInstrInfo::isMAI(MI) ||
101*82d56013Sjoerg isDGEMM(Opcode) ||
102*82d56013Sjoerg Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
103*82d56013Sjoerg Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
104*82d56013Sjoerg return false;
105*82d56013Sjoerg
106*82d56013Sjoerg return true;
107*82d56013Sjoerg }
108*82d56013Sjoerg
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)1097330f729Sjoerg static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
1107330f729Sjoerg const MachineInstr &MI) {
1117330f729Sjoerg if (TII.isAlwaysGDS(MI.getOpcode()))
1127330f729Sjoerg return true;
1137330f729Sjoerg
1147330f729Sjoerg switch (MI.getOpcode()) {
1157330f729Sjoerg case AMDGPU::S_SENDMSG:
1167330f729Sjoerg case AMDGPU::S_SENDMSGHALT:
1177330f729Sjoerg case AMDGPU::S_TTRACEDATA:
1187330f729Sjoerg return true;
1197330f729Sjoerg // These DS opcodes don't support GDS.
1207330f729Sjoerg case AMDGPU::DS_NOP:
1217330f729Sjoerg case AMDGPU::DS_PERMUTE_B32:
1227330f729Sjoerg case AMDGPU::DS_BPERMUTE_B32:
1237330f729Sjoerg return false;
1247330f729Sjoerg default:
1257330f729Sjoerg if (TII.isDS(MI.getOpcode())) {
1267330f729Sjoerg int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1277330f729Sjoerg AMDGPU::OpName::gds);
1287330f729Sjoerg if (MI.getOperand(GDS).getImm())
1297330f729Sjoerg return true;
1307330f729Sjoerg }
1317330f729Sjoerg return false;
1327330f729Sjoerg }
1337330f729Sjoerg }
1347330f729Sjoerg
isPermlane(const MachineInstr & MI)1357330f729Sjoerg static bool isPermlane(const MachineInstr &MI) {
1367330f729Sjoerg unsigned Opcode = MI.getOpcode();
137*82d56013Sjoerg return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
138*82d56013Sjoerg Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
1397330f729Sjoerg }
1407330f729Sjoerg
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)1417330f729Sjoerg static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
1427330f729Sjoerg const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
1437330f729Sjoerg AMDGPU::OpName::simm16);
1447330f729Sjoerg return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
1457330f729Sjoerg }
1467330f729Sjoerg
1477330f729Sjoerg ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)1487330f729Sjoerg GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
1497330f729Sjoerg MachineInstr *MI = SU->getInstr();
150*82d56013Sjoerg // If we are not in "HazardRecognizerMode" and therefore not being run from
151*82d56013Sjoerg // the scheduler, track possible stalls from hazards but don't insert noops.
152*82d56013Sjoerg auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
153*82d56013Sjoerg
1547330f729Sjoerg if (MI->isBundle())
1557330f729Sjoerg return NoHazard;
1567330f729Sjoerg
1577330f729Sjoerg if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
158*82d56013Sjoerg return HazardType;
159*82d56013Sjoerg
160*82d56013Sjoerg if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
161*82d56013Sjoerg return HazardType;
162*82d56013Sjoerg
163*82d56013Sjoerg if (checkFPAtomicToDenormModeHazard(MI) > 0)
164*82d56013Sjoerg return HazardType;
165*82d56013Sjoerg
166*82d56013Sjoerg if (ST.hasNoDataDepHazard())
167*82d56013Sjoerg return NoHazard;
1687330f729Sjoerg
1697330f729Sjoerg // FIXME: Should flat be considered vmem?
1707330f729Sjoerg if ((SIInstrInfo::isVMEM(*MI) ||
1717330f729Sjoerg SIInstrInfo::isFLAT(*MI))
1727330f729Sjoerg && checkVMEMHazards(MI) > 0)
173*82d56013Sjoerg return HazardType;
1747330f729Sjoerg
1757330f729Sjoerg if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
176*82d56013Sjoerg return HazardType;
1777330f729Sjoerg
1787330f729Sjoerg if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
179*82d56013Sjoerg return HazardType;
1807330f729Sjoerg
1817330f729Sjoerg if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
182*82d56013Sjoerg return HazardType;
1837330f729Sjoerg
1847330f729Sjoerg if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
185*82d56013Sjoerg return HazardType;
186*82d56013Sjoerg
187*82d56013Sjoerg if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
188*82d56013Sjoerg SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
189*82d56013Sjoerg SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
190*82d56013Sjoerg return HazardType;
1917330f729Sjoerg
1927330f729Sjoerg if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
193*82d56013Sjoerg return HazardType;
1947330f729Sjoerg
1957330f729Sjoerg if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
196*82d56013Sjoerg return HazardType;
1977330f729Sjoerg
1987330f729Sjoerg if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
199*82d56013Sjoerg return HazardType;
2007330f729Sjoerg
2017330f729Sjoerg if (ST.hasReadM0MovRelInterpHazard() &&
2027330f729Sjoerg (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
2037330f729Sjoerg checkReadM0Hazards(MI) > 0)
204*82d56013Sjoerg return HazardType;
2057330f729Sjoerg
2067330f729Sjoerg if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
2077330f729Sjoerg checkReadM0Hazards(MI) > 0)
208*82d56013Sjoerg return HazardType;
2097330f729Sjoerg
2107330f729Sjoerg if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
211*82d56013Sjoerg return HazardType;
2127330f729Sjoerg
213*82d56013Sjoerg if ((SIInstrInfo::isVMEM(*MI) ||
214*82d56013Sjoerg SIInstrInfo::isFLAT(*MI) ||
215*82d56013Sjoerg SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
216*82d56013Sjoerg return HazardType;
2177330f729Sjoerg
2187330f729Sjoerg if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
219*82d56013Sjoerg return HazardType;
2207330f729Sjoerg
2217330f729Sjoerg return NoHazard;
2227330f729Sjoerg }
2237330f729Sjoerg
insertNoopsInBundle(MachineInstr * MI,const SIInstrInfo & TII,unsigned Quantity)224*82d56013Sjoerg static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
225*82d56013Sjoerg unsigned Quantity) {
226*82d56013Sjoerg while (Quantity > 0) {
227*82d56013Sjoerg unsigned Arg = std::min(Quantity, 8u);
228*82d56013Sjoerg Quantity -= Arg;
2297330f729Sjoerg BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
230*82d56013Sjoerg .addImm(Arg - 1);
231*82d56013Sjoerg }
2327330f729Sjoerg }
2337330f729Sjoerg
processBundle()2347330f729Sjoerg void GCNHazardRecognizer::processBundle() {
2357330f729Sjoerg MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
2367330f729Sjoerg MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
2377330f729Sjoerg // Check bundled MachineInstr's for hazards.
2387330f729Sjoerg for (; MI != E && MI->isInsideBundle(); ++MI) {
2397330f729Sjoerg CurrCycleInstr = &*MI;
2407330f729Sjoerg unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
2417330f729Sjoerg
242*82d56013Sjoerg if (IsHazardRecognizerMode) {
2437330f729Sjoerg fixHazards(CurrCycleInstr);
2447330f729Sjoerg
245*82d56013Sjoerg insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
246*82d56013Sjoerg }
2477330f729Sjoerg
2487330f729Sjoerg // It’s unnecessary to track more than MaxLookAhead instructions. Since we
2497330f729Sjoerg // include the bundled MI directly after, only add a maximum of
2507330f729Sjoerg // (MaxLookAhead - 1) noops to EmittedInstrs.
2517330f729Sjoerg for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
2527330f729Sjoerg EmittedInstrs.push_front(nullptr);
2537330f729Sjoerg
2547330f729Sjoerg EmittedInstrs.push_front(CurrCycleInstr);
2557330f729Sjoerg EmittedInstrs.resize(MaxLookAhead);
2567330f729Sjoerg }
2577330f729Sjoerg CurrCycleInstr = nullptr;
2587330f729Sjoerg }
2597330f729Sjoerg
PreEmitNoops(MachineInstr * MI)2607330f729Sjoerg unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
2617330f729Sjoerg IsHazardRecognizerMode = true;
2627330f729Sjoerg CurrCycleInstr = MI;
2637330f729Sjoerg unsigned W = PreEmitNoopsCommon(MI);
2647330f729Sjoerg fixHazards(MI);
2657330f729Sjoerg CurrCycleInstr = nullptr;
2667330f729Sjoerg return W;
2677330f729Sjoerg }
2687330f729Sjoerg
PreEmitNoopsCommon(MachineInstr * MI)2697330f729Sjoerg unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
2707330f729Sjoerg if (MI->isBundle())
2717330f729Sjoerg return 0;
2727330f729Sjoerg
273*82d56013Sjoerg int WaitStates = 0;
2747330f729Sjoerg
2757330f729Sjoerg if (SIInstrInfo::isSMRD(*MI))
2767330f729Sjoerg return std::max(WaitStates, checkSMRDHazards(MI));
2777330f729Sjoerg
2787330f729Sjoerg if (ST.hasNSAtoVMEMBug())
2797330f729Sjoerg WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
2807330f729Sjoerg
2817330f729Sjoerg WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
2827330f729Sjoerg
2837330f729Sjoerg if (ST.hasNoDataDepHazard())
2847330f729Sjoerg return WaitStates;
2857330f729Sjoerg
286*82d56013Sjoerg if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
287*82d56013Sjoerg WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
288*82d56013Sjoerg
2897330f729Sjoerg if (SIInstrInfo::isVALU(*MI))
2907330f729Sjoerg WaitStates = std::max(WaitStates, checkVALUHazards(MI));
2917330f729Sjoerg
2927330f729Sjoerg if (SIInstrInfo::isDPP(*MI))
2937330f729Sjoerg WaitStates = std::max(WaitStates, checkDPPHazards(MI));
2947330f729Sjoerg
2957330f729Sjoerg if (isDivFMas(MI->getOpcode()))
2967330f729Sjoerg WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
2977330f729Sjoerg
2987330f729Sjoerg if (isRWLane(MI->getOpcode()))
2997330f729Sjoerg WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
3007330f729Sjoerg
301*82d56013Sjoerg if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
302*82d56013Sjoerg SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
303*82d56013Sjoerg SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
304*82d56013Sjoerg WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
305*82d56013Sjoerg
3067330f729Sjoerg if (MI->isInlineAsm())
3077330f729Sjoerg return std::max(WaitStates, checkInlineAsmHazards(MI));
3087330f729Sjoerg
3097330f729Sjoerg if (isSGetReg(MI->getOpcode()))
3107330f729Sjoerg return std::max(WaitStates, checkGetRegHazards(MI));
3117330f729Sjoerg
3127330f729Sjoerg if (isSSetReg(MI->getOpcode()))
3137330f729Sjoerg return std::max(WaitStates, checkSetRegHazards(MI));
3147330f729Sjoerg
3157330f729Sjoerg if (isRFE(MI->getOpcode()))
3167330f729Sjoerg return std::max(WaitStates, checkRFEHazards(MI));
3177330f729Sjoerg
3187330f729Sjoerg if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
3197330f729Sjoerg isSMovRel(MI->getOpcode())))
3207330f729Sjoerg return std::max(WaitStates, checkReadM0Hazards(MI));
3217330f729Sjoerg
3227330f729Sjoerg if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
3237330f729Sjoerg return std::max(WaitStates, checkReadM0Hazards(MI));
3247330f729Sjoerg
3257330f729Sjoerg if (SIInstrInfo::isMAI(*MI))
3267330f729Sjoerg return std::max(WaitStates, checkMAIHazards(MI));
3277330f729Sjoerg
328*82d56013Sjoerg if (SIInstrInfo::isVMEM(*MI) ||
329*82d56013Sjoerg SIInstrInfo::isFLAT(*MI) ||
330*82d56013Sjoerg SIInstrInfo::isDS(*MI))
3317330f729Sjoerg return std::max(WaitStates, checkMAILdStHazards(MI));
3327330f729Sjoerg
3337330f729Sjoerg return WaitStates;
3347330f729Sjoerg }
3357330f729Sjoerg
EmitNoop()3367330f729Sjoerg void GCNHazardRecognizer::EmitNoop() {
3377330f729Sjoerg EmittedInstrs.push_front(nullptr);
3387330f729Sjoerg }
3397330f729Sjoerg
AdvanceCycle()3407330f729Sjoerg void GCNHazardRecognizer::AdvanceCycle() {
3417330f729Sjoerg // When the scheduler detects a stall, it will call AdvanceCycle() without
3427330f729Sjoerg // emitting any instructions.
343*82d56013Sjoerg if (!CurrCycleInstr) {
344*82d56013Sjoerg EmittedInstrs.push_front(nullptr);
3457330f729Sjoerg return;
346*82d56013Sjoerg }
3477330f729Sjoerg
3487330f729Sjoerg // Do not track non-instructions which do not affect the wait states.
3497330f729Sjoerg // If included, these instructions can lead to buffer overflow such that
3507330f729Sjoerg // detectable hazards are missed.
351*82d56013Sjoerg if (CurrCycleInstr->isMetaInstruction()) {
352*82d56013Sjoerg CurrCycleInstr = nullptr;
3537330f729Sjoerg return;
354*82d56013Sjoerg }
3557330f729Sjoerg
3567330f729Sjoerg if (CurrCycleInstr->isBundle()) {
3577330f729Sjoerg processBundle();
3587330f729Sjoerg return;
3597330f729Sjoerg }
3607330f729Sjoerg
3617330f729Sjoerg unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
3627330f729Sjoerg
3637330f729Sjoerg // Keep track of emitted instructions
3647330f729Sjoerg EmittedInstrs.push_front(CurrCycleInstr);
3657330f729Sjoerg
3667330f729Sjoerg // Add a nullptr for each additional wait state after the first. Make sure
3677330f729Sjoerg // not to add more than getMaxLookAhead() items to the list, since we
3687330f729Sjoerg // truncate the list to that size right after this loop.
3697330f729Sjoerg for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
3707330f729Sjoerg i < e; ++i) {
3717330f729Sjoerg EmittedInstrs.push_front(nullptr);
3727330f729Sjoerg }
3737330f729Sjoerg
3747330f729Sjoerg // getMaxLookahead() is the largest number of wait states we will ever need
3757330f729Sjoerg // to insert, so there is no point in keeping track of more than that many
3767330f729Sjoerg // wait states.
3777330f729Sjoerg EmittedInstrs.resize(getMaxLookAhead());
3787330f729Sjoerg
3797330f729Sjoerg CurrCycleInstr = nullptr;
3807330f729Sjoerg }
3817330f729Sjoerg
RecedeCycle()3827330f729Sjoerg void GCNHazardRecognizer::RecedeCycle() {
3837330f729Sjoerg llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
3847330f729Sjoerg }
3857330f729Sjoerg
3867330f729Sjoerg //===----------------------------------------------------------------------===//
3877330f729Sjoerg // Helper Functions
3887330f729Sjoerg //===----------------------------------------------------------------------===//
3897330f729Sjoerg
390*82d56013Sjoerg typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
3917330f729Sjoerg
3927330f729Sjoerg // Returns a minimum wait states since \p I walking all predecessors.
3937330f729Sjoerg // Only scans until \p IsExpired does not return true.
3947330f729Sjoerg // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited)3957330f729Sjoerg static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
396*82d56013Sjoerg const MachineBasicBlock *MBB,
397*82d56013Sjoerg MachineBasicBlock::const_reverse_instr_iterator I,
398*82d56013Sjoerg int WaitStates, IsExpiredFn IsExpired,
3997330f729Sjoerg DenseSet<const MachineBasicBlock *> &Visited) {
4007330f729Sjoerg for (auto E = MBB->instr_rend(); I != E; ++I) {
4017330f729Sjoerg // Don't add WaitStates for parent BUNDLE instructions.
4027330f729Sjoerg if (I->isBundle())
4037330f729Sjoerg continue;
4047330f729Sjoerg
405*82d56013Sjoerg if (IsHazard(*I))
4067330f729Sjoerg return WaitStates;
4077330f729Sjoerg
408*82d56013Sjoerg if (I->isInlineAsm() || I->isMetaInstruction())
4097330f729Sjoerg continue;
4107330f729Sjoerg
4117330f729Sjoerg WaitStates += SIInstrInfo::getNumWaitStates(*I);
4127330f729Sjoerg
413*82d56013Sjoerg if (IsExpired(*I, WaitStates))
4147330f729Sjoerg return std::numeric_limits<int>::max();
4157330f729Sjoerg }
4167330f729Sjoerg
417*82d56013Sjoerg int MinWaitStates = std::numeric_limits<int>::max();
4187330f729Sjoerg for (MachineBasicBlock *Pred : MBB->predecessors()) {
4197330f729Sjoerg if (!Visited.insert(Pred).second)
4207330f729Sjoerg continue;
4217330f729Sjoerg
4227330f729Sjoerg int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
4237330f729Sjoerg WaitStates, IsExpired, Visited);
4247330f729Sjoerg
425*82d56013Sjoerg MinWaitStates = std::min(MinWaitStates, W);
4267330f729Sjoerg }
4277330f729Sjoerg
4287330f729Sjoerg return MinWaitStates;
4297330f729Sjoerg }
4307330f729Sjoerg
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineInstr * MI,IsExpiredFn IsExpired)4317330f729Sjoerg static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
432*82d56013Sjoerg const MachineInstr *MI, IsExpiredFn IsExpired) {
4337330f729Sjoerg DenseSet<const MachineBasicBlock *> Visited;
4347330f729Sjoerg return getWaitStatesSince(IsHazard, MI->getParent(),
4357330f729Sjoerg std::next(MI->getReverseIterator()),
4367330f729Sjoerg 0, IsExpired, Visited);
4377330f729Sjoerg }
4387330f729Sjoerg
getWaitStatesSince(IsHazardFn IsHazard,int Limit)4397330f729Sjoerg int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
4407330f729Sjoerg if (IsHazardRecognizerMode) {
441*82d56013Sjoerg auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
4427330f729Sjoerg return WaitStates >= Limit;
4437330f729Sjoerg };
4447330f729Sjoerg return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
4457330f729Sjoerg }
4467330f729Sjoerg
4477330f729Sjoerg int WaitStates = 0;
4487330f729Sjoerg for (MachineInstr *MI : EmittedInstrs) {
4497330f729Sjoerg if (MI) {
450*82d56013Sjoerg if (IsHazard(*MI))
4517330f729Sjoerg return WaitStates;
4527330f729Sjoerg
4537330f729Sjoerg if (MI->isInlineAsm())
4547330f729Sjoerg continue;
4557330f729Sjoerg }
4567330f729Sjoerg ++WaitStates;
4577330f729Sjoerg
4587330f729Sjoerg if (WaitStates >= Limit)
4597330f729Sjoerg break;
4607330f729Sjoerg }
4617330f729Sjoerg return std::numeric_limits<int>::max();
4627330f729Sjoerg }
4637330f729Sjoerg
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)4647330f729Sjoerg int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
4657330f729Sjoerg IsHazardFn IsHazardDef,
4667330f729Sjoerg int Limit) {
4677330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
4687330f729Sjoerg
469*82d56013Sjoerg auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
470*82d56013Sjoerg return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
4717330f729Sjoerg };
4727330f729Sjoerg
4737330f729Sjoerg return getWaitStatesSince(IsHazardFn, Limit);
4747330f729Sjoerg }
4757330f729Sjoerg
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)4767330f729Sjoerg int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
4777330f729Sjoerg int Limit) {
478*82d56013Sjoerg auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
479*82d56013Sjoerg return isSSetReg(MI.getOpcode()) && IsHazard(MI);
4807330f729Sjoerg };
4817330f729Sjoerg
4827330f729Sjoerg return getWaitStatesSince(IsHazardFn, Limit);
4837330f729Sjoerg }
4847330f729Sjoerg
4857330f729Sjoerg //===----------------------------------------------------------------------===//
4867330f729Sjoerg // No-op Hazard Detection
4877330f729Sjoerg //===----------------------------------------------------------------------===//
4887330f729Sjoerg
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,MCRegister Reg)489*82d56013Sjoerg static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
490*82d56013Sjoerg MCRegister Reg) {
4917330f729Sjoerg for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
4927330f729Sjoerg BV.set(*RUI);
4937330f729Sjoerg }
4947330f729Sjoerg
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & Set)4957330f729Sjoerg static void addRegsToSet(const SIRegisterInfo &TRI,
4967330f729Sjoerg iterator_range<MachineInstr::const_mop_iterator> Ops,
4977330f729Sjoerg BitVector &Set) {
4987330f729Sjoerg for (const MachineOperand &Op : Ops) {
4997330f729Sjoerg if (Op.isReg())
500*82d56013Sjoerg addRegUnits(TRI, Set, Op.getReg().asMCReg());
5017330f729Sjoerg }
5027330f729Sjoerg }
5037330f729Sjoerg
addClauseInst(const MachineInstr & MI)5047330f729Sjoerg void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
5057330f729Sjoerg // XXX: Do we need to worry about implicit operands
5067330f729Sjoerg addRegsToSet(TRI, MI.defs(), ClauseDefs);
5077330f729Sjoerg addRegsToSet(TRI, MI.uses(), ClauseUses);
5087330f729Sjoerg }
5097330f729Sjoerg
breaksSMEMSoftClause(MachineInstr * MI)510*82d56013Sjoerg static bool breaksSMEMSoftClause(MachineInstr *MI) {
511*82d56013Sjoerg return !SIInstrInfo::isSMRD(*MI);
512*82d56013Sjoerg }
513*82d56013Sjoerg
breaksVMEMSoftClause(MachineInstr * MI)514*82d56013Sjoerg static bool breaksVMEMSoftClause(MachineInstr *MI) {
515*82d56013Sjoerg return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
516*82d56013Sjoerg }
517*82d56013Sjoerg
checkSoftClauseHazards(MachineInstr * MEM)5187330f729Sjoerg int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
5197330f729Sjoerg // SMEM soft clause are only present on VI+, and only matter if xnack is
5207330f729Sjoerg // enabled.
5217330f729Sjoerg if (!ST.isXNACKEnabled())
5227330f729Sjoerg return 0;
5237330f729Sjoerg
5247330f729Sjoerg bool IsSMRD = TII.isSMRD(*MEM);
5257330f729Sjoerg
5267330f729Sjoerg resetClause();
5277330f729Sjoerg
5287330f729Sjoerg // A soft-clause is any group of consecutive SMEM instructions. The
5297330f729Sjoerg // instructions in this group may return out of order and/or may be
5307330f729Sjoerg // replayed (i.e. the same instruction issued more than once).
5317330f729Sjoerg //
5327330f729Sjoerg // In order to handle these situations correctly we need to make sure that
5337330f729Sjoerg // when a clause has more than one instruction, no instruction in the clause
5347330f729Sjoerg // writes to a register that is read by another instruction in the clause
5357330f729Sjoerg // (including itself). If we encounter this situaion, we need to break the
5367330f729Sjoerg // clause by inserting a non SMEM instruction.
5377330f729Sjoerg
5387330f729Sjoerg for (MachineInstr *MI : EmittedInstrs) {
5397330f729Sjoerg // When we hit a non-SMEM instruction then we have passed the start of the
5407330f729Sjoerg // clause and we can stop.
5417330f729Sjoerg if (!MI)
5427330f729Sjoerg break;
5437330f729Sjoerg
544*82d56013Sjoerg if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
5457330f729Sjoerg break;
5467330f729Sjoerg
5477330f729Sjoerg addClauseInst(*MI);
5487330f729Sjoerg }
5497330f729Sjoerg
5507330f729Sjoerg if (ClauseDefs.none())
5517330f729Sjoerg return 0;
5527330f729Sjoerg
5537330f729Sjoerg // We need to make sure not to put loads and stores in the same clause if they
5547330f729Sjoerg // use the same address. For now, just start a new clause whenever we see a
5557330f729Sjoerg // store.
5567330f729Sjoerg if (MEM->mayStore())
5577330f729Sjoerg return 1;
5587330f729Sjoerg
5597330f729Sjoerg addClauseInst(*MEM);
5607330f729Sjoerg
5617330f729Sjoerg // If the set of defs and uses intersect then we cannot add this instruction
5627330f729Sjoerg // to the clause, so we have a hazard.
5637330f729Sjoerg return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
5647330f729Sjoerg }
5657330f729Sjoerg
checkSMRDHazards(MachineInstr * SMRD)5667330f729Sjoerg int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
5677330f729Sjoerg int WaitStatesNeeded = 0;
5687330f729Sjoerg
5697330f729Sjoerg WaitStatesNeeded = checkSoftClauseHazards(SMRD);
5707330f729Sjoerg
5717330f729Sjoerg // This SMRD hazard only affects SI.
5727330f729Sjoerg if (!ST.hasSMRDReadVALUDefHazard())
5737330f729Sjoerg return WaitStatesNeeded;
5747330f729Sjoerg
5757330f729Sjoerg // A read of an SGPR by SMRD instruction requires 4 wait states when the
5767330f729Sjoerg // SGPR was written by a VALU instruction.
5777330f729Sjoerg int SmrdSgprWaitStates = 4;
578*82d56013Sjoerg auto IsHazardDefFn = [this](const MachineInstr &MI) {
579*82d56013Sjoerg return TII.isVALU(MI);
580*82d56013Sjoerg };
581*82d56013Sjoerg auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
582*82d56013Sjoerg return TII.isSALU(MI);
583*82d56013Sjoerg };
5847330f729Sjoerg
5857330f729Sjoerg bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
5867330f729Sjoerg
5877330f729Sjoerg for (const MachineOperand &Use : SMRD->uses()) {
5887330f729Sjoerg if (!Use.isReg())
5897330f729Sjoerg continue;
5907330f729Sjoerg int WaitStatesNeededForUse =
5917330f729Sjoerg SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
5927330f729Sjoerg SmrdSgprWaitStates);
5937330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
5947330f729Sjoerg
5957330f729Sjoerg // This fixes what appears to be undocumented hardware behavior in SI where
5967330f729Sjoerg // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
5977330f729Sjoerg // needs some number of nops in between. We don't know how many we need, but
5987330f729Sjoerg // let's use 4. This wasn't discovered before probably because the only
5997330f729Sjoerg // case when this happens is when we expand a 64-bit pointer into a full
6007330f729Sjoerg // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
6017330f729Sjoerg // probably never encountered in the closed-source land.
6027330f729Sjoerg if (IsBufferSMRD) {
6037330f729Sjoerg int WaitStatesNeededForUse =
6047330f729Sjoerg SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
6057330f729Sjoerg IsBufferHazardDefFn,
6067330f729Sjoerg SmrdSgprWaitStates);
6077330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6087330f729Sjoerg }
6097330f729Sjoerg }
6107330f729Sjoerg
6117330f729Sjoerg return WaitStatesNeeded;
6127330f729Sjoerg }
6137330f729Sjoerg
checkVMEMHazards(MachineInstr * VMEM)6147330f729Sjoerg int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
6157330f729Sjoerg if (!ST.hasVMEMReadSGPRVALUDefHazard())
6167330f729Sjoerg return 0;
6177330f729Sjoerg
6187330f729Sjoerg int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
6197330f729Sjoerg
6207330f729Sjoerg // A read of an SGPR by a VMEM instruction requires 5 wait states when the
6217330f729Sjoerg // SGPR was written by a VALU Instruction.
6227330f729Sjoerg const int VmemSgprWaitStates = 5;
623*82d56013Sjoerg auto IsHazardDefFn = [this](const MachineInstr &MI) {
624*82d56013Sjoerg return TII.isVALU(MI);
625*82d56013Sjoerg };
6267330f729Sjoerg for (const MachineOperand &Use : VMEM->uses()) {
627*82d56013Sjoerg if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
6287330f729Sjoerg continue;
6297330f729Sjoerg
6307330f729Sjoerg int WaitStatesNeededForUse =
6317330f729Sjoerg VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
6327330f729Sjoerg VmemSgprWaitStates);
6337330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6347330f729Sjoerg }
6357330f729Sjoerg return WaitStatesNeeded;
6367330f729Sjoerg }
6377330f729Sjoerg
checkDPPHazards(MachineInstr * DPP)6387330f729Sjoerg int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
6397330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
6407330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
6417330f729Sjoerg
6427330f729Sjoerg // Check for DPP VGPR read after VALU VGPR write and EXEC write.
6437330f729Sjoerg int DppVgprWaitStates = 2;
6447330f729Sjoerg int DppExecWaitStates = 5;
6457330f729Sjoerg int WaitStatesNeeded = 0;
646*82d56013Sjoerg auto IsHazardDefFn = [TII](const MachineInstr &MI) {
647*82d56013Sjoerg return TII->isVALU(MI);
648*82d56013Sjoerg };
6497330f729Sjoerg
6507330f729Sjoerg for (const MachineOperand &Use : DPP->uses()) {
6517330f729Sjoerg if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
6527330f729Sjoerg continue;
6537330f729Sjoerg int WaitStatesNeededForUse =
654*82d56013Sjoerg DppVgprWaitStates - getWaitStatesSinceDef(
655*82d56013Sjoerg Use.getReg(),
656*82d56013Sjoerg [](const MachineInstr &) { return true; },
6577330f729Sjoerg DppVgprWaitStates);
6587330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6597330f729Sjoerg }
6607330f729Sjoerg
6617330f729Sjoerg WaitStatesNeeded = std::max(
6627330f729Sjoerg WaitStatesNeeded,
6637330f729Sjoerg DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
6647330f729Sjoerg DppExecWaitStates));
6657330f729Sjoerg
6667330f729Sjoerg return WaitStatesNeeded;
6677330f729Sjoerg }
6687330f729Sjoerg
checkDivFMasHazards(MachineInstr * DivFMas)6697330f729Sjoerg int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
6707330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
6717330f729Sjoerg
6727330f729Sjoerg // v_div_fmas requires 4 wait states after a write to vcc from a VALU
6737330f729Sjoerg // instruction.
6747330f729Sjoerg const int DivFMasWaitStates = 4;
675*82d56013Sjoerg auto IsHazardDefFn = [TII](const MachineInstr &MI) {
676*82d56013Sjoerg return TII->isVALU(MI);
677*82d56013Sjoerg };
6787330f729Sjoerg int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
6797330f729Sjoerg DivFMasWaitStates);
6807330f729Sjoerg
6817330f729Sjoerg return DivFMasWaitStates - WaitStatesNeeded;
6827330f729Sjoerg }
6837330f729Sjoerg
checkGetRegHazards(MachineInstr * GetRegInstr)6847330f729Sjoerg int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
6857330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
6867330f729Sjoerg unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
6877330f729Sjoerg
6887330f729Sjoerg const int GetRegWaitStates = 2;
689*82d56013Sjoerg auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
690*82d56013Sjoerg return GetRegHWReg == getHWReg(TII, MI);
6917330f729Sjoerg };
6927330f729Sjoerg int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
6937330f729Sjoerg
6947330f729Sjoerg return GetRegWaitStates - WaitStatesNeeded;
6957330f729Sjoerg }
6967330f729Sjoerg
checkSetRegHazards(MachineInstr * SetRegInstr)6977330f729Sjoerg int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
6987330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
6997330f729Sjoerg unsigned HWReg = getHWReg(TII, *SetRegInstr);
7007330f729Sjoerg
7017330f729Sjoerg const int SetRegWaitStates = ST.getSetRegWaitStates();
702*82d56013Sjoerg auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
703*82d56013Sjoerg return HWReg == getHWReg(TII, MI);
7047330f729Sjoerg };
7057330f729Sjoerg int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
7067330f729Sjoerg return SetRegWaitStates - WaitStatesNeeded;
7077330f729Sjoerg }
7087330f729Sjoerg
createsVALUHazard(const MachineInstr & MI)7097330f729Sjoerg int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
7107330f729Sjoerg if (!MI.mayStore())
7117330f729Sjoerg return -1;
7127330f729Sjoerg
7137330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
7147330f729Sjoerg unsigned Opcode = MI.getOpcode();
7157330f729Sjoerg const MCInstrDesc &Desc = MI.getDesc();
7167330f729Sjoerg
7177330f729Sjoerg int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
7187330f729Sjoerg int VDataRCID = -1;
7197330f729Sjoerg if (VDataIdx != -1)
7207330f729Sjoerg VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
7217330f729Sjoerg
7227330f729Sjoerg if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
7237330f729Sjoerg // There is no hazard if the instruction does not use vector regs
7247330f729Sjoerg // (like wbinvl1)
7257330f729Sjoerg if (VDataIdx == -1)
7267330f729Sjoerg return -1;
7277330f729Sjoerg // For MUBUF/MTBUF instructions this hazard only exists if the
7287330f729Sjoerg // instruction is not using a register in the soffset field.
7297330f729Sjoerg const MachineOperand *SOffset =
7307330f729Sjoerg TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
7317330f729Sjoerg // If we have no soffset operand, then assume this field has been
7327330f729Sjoerg // hardcoded to zero.
7337330f729Sjoerg if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
7347330f729Sjoerg (!SOffset || !SOffset->isReg()))
7357330f729Sjoerg return VDataIdx;
7367330f729Sjoerg }
7377330f729Sjoerg
7387330f729Sjoerg // MIMG instructions create a hazard if they don't use a 256-bit T# and
7397330f729Sjoerg // the store size is greater than 8 bytes and they have more than two bits
7407330f729Sjoerg // of their dmask set.
7417330f729Sjoerg // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
7427330f729Sjoerg if (TII->isMIMG(MI)) {
7437330f729Sjoerg int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
7447330f729Sjoerg assert(SRsrcIdx != -1 &&
7457330f729Sjoerg AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
7467330f729Sjoerg (void)SRsrcIdx;
7477330f729Sjoerg }
7487330f729Sjoerg
7497330f729Sjoerg if (TII->isFLAT(MI)) {
7507330f729Sjoerg int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
7517330f729Sjoerg if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
7527330f729Sjoerg return DataIdx;
7537330f729Sjoerg }
7547330f729Sjoerg
7557330f729Sjoerg return -1;
7567330f729Sjoerg }
7577330f729Sjoerg
758*82d56013Sjoerg int
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)759*82d56013Sjoerg GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
7607330f729Sjoerg const MachineRegisterInfo &MRI) {
7617330f729Sjoerg // Helper to check for the hazard where VMEM instructions that store more than
7627330f729Sjoerg // 8 bytes can have there store data over written by the next instruction.
7637330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
7647330f729Sjoerg
7657330f729Sjoerg const int VALUWaitStates = 1;
7667330f729Sjoerg int WaitStatesNeeded = 0;
7677330f729Sjoerg
768*82d56013Sjoerg if (!TRI->isVectorRegister(MRI, Def.getReg()))
7697330f729Sjoerg return WaitStatesNeeded;
7707330f729Sjoerg Register Reg = Def.getReg();
771*82d56013Sjoerg auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
772*82d56013Sjoerg int DataIdx = createsVALUHazard(MI);
7737330f729Sjoerg return DataIdx >= 0 &&
774*82d56013Sjoerg TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
7757330f729Sjoerg };
7767330f729Sjoerg int WaitStatesNeededForDef =
7777330f729Sjoerg VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
7787330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
7797330f729Sjoerg
7807330f729Sjoerg return WaitStatesNeeded;
7817330f729Sjoerg }
7827330f729Sjoerg
checkVALUHazards(MachineInstr * VALU)7837330f729Sjoerg int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
7847330f729Sjoerg // This checks for the hazard where VMEM instructions that store more than
7857330f729Sjoerg // 8 bytes can have there store data over written by the next instruction.
7867330f729Sjoerg if (!ST.has12DWordStoreHazard())
7877330f729Sjoerg return 0;
7887330f729Sjoerg
7897330f729Sjoerg const MachineRegisterInfo &MRI = MF.getRegInfo();
7907330f729Sjoerg int WaitStatesNeeded = 0;
7917330f729Sjoerg
7927330f729Sjoerg for (const MachineOperand &Def : VALU->defs()) {
7937330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
7947330f729Sjoerg }
7957330f729Sjoerg
7967330f729Sjoerg return WaitStatesNeeded;
7977330f729Sjoerg }
7987330f729Sjoerg
checkInlineAsmHazards(MachineInstr * IA)7997330f729Sjoerg int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
8007330f729Sjoerg // This checks for hazards associated with inline asm statements.
8017330f729Sjoerg // Since inline asms can contain just about anything, we use this
8027330f729Sjoerg // to call/leverage other check*Hazard routines. Note that
8037330f729Sjoerg // this function doesn't attempt to address all possible inline asm
8047330f729Sjoerg // hazards (good luck), but is a collection of what has been
8057330f729Sjoerg // problematic thus far.
8067330f729Sjoerg
8077330f729Sjoerg // see checkVALUHazards()
8087330f729Sjoerg if (!ST.has12DWordStoreHazard())
8097330f729Sjoerg return 0;
8107330f729Sjoerg
8117330f729Sjoerg const MachineRegisterInfo &MRI = MF.getRegInfo();
8127330f729Sjoerg int WaitStatesNeeded = 0;
8137330f729Sjoerg
8147330f729Sjoerg for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
8157330f729Sjoerg I != E; ++I) {
8167330f729Sjoerg const MachineOperand &Op = IA->getOperand(I);
8177330f729Sjoerg if (Op.isReg() && Op.isDef()) {
8187330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
8197330f729Sjoerg }
8207330f729Sjoerg }
8217330f729Sjoerg
8227330f729Sjoerg return WaitStatesNeeded;
8237330f729Sjoerg }
8247330f729Sjoerg
checkRWLaneHazards(MachineInstr * RWLane)8257330f729Sjoerg int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
8267330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
8277330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
8287330f729Sjoerg const MachineRegisterInfo &MRI = MF.getRegInfo();
8297330f729Sjoerg
8307330f729Sjoerg const MachineOperand *LaneSelectOp =
8317330f729Sjoerg TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
8327330f729Sjoerg
8337330f729Sjoerg if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
8347330f729Sjoerg return 0;
8357330f729Sjoerg
8367330f729Sjoerg Register LaneSelectReg = LaneSelectOp->getReg();
837*82d56013Sjoerg auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
8387330f729Sjoerg
8397330f729Sjoerg const int RWLaneWaitStates = 4;
8407330f729Sjoerg int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
8417330f729Sjoerg RWLaneWaitStates);
8427330f729Sjoerg return RWLaneWaitStates - WaitStatesSince;
8437330f729Sjoerg }
8447330f729Sjoerg
checkRFEHazards(MachineInstr * RFE)8457330f729Sjoerg int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
8467330f729Sjoerg if (!ST.hasRFEHazards())
8477330f729Sjoerg return 0;
8487330f729Sjoerg
8497330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
8507330f729Sjoerg
8517330f729Sjoerg const int RFEWaitStates = 1;
8527330f729Sjoerg
853*82d56013Sjoerg auto IsHazardFn = [TII](const MachineInstr &MI) {
854*82d56013Sjoerg return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
8557330f729Sjoerg };
8567330f729Sjoerg int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
8577330f729Sjoerg return RFEWaitStates - WaitStatesNeeded;
8587330f729Sjoerg }
8597330f729Sjoerg
checkReadM0Hazards(MachineInstr * MI)8607330f729Sjoerg int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
8617330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
8627330f729Sjoerg const int SMovRelWaitStates = 1;
863*82d56013Sjoerg auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
8647330f729Sjoerg return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
8657330f729Sjoerg SMovRelWaitStates);
8667330f729Sjoerg }
8677330f729Sjoerg
fixHazards(MachineInstr * MI)8687330f729Sjoerg void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
8697330f729Sjoerg fixVMEMtoScalarWriteHazards(MI);
8707330f729Sjoerg fixVcmpxPermlaneHazards(MI);
8717330f729Sjoerg fixSMEMtoVectorWriteHazards(MI);
8727330f729Sjoerg fixVcmpxExecWARHazard(MI);
8737330f729Sjoerg fixLdsBranchVmemWARHazard(MI);
8747330f729Sjoerg }
8757330f729Sjoerg
fixVcmpxPermlaneHazards(MachineInstr * MI)8767330f729Sjoerg bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
8777330f729Sjoerg if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
8787330f729Sjoerg return false;
8797330f729Sjoerg
8807330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
881*82d56013Sjoerg auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
8827330f729Sjoerg
883*82d56013Sjoerg auto IsExpiredFn = [](const MachineInstr &MI, int) {
884*82d56013Sjoerg unsigned Opc = MI.getOpcode();
885*82d56013Sjoerg return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
886*82d56013Sjoerg Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
8877330f729Sjoerg };
8887330f729Sjoerg
8897330f729Sjoerg if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
8907330f729Sjoerg std::numeric_limits<int>::max())
8917330f729Sjoerg return false;
8927330f729Sjoerg
8937330f729Sjoerg // V_NOP will be discarded by SQ.
8947330f729Sjoerg // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
8957330f729Sjoerg // which is always a VGPR and available.
8967330f729Sjoerg auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
8977330f729Sjoerg Register Reg = Src0->getReg();
8987330f729Sjoerg bool IsUndef = Src0->isUndef();
8997330f729Sjoerg BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
9007330f729Sjoerg TII->get(AMDGPU::V_MOV_B32_e32))
9017330f729Sjoerg .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
9027330f729Sjoerg .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
9037330f729Sjoerg
9047330f729Sjoerg return true;
9057330f729Sjoerg }
9067330f729Sjoerg
fixVMEMtoScalarWriteHazards(MachineInstr * MI)9077330f729Sjoerg bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
9087330f729Sjoerg if (!ST.hasVMEMtoScalarWriteHazard())
9097330f729Sjoerg return false;
9107330f729Sjoerg
9117330f729Sjoerg if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
9127330f729Sjoerg return false;
9137330f729Sjoerg
9147330f729Sjoerg if (MI->getNumDefs() == 0)
9157330f729Sjoerg return false;
9167330f729Sjoerg
9177330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
9187330f729Sjoerg
919*82d56013Sjoerg auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
920*82d56013Sjoerg if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
921*82d56013Sjoerg !SIInstrInfo::isFLAT(I))
9227330f729Sjoerg return false;
9237330f729Sjoerg
9247330f729Sjoerg for (const MachineOperand &Def : MI->defs()) {
925*82d56013Sjoerg const MachineOperand *Op =
926*82d56013Sjoerg I.findRegisterUseOperand(Def.getReg(), false, TRI);
9277330f729Sjoerg if (!Op)
9287330f729Sjoerg continue;
9297330f729Sjoerg return true;
9307330f729Sjoerg }
9317330f729Sjoerg return false;
9327330f729Sjoerg };
9337330f729Sjoerg
934*82d56013Sjoerg auto IsExpiredFn = [](const MachineInstr &MI, int) {
935*82d56013Sjoerg return SIInstrInfo::isVALU(MI) ||
936*82d56013Sjoerg (MI.getOpcode() == AMDGPU::S_WAITCNT &&
937*82d56013Sjoerg !MI.getOperand(0).getImm()) ||
938*82d56013Sjoerg (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
939*82d56013Sjoerg MI.getOperand(0).getImm() == 0xffe3);
9407330f729Sjoerg };
9417330f729Sjoerg
9427330f729Sjoerg if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
9437330f729Sjoerg std::numeric_limits<int>::max())
9447330f729Sjoerg return false;
9457330f729Sjoerg
9467330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
947*82d56013Sjoerg BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
948*82d56013Sjoerg TII->get(AMDGPU::S_WAITCNT_DEPCTR))
949*82d56013Sjoerg .addImm(0xffe3);
9507330f729Sjoerg return true;
9517330f729Sjoerg }
9527330f729Sjoerg
fixSMEMtoVectorWriteHazards(MachineInstr * MI)9537330f729Sjoerg bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
9547330f729Sjoerg if (!ST.hasSMEMtoVectorWriteHazard())
9557330f729Sjoerg return false;
9567330f729Sjoerg
9577330f729Sjoerg if (!SIInstrInfo::isVALU(*MI))
9587330f729Sjoerg return false;
9597330f729Sjoerg
9607330f729Sjoerg unsigned SDSTName;
9617330f729Sjoerg switch (MI->getOpcode()) {
9627330f729Sjoerg case AMDGPU::V_READLANE_B32:
9637330f729Sjoerg case AMDGPU::V_READFIRSTLANE_B32:
9647330f729Sjoerg SDSTName = AMDGPU::OpName::vdst;
9657330f729Sjoerg break;
9667330f729Sjoerg default:
9677330f729Sjoerg SDSTName = AMDGPU::OpName::sdst;
9687330f729Sjoerg break;
9697330f729Sjoerg }
9707330f729Sjoerg
9717330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
9727330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
9737330f729Sjoerg const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
9747330f729Sjoerg const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
9757330f729Sjoerg if (!SDST) {
9767330f729Sjoerg for (const auto &MO : MI->implicit_operands()) {
9777330f729Sjoerg if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
9787330f729Sjoerg SDST = &MO;
9797330f729Sjoerg break;
9807330f729Sjoerg }
9817330f729Sjoerg }
9827330f729Sjoerg }
9837330f729Sjoerg
9847330f729Sjoerg if (!SDST)
9857330f729Sjoerg return false;
9867330f729Sjoerg
9877330f729Sjoerg const Register SDSTReg = SDST->getReg();
988*82d56013Sjoerg auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
989*82d56013Sjoerg return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
9907330f729Sjoerg };
9917330f729Sjoerg
992*82d56013Sjoerg auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
993*82d56013Sjoerg if (TII->isSALU(MI)) {
994*82d56013Sjoerg switch (MI.getOpcode()) {
9957330f729Sjoerg case AMDGPU::S_SETVSKIP:
9967330f729Sjoerg case AMDGPU::S_VERSION:
9977330f729Sjoerg case AMDGPU::S_WAITCNT_VSCNT:
9987330f729Sjoerg case AMDGPU::S_WAITCNT_VMCNT:
9997330f729Sjoerg case AMDGPU::S_WAITCNT_EXPCNT:
10007330f729Sjoerg // These instructions cannot not mitigate the hazard.
10017330f729Sjoerg return false;
10027330f729Sjoerg case AMDGPU::S_WAITCNT_LGKMCNT:
10037330f729Sjoerg // Reducing lgkmcnt count to 0 always mitigates the hazard.
1004*82d56013Sjoerg return (MI.getOperand(1).getImm() == 0) &&
1005*82d56013Sjoerg (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
10067330f729Sjoerg case AMDGPU::S_WAITCNT: {
1007*82d56013Sjoerg const int64_t Imm = MI.getOperand(0).getImm();
10087330f729Sjoerg AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
10097330f729Sjoerg return (Decoded.LgkmCnt == 0);
10107330f729Sjoerg }
10117330f729Sjoerg default:
10127330f729Sjoerg // SOPP instructions cannot mitigate the hazard.
1013*82d56013Sjoerg if (TII->isSOPP(MI))
10147330f729Sjoerg return false;
10157330f729Sjoerg // At this point the SALU can be assumed to mitigate the hazard
10167330f729Sjoerg // because either:
10177330f729Sjoerg // (a) it is independent of the at risk SMEM (breaking chain),
10187330f729Sjoerg // or
10197330f729Sjoerg // (b) it is dependent on the SMEM, in which case an appropriate
10207330f729Sjoerg // s_waitcnt lgkmcnt _must_ exist between it and the at risk
10217330f729Sjoerg // SMEM instruction.
10227330f729Sjoerg return true;
10237330f729Sjoerg }
10247330f729Sjoerg }
10257330f729Sjoerg return false;
10267330f729Sjoerg };
10277330f729Sjoerg
10287330f729Sjoerg if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
10297330f729Sjoerg std::numeric_limits<int>::max())
10307330f729Sjoerg return false;
10317330f729Sjoerg
10327330f729Sjoerg BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
10337330f729Sjoerg TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
10347330f729Sjoerg .addImm(0);
10357330f729Sjoerg return true;
10367330f729Sjoerg }
10377330f729Sjoerg
fixVcmpxExecWARHazard(MachineInstr * MI)10387330f729Sjoerg bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
10397330f729Sjoerg if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
10407330f729Sjoerg return false;
10417330f729Sjoerg
10427330f729Sjoerg const SIRegisterInfo *TRI = ST.getRegisterInfo();
10437330f729Sjoerg if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
10447330f729Sjoerg return false;
10457330f729Sjoerg
1046*82d56013Sjoerg auto IsHazardFn = [TRI](const MachineInstr &I) {
1047*82d56013Sjoerg if (SIInstrInfo::isVALU(I))
10487330f729Sjoerg return false;
1049*82d56013Sjoerg return I.readsRegister(AMDGPU::EXEC, TRI);
10507330f729Sjoerg };
10517330f729Sjoerg
10527330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
1053*82d56013Sjoerg auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1054*82d56013Sjoerg if (SIInstrInfo::isVALU(MI)) {
1055*82d56013Sjoerg if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
10567330f729Sjoerg return true;
1057*82d56013Sjoerg for (auto MO : MI.implicit_operands())
10587330f729Sjoerg if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
10597330f729Sjoerg return true;
10607330f729Sjoerg }
1061*82d56013Sjoerg if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1062*82d56013Sjoerg (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
10637330f729Sjoerg return true;
10647330f729Sjoerg return false;
10657330f729Sjoerg };
10667330f729Sjoerg
10677330f729Sjoerg if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
10687330f729Sjoerg std::numeric_limits<int>::max())
10697330f729Sjoerg return false;
10707330f729Sjoerg
10717330f729Sjoerg BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
10727330f729Sjoerg TII->get(AMDGPU::S_WAITCNT_DEPCTR))
10737330f729Sjoerg .addImm(0xfffe);
10747330f729Sjoerg return true;
10757330f729Sjoerg }
10767330f729Sjoerg
fixLdsBranchVmemWARHazard(MachineInstr * MI)10777330f729Sjoerg bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
10787330f729Sjoerg if (!ST.hasLdsBranchVmemWARHazard())
10797330f729Sjoerg return false;
10807330f729Sjoerg
1081*82d56013Sjoerg auto IsHazardInst = [](const MachineInstr &MI) {
1082*82d56013Sjoerg if (SIInstrInfo::isDS(MI))
10837330f729Sjoerg return 1;
1084*82d56013Sjoerg if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
10857330f729Sjoerg return 2;
10867330f729Sjoerg return 0;
10877330f729Sjoerg };
10887330f729Sjoerg
1089*82d56013Sjoerg auto InstType = IsHazardInst(*MI);
10907330f729Sjoerg if (!InstType)
10917330f729Sjoerg return false;
10927330f729Sjoerg
1093*82d56013Sjoerg auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1094*82d56013Sjoerg return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1095*82d56013Sjoerg I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1096*82d56013Sjoerg !I.getOperand(1).getImm());
10977330f729Sjoerg };
10987330f729Sjoerg
1099*82d56013Sjoerg auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1100*82d56013Sjoerg if (!I.isBranch())
11017330f729Sjoerg return false;
11027330f729Sjoerg
1103*82d56013Sjoerg auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
11047330f729Sjoerg auto InstType2 = IsHazardInst(I);
11057330f729Sjoerg return InstType2 && InstType != InstType2;
11067330f729Sjoerg };
11077330f729Sjoerg
1108*82d56013Sjoerg auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
11097330f729Sjoerg auto InstType2 = IsHazardInst(I);
11107330f729Sjoerg if (InstType == InstType2)
11117330f729Sjoerg return true;
11127330f729Sjoerg
1113*82d56013Sjoerg return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1114*82d56013Sjoerg I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1115*82d56013Sjoerg !I.getOperand(1).getImm();
11167330f729Sjoerg };
11177330f729Sjoerg
1118*82d56013Sjoerg return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
11197330f729Sjoerg std::numeric_limits<int>::max();
11207330f729Sjoerg };
11217330f729Sjoerg
11227330f729Sjoerg if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11237330f729Sjoerg std::numeric_limits<int>::max())
11247330f729Sjoerg return false;
11257330f729Sjoerg
11267330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
11277330f729Sjoerg BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
11287330f729Sjoerg TII->get(AMDGPU::S_WAITCNT_VSCNT))
11297330f729Sjoerg .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
11307330f729Sjoerg .addImm(0);
11317330f729Sjoerg
11327330f729Sjoerg return true;
11337330f729Sjoerg }
11347330f729Sjoerg
checkNSAtoVMEMHazard(MachineInstr * MI)11357330f729Sjoerg int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
11367330f729Sjoerg int NSAtoVMEMWaitStates = 1;
11377330f729Sjoerg
11387330f729Sjoerg if (!ST.hasNSAtoVMEMBug())
11397330f729Sjoerg return 0;
11407330f729Sjoerg
11417330f729Sjoerg if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
11427330f729Sjoerg return 0;
11437330f729Sjoerg
11447330f729Sjoerg const SIInstrInfo *TII = ST.getInstrInfo();
11457330f729Sjoerg const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
11467330f729Sjoerg if (!Offset || (Offset->getImm() & 6) == 0)
11477330f729Sjoerg return 0;
11487330f729Sjoerg
1149*82d56013Sjoerg auto IsHazardFn = [TII](const MachineInstr &I) {
1150*82d56013Sjoerg if (!SIInstrInfo::isMIMG(I))
11517330f729Sjoerg return false;
1152*82d56013Sjoerg const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
11537330f729Sjoerg return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1154*82d56013Sjoerg TII->getInstSizeInBytes(I) >= 16;
11557330f729Sjoerg };
11567330f729Sjoerg
11577330f729Sjoerg return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
11587330f729Sjoerg }
11597330f729Sjoerg
checkFPAtomicToDenormModeHazard(MachineInstr * MI)11607330f729Sjoerg int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
11617330f729Sjoerg int FPAtomicToDenormModeWaitStates = 3;
11627330f729Sjoerg
11637330f729Sjoerg if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
11647330f729Sjoerg return 0;
11657330f729Sjoerg
1166*82d56013Sjoerg auto IsHazardFn = [](const MachineInstr &I) {
1167*82d56013Sjoerg if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
11687330f729Sjoerg return false;
1169*82d56013Sjoerg return SIInstrInfo::isFPAtomic(I);
11707330f729Sjoerg };
11717330f729Sjoerg
1172*82d56013Sjoerg auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1173*82d56013Sjoerg if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
11747330f729Sjoerg return true;
11757330f729Sjoerg
1176*82d56013Sjoerg switch (MI.getOpcode()) {
11777330f729Sjoerg case AMDGPU::S_WAITCNT:
11787330f729Sjoerg case AMDGPU::S_WAITCNT_VSCNT:
11797330f729Sjoerg case AMDGPU::S_WAITCNT_VMCNT:
11807330f729Sjoerg case AMDGPU::S_WAITCNT_EXPCNT:
11817330f729Sjoerg case AMDGPU::S_WAITCNT_LGKMCNT:
1182*82d56013Sjoerg case AMDGPU::S_WAIT_IDLE:
11837330f729Sjoerg return true;
11847330f729Sjoerg default:
11857330f729Sjoerg break;
11867330f729Sjoerg }
11877330f729Sjoerg
11887330f729Sjoerg return false;
11897330f729Sjoerg };
11907330f729Sjoerg
11917330f729Sjoerg return FPAtomicToDenormModeWaitStates -
11927330f729Sjoerg ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
11937330f729Sjoerg }
11947330f729Sjoerg
checkMAIHazards(MachineInstr * MI)11957330f729Sjoerg int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
11967330f729Sjoerg assert(SIInstrInfo::isMAI(*MI));
11977330f729Sjoerg
1198*82d56013Sjoerg return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1199*82d56013Sjoerg }
1200*82d56013Sjoerg
checkMAIHazards908(MachineInstr * MI)1201*82d56013Sjoerg int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
12027330f729Sjoerg int WaitStatesNeeded = 0;
12037330f729Sjoerg unsigned Opc = MI->getOpcode();
12047330f729Sjoerg
1205*82d56013Sjoerg auto IsVALUFn = [](const MachineInstr &MI) {
1206*82d56013Sjoerg return SIInstrInfo::isVALU(MI);
12077330f729Sjoerg };
12087330f729Sjoerg
1209*82d56013Sjoerg if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
12107330f729Sjoerg const int LegacyVALUWritesVGPRWaitStates = 2;
12117330f729Sjoerg const int VALUWritesExecWaitStates = 4;
12127330f729Sjoerg const int MaxWaitStates = 4;
12137330f729Sjoerg
12147330f729Sjoerg int WaitStatesNeededForUse = VALUWritesExecWaitStates -
12157330f729Sjoerg getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
12167330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
12177330f729Sjoerg
12187330f729Sjoerg if (WaitStatesNeeded < MaxWaitStates) {
12197330f729Sjoerg for (const MachineOperand &Use : MI->explicit_uses()) {
12207330f729Sjoerg const int MaxWaitStates = 2;
12217330f729Sjoerg
12227330f729Sjoerg if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
12237330f729Sjoerg continue;
12247330f729Sjoerg
12257330f729Sjoerg int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
12267330f729Sjoerg getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
12277330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
12287330f729Sjoerg
12297330f729Sjoerg if (WaitStatesNeeded == MaxWaitStates)
12307330f729Sjoerg break;
12317330f729Sjoerg }
12327330f729Sjoerg }
12337330f729Sjoerg }
12347330f729Sjoerg
1235*82d56013Sjoerg auto IsMFMAFn = [](const MachineInstr &MI) {
1236*82d56013Sjoerg return SIInstrInfo::isMAI(MI) &&
1237*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1238*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
12397330f729Sjoerg };
12407330f729Sjoerg
12417330f729Sjoerg for (const MachineOperand &Op : MI->explicit_operands()) {
12427330f729Sjoerg if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
12437330f729Sjoerg continue;
12447330f729Sjoerg
1245*82d56013Sjoerg if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
12467330f729Sjoerg continue;
12477330f729Sjoerg
12487330f729Sjoerg const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
12497330f729Sjoerg const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
12507330f729Sjoerg const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
12517330f729Sjoerg const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
12527330f729Sjoerg const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
12537330f729Sjoerg const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
12547330f729Sjoerg const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
12557330f729Sjoerg const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
12567330f729Sjoerg const int MaxWaitStates = 18;
12577330f729Sjoerg Register Reg = Op.getReg();
12587330f729Sjoerg unsigned HazardDefLatency = 0;
12597330f729Sjoerg
1260*82d56013Sjoerg auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1261*82d56013Sjoerg this](const MachineInstr &MI) {
12627330f729Sjoerg if (!IsMFMAFn(MI))
12637330f729Sjoerg return false;
1264*82d56013Sjoerg Register DstReg = MI.getOperand(0).getReg();
12657330f729Sjoerg if (DstReg == Reg)
12667330f729Sjoerg return false;
1267*82d56013Sjoerg HazardDefLatency =
1268*82d56013Sjoerg std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
12697330f729Sjoerg return TRI.regsOverlap(DstReg, Reg);
12707330f729Sjoerg };
12717330f729Sjoerg
12727330f729Sjoerg int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
12737330f729Sjoerg MaxWaitStates);
12747330f729Sjoerg int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
12757330f729Sjoerg int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
12767330f729Sjoerg int OpNo = MI->getOperandNo(&Op);
12777330f729Sjoerg if (OpNo == SrcCIdx) {
12787330f729Sjoerg NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1279*82d56013Sjoerg } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
12807330f729Sjoerg switch (HazardDefLatency) {
12817330f729Sjoerg case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
12827330f729Sjoerg break;
12837330f729Sjoerg case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
12847330f729Sjoerg break;
12857330f729Sjoerg case 16: LLVM_FALLTHROUGH;
12867330f729Sjoerg default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
12877330f729Sjoerg break;
12887330f729Sjoerg }
1289*82d56013Sjoerg } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
12907330f729Sjoerg switch (HazardDefLatency) {
12917330f729Sjoerg case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
12927330f729Sjoerg break;
12937330f729Sjoerg case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
12947330f729Sjoerg break;
12957330f729Sjoerg case 16: LLVM_FALLTHROUGH;
12967330f729Sjoerg default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
12977330f729Sjoerg break;
12987330f729Sjoerg }
12997330f729Sjoerg }
13007330f729Sjoerg
13017330f729Sjoerg int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
13027330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
13037330f729Sjoerg
13047330f729Sjoerg if (WaitStatesNeeded == MaxWaitStates)
13057330f729Sjoerg return WaitStatesNeeded; // Early exit.
13067330f729Sjoerg
1307*82d56013Sjoerg auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1308*82d56013Sjoerg if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
13097330f729Sjoerg return false;
1310*82d56013Sjoerg Register DstReg = MI.getOperand(0).getReg();
13117330f729Sjoerg return TRI.regsOverlap(Reg, DstReg);
13127330f729Sjoerg };
13137330f729Sjoerg
13147330f729Sjoerg const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
13157330f729Sjoerg const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
13167330f729Sjoerg const int AccVGPRWriteAccVgprReadWaitStates = 3;
13177330f729Sjoerg NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
13187330f729Sjoerg if (OpNo == SrcCIdx)
13197330f729Sjoerg NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1320*82d56013Sjoerg else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
13217330f729Sjoerg NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
13227330f729Sjoerg
13237330f729Sjoerg WaitStatesNeededForUse = NeedWaitStates -
13247330f729Sjoerg getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
13257330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
13267330f729Sjoerg
13277330f729Sjoerg if (WaitStatesNeeded == MaxWaitStates)
13287330f729Sjoerg return WaitStatesNeeded; // Early exit.
13297330f729Sjoerg }
13307330f729Sjoerg
1331*82d56013Sjoerg if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
13327330f729Sjoerg const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
13337330f729Sjoerg const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
13347330f729Sjoerg const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
13357330f729Sjoerg const int MaxWaitStates = 13;
13367330f729Sjoerg Register DstReg = MI->getOperand(0).getReg();
13377330f729Sjoerg unsigned HazardDefLatency = 0;
13387330f729Sjoerg
1339*82d56013Sjoerg auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1340*82d56013Sjoerg this](const MachineInstr &MI) {
13417330f729Sjoerg if (!IsMFMAFn(MI))
13427330f729Sjoerg return false;
1343*82d56013Sjoerg Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1344*82d56013Sjoerg HazardDefLatency =
1345*82d56013Sjoerg std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
13467330f729Sjoerg return TRI.regsOverlap(Reg, DstReg);
13477330f729Sjoerg };
13487330f729Sjoerg
13497330f729Sjoerg int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
13507330f729Sjoerg int NeedWaitStates;
13517330f729Sjoerg switch (HazardDefLatency) {
13527330f729Sjoerg case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
13537330f729Sjoerg break;
13547330f729Sjoerg case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
13557330f729Sjoerg break;
13567330f729Sjoerg case 16: LLVM_FALLTHROUGH;
13577330f729Sjoerg default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
13587330f729Sjoerg break;
13597330f729Sjoerg }
13607330f729Sjoerg
13617330f729Sjoerg int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
13627330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
13637330f729Sjoerg }
13647330f729Sjoerg
13657330f729Sjoerg return WaitStatesNeeded;
13667330f729Sjoerg }
13677330f729Sjoerg
checkMAIHazards90A(MachineInstr * MI)1368*82d56013Sjoerg int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1369*82d56013Sjoerg int WaitStatesNeeded = 0;
1370*82d56013Sjoerg unsigned Opc = MI->getOpcode();
1371*82d56013Sjoerg
1372*82d56013Sjoerg auto IsMFMAFn = [](const MachineInstr &MI) {
1373*82d56013Sjoerg return SIInstrInfo::isMAI(MI) &&
1374*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1375*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1376*82d56013Sjoerg };
1377*82d56013Sjoerg
1378*82d56013Sjoerg auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1379*82d56013Sjoerg return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1380*82d56013Sjoerg };
1381*82d56013Sjoerg
1382*82d56013Sjoerg auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1383*82d56013Sjoerg return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1384*82d56013Sjoerg };
1385*82d56013Sjoerg
1386*82d56013Sjoerg if (!IsMFMAFn(*MI))
1387*82d56013Sjoerg return WaitStatesNeeded;
1388*82d56013Sjoerg
1389*82d56013Sjoerg const int VALUWritesExecWaitStates = 4;
1390*82d56013Sjoerg int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1391*82d56013Sjoerg getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1392*82d56013Sjoerg VALUWritesExecWaitStates);
1393*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1394*82d56013Sjoerg
1395*82d56013Sjoerg int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1396*82d56013Sjoerg
1397*82d56013Sjoerg // Loop for both DGEMM and S/HGEMM 2nd instruction.
1398*82d56013Sjoerg for (const MachineOperand &Use : MI->explicit_uses()) {
1399*82d56013Sjoerg const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1400*82d56013Sjoerg const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1401*82d56013Sjoerg const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1402*82d56013Sjoerg const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1403*82d56013Sjoerg const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1404*82d56013Sjoerg const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1405*82d56013Sjoerg const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1406*82d56013Sjoerg const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1407*82d56013Sjoerg const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1408*82d56013Sjoerg const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1409*82d56013Sjoerg const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1410*82d56013Sjoerg const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1411*82d56013Sjoerg const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1412*82d56013Sjoerg const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1413*82d56013Sjoerg const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1414*82d56013Sjoerg const int MaxWaitStates = 19;
1415*82d56013Sjoerg
1416*82d56013Sjoerg if (!Use.isReg())
1417*82d56013Sjoerg continue;
1418*82d56013Sjoerg unsigned Reg = Use.getReg();
1419*82d56013Sjoerg bool FullReg;
1420*82d56013Sjoerg const MachineInstr *MI1;
1421*82d56013Sjoerg
1422*82d56013Sjoerg auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1423*82d56013Sjoerg this](const MachineInstr &MI) {
1424*82d56013Sjoerg if (!IsMFMAFn(MI))
1425*82d56013Sjoerg return false;
1426*82d56013Sjoerg if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1427*82d56013Sjoerg return false;
1428*82d56013Sjoerg Register DstReg = MI.getOperand(0).getReg();
1429*82d56013Sjoerg FullReg = (DstReg == Reg);
1430*82d56013Sjoerg MI1 = &MI;
1431*82d56013Sjoerg return TRI.regsOverlap(DstReg, Reg);
1432*82d56013Sjoerg };
1433*82d56013Sjoerg
1434*82d56013Sjoerg WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1435*82d56013Sjoerg getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1436*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1437*82d56013Sjoerg
1438*82d56013Sjoerg int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
1439*82d56013Sjoerg MaxWaitStates);
1440*82d56013Sjoerg if (NumWaitStates == std::numeric_limits<int>::max())
1441*82d56013Sjoerg continue;
1442*82d56013Sjoerg
1443*82d56013Sjoerg int OpNo = MI->getOperandNo(&Use);
1444*82d56013Sjoerg unsigned Opc1 = MI1->getOpcode();
1445*82d56013Sjoerg int NeedWaitStates = 0;
1446*82d56013Sjoerg if (OpNo == SrcCIdx) {
1447*82d56013Sjoerg if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1448*82d56013Sjoerg NeedWaitStates = 0;
1449*82d56013Sjoerg } else if (FullReg) {
1450*82d56013Sjoerg if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1451*82d56013Sjoerg Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1452*82d56013Sjoerg (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1453*82d56013Sjoerg Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1454*82d56013Sjoerg NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1455*82d56013Sjoerg } else {
1456*82d56013Sjoerg switch (Opc1) {
1457*82d56013Sjoerg case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1458*82d56013Sjoerg case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1459*82d56013Sjoerg if (!isXDL(ST, *MI))
1460*82d56013Sjoerg NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1461*82d56013Sjoerg break;
1462*82d56013Sjoerg case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1463*82d56013Sjoerg case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1464*82d56013Sjoerg if (!isXDL(ST, *MI))
1465*82d56013Sjoerg NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1466*82d56013Sjoerg break;
1467*82d56013Sjoerg default:
1468*82d56013Sjoerg switch (TSchedModel.computeInstrLatency(MI1)) {
1469*82d56013Sjoerg case 2:
1470*82d56013Sjoerg NeedWaitStates = isDGEMM(Opc)
1471*82d56013Sjoerg ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1472*82d56013Sjoerg : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1473*82d56013Sjoerg break;
1474*82d56013Sjoerg case 8:
1475*82d56013Sjoerg NeedWaitStates = isDGEMM(Opc)
1476*82d56013Sjoerg ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1477*82d56013Sjoerg : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1478*82d56013Sjoerg break;
1479*82d56013Sjoerg case 16: LLVM_FALLTHROUGH;
1480*82d56013Sjoerg default:
1481*82d56013Sjoerg NeedWaitStates = isDGEMM(Opc)
1482*82d56013Sjoerg ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1483*82d56013Sjoerg : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1484*82d56013Sjoerg }
1485*82d56013Sjoerg }
1486*82d56013Sjoerg }
1487*82d56013Sjoerg } else {
1488*82d56013Sjoerg switch (Opc1) {
1489*82d56013Sjoerg case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1490*82d56013Sjoerg case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1491*82d56013Sjoerg NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1492*82d56013Sjoerg break;
1493*82d56013Sjoerg case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1494*82d56013Sjoerg case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1495*82d56013Sjoerg NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1496*82d56013Sjoerg break;
1497*82d56013Sjoerg default:
1498*82d56013Sjoerg switch (TSchedModel.computeInstrLatency(MI1)) {
1499*82d56013Sjoerg case 2:
1500*82d56013Sjoerg NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1501*82d56013Sjoerg break;
1502*82d56013Sjoerg case 8:
1503*82d56013Sjoerg NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1504*82d56013Sjoerg break;
1505*82d56013Sjoerg case 16: LLVM_FALLTHROUGH;
1506*82d56013Sjoerg default:
1507*82d56013Sjoerg NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1508*82d56013Sjoerg }
1509*82d56013Sjoerg }
1510*82d56013Sjoerg }
1511*82d56013Sjoerg if (WaitStatesNeeded >= NeedWaitStates)
1512*82d56013Sjoerg continue;
1513*82d56013Sjoerg
1514*82d56013Sjoerg WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1515*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1516*82d56013Sjoerg
1517*82d56013Sjoerg if (WaitStatesNeeded == MaxWaitStates)
1518*82d56013Sjoerg break;
1519*82d56013Sjoerg }
1520*82d56013Sjoerg
1521*82d56013Sjoerg return WaitStatesNeeded;
1522*82d56013Sjoerg }
1523*82d56013Sjoerg
checkMAILdStHazards(MachineInstr * MI)15247330f729Sjoerg int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1525*82d56013Sjoerg // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
1526*82d56013Sjoerg if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
15277330f729Sjoerg return 0;
15287330f729Sjoerg
15297330f729Sjoerg int WaitStatesNeeded = 0;
15307330f729Sjoerg
1531*82d56013Sjoerg auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1532*82d56013Sjoerg return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
15337330f729Sjoerg };
15347330f729Sjoerg
15357330f729Sjoerg for (const MachineOperand &Op : MI->explicit_uses()) {
15367330f729Sjoerg if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
15377330f729Sjoerg continue;
15387330f729Sjoerg
15397330f729Sjoerg Register Reg = Op.getReg();
15407330f729Sjoerg
15417330f729Sjoerg const int AccVgprReadLdStWaitStates = 2;
1542*82d56013Sjoerg const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
15437330f729Sjoerg const int MaxWaitStates = 2;
15447330f729Sjoerg
15457330f729Sjoerg int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
15467330f729Sjoerg getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
15477330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
15487330f729Sjoerg
15497330f729Sjoerg if (WaitStatesNeeded == MaxWaitStates)
15507330f729Sjoerg return WaitStatesNeeded; // Early exit.
15517330f729Sjoerg
1552*82d56013Sjoerg auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1553*82d56013Sjoerg if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1554*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
15557330f729Sjoerg return false;
1556*82d56013Sjoerg auto IsVALUFn = [](const MachineInstr &MI) {
1557*82d56013Sjoerg return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
15587330f729Sjoerg };
15597330f729Sjoerg return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
15607330f729Sjoerg std::numeric_limits<int>::max();
15617330f729Sjoerg };
15627330f729Sjoerg
1563*82d56013Sjoerg WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1564*82d56013Sjoerg getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
15657330f729Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
15667330f729Sjoerg }
15677330f729Sjoerg
15687330f729Sjoerg return WaitStatesNeeded;
15697330f729Sjoerg }
1570*82d56013Sjoerg
checkMAIVALUHazards(MachineInstr * MI)1571*82d56013Sjoerg int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1572*82d56013Sjoerg if (!ST.hasGFX90AInsts())
1573*82d56013Sjoerg return 0;
1574*82d56013Sjoerg
1575*82d56013Sjoerg auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1576*82d56013Sjoerg return SIInstrInfo::isMAI(MI) &&
1577*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1578*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1579*82d56013Sjoerg };
1580*82d56013Sjoerg
1581*82d56013Sjoerg auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1582*82d56013Sjoerg return isDGEMM(MI.getOpcode());
1583*82d56013Sjoerg };
1584*82d56013Sjoerg
1585*82d56013Sjoerg // This is checked in checkMAIHazards90A()
1586*82d56013Sjoerg if (IsMFMAFn(*MI))
1587*82d56013Sjoerg return 0;
1588*82d56013Sjoerg
1589*82d56013Sjoerg int WaitStatesNeeded = 0;
1590*82d56013Sjoerg
1591*82d56013Sjoerg bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1592*82d56013Sjoerg SIInstrInfo::isFLAT(*MI) ||
1593*82d56013Sjoerg SIInstrInfo::isDS(*MI) ||
1594*82d56013Sjoerg SIInstrInfo::isEXP(*MI);
1595*82d56013Sjoerg bool IsVALU = SIInstrInfo::isVALU(*MI);
1596*82d56013Sjoerg
1597*82d56013Sjoerg const MachineInstr *MFMA = nullptr;
1598*82d56013Sjoerg unsigned Reg;
1599*82d56013Sjoerg auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
1600*82d56013Sjoerg this](const MachineInstr &MI) {
1601*82d56013Sjoerg if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1602*82d56013Sjoerg return false;
1603*82d56013Sjoerg if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1604*82d56013Sjoerg return false;
1605*82d56013Sjoerg MFMA = &MI;
1606*82d56013Sjoerg return true;
1607*82d56013Sjoerg };
1608*82d56013Sjoerg
1609*82d56013Sjoerg const MachineInstr *DOT = nullptr;
1610*82d56013Sjoerg auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1611*82d56013Sjoerg if (!SIInstrInfo::isDOT(MI) ||
1612*82d56013Sjoerg !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1613*82d56013Sjoerg return false;
1614*82d56013Sjoerg DOT = &MI;
1615*82d56013Sjoerg return true;
1616*82d56013Sjoerg };
1617*82d56013Sjoerg
1618*82d56013Sjoerg int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1619*82d56013Sjoerg AMDGPU::OpName::src2);
1620*82d56013Sjoerg
1621*82d56013Sjoerg if (IsMemOrExport || IsVALU) {
1622*82d56013Sjoerg const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1623*82d56013Sjoerg const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1624*82d56013Sjoerg const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1625*82d56013Sjoerg const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1626*82d56013Sjoerg const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1627*82d56013Sjoerg const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1628*82d56013Sjoerg const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1629*82d56013Sjoerg const int DotWriteSameDotReadSrcAB = 3;
1630*82d56013Sjoerg const int DotWriteDifferentVALURead = 3;
1631*82d56013Sjoerg const int MaxWaitStates = 19;
1632*82d56013Sjoerg
1633*82d56013Sjoerg for (const MachineOperand &Use : MI->explicit_uses()) {
1634*82d56013Sjoerg if (!Use.isReg())
1635*82d56013Sjoerg continue;
1636*82d56013Sjoerg Reg = Use.getReg();
1637*82d56013Sjoerg
1638*82d56013Sjoerg DOT = nullptr;
1639*82d56013Sjoerg int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1640*82d56013Sjoerg MaxWaitStates);
1641*82d56013Sjoerg if (DOT) {
1642*82d56013Sjoerg int NeedWaitStates = 0;
1643*82d56013Sjoerg if (DOT->getOpcode() == MI->getOpcode()) {
1644*82d56013Sjoerg if (&Use - &MI->getOperand(0) != SrcCIdx)
1645*82d56013Sjoerg NeedWaitStates = DotWriteSameDotReadSrcAB;
1646*82d56013Sjoerg } else {
1647*82d56013Sjoerg NeedWaitStates = DotWriteDifferentVALURead;
1648*82d56013Sjoerg }
1649*82d56013Sjoerg
1650*82d56013Sjoerg int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1651*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1652*82d56013Sjoerg }
1653*82d56013Sjoerg
1654*82d56013Sjoerg MFMA = nullptr;
1655*82d56013Sjoerg WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1656*82d56013Sjoerg MaxWaitStates);
1657*82d56013Sjoerg if (!MFMA)
1658*82d56013Sjoerg continue;
1659*82d56013Sjoerg
1660*82d56013Sjoerg unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1661*82d56013Sjoerg int NeedWaitStates = MaxWaitStates;
1662*82d56013Sjoerg switch (HazardDefLatency) {
1663*82d56013Sjoerg case 2:
1664*82d56013Sjoerg NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1665*82d56013Sjoerg break;
1666*82d56013Sjoerg case 4:
1667*82d56013Sjoerg assert(isDGEMM(MFMA->getOpcode()));
1668*82d56013Sjoerg NeedWaitStates =
1669*82d56013Sjoerg IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1670*82d56013Sjoerg : DMFMA4x4WriteVgprVALUReadWaitStates;
1671*82d56013Sjoerg break;
1672*82d56013Sjoerg case 8:
1673*82d56013Sjoerg NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1674*82d56013Sjoerg break;
1675*82d56013Sjoerg case 16: LLVM_FALLTHROUGH;
1676*82d56013Sjoerg default:
1677*82d56013Sjoerg NeedWaitStates =
1678*82d56013Sjoerg isDGEMM(MFMA->getOpcode())
1679*82d56013Sjoerg ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1680*82d56013Sjoerg : DMFMA16x16WriteVgprVALUReadWaitStates
1681*82d56013Sjoerg : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1682*82d56013Sjoerg break;
1683*82d56013Sjoerg }
1684*82d56013Sjoerg
1685*82d56013Sjoerg int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1686*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1687*82d56013Sjoerg
1688*82d56013Sjoerg if (WaitStatesNeeded == MaxWaitStates)
1689*82d56013Sjoerg break;
1690*82d56013Sjoerg }
1691*82d56013Sjoerg }
1692*82d56013Sjoerg
1693*82d56013Sjoerg unsigned Opc = MI->getOpcode();
1694*82d56013Sjoerg const int DMFMAToFMA64WaitStates = 2;
1695*82d56013Sjoerg if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1696*82d56013Sjoerg Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1697*82d56013Sjoerg Opc == AMDGPU::V_FMAC_F64_dpp) &&
1698*82d56013Sjoerg WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1699*82d56013Sjoerg int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1700*82d56013Sjoerg getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1701*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1702*82d56013Sjoerg }
1703*82d56013Sjoerg
1704*82d56013Sjoerg if (!IsVALU && !IsMemOrExport)
1705*82d56013Sjoerg return WaitStatesNeeded;
1706*82d56013Sjoerg
1707*82d56013Sjoerg for (const MachineOperand &Def : MI->defs()) {
1708*82d56013Sjoerg const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1709*82d56013Sjoerg const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1710*82d56013Sjoerg const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1711*82d56013Sjoerg const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1712*82d56013Sjoerg const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1713*82d56013Sjoerg const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1714*82d56013Sjoerg const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1715*82d56013Sjoerg const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1716*82d56013Sjoerg const int DotWriteDifferentVALUWrite = 3;
1717*82d56013Sjoerg const int MaxWaitStates = 19;
1718*82d56013Sjoerg const int MaxWarWaitStates = 15;
1719*82d56013Sjoerg
1720*82d56013Sjoerg Reg = Def.getReg();
1721*82d56013Sjoerg
1722*82d56013Sjoerg DOT = nullptr;
1723*82d56013Sjoerg int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1724*82d56013Sjoerg MaxWaitStates);
1725*82d56013Sjoerg if (DOT && DOT->getOpcode() != MI->getOpcode())
1726*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1727*82d56013Sjoerg WaitStatesSinceDef);
1728*82d56013Sjoerg
1729*82d56013Sjoerg MFMA = nullptr;
1730*82d56013Sjoerg WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1731*82d56013Sjoerg MaxWaitStates);
1732*82d56013Sjoerg if (MFMA) {
1733*82d56013Sjoerg int NeedWaitStates = MaxWaitStates;
1734*82d56013Sjoerg switch (TSchedModel.computeInstrLatency(MFMA)) {
1735*82d56013Sjoerg case 2:
1736*82d56013Sjoerg NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1737*82d56013Sjoerg break;
1738*82d56013Sjoerg case 4:
1739*82d56013Sjoerg assert(isDGEMM(MFMA->getOpcode()));
1740*82d56013Sjoerg NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1741*82d56013Sjoerg break;
1742*82d56013Sjoerg case 8:
1743*82d56013Sjoerg NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1744*82d56013Sjoerg break;
1745*82d56013Sjoerg case 16: LLVM_FALLTHROUGH;
1746*82d56013Sjoerg default:
1747*82d56013Sjoerg NeedWaitStates = isDGEMM(MFMA->getOpcode())
1748*82d56013Sjoerg ? DMFMA16x16WriteVgprVALUWriteWaitStates
1749*82d56013Sjoerg : SMFMA32x32WriteVgprVALUWawWaitStates;
1750*82d56013Sjoerg break;
1751*82d56013Sjoerg }
1752*82d56013Sjoerg
1753*82d56013Sjoerg int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1754*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1755*82d56013Sjoerg
1756*82d56013Sjoerg if (WaitStatesNeeded == MaxWaitStates)
1757*82d56013Sjoerg break;
1758*82d56013Sjoerg }
1759*82d56013Sjoerg
1760*82d56013Sjoerg auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1761*82d56013Sjoerg this](const MachineInstr &MI) {
1762*82d56013Sjoerg if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1763*82d56013Sjoerg !MI.readsRegister(Reg, &TRI))
1764*82d56013Sjoerg return false;
1765*82d56013Sjoerg
1766*82d56013Sjoerg const MachineOperand *SrcC =
1767*82d56013Sjoerg TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1768*82d56013Sjoerg assert(SrcC);
1769*82d56013Sjoerg if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1770*82d56013Sjoerg return false;
1771*82d56013Sjoerg
1772*82d56013Sjoerg MFMA = &MI;
1773*82d56013Sjoerg return true;
1774*82d56013Sjoerg };
1775*82d56013Sjoerg
1776*82d56013Sjoerg MFMA = nullptr;
1777*82d56013Sjoerg int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1778*82d56013Sjoerg MaxWarWaitStates);
1779*82d56013Sjoerg if (!MFMA)
1780*82d56013Sjoerg continue;
1781*82d56013Sjoerg
1782*82d56013Sjoerg unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1783*82d56013Sjoerg int NeedWaitStates = MaxWaitStates;
1784*82d56013Sjoerg switch (HazardDefLatency) {
1785*82d56013Sjoerg case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1786*82d56013Sjoerg break;
1787*82d56013Sjoerg case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1788*82d56013Sjoerg break;
1789*82d56013Sjoerg case 16: LLVM_FALLTHROUGH;
1790*82d56013Sjoerg default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1791*82d56013Sjoerg break;
1792*82d56013Sjoerg }
1793*82d56013Sjoerg
1794*82d56013Sjoerg int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1795*82d56013Sjoerg WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1796*82d56013Sjoerg }
1797*82d56013Sjoerg
1798*82d56013Sjoerg return WaitStatesNeeded;
1799*82d56013Sjoerg }
1800*82d56013Sjoerg
ShouldPreferAnother(SUnit * SU)1801*82d56013Sjoerg bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1802*82d56013Sjoerg if (!SU->isInstr())
1803*82d56013Sjoerg return false;
1804*82d56013Sjoerg
1805*82d56013Sjoerg const MachineInstr *MAI = nullptr;
1806*82d56013Sjoerg auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1807*82d56013Sjoerg MAI = nullptr;
1808*82d56013Sjoerg if (SIInstrInfo::isMAI(MI) &&
1809*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1810*82d56013Sjoerg MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1811*82d56013Sjoerg MAI = &MI;
1812*82d56013Sjoerg return MAI != nullptr;
1813*82d56013Sjoerg };
1814*82d56013Sjoerg
1815*82d56013Sjoerg MachineInstr *MI = SU->getInstr();
1816*82d56013Sjoerg if (IsMFMAFn(*MI)) {
1817*82d56013Sjoerg int W = getWaitStatesSince(IsMFMAFn, 16);
1818*82d56013Sjoerg if (MAI)
1819*82d56013Sjoerg return W < (int)TSchedModel.computeInstrLatency(MAI);
1820*82d56013Sjoerg }
1821*82d56013Sjoerg
1822*82d56013Sjoerg return false;
1823*82d56013Sjoerg }
1824