10b57cec5SDimitry Andric //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements hazard recognizers for scheduling on GCN processors. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "GCNHazardRecognizer.h" 14e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 150b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 1681ad6265SDimitry Andric #include "SIMachineFunctionInfo.h" 17*0fca6ea1SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h" 180b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 190b57cec5SDimitry Andric #include "llvm/CodeGen/ScheduleDAG.h" 2006c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h" 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric using namespace llvm; 230b57cec5SDimitry Andric 2481ad6265SDimitry Andric namespace { 2581ad6265SDimitry Andric 2681ad6265SDimitry Andric struct MFMAPaddingRatioParser : public cl::parser<unsigned> { 2781ad6265SDimitry Andric MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} 2881ad6265SDimitry Andric 2981ad6265SDimitry Andric bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { 3081ad6265SDimitry Andric if (Arg.getAsInteger(0, Value)) 3181ad6265SDimitry Andric return O.error("'" + Arg + "' value invalid for uint argument!"); 3281ad6265SDimitry Andric 3381ad6265SDimitry Andric if (Value > 100) 3481ad6265SDimitry Andric return O.error("'" + Arg + "' value must be in the range [0, 100]!"); 3581ad6265SDimitry Andric 3681ad6265SDimitry Andric return false; 3781ad6265SDimitry Andric } 3881ad6265SDimitry Andric }; 3981ad6265SDimitry Andric 4081ad6265SDimitry Andric } // end anonymous namespace 4181ad6265SDimitry Andric 4281ad6265SDimitry Andric static cl::opt<unsigned, false, MFMAPaddingRatioParser> 4381ad6265SDimitry Andric MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, 4481ad6265SDimitry Andric cl::desc("Fill a percentage of the latency between " 4581ad6265SDimitry Andric "neighboring MFMA with s_nops.")); 4681ad6265SDimitry Andric 470b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 4881ad6265SDimitry Andric // Hazard Recognizer Implementation 490b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 500b57cec5SDimitry Andric 51fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 52fe6060f1SDimitry Andric const GCNSubtarget &ST); 53fe6060f1SDimitry Andric 540b57cec5SDimitry Andric GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 550b57cec5SDimitry Andric IsHazardRecognizerMode(false), 560b57cec5SDimitry Andric CurrCycleInstr(nullptr), 570b57cec5SDimitry Andric MF(MF), 580b57cec5SDimitry Andric ST(MF.getSubtarget<GCNSubtarget>()), 590b57cec5SDimitry Andric TII(*ST.getInstrInfo()), 600b57cec5SDimitry Andric TRI(TII.getRegisterInfo()), 610b57cec5SDimitry Andric ClauseUses(TRI.getNumRegUnits()), 620b57cec5SDimitry Andric ClauseDefs(TRI.getNumRegUnits()) { 63fe6060f1SDimitry Andric MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; 640b57cec5SDimitry Andric TSchedModel.init(&ST); 65fe6060f1SDimitry Andric RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); 660b57cec5SDimitry Andric } 670b57cec5SDimitry Andric 68e8d8bef9SDimitry Andric void GCNHazardRecognizer::Reset() { 69e8d8bef9SDimitry Andric EmittedInstrs.clear(); 70e8d8bef9SDimitry Andric } 71e8d8bef9SDimitry Andric 720b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 730b57cec5SDimitry Andric EmitInstruction(SU->getInstr()); 740b57cec5SDimitry Andric } 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 770b57cec5SDimitry Andric CurrCycleInstr = MI; 780b57cec5SDimitry Andric } 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric static bool isDivFMas(unsigned Opcode) { 81e8d8bef9SDimitry Andric return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; 820b57cec5SDimitry Andric } 830b57cec5SDimitry Andric 840b57cec5SDimitry Andric static bool isSGetReg(unsigned Opcode) { 850b57cec5SDimitry Andric return Opcode == AMDGPU::S_GETREG_B32; 860b57cec5SDimitry Andric } 870b57cec5SDimitry Andric 880b57cec5SDimitry Andric static bool isSSetReg(unsigned Opcode) { 89e8d8bef9SDimitry Andric switch (Opcode) { 90e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_B32: 91e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_B32_mode: 92e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_IMM32_B32: 93e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_IMM32_B32_mode: 94e8d8bef9SDimitry Andric return true; 95e8d8bef9SDimitry Andric } 96e8d8bef9SDimitry Andric return false; 970b57cec5SDimitry Andric } 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric static bool isRWLane(unsigned Opcode) { 1000b57cec5SDimitry Andric return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 1010b57cec5SDimitry Andric } 1020b57cec5SDimitry Andric 1030b57cec5SDimitry Andric static bool isRFE(unsigned Opcode) { 1040b57cec5SDimitry Andric return Opcode == AMDGPU::S_RFE_B64; 1050b57cec5SDimitry Andric } 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric static bool isSMovRel(unsigned Opcode) { 1080b57cec5SDimitry Andric switch (Opcode) { 1090b57cec5SDimitry Andric case AMDGPU::S_MOVRELS_B32: 1100b57cec5SDimitry Andric case AMDGPU::S_MOVRELS_B64: 1110b57cec5SDimitry Andric case AMDGPU::S_MOVRELD_B32: 1120b57cec5SDimitry Andric case AMDGPU::S_MOVRELD_B64: 1130b57cec5SDimitry Andric return true; 1140b57cec5SDimitry Andric default: 1150b57cec5SDimitry Andric return false; 1160b57cec5SDimitry Andric } 1170b57cec5SDimitry Andric } 1180b57cec5SDimitry Andric 119fe6060f1SDimitry Andric static bool isDGEMM(unsigned Opcode) { 12081ad6265SDimitry Andric return AMDGPU::getMAIIsDGEMM(Opcode); 121fe6060f1SDimitry Andric } 122fe6060f1SDimitry Andric 123fe6060f1SDimitry Andric static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { 124fe6060f1SDimitry Andric unsigned Opcode = MI.getOpcode(); 125fe6060f1SDimitry Andric 126fe6060f1SDimitry Andric if (!SIInstrInfo::isMAI(MI) || 127fe6060f1SDimitry Andric isDGEMM(Opcode) || 128fe6060f1SDimitry Andric Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 129fe6060f1SDimitry Andric Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) 130fe6060f1SDimitry Andric return false; 131fe6060f1SDimitry Andric 13281ad6265SDimitry Andric if (!ST.hasGFX940Insts()) 133fe6060f1SDimitry Andric return true; 13481ad6265SDimitry Andric 13581ad6265SDimitry Andric return AMDGPU::getMAIIsGFX940XDL(Opcode); 136fe6060f1SDimitry Andric } 137fe6060f1SDimitry Andric 1380b57cec5SDimitry Andric static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 1390b57cec5SDimitry Andric const MachineInstr &MI) { 1400b57cec5SDimitry Andric if (TII.isAlwaysGDS(MI.getOpcode())) 1410b57cec5SDimitry Andric return true; 1420b57cec5SDimitry Andric 1430b57cec5SDimitry Andric switch (MI.getOpcode()) { 1440b57cec5SDimitry Andric case AMDGPU::S_SENDMSG: 1450b57cec5SDimitry Andric case AMDGPU::S_SENDMSGHALT: 1460b57cec5SDimitry Andric case AMDGPU::S_TTRACEDATA: 1470b57cec5SDimitry Andric return true; 1480b57cec5SDimitry Andric // These DS opcodes don't support GDS. 1490b57cec5SDimitry Andric case AMDGPU::DS_NOP: 1500b57cec5SDimitry Andric case AMDGPU::DS_PERMUTE_B32: 1510b57cec5SDimitry Andric case AMDGPU::DS_BPERMUTE_B32: 1520b57cec5SDimitry Andric return false; 1530b57cec5SDimitry Andric default: 1540b57cec5SDimitry Andric if (TII.isDS(MI.getOpcode())) { 1550b57cec5SDimitry Andric int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 1560b57cec5SDimitry Andric AMDGPU::OpName::gds); 1570b57cec5SDimitry Andric if (MI.getOperand(GDS).getImm()) 1580b57cec5SDimitry Andric return true; 1590b57cec5SDimitry Andric } 1600b57cec5SDimitry Andric return false; 1610b57cec5SDimitry Andric } 1620b57cec5SDimitry Andric } 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric static bool isPermlane(const MachineInstr &MI) { 1650b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode(); 166e8d8bef9SDimitry Andric return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || 1677a6dacacSDimitry Andric Opcode == AMDGPU::V_PERMLANE64_B32 || 1685f757f3fSDimitry Andric Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || 1695f757f3fSDimitry Andric Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || 1705f757f3fSDimitry Andric Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; 1710b57cec5SDimitry Andric } 1720b57cec5SDimitry Andric 17381ad6265SDimitry Andric static bool isLdsDma(const MachineInstr &MI) { 17481ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) && 17581ad6265SDimitry Andric (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); 17681ad6265SDimitry Andric } 17781ad6265SDimitry Andric 1780b57cec5SDimitry Andric static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 1790b57cec5SDimitry Andric const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 1800b57cec5SDimitry Andric AMDGPU::OpName::simm16); 181*0fca6ea1SDimitry Andric return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); 1820b57cec5SDimitry Andric } 1830b57cec5SDimitry Andric 1840b57cec5SDimitry Andric ScheduleHazardRecognizer::HazardType 1850b57cec5SDimitry Andric GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 1860b57cec5SDimitry Andric MachineInstr *MI = SU->getInstr(); 187e8d8bef9SDimitry Andric // If we are not in "HazardRecognizerMode" and therefore not being run from 188e8d8bef9SDimitry Andric // the scheduler, track possible stalls from hazards but don't insert noops. 189e8d8bef9SDimitry Andric auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 190e8d8bef9SDimitry Andric 1910b57cec5SDimitry Andric if (MI->isBundle()) 1920b57cec5SDimitry Andric return NoHazard; 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 195e8d8bef9SDimitry Andric return HazardType; 1960b57cec5SDimitry Andric 1970b57cec5SDimitry Andric if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 198e8d8bef9SDimitry Andric return HazardType; 1990b57cec5SDimitry Andric 2000b57cec5SDimitry Andric if (checkFPAtomicToDenormModeHazard(MI) > 0) 201e8d8bef9SDimitry Andric return HazardType; 2020b57cec5SDimitry Andric 2030b57cec5SDimitry Andric if (ST.hasNoDataDepHazard()) 2040b57cec5SDimitry Andric return NoHazard; 2050b57cec5SDimitry Andric 206fe6060f1SDimitry Andric // FIXME: Should flat be considered vmem? 207fe6060f1SDimitry Andric if ((SIInstrInfo::isVMEM(*MI) || 208fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI)) 209fe6060f1SDimitry Andric && checkVMEMHazards(MI) > 0) 210fe6060f1SDimitry Andric return HazardType; 211fe6060f1SDimitry Andric 2120b57cec5SDimitry Andric if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 213e8d8bef9SDimitry Andric return HazardType; 2140b57cec5SDimitry Andric 2150b57cec5SDimitry Andric if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 216e8d8bef9SDimitry Andric return HazardType; 2170b57cec5SDimitry Andric 2180b57cec5SDimitry Andric if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 219e8d8bef9SDimitry Andric return HazardType; 2200b57cec5SDimitry Andric 2210b57cec5SDimitry Andric if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 222e8d8bef9SDimitry Andric return HazardType; 2230b57cec5SDimitry Andric 224fe6060f1SDimitry Andric if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 225fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 226fe6060f1SDimitry Andric SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 227fe6060f1SDimitry Andric return HazardType; 228fe6060f1SDimitry Andric 2290b57cec5SDimitry Andric if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 230e8d8bef9SDimitry Andric return HazardType; 2310b57cec5SDimitry Andric 2320b57cec5SDimitry Andric if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 233e8d8bef9SDimitry Andric return HazardType; 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 236e8d8bef9SDimitry Andric return HazardType; 2370b57cec5SDimitry Andric 23881ad6265SDimitry Andric if (((ST.hasReadM0MovRelInterpHazard() && 239bdd1243dSDimitry Andric (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 240bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 241bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 24281ad6265SDimitry Andric (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 24381ad6265SDimitry Andric (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 24481ad6265SDimitry Andric (ST.hasReadM0LdsDirectHazard() && 245*0fca6ea1SDimitry Andric MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && 2460b57cec5SDimitry Andric checkReadM0Hazards(MI) > 0) 247e8d8bef9SDimitry Andric return HazardType; 2480b57cec5SDimitry Andric 2490b57cec5SDimitry Andric if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 250e8d8bef9SDimitry Andric return HazardType; 2510b57cec5SDimitry Andric 252e8d8bef9SDimitry Andric if ((SIInstrInfo::isVMEM(*MI) || 253e8d8bef9SDimitry Andric SIInstrInfo::isFLAT(*MI) || 254e8d8bef9SDimitry Andric SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 255e8d8bef9SDimitry Andric return HazardType; 2560b57cec5SDimitry Andric 2570b57cec5SDimitry Andric if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 258e8d8bef9SDimitry Andric return HazardType; 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric return NoHazard; 2610b57cec5SDimitry Andric } 2620b57cec5SDimitry Andric 263e8d8bef9SDimitry Andric static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, 264e8d8bef9SDimitry Andric unsigned Quantity) { 265e8d8bef9SDimitry Andric while (Quantity > 0) { 266e8d8bef9SDimitry Andric unsigned Arg = std::min(Quantity, 8u); 267e8d8bef9SDimitry Andric Quantity -= Arg; 2680b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 269e8d8bef9SDimitry Andric .addImm(Arg - 1); 270e8d8bef9SDimitry Andric } 2710b57cec5SDimitry Andric } 2720b57cec5SDimitry Andric 27381ad6265SDimitry Andric unsigned 27481ad6265SDimitry Andric GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { 27581ad6265SDimitry Andric const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); 27681ad6265SDimitry Andric assert(TSchedModel.getWriteProcResBegin(SC) != 27781ad6265SDimitry Andric TSchedModel.getWriteProcResEnd(SC)); 2785f757f3fSDimitry Andric return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; 27981ad6265SDimitry Andric } 28081ad6265SDimitry Andric 2810b57cec5SDimitry Andric void GCNHazardRecognizer::processBundle() { 2820b57cec5SDimitry Andric MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 2830b57cec5SDimitry Andric MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 2840b57cec5SDimitry Andric // Check bundled MachineInstr's for hazards. 2850b57cec5SDimitry Andric for (; MI != E && MI->isInsideBundle(); ++MI) { 2860b57cec5SDimitry Andric CurrCycleInstr = &*MI; 2870b57cec5SDimitry Andric unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 2880b57cec5SDimitry Andric 289e8d8bef9SDimitry Andric if (IsHazardRecognizerMode) { 2900b57cec5SDimitry Andric fixHazards(CurrCycleInstr); 2910b57cec5SDimitry Andric 292e8d8bef9SDimitry Andric insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); 293e8d8bef9SDimitry Andric } 2940b57cec5SDimitry Andric 2950b57cec5SDimitry Andric // It’s unnecessary to track more than MaxLookAhead instructions. Since we 2960b57cec5SDimitry Andric // include the bundled MI directly after, only add a maximum of 2970b57cec5SDimitry Andric // (MaxLookAhead - 1) noops to EmittedInstrs. 2980b57cec5SDimitry Andric for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 2990b57cec5SDimitry Andric EmittedInstrs.push_front(nullptr); 3000b57cec5SDimitry Andric 3010b57cec5SDimitry Andric EmittedInstrs.push_front(CurrCycleInstr); 3020b57cec5SDimitry Andric EmittedInstrs.resize(MaxLookAhead); 3030b57cec5SDimitry Andric } 3040b57cec5SDimitry Andric CurrCycleInstr = nullptr; 3050b57cec5SDimitry Andric } 3060b57cec5SDimitry Andric 307bdd1243dSDimitry Andric void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { 308bdd1243dSDimitry Andric assert(IsHazardRecognizerMode); 309bdd1243dSDimitry Andric 310bdd1243dSDimitry Andric unsigned NumPreNoops = PreEmitNoops(MI); 311bdd1243dSDimitry Andric EmitNoops(NumPreNoops); 312bdd1243dSDimitry Andric if (MI->isInsideBundle()) 313bdd1243dSDimitry Andric insertNoopsInBundle(MI, TII, NumPreNoops); 314bdd1243dSDimitry Andric else 315bdd1243dSDimitry Andric TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), 316bdd1243dSDimitry Andric NumPreNoops); 317bdd1243dSDimitry Andric EmitInstruction(MI); 318bdd1243dSDimitry Andric AdvanceCycle(); 319bdd1243dSDimitry Andric } 320bdd1243dSDimitry Andric 3210b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 3220b57cec5SDimitry Andric IsHazardRecognizerMode = true; 3230b57cec5SDimitry Andric CurrCycleInstr = MI; 3240b57cec5SDimitry Andric unsigned W = PreEmitNoopsCommon(MI); 3250b57cec5SDimitry Andric fixHazards(MI); 3260b57cec5SDimitry Andric CurrCycleInstr = nullptr; 3270b57cec5SDimitry Andric return W; 3280b57cec5SDimitry Andric } 3290b57cec5SDimitry Andric 3300b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 3310b57cec5SDimitry Andric if (MI->isBundle()) 3320b57cec5SDimitry Andric return 0; 3330b57cec5SDimitry Andric 334e8d8bef9SDimitry Andric int WaitStates = 0; 3350b57cec5SDimitry Andric 3360b57cec5SDimitry Andric if (SIInstrInfo::isSMRD(*MI)) 3370b57cec5SDimitry Andric return std::max(WaitStates, checkSMRDHazards(MI)); 3380b57cec5SDimitry Andric 3390b57cec5SDimitry Andric if (ST.hasNSAtoVMEMBug()) 3400b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 3410b57cec5SDimitry Andric 3420b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 3430b57cec5SDimitry Andric 3440b57cec5SDimitry Andric if (ST.hasNoDataDepHazard()) 3450b57cec5SDimitry Andric return WaitStates; 3460b57cec5SDimitry Andric 347fe6060f1SDimitry Andric if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 348fe6060f1SDimitry Andric WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 349fe6060f1SDimitry Andric 3500b57cec5SDimitry Andric if (SIInstrInfo::isVALU(*MI)) 3510b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 3520b57cec5SDimitry Andric 3530b57cec5SDimitry Andric if (SIInstrInfo::isDPP(*MI)) 3540b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 3550b57cec5SDimitry Andric 3560b57cec5SDimitry Andric if (isDivFMas(MI->getOpcode())) 3570b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric if (isRWLane(MI->getOpcode())) 3600b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 3610b57cec5SDimitry Andric 362fe6060f1SDimitry Andric if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || 363fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || 364fe6060f1SDimitry Andric SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) 365fe6060f1SDimitry Andric WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); 366fe6060f1SDimitry Andric 3670b57cec5SDimitry Andric if (MI->isInlineAsm()) 3680b57cec5SDimitry Andric return std::max(WaitStates, checkInlineAsmHazards(MI)); 3690b57cec5SDimitry Andric 3700b57cec5SDimitry Andric if (isSGetReg(MI->getOpcode())) 3710b57cec5SDimitry Andric return std::max(WaitStates, checkGetRegHazards(MI)); 3720b57cec5SDimitry Andric 3730b57cec5SDimitry Andric if (isSSetReg(MI->getOpcode())) 3740b57cec5SDimitry Andric return std::max(WaitStates, checkSetRegHazards(MI)); 3750b57cec5SDimitry Andric 3760b57cec5SDimitry Andric if (isRFE(MI->getOpcode())) 3770b57cec5SDimitry Andric return std::max(WaitStates, checkRFEHazards(MI)); 3780b57cec5SDimitry Andric 37981ad6265SDimitry Andric if ((ST.hasReadM0MovRelInterpHazard() && 380bdd1243dSDimitry Andric (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) || 381bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || 382bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || 38381ad6265SDimitry Andric (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || 38481ad6265SDimitry Andric (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || 385*0fca6ea1SDimitry Andric (ST.hasReadM0LdsDirectHazard() && 386*0fca6ea1SDimitry Andric MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) 3870b57cec5SDimitry Andric return std::max(WaitStates, checkReadM0Hazards(MI)); 3880b57cec5SDimitry Andric 3890b57cec5SDimitry Andric if (SIInstrInfo::isMAI(*MI)) 3900b57cec5SDimitry Andric return std::max(WaitStates, checkMAIHazards(MI)); 3910b57cec5SDimitry Andric 392e8d8bef9SDimitry Andric if (SIInstrInfo::isVMEM(*MI) || 393e8d8bef9SDimitry Andric SIInstrInfo::isFLAT(*MI) || 394e8d8bef9SDimitry Andric SIInstrInfo::isDS(*MI)) 3950b57cec5SDimitry Andric return std::max(WaitStates, checkMAILdStHazards(MI)); 3960b57cec5SDimitry Andric 3970b57cec5SDimitry Andric return WaitStates; 3980b57cec5SDimitry Andric } 3990b57cec5SDimitry Andric 4000b57cec5SDimitry Andric void GCNHazardRecognizer::EmitNoop() { 4010b57cec5SDimitry Andric EmittedInstrs.push_front(nullptr); 4020b57cec5SDimitry Andric } 4030b57cec5SDimitry Andric 4040b57cec5SDimitry Andric void GCNHazardRecognizer::AdvanceCycle() { 4050b57cec5SDimitry Andric // When the scheduler detects a stall, it will call AdvanceCycle() without 4060b57cec5SDimitry Andric // emitting any instructions. 407e8d8bef9SDimitry Andric if (!CurrCycleInstr) { 408e8d8bef9SDimitry Andric EmittedInstrs.push_front(nullptr); 4090b57cec5SDimitry Andric return; 410e8d8bef9SDimitry Andric } 4110b57cec5SDimitry Andric 4120b57cec5SDimitry Andric if (CurrCycleInstr->isBundle()) { 4130b57cec5SDimitry Andric processBundle(); 4140b57cec5SDimitry Andric return; 4150b57cec5SDimitry Andric } 4160b57cec5SDimitry Andric 4170b57cec5SDimitry Andric unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 418349cc55cSDimitry Andric if (!NumWaitStates) { 419349cc55cSDimitry Andric CurrCycleInstr = nullptr; 420349cc55cSDimitry Andric return; 421349cc55cSDimitry Andric } 4220b57cec5SDimitry Andric 4230b57cec5SDimitry Andric // Keep track of emitted instructions 4240b57cec5SDimitry Andric EmittedInstrs.push_front(CurrCycleInstr); 4250b57cec5SDimitry Andric 4260b57cec5SDimitry Andric // Add a nullptr for each additional wait state after the first. Make sure 4270b57cec5SDimitry Andric // not to add more than getMaxLookAhead() items to the list, since we 4280b57cec5SDimitry Andric // truncate the list to that size right after this loop. 4290b57cec5SDimitry Andric for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 4300b57cec5SDimitry Andric i < e; ++i) { 4310b57cec5SDimitry Andric EmittedInstrs.push_front(nullptr); 4320b57cec5SDimitry Andric } 4330b57cec5SDimitry Andric 4340b57cec5SDimitry Andric // getMaxLookahead() is the largest number of wait states we will ever need 4350b57cec5SDimitry Andric // to insert, so there is no point in keeping track of more than that many 4360b57cec5SDimitry Andric // wait states. 4370b57cec5SDimitry Andric EmittedInstrs.resize(getMaxLookAhead()); 4380b57cec5SDimitry Andric 4390b57cec5SDimitry Andric CurrCycleInstr = nullptr; 4400b57cec5SDimitry Andric } 4410b57cec5SDimitry Andric 4420b57cec5SDimitry Andric void GCNHazardRecognizer::RecedeCycle() { 4430b57cec5SDimitry Andric llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 4440b57cec5SDimitry Andric } 4450b57cec5SDimitry Andric 4460b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 4470b57cec5SDimitry Andric // Helper Functions 4480b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 4490b57cec5SDimitry Andric 450*0fca6ea1SDimitry Andric using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; 45181ad6265SDimitry Andric 452*0fca6ea1SDimitry Andric using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; 453*0fca6ea1SDimitry Andric using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; 45481ad6265SDimitry Andric 45581ad6265SDimitry Andric // Search for a hazard in a block and its predecessors. 45681ad6265SDimitry Andric template <typename StateT> 45781ad6265SDimitry Andric static bool 45881ad6265SDimitry Andric hasHazard(StateT State, 45981ad6265SDimitry Andric function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, 46081ad6265SDimitry Andric function_ref<void(StateT &, const MachineInstr &)> UpdateState, 46181ad6265SDimitry Andric const MachineBasicBlock *MBB, 46281ad6265SDimitry Andric MachineBasicBlock::const_reverse_instr_iterator I, 46381ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> &Visited) { 46481ad6265SDimitry Andric for (auto E = MBB->instr_rend(); I != E; ++I) { 46581ad6265SDimitry Andric // No need to look at parent BUNDLE instructions. 46681ad6265SDimitry Andric if (I->isBundle()) 46781ad6265SDimitry Andric continue; 46881ad6265SDimitry Andric 46981ad6265SDimitry Andric switch (IsHazard(State, *I)) { 47081ad6265SDimitry Andric case HazardFound: 47181ad6265SDimitry Andric return true; 47281ad6265SDimitry Andric case HazardExpired: 47381ad6265SDimitry Andric return false; 47481ad6265SDimitry Andric default: 47581ad6265SDimitry Andric // Continue search 47681ad6265SDimitry Andric break; 47781ad6265SDimitry Andric } 47881ad6265SDimitry Andric 47981ad6265SDimitry Andric if (I->isInlineAsm() || I->isMetaInstruction()) 48081ad6265SDimitry Andric continue; 48181ad6265SDimitry Andric 48281ad6265SDimitry Andric UpdateState(State, *I); 48381ad6265SDimitry Andric } 48481ad6265SDimitry Andric 48581ad6265SDimitry Andric for (MachineBasicBlock *Pred : MBB->predecessors()) { 48681ad6265SDimitry Andric if (!Visited.insert(Pred).second) 48781ad6265SDimitry Andric continue; 48881ad6265SDimitry Andric 48981ad6265SDimitry Andric if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), 49081ad6265SDimitry Andric Visited)) 49181ad6265SDimitry Andric return true; 49281ad6265SDimitry Andric } 49381ad6265SDimitry Andric 49481ad6265SDimitry Andric return false; 49581ad6265SDimitry Andric } 4960b57cec5SDimitry Andric 4970b57cec5SDimitry Andric // Returns a minimum wait states since \p I walking all predecessors. 4980b57cec5SDimitry Andric // Only scans until \p IsExpired does not return true. 4990b57cec5SDimitry Andric // Can only be run in a hazard recognizer mode. 50081ad6265SDimitry Andric static int getWaitStatesSince( 50181ad6265SDimitry Andric GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, 50281ad6265SDimitry Andric MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, 50381ad6265SDimitry Andric IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, 50481ad6265SDimitry Andric GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { 5050b57cec5SDimitry Andric for (auto E = MBB->instr_rend(); I != E; ++I) { 5060b57cec5SDimitry Andric // Don't add WaitStates for parent BUNDLE instructions. 5070b57cec5SDimitry Andric if (I->isBundle()) 5080b57cec5SDimitry Andric continue; 5090b57cec5SDimitry Andric 510fe6060f1SDimitry Andric if (IsHazard(*I)) 5110b57cec5SDimitry Andric return WaitStates; 5120b57cec5SDimitry Andric 513349cc55cSDimitry Andric if (I->isInlineAsm()) 5140b57cec5SDimitry Andric continue; 5150b57cec5SDimitry Andric 51681ad6265SDimitry Andric WaitStates += GetNumWaitStates(*I); 5170b57cec5SDimitry Andric 518fe6060f1SDimitry Andric if (IsExpired(*I, WaitStates)) 5190b57cec5SDimitry Andric return std::numeric_limits<int>::max(); 5200b57cec5SDimitry Andric } 5210b57cec5SDimitry Andric 522fe6060f1SDimitry Andric int MinWaitStates = std::numeric_limits<int>::max(); 5230b57cec5SDimitry Andric for (MachineBasicBlock *Pred : MBB->predecessors()) { 5240b57cec5SDimitry Andric if (!Visited.insert(Pred).second) 5250b57cec5SDimitry Andric continue; 5260b57cec5SDimitry Andric 52781ad6265SDimitry Andric int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, 52881ad6265SDimitry Andric IsExpired, Visited, GetNumWaitStates); 5290b57cec5SDimitry Andric 530fe6060f1SDimitry Andric MinWaitStates = std::min(MinWaitStates, W); 5310b57cec5SDimitry Andric } 5320b57cec5SDimitry Andric 5330b57cec5SDimitry Andric return MinWaitStates; 5340b57cec5SDimitry Andric } 5350b57cec5SDimitry Andric 5360b57cec5SDimitry Andric static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 537fe6060f1SDimitry Andric const MachineInstr *MI, IsExpiredFn IsExpired) { 5380b57cec5SDimitry Andric DenseSet<const MachineBasicBlock *> Visited; 5390b57cec5SDimitry Andric return getWaitStatesSince(IsHazard, MI->getParent(), 5400b57cec5SDimitry Andric std::next(MI->getReverseIterator()), 5410b57cec5SDimitry Andric 0, IsExpired, Visited); 5420b57cec5SDimitry Andric } 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 5450b57cec5SDimitry Andric if (IsHazardRecognizerMode) { 546fe6060f1SDimitry Andric auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { 5470b57cec5SDimitry Andric return WaitStates >= Limit; 5480b57cec5SDimitry Andric }; 5490b57cec5SDimitry Andric return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 5500b57cec5SDimitry Andric } 5510b57cec5SDimitry Andric 5520b57cec5SDimitry Andric int WaitStates = 0; 5530b57cec5SDimitry Andric for (MachineInstr *MI : EmittedInstrs) { 5540b57cec5SDimitry Andric if (MI) { 555fe6060f1SDimitry Andric if (IsHazard(*MI)) 5560b57cec5SDimitry Andric return WaitStates; 5570b57cec5SDimitry Andric 5580b57cec5SDimitry Andric if (MI->isInlineAsm()) 5590b57cec5SDimitry Andric continue; 5600b57cec5SDimitry Andric } 5610b57cec5SDimitry Andric ++WaitStates; 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric if (WaitStates >= Limit) 5640b57cec5SDimitry Andric break; 5650b57cec5SDimitry Andric } 5660b57cec5SDimitry Andric return std::numeric_limits<int>::max(); 5670b57cec5SDimitry Andric } 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 5700b57cec5SDimitry Andric IsHazardFn IsHazardDef, 5710b57cec5SDimitry Andric int Limit) { 5720b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5730b57cec5SDimitry Andric 574fe6060f1SDimitry Andric auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { 575fe6060f1SDimitry Andric return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); 5760b57cec5SDimitry Andric }; 5770b57cec5SDimitry Andric 5780b57cec5SDimitry Andric return getWaitStatesSince(IsHazardFn, Limit); 5790b57cec5SDimitry Andric } 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 5820b57cec5SDimitry Andric int Limit) { 583fe6060f1SDimitry Andric auto IsHazardFn = [IsHazard](const MachineInstr &MI) { 584fe6060f1SDimitry Andric return isSSetReg(MI.getOpcode()) && IsHazard(MI); 5850b57cec5SDimitry Andric }; 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric return getWaitStatesSince(IsHazardFn, Limit); 5880b57cec5SDimitry Andric } 5890b57cec5SDimitry Andric 5900b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 5910b57cec5SDimitry Andric // No-op Hazard Detection 5920b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 5930b57cec5SDimitry Andric 594e8d8bef9SDimitry Andric static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, 595e8d8bef9SDimitry Andric MCRegister Reg) { 59606c3fb27SDimitry Andric for (MCRegUnit Unit : TRI.regunits(Reg)) 59706c3fb27SDimitry Andric BV.set(Unit); 5980b57cec5SDimitry Andric } 5990b57cec5SDimitry Andric 6000b57cec5SDimitry Andric static void addRegsToSet(const SIRegisterInfo &TRI, 6010b57cec5SDimitry Andric iterator_range<MachineInstr::const_mop_iterator> Ops, 60206c3fb27SDimitry Andric BitVector &DefSet, BitVector &UseSet) { 6030b57cec5SDimitry Andric for (const MachineOperand &Op : Ops) { 6040b57cec5SDimitry Andric if (Op.isReg()) 60506c3fb27SDimitry Andric addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); 6060b57cec5SDimitry Andric } 6070b57cec5SDimitry Andric } 6080b57cec5SDimitry Andric 6090b57cec5SDimitry Andric void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 61006c3fb27SDimitry Andric addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); 6110b57cec5SDimitry Andric } 6120b57cec5SDimitry Andric 6135ffd83dbSDimitry Andric static bool breaksSMEMSoftClause(MachineInstr *MI) { 6145ffd83dbSDimitry Andric return !SIInstrInfo::isSMRD(*MI); 6155ffd83dbSDimitry Andric } 6165ffd83dbSDimitry Andric 6175ffd83dbSDimitry Andric static bool breaksVMEMSoftClause(MachineInstr *MI) { 6185ffd83dbSDimitry Andric return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 6195ffd83dbSDimitry Andric } 6205ffd83dbSDimitry Andric 6210b57cec5SDimitry Andric int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 6220b57cec5SDimitry Andric // SMEM soft clause are only present on VI+, and only matter if xnack is 6230b57cec5SDimitry Andric // enabled. 6240b57cec5SDimitry Andric if (!ST.isXNACKEnabled()) 6250b57cec5SDimitry Andric return 0; 6260b57cec5SDimitry Andric 6270b57cec5SDimitry Andric bool IsSMRD = TII.isSMRD(*MEM); 6280b57cec5SDimitry Andric 6290b57cec5SDimitry Andric resetClause(); 6300b57cec5SDimitry Andric 6310b57cec5SDimitry Andric // A soft-clause is any group of consecutive SMEM instructions. The 6320b57cec5SDimitry Andric // instructions in this group may return out of order and/or may be 6330b57cec5SDimitry Andric // replayed (i.e. the same instruction issued more than once). 6340b57cec5SDimitry Andric // 6350b57cec5SDimitry Andric // In order to handle these situations correctly we need to make sure that 6360b57cec5SDimitry Andric // when a clause has more than one instruction, no instruction in the clause 6370b57cec5SDimitry Andric // writes to a register that is read by another instruction in the clause 63881ad6265SDimitry Andric // (including itself). If we encounter this situation, we need to break the 6390b57cec5SDimitry Andric // clause by inserting a non SMEM instruction. 6400b57cec5SDimitry Andric 6410b57cec5SDimitry Andric for (MachineInstr *MI : EmittedInstrs) { 6420b57cec5SDimitry Andric // When we hit a non-SMEM instruction then we have passed the start of the 6430b57cec5SDimitry Andric // clause and we can stop. 6440b57cec5SDimitry Andric if (!MI) 6450b57cec5SDimitry Andric break; 6460b57cec5SDimitry Andric 6475ffd83dbSDimitry Andric if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 6480b57cec5SDimitry Andric break; 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric addClauseInst(*MI); 6510b57cec5SDimitry Andric } 6520b57cec5SDimitry Andric 6530b57cec5SDimitry Andric if (ClauseDefs.none()) 6540b57cec5SDimitry Andric return 0; 6550b57cec5SDimitry Andric 6560b57cec5SDimitry Andric // We need to make sure not to put loads and stores in the same clause if they 6570b57cec5SDimitry Andric // use the same address. For now, just start a new clause whenever we see a 6580b57cec5SDimitry Andric // store. 6590b57cec5SDimitry Andric if (MEM->mayStore()) 6600b57cec5SDimitry Andric return 1; 6610b57cec5SDimitry Andric 6620b57cec5SDimitry Andric addClauseInst(*MEM); 6630b57cec5SDimitry Andric 6640b57cec5SDimitry Andric // If the set of defs and uses intersect then we cannot add this instruction 6650b57cec5SDimitry Andric // to the clause, so we have a hazard. 6660b57cec5SDimitry Andric return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 6670b57cec5SDimitry Andric } 6680b57cec5SDimitry Andric 6690b57cec5SDimitry Andric int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 6700b57cec5SDimitry Andric int WaitStatesNeeded = 0; 6710b57cec5SDimitry Andric 6720b57cec5SDimitry Andric WaitStatesNeeded = checkSoftClauseHazards(SMRD); 6730b57cec5SDimitry Andric 6740b57cec5SDimitry Andric // This SMRD hazard only affects SI. 6750b57cec5SDimitry Andric if (!ST.hasSMRDReadVALUDefHazard()) 6760b57cec5SDimitry Andric return WaitStatesNeeded; 6770b57cec5SDimitry Andric 6780b57cec5SDimitry Andric // A read of an SGPR by SMRD instruction requires 4 wait states when the 6790b57cec5SDimitry Andric // SGPR was written by a VALU instruction. 6800b57cec5SDimitry Andric int SmrdSgprWaitStates = 4; 681fe6060f1SDimitry Andric auto IsHazardDefFn = [this](const MachineInstr &MI) { 682fe6060f1SDimitry Andric return TII.isVALU(MI); 683fe6060f1SDimitry Andric }; 684fe6060f1SDimitry Andric auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { 685fe6060f1SDimitry Andric return TII.isSALU(MI); 686fe6060f1SDimitry Andric }; 6870b57cec5SDimitry Andric 6880b57cec5SDimitry Andric bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 6890b57cec5SDimitry Andric 6900b57cec5SDimitry Andric for (const MachineOperand &Use : SMRD->uses()) { 6910b57cec5SDimitry Andric if (!Use.isReg()) 6920b57cec5SDimitry Andric continue; 6930b57cec5SDimitry Andric int WaitStatesNeededForUse = 6940b57cec5SDimitry Andric SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 6950b57cec5SDimitry Andric SmrdSgprWaitStates); 6960b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 6970b57cec5SDimitry Andric 6980b57cec5SDimitry Andric // This fixes what appears to be undocumented hardware behavior in SI where 6990b57cec5SDimitry Andric // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 7000b57cec5SDimitry Andric // needs some number of nops in between. We don't know how many we need, but 7010b57cec5SDimitry Andric // let's use 4. This wasn't discovered before probably because the only 7020b57cec5SDimitry Andric // case when this happens is when we expand a 64-bit pointer into a full 7030b57cec5SDimitry Andric // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 7040b57cec5SDimitry Andric // probably never encountered in the closed-source land. 7050b57cec5SDimitry Andric if (IsBufferSMRD) { 7060b57cec5SDimitry Andric int WaitStatesNeededForUse = 7070b57cec5SDimitry Andric SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 7080b57cec5SDimitry Andric IsBufferHazardDefFn, 7090b57cec5SDimitry Andric SmrdSgprWaitStates); 7100b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 7110b57cec5SDimitry Andric } 7120b57cec5SDimitry Andric } 7130b57cec5SDimitry Andric 7140b57cec5SDimitry Andric return WaitStatesNeeded; 7150b57cec5SDimitry Andric } 7160b57cec5SDimitry Andric 7170b57cec5SDimitry Andric int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 7180b57cec5SDimitry Andric if (!ST.hasVMEMReadSGPRVALUDefHazard()) 7190b57cec5SDimitry Andric return 0; 7200b57cec5SDimitry Andric 7210b57cec5SDimitry Andric int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 7220b57cec5SDimitry Andric 7230b57cec5SDimitry Andric // A read of an SGPR by a VMEM instruction requires 5 wait states when the 7240b57cec5SDimitry Andric // SGPR was written by a VALU Instruction. 7250b57cec5SDimitry Andric const int VmemSgprWaitStates = 5; 726fe6060f1SDimitry Andric auto IsHazardDefFn = [this](const MachineInstr &MI) { 727fe6060f1SDimitry Andric return TII.isVALU(MI); 728fe6060f1SDimitry Andric }; 7290b57cec5SDimitry Andric for (const MachineOperand &Use : VMEM->uses()) { 730fe6060f1SDimitry Andric if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) 7310b57cec5SDimitry Andric continue; 7320b57cec5SDimitry Andric 7330b57cec5SDimitry Andric int WaitStatesNeededForUse = 7340b57cec5SDimitry Andric VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 7350b57cec5SDimitry Andric VmemSgprWaitStates); 7360b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 7370b57cec5SDimitry Andric } 7380b57cec5SDimitry Andric return WaitStatesNeeded; 7390b57cec5SDimitry Andric } 7400b57cec5SDimitry Andric 7410b57cec5SDimitry Andric int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 7420b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 7430b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 7440b57cec5SDimitry Andric 7450b57cec5SDimitry Andric // Check for DPP VGPR read after VALU VGPR write and EXEC write. 7460b57cec5SDimitry Andric int DppVgprWaitStates = 2; 7470b57cec5SDimitry Andric int DppExecWaitStates = 5; 7480b57cec5SDimitry Andric int WaitStatesNeeded = 0; 749fe6060f1SDimitry Andric auto IsHazardDefFn = [TII](const MachineInstr &MI) { 750fe6060f1SDimitry Andric return TII->isVALU(MI); 751fe6060f1SDimitry Andric }; 7520b57cec5SDimitry Andric 7530b57cec5SDimitry Andric for (const MachineOperand &Use : DPP->uses()) { 7540b57cec5SDimitry Andric if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 7550b57cec5SDimitry Andric continue; 7560b57cec5SDimitry Andric int WaitStatesNeededForUse = 757fe6060f1SDimitry Andric DppVgprWaitStates - getWaitStatesSinceDef( 758fe6060f1SDimitry Andric Use.getReg(), 759fe6060f1SDimitry Andric [](const MachineInstr &) { return true; }, 7600b57cec5SDimitry Andric DppVgprWaitStates); 7610b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 7620b57cec5SDimitry Andric } 7630b57cec5SDimitry Andric 7640b57cec5SDimitry Andric WaitStatesNeeded = std::max( 7650b57cec5SDimitry Andric WaitStatesNeeded, 7660b57cec5SDimitry Andric DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 7670b57cec5SDimitry Andric DppExecWaitStates)); 7680b57cec5SDimitry Andric 7690b57cec5SDimitry Andric return WaitStatesNeeded; 7700b57cec5SDimitry Andric } 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 7730b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 7740b57cec5SDimitry Andric 7750b57cec5SDimitry Andric // v_div_fmas requires 4 wait states after a write to vcc from a VALU 7760b57cec5SDimitry Andric // instruction. 7770b57cec5SDimitry Andric const int DivFMasWaitStates = 4; 778fe6060f1SDimitry Andric auto IsHazardDefFn = [TII](const MachineInstr &MI) { 779fe6060f1SDimitry Andric return TII->isVALU(MI); 780fe6060f1SDimitry Andric }; 7810b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 7820b57cec5SDimitry Andric DivFMasWaitStates); 7830b57cec5SDimitry Andric 7840b57cec5SDimitry Andric return DivFMasWaitStates - WaitStatesNeeded; 7850b57cec5SDimitry Andric } 7860b57cec5SDimitry Andric 7870b57cec5SDimitry Andric int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 7880b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 7890b57cec5SDimitry Andric unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 7900b57cec5SDimitry Andric 7910b57cec5SDimitry Andric const int GetRegWaitStates = 2; 792fe6060f1SDimitry Andric auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { 793fe6060f1SDimitry Andric return GetRegHWReg == getHWReg(TII, MI); 7940b57cec5SDimitry Andric }; 7950b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 7960b57cec5SDimitry Andric 7970b57cec5SDimitry Andric return GetRegWaitStates - WaitStatesNeeded; 7980b57cec5SDimitry Andric } 7990b57cec5SDimitry Andric 8000b57cec5SDimitry Andric int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 8010b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 8020b57cec5SDimitry Andric unsigned HWReg = getHWReg(TII, *SetRegInstr); 8030b57cec5SDimitry Andric 8040b57cec5SDimitry Andric const int SetRegWaitStates = ST.getSetRegWaitStates(); 805fe6060f1SDimitry Andric auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { 806fe6060f1SDimitry Andric return HWReg == getHWReg(TII, MI); 8070b57cec5SDimitry Andric }; 8080b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 8090b57cec5SDimitry Andric return SetRegWaitStates - WaitStatesNeeded; 8100b57cec5SDimitry Andric } 8110b57cec5SDimitry Andric 8120b57cec5SDimitry Andric int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 8130b57cec5SDimitry Andric if (!MI.mayStore()) 8140b57cec5SDimitry Andric return -1; 8150b57cec5SDimitry Andric 8160b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 8170b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode(); 8180b57cec5SDimitry Andric const MCInstrDesc &Desc = MI.getDesc(); 8190b57cec5SDimitry Andric 8200b57cec5SDimitry Andric int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 8210b57cec5SDimitry Andric int VDataRCID = -1; 8220b57cec5SDimitry Andric if (VDataIdx != -1) 823bdd1243dSDimitry Andric VDataRCID = Desc.operands()[VDataIdx].RegClass; 8240b57cec5SDimitry Andric 8250b57cec5SDimitry Andric if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 8260b57cec5SDimitry Andric // There is no hazard if the instruction does not use vector regs 8270b57cec5SDimitry Andric // (like wbinvl1) 8280b57cec5SDimitry Andric if (VDataIdx == -1) 8290b57cec5SDimitry Andric return -1; 8300b57cec5SDimitry Andric // For MUBUF/MTBUF instructions this hazard only exists if the 8310b57cec5SDimitry Andric // instruction is not using a register in the soffset field. 8320b57cec5SDimitry Andric const MachineOperand *SOffset = 8330b57cec5SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 8340b57cec5SDimitry Andric // If we have no soffset operand, then assume this field has been 8350b57cec5SDimitry Andric // hardcoded to zero. 8360b57cec5SDimitry Andric if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 8370b57cec5SDimitry Andric (!SOffset || !SOffset->isReg())) 8380b57cec5SDimitry Andric return VDataIdx; 8390b57cec5SDimitry Andric } 8400b57cec5SDimitry Andric 8410b57cec5SDimitry Andric // MIMG instructions create a hazard if they don't use a 256-bit T# and 8420b57cec5SDimitry Andric // the store size is greater than 8 bytes and they have more than two bits 8430b57cec5SDimitry Andric // of their dmask set. 8440b57cec5SDimitry Andric // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 8450b57cec5SDimitry Andric if (TII->isMIMG(MI)) { 8460b57cec5SDimitry Andric int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 8470b57cec5SDimitry Andric assert(SRsrcIdx != -1 && 848bdd1243dSDimitry Andric AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); 8490b57cec5SDimitry Andric (void)SRsrcIdx; 8500b57cec5SDimitry Andric } 8510b57cec5SDimitry Andric 8520b57cec5SDimitry Andric if (TII->isFLAT(MI)) { 8530b57cec5SDimitry Andric int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 854bdd1243dSDimitry Andric if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64) 8550b57cec5SDimitry Andric return DataIdx; 8560b57cec5SDimitry Andric } 8570b57cec5SDimitry Andric 8580b57cec5SDimitry Andric return -1; 8590b57cec5SDimitry Andric } 8600b57cec5SDimitry Andric 861e8d8bef9SDimitry Andric int 862e8d8bef9SDimitry Andric GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 8630b57cec5SDimitry Andric const MachineRegisterInfo &MRI) { 8640b57cec5SDimitry Andric // Helper to check for the hazard where VMEM instructions that store more than 8650b57cec5SDimitry Andric // 8 bytes can have there store data over written by the next instruction. 8660b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 8670b57cec5SDimitry Andric 86881ad6265SDimitry Andric const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; 8690b57cec5SDimitry Andric int WaitStatesNeeded = 0; 8700b57cec5SDimitry Andric 871fe6060f1SDimitry Andric if (!TRI->isVectorRegister(MRI, Def.getReg())) 8720b57cec5SDimitry Andric return WaitStatesNeeded; 8738bcb0991SDimitry Andric Register Reg = Def.getReg(); 874fe6060f1SDimitry Andric auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { 875fe6060f1SDimitry Andric int DataIdx = createsVALUHazard(MI); 8760b57cec5SDimitry Andric return DataIdx >= 0 && 877fe6060f1SDimitry Andric TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); 8780b57cec5SDimitry Andric }; 8790b57cec5SDimitry Andric int WaitStatesNeededForDef = 8800b57cec5SDimitry Andric VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 8810b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 8820b57cec5SDimitry Andric 8830b57cec5SDimitry Andric return WaitStatesNeeded; 8840b57cec5SDimitry Andric } 8850b57cec5SDimitry Andric 8860b57cec5SDimitry Andric int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 88781ad6265SDimitry Andric int WaitStatesNeeded = 0; 88881ad6265SDimitry Andric 88981ad6265SDimitry Andric if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { 89081ad6265SDimitry Andric const int TransDefWaitstates = 1; 89181ad6265SDimitry Andric 89281ad6265SDimitry Andric auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { 89381ad6265SDimitry Andric if (!SIInstrInfo::isTRANS(MI)) 89481ad6265SDimitry Andric return false; 89581ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 89681ad6265SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 89781ad6265SDimitry Andric Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); 89881ad6265SDimitry Andric 89981ad6265SDimitry Andric for (const MachineOperand &Use : VALU->explicit_uses()) { 90081ad6265SDimitry Andric if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 90181ad6265SDimitry Andric return true; 90281ad6265SDimitry Andric } 90381ad6265SDimitry Andric 90481ad6265SDimitry Andric return false; 90581ad6265SDimitry Andric }; 90681ad6265SDimitry Andric 90781ad6265SDimitry Andric int WaitStatesNeededForDef = 90881ad6265SDimitry Andric TransDefWaitstates - 90981ad6265SDimitry Andric getWaitStatesSince(IsTransDefFn, TransDefWaitstates); 91081ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 91181ad6265SDimitry Andric } 91281ad6265SDimitry Andric 91381ad6265SDimitry Andric if (ST.hasDstSelForwardingHazard()) { 91481ad6265SDimitry Andric const int Shift16DefWaitstates = 1; 91581ad6265SDimitry Andric 91681ad6265SDimitry Andric auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { 91781ad6265SDimitry Andric if (!SIInstrInfo::isVALU(MI)) 91881ad6265SDimitry Andric return false; 91981ad6265SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 92081ad6265SDimitry Andric if (SIInstrInfo::isSDWA(MI)) { 92181ad6265SDimitry Andric if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) 92281ad6265SDimitry Andric if (DstSel->getImm() == AMDGPU::SDWA::DWORD) 92381ad6265SDimitry Andric return false; 92481ad6265SDimitry Andric } else { 925bdd1243dSDimitry Andric if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || 92681ad6265SDimitry Andric !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers) 92781ad6265SDimitry Andric ->getImm() & 92881ad6265SDimitry Andric SISrcMods::DST_OP_SEL)) 92981ad6265SDimitry Andric return false; 93081ad6265SDimitry Andric } 93181ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 93281ad6265SDimitry Andric if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 93381ad6265SDimitry Andric Register Def = Dst->getReg(); 93481ad6265SDimitry Andric 93581ad6265SDimitry Andric for (const MachineOperand &Use : VALU->explicit_uses()) { 93681ad6265SDimitry Andric if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) 93781ad6265SDimitry Andric return true; 93881ad6265SDimitry Andric } 93981ad6265SDimitry Andric } 94081ad6265SDimitry Andric 94181ad6265SDimitry Andric return false; 94281ad6265SDimitry Andric }; 94381ad6265SDimitry Andric 94481ad6265SDimitry Andric int WaitStatesNeededForDef = 94581ad6265SDimitry Andric Shift16DefWaitstates - 94681ad6265SDimitry Andric getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); 94781ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 94881ad6265SDimitry Andric } 94981ad6265SDimitry Andric 95081ad6265SDimitry Andric if (ST.hasVDecCoExecHazard()) { 95181ad6265SDimitry Andric const int VALUWriteSGPRVALUReadWaitstates = 2; 95281ad6265SDimitry Andric const int VALUWriteEXECRWLane = 4; 95381ad6265SDimitry Andric const int VALUWriteVGPRReadlaneRead = 1; 95481ad6265SDimitry Andric 95581ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 95681ad6265SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 95781ad6265SDimitry Andric Register UseReg; 95881ad6265SDimitry Andric auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { 95981ad6265SDimitry Andric if (!SIInstrInfo::isVALU(MI)) 96081ad6265SDimitry Andric return false; 96181ad6265SDimitry Andric return MI.modifiesRegister(UseReg, TRI); 96281ad6265SDimitry Andric }; 96381ad6265SDimitry Andric 96481ad6265SDimitry Andric for (const MachineOperand &Use : VALU->explicit_uses()) { 96581ad6265SDimitry Andric if (!Use.isReg()) 96681ad6265SDimitry Andric continue; 96781ad6265SDimitry Andric 96881ad6265SDimitry Andric UseReg = Use.getReg(); 96981ad6265SDimitry Andric if (TRI->isSGPRReg(MRI, UseReg)) { 97081ad6265SDimitry Andric int WaitStatesNeededForDef = 97181ad6265SDimitry Andric VALUWriteSGPRVALUReadWaitstates - 97281ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, 97381ad6265SDimitry Andric VALUWriteSGPRVALUReadWaitstates); 97481ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 97581ad6265SDimitry Andric } 97681ad6265SDimitry Andric } 97781ad6265SDimitry Andric 97881ad6265SDimitry Andric if (VALU->readsRegister(AMDGPU::VCC, TRI)) { 97981ad6265SDimitry Andric UseReg = AMDGPU::VCC; 98081ad6265SDimitry Andric int WaitStatesNeededForDef = 98181ad6265SDimitry Andric VALUWriteSGPRVALUReadWaitstates - 98281ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); 98381ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 98481ad6265SDimitry Andric } 98581ad6265SDimitry Andric 98681ad6265SDimitry Andric switch (VALU->getOpcode()) { 98781ad6265SDimitry Andric case AMDGPU::V_READLANE_B32: 98881ad6265SDimitry Andric case AMDGPU::V_READFIRSTLANE_B32: { 98981ad6265SDimitry Andric MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); 99081ad6265SDimitry Andric UseReg = Src->getReg(); 99181ad6265SDimitry Andric int WaitStatesNeededForDef = 99281ad6265SDimitry Andric VALUWriteVGPRReadlaneRead - 99381ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); 99481ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 99581ad6265SDimitry Andric } 996bdd1243dSDimitry Andric [[fallthrough]]; 99781ad6265SDimitry Andric case AMDGPU::V_WRITELANE_B32: { 99881ad6265SDimitry Andric UseReg = AMDGPU::EXEC; 99981ad6265SDimitry Andric int WaitStatesNeededForDef = 100081ad6265SDimitry Andric VALUWriteEXECRWLane - 100181ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); 100281ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 100381ad6265SDimitry Andric break; 100481ad6265SDimitry Andric } 100581ad6265SDimitry Andric default: 100681ad6265SDimitry Andric break; 100781ad6265SDimitry Andric } 100881ad6265SDimitry Andric } 100981ad6265SDimitry Andric 10100b57cec5SDimitry Andric // This checks for the hazard where VMEM instructions that store more than 10110b57cec5SDimitry Andric // 8 bytes can have there store data over written by the next instruction. 10120b57cec5SDimitry Andric if (!ST.has12DWordStoreHazard()) 101381ad6265SDimitry Andric return WaitStatesNeeded; 10140b57cec5SDimitry Andric 10150b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 10160b57cec5SDimitry Andric 10170b57cec5SDimitry Andric for (const MachineOperand &Def : VALU->defs()) { 10180b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 10190b57cec5SDimitry Andric } 10200b57cec5SDimitry Andric 10210b57cec5SDimitry Andric return WaitStatesNeeded; 10220b57cec5SDimitry Andric } 10230b57cec5SDimitry Andric 10240b57cec5SDimitry Andric int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 10250b57cec5SDimitry Andric // This checks for hazards associated with inline asm statements. 10260b57cec5SDimitry Andric // Since inline asms can contain just about anything, we use this 10270b57cec5SDimitry Andric // to call/leverage other check*Hazard routines. Note that 10280b57cec5SDimitry Andric // this function doesn't attempt to address all possible inline asm 10290b57cec5SDimitry Andric // hazards (good luck), but is a collection of what has been 10300b57cec5SDimitry Andric // problematic thus far. 10310b57cec5SDimitry Andric 10320b57cec5SDimitry Andric // see checkVALUHazards() 10330b57cec5SDimitry Andric if (!ST.has12DWordStoreHazard()) 10340b57cec5SDimitry Andric return 0; 10350b57cec5SDimitry Andric 10360b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 10370b57cec5SDimitry Andric int WaitStatesNeeded = 0; 10380b57cec5SDimitry Andric 103906c3fb27SDimitry Andric for (const MachineOperand &Op : 104006c3fb27SDimitry Andric llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { 10410b57cec5SDimitry Andric if (Op.isReg() && Op.isDef()) { 104206c3fb27SDimitry Andric WaitStatesNeeded = 104306c3fb27SDimitry Andric std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 10440b57cec5SDimitry Andric } 10450b57cec5SDimitry Andric } 10460b57cec5SDimitry Andric 10470b57cec5SDimitry Andric return WaitStatesNeeded; 10480b57cec5SDimitry Andric } 10490b57cec5SDimitry Andric 10500b57cec5SDimitry Andric int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 10510b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 10520b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 10530b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 10540b57cec5SDimitry Andric 10550b57cec5SDimitry Andric const MachineOperand *LaneSelectOp = 10560b57cec5SDimitry Andric TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 10570b57cec5SDimitry Andric 10580b57cec5SDimitry Andric if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 10590b57cec5SDimitry Andric return 0; 10600b57cec5SDimitry Andric 10618bcb0991SDimitry Andric Register LaneSelectReg = LaneSelectOp->getReg(); 1062fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; 10630b57cec5SDimitry Andric 10640b57cec5SDimitry Andric const int RWLaneWaitStates = 4; 10650b57cec5SDimitry Andric int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 10660b57cec5SDimitry Andric RWLaneWaitStates); 10670b57cec5SDimitry Andric return RWLaneWaitStates - WaitStatesSince; 10680b57cec5SDimitry Andric } 10690b57cec5SDimitry Andric 10700b57cec5SDimitry Andric int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 10710b57cec5SDimitry Andric if (!ST.hasRFEHazards()) 10720b57cec5SDimitry Andric return 0; 10730b57cec5SDimitry Andric 10740b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 10750b57cec5SDimitry Andric 10760b57cec5SDimitry Andric const int RFEWaitStates = 1; 10770b57cec5SDimitry Andric 1078fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &MI) { 1079fe6060f1SDimitry Andric return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; 10800b57cec5SDimitry Andric }; 10810b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 10820b57cec5SDimitry Andric return RFEWaitStates - WaitStatesNeeded; 10830b57cec5SDimitry Andric } 10840b57cec5SDimitry Andric 10850b57cec5SDimitry Andric int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 10860b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 108781ad6265SDimitry Andric const int ReadM0WaitStates = 1; 1088fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; 108981ad6265SDimitry Andric return ReadM0WaitStates - 109081ad6265SDimitry Andric getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); 10910b57cec5SDimitry Andric } 10920b57cec5SDimitry Andric 10930b57cec5SDimitry Andric void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 10940b57cec5SDimitry Andric fixVMEMtoScalarWriteHazards(MI); 10950b57cec5SDimitry Andric fixVcmpxPermlaneHazards(MI); 10960b57cec5SDimitry Andric fixSMEMtoVectorWriteHazards(MI); 10970b57cec5SDimitry Andric fixVcmpxExecWARHazard(MI); 10980b57cec5SDimitry Andric fixLdsBranchVmemWARHazard(MI); 109981ad6265SDimitry Andric if (ST.hasLdsDirect()) { 110081ad6265SDimitry Andric fixLdsDirectVALUHazard(MI); 110181ad6265SDimitry Andric fixLdsDirectVMEMHazard(MI); 110281ad6265SDimitry Andric } 110381ad6265SDimitry Andric fixVALUPartialForwardingHazard(MI); 110481ad6265SDimitry Andric fixVALUTransUseHazard(MI); 110581ad6265SDimitry Andric fixWMMAHazards(MI); 1106bdd1243dSDimitry Andric fixShift64HighRegBug(MI); 1107bdd1243dSDimitry Andric fixVALUMaskWriteHazard(MI); 1108*0fca6ea1SDimitry Andric fixRequiredExportPriority(MI); 11090b57cec5SDimitry Andric } 11100b57cec5SDimitry Andric 11110b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 11120b57cec5SDimitry Andric if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 11130b57cec5SDimitry Andric return false; 11140b57cec5SDimitry Andric 11150b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 111681ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 111781ad6265SDimitry Andric auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { 111881ad6265SDimitry Andric return (TII->isVOPC(MI) || 111981ad6265SDimitry Andric ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) && 112081ad6265SDimitry Andric MI.modifiesRegister(AMDGPU::EXEC, TRI); 112181ad6265SDimitry Andric }; 11220b57cec5SDimitry Andric 1123fe6060f1SDimitry Andric auto IsExpiredFn = [](const MachineInstr &MI, int) { 1124fe6060f1SDimitry Andric unsigned Opc = MI.getOpcode(); 1125fe6060f1SDimitry Andric return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && 1126fe6060f1SDimitry Andric Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; 11270b57cec5SDimitry Andric }; 11280b57cec5SDimitry Andric 11290b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 11300b57cec5SDimitry Andric std::numeric_limits<int>::max()) 11310b57cec5SDimitry Andric return false; 11320b57cec5SDimitry Andric 11330b57cec5SDimitry Andric // V_NOP will be discarded by SQ. 113481ad6265SDimitry Andric // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 11350b57cec5SDimitry Andric // which is always a VGPR and available. 11360b57cec5SDimitry Andric auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 11378bcb0991SDimitry Andric Register Reg = Src0->getReg(); 11380b57cec5SDimitry Andric bool IsUndef = Src0->isUndef(); 11390b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 11400b57cec5SDimitry Andric TII->get(AMDGPU::V_MOV_B32_e32)) 11410b57cec5SDimitry Andric .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 11420b57cec5SDimitry Andric .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 11430b57cec5SDimitry Andric 11440b57cec5SDimitry Andric return true; 11450b57cec5SDimitry Andric } 11460b57cec5SDimitry Andric 11470b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 11480b57cec5SDimitry Andric if (!ST.hasVMEMtoScalarWriteHazard()) 11490b57cec5SDimitry Andric return false; 11507a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 11510b57cec5SDimitry Andric 11520b57cec5SDimitry Andric if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 11530b57cec5SDimitry Andric return false; 11540b57cec5SDimitry Andric 11550b57cec5SDimitry Andric if (MI->getNumDefs() == 0) 11560b57cec5SDimitry Andric return false; 11570b57cec5SDimitry Andric 11580b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 11590b57cec5SDimitry Andric 1160fe6060f1SDimitry Andric auto IsHazardFn = [TRI, MI](const MachineInstr &I) { 1161fe6060f1SDimitry Andric if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && 1162fe6060f1SDimitry Andric !SIInstrInfo::isFLAT(I)) 11630b57cec5SDimitry Andric return false; 11640b57cec5SDimitry Andric 11650b57cec5SDimitry Andric for (const MachineOperand &Def : MI->defs()) { 1166fe6060f1SDimitry Andric const MachineOperand *Op = 1167*0fca6ea1SDimitry Andric I.findRegisterUseOperand(Def.getReg(), TRI, false); 11680b57cec5SDimitry Andric if (!Op) 11690b57cec5SDimitry Andric continue; 11700b57cec5SDimitry Andric return true; 11710b57cec5SDimitry Andric } 11720b57cec5SDimitry Andric return false; 11730b57cec5SDimitry Andric }; 11740b57cec5SDimitry Andric 1175fe6060f1SDimitry Andric auto IsExpiredFn = [](const MachineInstr &MI, int) { 1176fe6060f1SDimitry Andric return SIInstrInfo::isVALU(MI) || 1177fe6060f1SDimitry Andric (MI.getOpcode() == AMDGPU::S_WAITCNT && 1178fe6060f1SDimitry Andric !MI.getOperand(0).getImm()) || 1179fe6060f1SDimitry Andric (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 118006c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); 11810b57cec5SDimitry Andric }; 11820b57cec5SDimitry Andric 11830b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 11840b57cec5SDimitry Andric std::numeric_limits<int>::max()) 11850b57cec5SDimitry Andric return false; 11860b57cec5SDimitry Andric 11870b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 1188e8d8bef9SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1189e8d8bef9SDimitry Andric TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 119006c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 11910b57cec5SDimitry Andric return true; 11920b57cec5SDimitry Andric } 11930b57cec5SDimitry Andric 11940b57cec5SDimitry Andric bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 11950b57cec5SDimitry Andric if (!ST.hasSMEMtoVectorWriteHazard()) 11960b57cec5SDimitry Andric return false; 11977a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 11980b57cec5SDimitry Andric 11990b57cec5SDimitry Andric if (!SIInstrInfo::isVALU(*MI)) 12000b57cec5SDimitry Andric return false; 12010b57cec5SDimitry Andric 12020b57cec5SDimitry Andric unsigned SDSTName; 12030b57cec5SDimitry Andric switch (MI->getOpcode()) { 12040b57cec5SDimitry Andric case AMDGPU::V_READLANE_B32: 12050b57cec5SDimitry Andric case AMDGPU::V_READFIRSTLANE_B32: 12060b57cec5SDimitry Andric SDSTName = AMDGPU::OpName::vdst; 12070b57cec5SDimitry Andric break; 12080b57cec5SDimitry Andric default: 12090b57cec5SDimitry Andric SDSTName = AMDGPU::OpName::sdst; 12100b57cec5SDimitry Andric break; 12110b57cec5SDimitry Andric } 12120b57cec5SDimitry Andric 12130b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 12140b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 12150b57cec5SDimitry Andric const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 12160b57cec5SDimitry Andric const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 12170b57cec5SDimitry Andric if (!SDST) { 12180b57cec5SDimitry Andric for (const auto &MO : MI->implicit_operands()) { 1219bdd1243dSDimitry Andric if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) { 12200b57cec5SDimitry Andric SDST = &MO; 12210b57cec5SDimitry Andric break; 12220b57cec5SDimitry Andric } 12230b57cec5SDimitry Andric } 12240b57cec5SDimitry Andric } 12250b57cec5SDimitry Andric 12260b57cec5SDimitry Andric if (!SDST) 12270b57cec5SDimitry Andric return false; 12280b57cec5SDimitry Andric 12298bcb0991SDimitry Andric const Register SDSTReg = SDST->getReg(); 1230fe6060f1SDimitry Andric auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { 1231fe6060f1SDimitry Andric return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); 12320b57cec5SDimitry Andric }; 12330b57cec5SDimitry Andric 1234fe6060f1SDimitry Andric auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { 1235fe6060f1SDimitry Andric if (TII->isSALU(MI)) { 1236fe6060f1SDimitry Andric switch (MI.getOpcode()) { 12370b57cec5SDimitry Andric case AMDGPU::S_SETVSKIP: 12380b57cec5SDimitry Andric case AMDGPU::S_VERSION: 12390b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VSCNT: 12400b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VMCNT: 12410b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_EXPCNT: 12420b57cec5SDimitry Andric // These instructions cannot not mitigate the hazard. 12430b57cec5SDimitry Andric return false; 12440b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_LGKMCNT: 12450b57cec5SDimitry Andric // Reducing lgkmcnt count to 0 always mitigates the hazard. 1246fe6060f1SDimitry Andric return (MI.getOperand(1).getImm() == 0) && 1247fe6060f1SDimitry Andric (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 12480b57cec5SDimitry Andric case AMDGPU::S_WAITCNT: { 1249fe6060f1SDimitry Andric const int64_t Imm = MI.getOperand(0).getImm(); 12500b57cec5SDimitry Andric AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 12517a6dacacSDimitry Andric // DsCnt corresponds to LGKMCnt here. 12527a6dacacSDimitry Andric return (Decoded.DsCnt == 0); 12530b57cec5SDimitry Andric } 12540b57cec5SDimitry Andric default: 12550b57cec5SDimitry Andric // SOPP instructions cannot mitigate the hazard. 1256fe6060f1SDimitry Andric if (TII->isSOPP(MI)) 12570b57cec5SDimitry Andric return false; 12580b57cec5SDimitry Andric // At this point the SALU can be assumed to mitigate the hazard 12590b57cec5SDimitry Andric // because either: 12600b57cec5SDimitry Andric // (a) it is independent of the at risk SMEM (breaking chain), 12610b57cec5SDimitry Andric // or 12620b57cec5SDimitry Andric // (b) it is dependent on the SMEM, in which case an appropriate 12630b57cec5SDimitry Andric // s_waitcnt lgkmcnt _must_ exist between it and the at risk 12640b57cec5SDimitry Andric // SMEM instruction. 12650b57cec5SDimitry Andric return true; 12660b57cec5SDimitry Andric } 12670b57cec5SDimitry Andric } 12680b57cec5SDimitry Andric return false; 12690b57cec5SDimitry Andric }; 12700b57cec5SDimitry Andric 12710b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 12720b57cec5SDimitry Andric std::numeric_limits<int>::max()) 12730b57cec5SDimitry Andric return false; 12740b57cec5SDimitry Andric 12750b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 12760b57cec5SDimitry Andric TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 12770b57cec5SDimitry Andric .addImm(0); 12780b57cec5SDimitry Andric return true; 12790b57cec5SDimitry Andric } 12800b57cec5SDimitry Andric 12810b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 12827a6dacacSDimitry Andric if (!ST.hasVcmpxExecWARHazard()) 12837a6dacacSDimitry Andric return false; 12847a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 12857a6dacacSDimitry Andric 12867a6dacacSDimitry Andric if (!SIInstrInfo::isVALU(*MI)) 12870b57cec5SDimitry Andric return false; 12880b57cec5SDimitry Andric 12890b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 12900b57cec5SDimitry Andric if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 12910b57cec5SDimitry Andric return false; 12920b57cec5SDimitry Andric 1293fe6060f1SDimitry Andric auto IsHazardFn = [TRI](const MachineInstr &I) { 1294fe6060f1SDimitry Andric if (SIInstrInfo::isVALU(I)) 12950b57cec5SDimitry Andric return false; 1296fe6060f1SDimitry Andric return I.readsRegister(AMDGPU::EXEC, TRI); 12970b57cec5SDimitry Andric }; 12980b57cec5SDimitry Andric 12990b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 1300fe6060f1SDimitry Andric auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { 1301fe6060f1SDimitry Andric if (SIInstrInfo::isVALU(MI)) { 1302fe6060f1SDimitry Andric if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) 13030b57cec5SDimitry Andric return true; 1304fe6060f1SDimitry Andric for (auto MO : MI.implicit_operands()) 1305bdd1243dSDimitry Andric if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) 13060b57cec5SDimitry Andric return true; 13070b57cec5SDimitry Andric } 1308fe6060f1SDimitry Andric if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 130906c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) 13100b57cec5SDimitry Andric return true; 13110b57cec5SDimitry Andric return false; 13120b57cec5SDimitry Andric }; 13130b57cec5SDimitry Andric 13140b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 13150b57cec5SDimitry Andric std::numeric_limits<int>::max()) 13160b57cec5SDimitry Andric return false; 13170b57cec5SDimitry Andric 13180b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 13190b57cec5SDimitry Andric TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 132006c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 13210b57cec5SDimitry Andric return true; 13220b57cec5SDimitry Andric } 13230b57cec5SDimitry Andric 1324fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, 1325fe6060f1SDimitry Andric const GCNSubtarget &ST) { 13260b57cec5SDimitry Andric if (!ST.hasLdsBranchVmemWARHazard()) 13270b57cec5SDimitry Andric return false; 13280b57cec5SDimitry Andric 1329fe6060f1SDimitry Andric // Check if the necessary condition for the hazard is met: both LDS and VMEM 1330fe6060f1SDimitry Andric // instructions need to appear in the same function. 1331fe6060f1SDimitry Andric bool HasLds = false; 1332fe6060f1SDimitry Andric bool HasVmem = false; 1333fe6060f1SDimitry Andric for (auto &MBB : MF) { 1334fe6060f1SDimitry Andric for (auto &MI : MBB) { 1335fe6060f1SDimitry Andric HasLds |= SIInstrInfo::isDS(MI); 1336fe6060f1SDimitry Andric HasVmem |= 1337fe6060f1SDimitry Andric SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); 1338fe6060f1SDimitry Andric if (HasLds && HasVmem) 1339fe6060f1SDimitry Andric return true; 1340fe6060f1SDimitry Andric } 1341fe6060f1SDimitry Andric } 1342fe6060f1SDimitry Andric return false; 1343fe6060f1SDimitry Andric } 1344fe6060f1SDimitry Andric 1345bdd1243dSDimitry Andric static bool isStoreCountWaitZero(const MachineInstr &I) { 1346bdd1243dSDimitry Andric return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1347bdd1243dSDimitry Andric I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1348bdd1243dSDimitry Andric !I.getOperand(1).getImm(); 1349bdd1243dSDimitry Andric } 1350bdd1243dSDimitry Andric 1351fe6060f1SDimitry Andric bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1352fe6060f1SDimitry Andric if (!RunLdsBranchVmemWARHazardFixup) 1353fe6060f1SDimitry Andric return false; 1354fe6060f1SDimitry Andric 1355fe6060f1SDimitry Andric assert(ST.hasLdsBranchVmemWARHazard()); 13567a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 1357fe6060f1SDimitry Andric 1358fe6060f1SDimitry Andric auto IsHazardInst = [](const MachineInstr &MI) { 1359fe6060f1SDimitry Andric if (SIInstrInfo::isDS(MI)) 13600b57cec5SDimitry Andric return 1; 1361fe6060f1SDimitry Andric if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) 13620b57cec5SDimitry Andric return 2; 13630b57cec5SDimitry Andric return 0; 13640b57cec5SDimitry Andric }; 13650b57cec5SDimitry Andric 1366fe6060f1SDimitry Andric auto InstType = IsHazardInst(*MI); 13670b57cec5SDimitry Andric if (!InstType) 13680b57cec5SDimitry Andric return false; 13690b57cec5SDimitry Andric 1370fe6060f1SDimitry Andric auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { 1371bdd1243dSDimitry Andric return IsHazardInst(I) || isStoreCountWaitZero(I); 13720b57cec5SDimitry Andric }; 13730b57cec5SDimitry Andric 1374fe6060f1SDimitry Andric auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { 1375fe6060f1SDimitry Andric if (!I.isBranch()) 13760b57cec5SDimitry Andric return false; 13770b57cec5SDimitry Andric 1378fe6060f1SDimitry Andric auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { 13790b57cec5SDimitry Andric auto InstType2 = IsHazardInst(I); 13800b57cec5SDimitry Andric return InstType2 && InstType != InstType2; 13810b57cec5SDimitry Andric }; 13820b57cec5SDimitry Andric 1383fe6060f1SDimitry Andric auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { 13840b57cec5SDimitry Andric auto InstType2 = IsHazardInst(I); 13850b57cec5SDimitry Andric if (InstType == InstType2) 13860b57cec5SDimitry Andric return true; 13870b57cec5SDimitry Andric 1388bdd1243dSDimitry Andric return isStoreCountWaitZero(I); 13890b57cec5SDimitry Andric }; 13900b57cec5SDimitry Andric 1391fe6060f1SDimitry Andric return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != 13920b57cec5SDimitry Andric std::numeric_limits<int>::max(); 13930b57cec5SDimitry Andric }; 13940b57cec5SDimitry Andric 13950b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 13960b57cec5SDimitry Andric std::numeric_limits<int>::max()) 13970b57cec5SDimitry Andric return false; 13980b57cec5SDimitry Andric 13990b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 14000b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 14010b57cec5SDimitry Andric TII->get(AMDGPU::S_WAITCNT_VSCNT)) 14020b57cec5SDimitry Andric .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 14030b57cec5SDimitry Andric .addImm(0); 14040b57cec5SDimitry Andric 14050b57cec5SDimitry Andric return true; 14060b57cec5SDimitry Andric } 14070b57cec5SDimitry Andric 140881ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { 140981ad6265SDimitry Andric if (!SIInstrInfo::isLDSDIR(*MI)) 141081ad6265SDimitry Andric return false; 141181ad6265SDimitry Andric 141281ad6265SDimitry Andric const int NoHazardWaitStates = 15; 141381ad6265SDimitry Andric const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 141481ad6265SDimitry Andric const Register VDSTReg = VDST->getReg(); 141581ad6265SDimitry Andric 141681ad6265SDimitry Andric bool VisitedTrans = false; 141781ad6265SDimitry Andric auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { 141881ad6265SDimitry Andric if (!SIInstrInfo::isVALU(I)) 141981ad6265SDimitry Andric return false; 142081ad6265SDimitry Andric VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); 142181ad6265SDimitry Andric // Cover both WAR and WAW 142281ad6265SDimitry Andric return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 142381ad6265SDimitry Andric }; 142481ad6265SDimitry Andric auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { 142581ad6265SDimitry Andric if (WaitStates >= NoHazardWaitStates) 142681ad6265SDimitry Andric return true; 142781ad6265SDimitry Andric // Instructions which cause va_vdst==0 expire hazard 142881ad6265SDimitry Andric return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 142981ad6265SDimitry Andric SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); 143081ad6265SDimitry Andric }; 143181ad6265SDimitry Andric auto GetWaitStatesFn = [](const MachineInstr &MI) { 143281ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) ? 1 : 0; 143381ad6265SDimitry Andric }; 143481ad6265SDimitry Andric 143581ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> Visited; 143681ad6265SDimitry Andric auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), 143781ad6265SDimitry Andric std::next(MI->getReverseIterator()), 0, 143881ad6265SDimitry Andric IsExpiredFn, Visited, GetWaitStatesFn); 143981ad6265SDimitry Andric 144081ad6265SDimitry Andric // Transcendentals can execute in parallel to other VALUs. 144181ad6265SDimitry Andric // This makes va_vdst count unusable with a mixture of VALU and TRANS. 144281ad6265SDimitry Andric if (VisitedTrans) 144381ad6265SDimitry Andric Count = 0; 144481ad6265SDimitry Andric 144581ad6265SDimitry Andric MachineOperand *WaitVdstOp = 144681ad6265SDimitry Andric TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); 144781ad6265SDimitry Andric WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); 144881ad6265SDimitry Andric 144981ad6265SDimitry Andric return true; 145081ad6265SDimitry Andric } 145181ad6265SDimitry Andric 145281ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { 145381ad6265SDimitry Andric if (!SIInstrInfo::isLDSDIR(*MI)) 145481ad6265SDimitry Andric return false; 145581ad6265SDimitry Andric 145681ad6265SDimitry Andric const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); 145781ad6265SDimitry Andric const Register VDSTReg = VDST->getReg(); 145881ad6265SDimitry Andric 145981ad6265SDimitry Andric auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { 146081ad6265SDimitry Andric if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && 146181ad6265SDimitry Andric !SIInstrInfo::isDS(I)) 146281ad6265SDimitry Andric return false; 146381ad6265SDimitry Andric return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); 146481ad6265SDimitry Andric }; 1465297eecfbSDimitry Andric bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); 14667a6dacacSDimitry Andric // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT 14677a6dacacSDimitry Andric // according to the type of VMEM instruction. 1468297eecfbSDimitry Andric auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { 146981ad6265SDimitry Andric return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || 147081ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || 147181ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1472297eecfbSDimitry Andric AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || 1473297eecfbSDimitry Andric (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && 1474297eecfbSDimitry Andric !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); 147581ad6265SDimitry Andric }; 147681ad6265SDimitry Andric 147781ad6265SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 147881ad6265SDimitry Andric std::numeric_limits<int>::max()) 147981ad6265SDimitry Andric return false; 148081ad6265SDimitry Andric 1481297eecfbSDimitry Andric if (LdsdirCanWait) { 1482297eecfbSDimitry Andric TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); 1483297eecfbSDimitry Andric } else { 148481ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 148581ad6265SDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 148606c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); 1487297eecfbSDimitry Andric } 148881ad6265SDimitry Andric 148981ad6265SDimitry Andric return true; 149081ad6265SDimitry Andric } 149181ad6265SDimitry Andric 149281ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { 149381ad6265SDimitry Andric if (!ST.hasVALUPartialForwardingHazard()) 149481ad6265SDimitry Andric return false; 14957a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 14967a6dacacSDimitry Andric 14977a6dacacSDimitry Andric if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) 149881ad6265SDimitry Andric return false; 149981ad6265SDimitry Andric 150081ad6265SDimitry Andric SmallSetVector<Register, 4> SrcVGPRs; 150181ad6265SDimitry Andric 150281ad6265SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) { 150381ad6265SDimitry Andric if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 150481ad6265SDimitry Andric SrcVGPRs.insert(Use.getReg()); 150581ad6265SDimitry Andric } 150681ad6265SDimitry Andric 150781ad6265SDimitry Andric // Only applies with >= 2 unique VGPR sources 150881ad6265SDimitry Andric if (SrcVGPRs.size() <= 1) 150981ad6265SDimitry Andric return false; 151081ad6265SDimitry Andric 151181ad6265SDimitry Andric // Look for the following pattern: 151281ad6265SDimitry Andric // Va <- VALU [PreExecPos] 151381ad6265SDimitry Andric // intv1 151481ad6265SDimitry Andric // Exec <- SALU [ExecPos] 151581ad6265SDimitry Andric // intv2 151681ad6265SDimitry Andric // Vb <- VALU [PostExecPos] 151781ad6265SDimitry Andric // intv3 151881ad6265SDimitry Andric // MI Va, Vb (WaitState = 0) 151981ad6265SDimitry Andric // 152081ad6265SDimitry Andric // Where: 152181ad6265SDimitry Andric // intv1 + intv2 <= 2 VALUs 152281ad6265SDimitry Andric // intv3 <= 4 VALUs 152381ad6265SDimitry Andric // 152481ad6265SDimitry Andric // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 152581ad6265SDimitry Andric 152681ad6265SDimitry Andric const int Intv1plus2MaxVALUs = 2; 152781ad6265SDimitry Andric const int Intv3MaxVALUs = 4; 152881ad6265SDimitry Andric const int IntvMaxVALUs = 6; 152981ad6265SDimitry Andric const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; 153081ad6265SDimitry Andric 153181ad6265SDimitry Andric struct StateType { 153281ad6265SDimitry Andric SmallDenseMap<Register, int, 4> DefPos; 153381ad6265SDimitry Andric int ExecPos = std::numeric_limits<int>::max(); 153481ad6265SDimitry Andric int VALUs = 0; 153581ad6265SDimitry Andric }; 153681ad6265SDimitry Andric 153781ad6265SDimitry Andric StateType State; 153881ad6265SDimitry Andric 153981ad6265SDimitry Andric // This overloads expiry testing with all the hazard detection 154081ad6265SDimitry Andric auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 154181ad6265SDimitry Andric // Too many VALU states have passed 154281ad6265SDimitry Andric if (State.VALUs > NoHazardVALUWaitStates) 154381ad6265SDimitry Andric return HazardExpired; 154481ad6265SDimitry Andric 154581ad6265SDimitry Andric // Instructions which cause va_vdst==0 expire hazard 154681ad6265SDimitry Andric if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 154781ad6265SDimitry Andric SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 154881ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 154906c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) 155081ad6265SDimitry Andric return HazardExpired; 155181ad6265SDimitry Andric 155281ad6265SDimitry Andric // Track registers writes 155381ad6265SDimitry Andric bool Changed = false; 155481ad6265SDimitry Andric if (SIInstrInfo::isVALU(I)) { 155581ad6265SDimitry Andric for (Register Src : SrcVGPRs) { 155681ad6265SDimitry Andric if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { 155781ad6265SDimitry Andric State.DefPos[Src] = State.VALUs; 155881ad6265SDimitry Andric Changed = true; 155981ad6265SDimitry Andric } 156081ad6265SDimitry Andric } 156181ad6265SDimitry Andric } else if (SIInstrInfo::isSALU(I)) { 156281ad6265SDimitry Andric if (State.ExecPos == std::numeric_limits<int>::max()) { 156381ad6265SDimitry Andric if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { 156481ad6265SDimitry Andric State.ExecPos = State.VALUs; 156581ad6265SDimitry Andric Changed = true; 156681ad6265SDimitry Andric } 156781ad6265SDimitry Andric } 156881ad6265SDimitry Andric } 156981ad6265SDimitry Andric 157081ad6265SDimitry Andric // Early expiration: too many VALUs in intv3 157181ad6265SDimitry Andric if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) 157281ad6265SDimitry Andric return HazardExpired; 157381ad6265SDimitry Andric 157481ad6265SDimitry Andric // Only evaluate state if something changed 157581ad6265SDimitry Andric if (!Changed) 157681ad6265SDimitry Andric return NoHazardFound; 157781ad6265SDimitry Andric 157881ad6265SDimitry Andric // Determine positions of VALUs pre/post exec change 157981ad6265SDimitry Andric if (State.ExecPos == std::numeric_limits<int>::max()) 158081ad6265SDimitry Andric return NoHazardFound; 158181ad6265SDimitry Andric 158281ad6265SDimitry Andric int PreExecPos = std::numeric_limits<int>::max(); 158381ad6265SDimitry Andric int PostExecPos = std::numeric_limits<int>::max(); 158481ad6265SDimitry Andric 158581ad6265SDimitry Andric for (auto Entry : State.DefPos) { 158681ad6265SDimitry Andric int DefVALUs = Entry.second; 158781ad6265SDimitry Andric if (DefVALUs != std::numeric_limits<int>::max()) { 158881ad6265SDimitry Andric if (DefVALUs >= State.ExecPos) 158981ad6265SDimitry Andric PreExecPos = std::min(PreExecPos, DefVALUs); 1590*0fca6ea1SDimitry Andric else 159181ad6265SDimitry Andric PostExecPos = std::min(PostExecPos, DefVALUs); 159281ad6265SDimitry Andric } 159381ad6265SDimitry Andric } 159481ad6265SDimitry Andric 159581ad6265SDimitry Andric // Need a VALUs post exec change 159681ad6265SDimitry Andric if (PostExecPos == std::numeric_limits<int>::max()) 159781ad6265SDimitry Andric return NoHazardFound; 159881ad6265SDimitry Andric 159981ad6265SDimitry Andric // Too many VALUs in intv3? 160081ad6265SDimitry Andric int Intv3VALUs = PostExecPos; 160181ad6265SDimitry Andric if (Intv3VALUs > Intv3MaxVALUs) 160281ad6265SDimitry Andric return HazardExpired; 160381ad6265SDimitry Andric 160481ad6265SDimitry Andric // Too many VALUs in intv2? 160581ad6265SDimitry Andric int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; 160681ad6265SDimitry Andric if (Intv2VALUs > Intv1plus2MaxVALUs) 160781ad6265SDimitry Andric return HazardExpired; 160881ad6265SDimitry Andric 160981ad6265SDimitry Andric // Need a VALUs pre exec change 161081ad6265SDimitry Andric if (PreExecPos == std::numeric_limits<int>::max()) 161181ad6265SDimitry Andric return NoHazardFound; 161281ad6265SDimitry Andric 161381ad6265SDimitry Andric // Too many VALUs in intv1? 161481ad6265SDimitry Andric int Intv1VALUs = PreExecPos - State.ExecPos; 161581ad6265SDimitry Andric if (Intv1VALUs > Intv1plus2MaxVALUs) 161681ad6265SDimitry Andric return HazardExpired; 161781ad6265SDimitry Andric 161881ad6265SDimitry Andric // Too many VALUs in intv1 + intv2 161981ad6265SDimitry Andric if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) 162081ad6265SDimitry Andric return HazardExpired; 162181ad6265SDimitry Andric 162281ad6265SDimitry Andric return HazardFound; 162381ad6265SDimitry Andric }; 162481ad6265SDimitry Andric auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 162581ad6265SDimitry Andric if (SIInstrInfo::isVALU(MI)) 162681ad6265SDimitry Andric State.VALUs += 1; 162781ad6265SDimitry Andric }; 162881ad6265SDimitry Andric 162981ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> Visited; 163081ad6265SDimitry Andric if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 163181ad6265SDimitry Andric std::next(MI->getReverseIterator()), Visited)) 163281ad6265SDimitry Andric return false; 163381ad6265SDimitry Andric 163481ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 163581ad6265SDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 163681ad6265SDimitry Andric .addImm(0x0fff); 163781ad6265SDimitry Andric 163881ad6265SDimitry Andric return true; 163981ad6265SDimitry Andric } 164081ad6265SDimitry Andric 164181ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { 164281ad6265SDimitry Andric if (!ST.hasVALUTransUseHazard()) 164381ad6265SDimitry Andric return false; 16447a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 16457a6dacacSDimitry Andric 164681ad6265SDimitry Andric if (!SIInstrInfo::isVALU(*MI)) 164781ad6265SDimitry Andric return false; 164881ad6265SDimitry Andric 164981ad6265SDimitry Andric SmallSet<Register, 4> SrcVGPRs; 165081ad6265SDimitry Andric 165181ad6265SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) { 165281ad6265SDimitry Andric if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 165381ad6265SDimitry Andric SrcVGPRs.insert(Use.getReg()); 165481ad6265SDimitry Andric } 165581ad6265SDimitry Andric 165681ad6265SDimitry Andric // Look for the following pattern: 165781ad6265SDimitry Andric // Va <- TRANS VALU 165881ad6265SDimitry Andric // intv 165981ad6265SDimitry Andric // MI Va (WaitState = 0) 166081ad6265SDimitry Andric // 166181ad6265SDimitry Andric // Where: 166281ad6265SDimitry Andric // intv <= 5 VALUs / 1 TRANS 166381ad6265SDimitry Andric // 166481ad6265SDimitry Andric // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. 166581ad6265SDimitry Andric 166681ad6265SDimitry Andric const int IntvMaxVALUs = 5; 166781ad6265SDimitry Andric const int IntvMaxTRANS = 1; 166881ad6265SDimitry Andric 166981ad6265SDimitry Andric struct StateType { 167081ad6265SDimitry Andric int VALUs = 0; 167181ad6265SDimitry Andric int TRANS = 0; 167281ad6265SDimitry Andric }; 167381ad6265SDimitry Andric 167481ad6265SDimitry Andric StateType State; 167581ad6265SDimitry Andric 167681ad6265SDimitry Andric // This overloads expiry testing with all the hazard detection 167781ad6265SDimitry Andric auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { 167881ad6265SDimitry Andric // Too many VALU states have passed 167981ad6265SDimitry Andric if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) 168081ad6265SDimitry Andric return HazardExpired; 168181ad6265SDimitry Andric 168281ad6265SDimitry Andric // Instructions which cause va_vdst==0 expire hazard 168381ad6265SDimitry Andric if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || 168481ad6265SDimitry Andric SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || 168581ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 168681ad6265SDimitry Andric I.getOperand(0).getImm() == 0x0fff)) 168781ad6265SDimitry Andric return HazardExpired; 168881ad6265SDimitry Andric 168981ad6265SDimitry Andric // Track registers writes 169081ad6265SDimitry Andric if (SIInstrInfo::isTRANS(I)) { 169181ad6265SDimitry Andric for (Register Src : SrcVGPRs) { 169281ad6265SDimitry Andric if (I.modifiesRegister(Src, &TRI)) { 169381ad6265SDimitry Andric return HazardFound; 169481ad6265SDimitry Andric } 169581ad6265SDimitry Andric } 169681ad6265SDimitry Andric } 169781ad6265SDimitry Andric 169881ad6265SDimitry Andric return NoHazardFound; 169981ad6265SDimitry Andric }; 170081ad6265SDimitry Andric auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { 170181ad6265SDimitry Andric if (SIInstrInfo::isVALU(MI)) 170281ad6265SDimitry Andric State.VALUs += 1; 170381ad6265SDimitry Andric if (SIInstrInfo::isTRANS(MI)) 170481ad6265SDimitry Andric State.TRANS += 1; 170581ad6265SDimitry Andric }; 170681ad6265SDimitry Andric 170781ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> Visited; 170881ad6265SDimitry Andric if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), 170981ad6265SDimitry Andric std::next(MI->getReverseIterator()), Visited)) 171081ad6265SDimitry Andric return false; 171181ad6265SDimitry Andric 171281ad6265SDimitry Andric // Hazard is observed - insert a wait on va_dst counter to ensure hazard is 171306c3fb27SDimitry Andric // avoided. 171481ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 171581ad6265SDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 171606c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); 171781ad6265SDimitry Andric 171881ad6265SDimitry Andric return true; 171981ad6265SDimitry Andric } 172081ad6265SDimitry Andric 172181ad6265SDimitry Andric bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { 1722b3edf446SDimitry Andric if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) 172381ad6265SDimitry Andric return false; 172481ad6265SDimitry Andric 172581ad6265SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 172681ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo(); 172781ad6265SDimitry Andric 1728b3edf446SDimitry Andric auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { 1729b3edf446SDimitry Andric if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I)) 173081ad6265SDimitry Andric return false; 173181ad6265SDimitry Andric 1732*0fca6ea1SDimitry Andric // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps 1733*0fca6ea1SDimitry Andric // with the dest(matrix D) of the previous wmma. 173481ad6265SDimitry Andric const Register CurSrc0Reg = 173581ad6265SDimitry Andric TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); 173681ad6265SDimitry Andric const Register CurSrc1Reg = 173781ad6265SDimitry Andric TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 173881ad6265SDimitry Andric 173981ad6265SDimitry Andric const Register PrevDstReg = 174081ad6265SDimitry Andric TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); 174181ad6265SDimitry Andric 174281ad6265SDimitry Andric if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || 174381ad6265SDimitry Andric TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { 174481ad6265SDimitry Andric return true; 174581ad6265SDimitry Andric } 174681ad6265SDimitry Andric 1747b3edf446SDimitry Andric // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) 1748b3edf446SDimitry Andric // but Index can't overlap with PrevDstReg. 1749b3edf446SDimitry Andric if (AMDGPU::isGFX12Plus(ST)) { 1750b3edf446SDimitry Andric if (SIInstrInfo::isSWMMAC(*MI)) { 1751b3edf446SDimitry Andric const Register CurIndex = 1752b3edf446SDimitry Andric TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1753b3edf446SDimitry Andric if (TRI->regsOverlap(PrevDstReg, CurIndex)) 1754b3edf446SDimitry Andric return true; 1755b3edf446SDimitry Andric } 1756b3edf446SDimitry Andric return false; 1757b3edf446SDimitry Andric } 1758b3edf446SDimitry Andric 175981ad6265SDimitry Andric return false; 176081ad6265SDimitry Andric }; 176181ad6265SDimitry Andric 176281ad6265SDimitry Andric auto IsExpiredFn = [](const MachineInstr &I, int) { 176381ad6265SDimitry Andric return SIInstrInfo::isVALU(I); 176481ad6265SDimitry Andric }; 176581ad6265SDimitry Andric 176681ad6265SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 176781ad6265SDimitry Andric std::numeric_limits<int>::max()) 176881ad6265SDimitry Andric return false; 176981ad6265SDimitry Andric 177081ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 177181ad6265SDimitry Andric 177281ad6265SDimitry Andric return true; 177381ad6265SDimitry Andric } 177481ad6265SDimitry Andric 1775bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { 1776bdd1243dSDimitry Andric if (!ST.hasShift64HighRegBug()) 1777bdd1243dSDimitry Andric return false; 17787a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 1779bdd1243dSDimitry Andric 1780bdd1243dSDimitry Andric switch (MI->getOpcode()) { 1781bdd1243dSDimitry Andric default: 1782bdd1243dSDimitry Andric return false; 1783bdd1243dSDimitry Andric case AMDGPU::V_LSHLREV_B64_e64: 1784bdd1243dSDimitry Andric case AMDGPU::V_LSHRREV_B64_e64: 1785bdd1243dSDimitry Andric case AMDGPU::V_ASHRREV_I64_e64: 1786bdd1243dSDimitry Andric break; 1787bdd1243dSDimitry Andric } 1788bdd1243dSDimitry Andric 1789bdd1243dSDimitry Andric MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); 1790bdd1243dSDimitry Andric if (!Amt->isReg()) 1791bdd1243dSDimitry Andric return false; 1792bdd1243dSDimitry Andric 1793bdd1243dSDimitry Andric Register AmtReg = Amt->getReg(); 1794bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 1795bdd1243dSDimitry Andric // Check if this is a last VGPR in the allocation block. 1796bdd1243dSDimitry Andric if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) 1797bdd1243dSDimitry Andric return false; 1798bdd1243dSDimitry Andric 1799bdd1243dSDimitry Andric if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) 1800bdd1243dSDimitry Andric return false; 1801bdd1243dSDimitry Andric 1802bdd1243dSDimitry Andric MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); 1803bdd1243dSDimitry Andric bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); 1804bdd1243dSDimitry Andric bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); 1805bdd1243dSDimitry Andric bool Overlapped = OverlappedSrc || OverlappedDst; 1806bdd1243dSDimitry Andric 1807bdd1243dSDimitry Andric assert(!OverlappedDst || !OverlappedSrc || 1808bdd1243dSDimitry Andric Src1->getReg() == MI->getOperand(0).getReg()); 1809bdd1243dSDimitry Andric assert(ST.needsAlignedVGPRs()); 1810bdd1243dSDimitry Andric static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); 1811bdd1243dSDimitry Andric 1812bdd1243dSDimitry Andric Register NewReg; 1813bdd1243dSDimitry Andric for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass 1814bdd1243dSDimitry Andric : AMDGPU::VGPR_32RegClass) { 1815bdd1243dSDimitry Andric if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { 1816bdd1243dSDimitry Andric NewReg = Reg; 1817bdd1243dSDimitry Andric break; 1818bdd1243dSDimitry Andric } 1819bdd1243dSDimitry Andric } 1820bdd1243dSDimitry Andric 1821bdd1243dSDimitry Andric Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) 1822bdd1243dSDimitry Andric : NewReg; 1823bdd1243dSDimitry Andric Register NewAmtLo; 1824bdd1243dSDimitry Andric 1825bdd1243dSDimitry Andric if (Overlapped) 1826bdd1243dSDimitry Andric NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); 1827bdd1243dSDimitry Andric 1828bdd1243dSDimitry Andric DebugLoc DL = MI->getDebugLoc(); 1829bdd1243dSDimitry Andric MachineBasicBlock *MBB = MI->getParent(); 1830bdd1243dSDimitry Andric // Insert a full wait count because found register might be pending a wait. 1831bdd1243dSDimitry Andric BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) 1832bdd1243dSDimitry Andric .addImm(0); 1833bdd1243dSDimitry Andric 1834bdd1243dSDimitry Andric // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. 1835bdd1243dSDimitry Andric if (Overlapped) 1836bdd1243dSDimitry Andric runOnInstruction( 1837bdd1243dSDimitry Andric BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) 1838bdd1243dSDimitry Andric .addDef(AmtReg - 1) 1839bdd1243dSDimitry Andric .addReg(AmtReg - 1, RegState::Undef) 1840bdd1243dSDimitry Andric .addReg(NewAmtLo, RegState::Undef)); 1841bdd1243dSDimitry Andric runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) 1842bdd1243dSDimitry Andric .addDef(AmtReg) 1843bdd1243dSDimitry Andric .addReg(AmtReg, RegState::Undef) 1844bdd1243dSDimitry Andric .addReg(NewAmt, RegState::Undef)); 1845bdd1243dSDimitry Andric 1846bdd1243dSDimitry Andric // Instructions emitted after the current instruction will be processed by the 1847bdd1243dSDimitry Andric // parent loop of the hazard recognizer in a natural way. 1848bdd1243dSDimitry Andric BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1849bdd1243dSDimitry Andric AmtReg) 1850bdd1243dSDimitry Andric .addDef(NewAmt) 1851bdd1243dSDimitry Andric .addReg(NewAmt) 1852bdd1243dSDimitry Andric .addReg(AmtReg); 1853bdd1243dSDimitry Andric if (Overlapped) 1854bdd1243dSDimitry Andric BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), 1855bdd1243dSDimitry Andric AmtReg - 1) 1856bdd1243dSDimitry Andric .addDef(NewAmtLo) 1857bdd1243dSDimitry Andric .addReg(NewAmtLo) 1858bdd1243dSDimitry Andric .addReg(AmtReg - 1); 1859bdd1243dSDimitry Andric 1860bdd1243dSDimitry Andric // Re-running hazard recognizer on the modified instruction is not necessary, 1861bdd1243dSDimitry Andric // inserted V_SWAP_B32 has already both read and write new registers so 1862bdd1243dSDimitry Andric // hazards related to these register has already been handled. 1863bdd1243dSDimitry Andric Amt->setReg(NewAmt); 1864bdd1243dSDimitry Andric Amt->setIsKill(false); 1865bdd1243dSDimitry Andric // We do not update liveness, so verifier may see it as undef. 1866bdd1243dSDimitry Andric Amt->setIsUndef(); 1867bdd1243dSDimitry Andric if (OverlappedDst) 1868bdd1243dSDimitry Andric MI->getOperand(0).setReg(NewReg); 1869bdd1243dSDimitry Andric if (OverlappedSrc) { 1870bdd1243dSDimitry Andric Src1->setReg(NewReg); 1871bdd1243dSDimitry Andric Src1->setIsKill(false); 1872bdd1243dSDimitry Andric Src1->setIsUndef(); 1873bdd1243dSDimitry Andric } 1874bdd1243dSDimitry Andric 1875bdd1243dSDimitry Andric return true; 1876bdd1243dSDimitry Andric } 1877bdd1243dSDimitry Andric 18780b57cec5SDimitry Andric int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 18790b57cec5SDimitry Andric int NSAtoVMEMWaitStates = 1; 18800b57cec5SDimitry Andric 18810b57cec5SDimitry Andric if (!ST.hasNSAtoVMEMBug()) 18820b57cec5SDimitry Andric return 0; 18830b57cec5SDimitry Andric 18840b57cec5SDimitry Andric if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 18850b57cec5SDimitry Andric return 0; 18860b57cec5SDimitry Andric 18870b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 18880b57cec5SDimitry Andric const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 18890b57cec5SDimitry Andric if (!Offset || (Offset->getImm() & 6) == 0) 18900b57cec5SDimitry Andric return 0; 18910b57cec5SDimitry Andric 1892fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &I) { 1893fe6060f1SDimitry Andric if (!SIInstrInfo::isMIMG(I)) 18940b57cec5SDimitry Andric return false; 1895fe6060f1SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); 18960b57cec5SDimitry Andric return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1897fe6060f1SDimitry Andric TII->getInstSizeInBytes(I) >= 16; 18980b57cec5SDimitry Andric }; 18990b57cec5SDimitry Andric 19000b57cec5SDimitry Andric return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 19010b57cec5SDimitry Andric } 19020b57cec5SDimitry Andric 19030b57cec5SDimitry Andric int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 19040b57cec5SDimitry Andric int FPAtomicToDenormModeWaitStates = 3; 19050b57cec5SDimitry Andric 1906bdd1243dSDimitry Andric if (!ST.hasFPAtomicToDenormModeHazard()) 1907bdd1243dSDimitry Andric return 0; 19087a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 1909bdd1243dSDimitry Andric 19100b57cec5SDimitry Andric if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 19110b57cec5SDimitry Andric return 0; 19120b57cec5SDimitry Andric 1913fe6060f1SDimitry Andric auto IsHazardFn = [](const MachineInstr &I) { 1914fe6060f1SDimitry Andric if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) 19150b57cec5SDimitry Andric return false; 1916fe6060f1SDimitry Andric return SIInstrInfo::isFPAtomic(I); 19170b57cec5SDimitry Andric }; 19180b57cec5SDimitry Andric 1919fe6060f1SDimitry Andric auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { 1920fe6060f1SDimitry Andric if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) 19210b57cec5SDimitry Andric return true; 19220b57cec5SDimitry Andric 1923fe6060f1SDimitry Andric switch (MI.getOpcode()) { 19240b57cec5SDimitry Andric case AMDGPU::S_WAITCNT: 19250b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VSCNT: 19260b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VMCNT: 19270b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_EXPCNT: 19280b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_LGKMCNT: 1929e8d8bef9SDimitry Andric case AMDGPU::S_WAIT_IDLE: 19300b57cec5SDimitry Andric return true; 19310b57cec5SDimitry Andric default: 19320b57cec5SDimitry Andric break; 19330b57cec5SDimitry Andric } 19340b57cec5SDimitry Andric 19350b57cec5SDimitry Andric return false; 19360b57cec5SDimitry Andric }; 19370b57cec5SDimitry Andric 19380b57cec5SDimitry Andric return FPAtomicToDenormModeWaitStates - 19390b57cec5SDimitry Andric ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 19400b57cec5SDimitry Andric } 19410b57cec5SDimitry Andric 19420b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 19430b57cec5SDimitry Andric assert(SIInstrInfo::isMAI(*MI)); 19440b57cec5SDimitry Andric 1945fe6060f1SDimitry Andric return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); 1946fe6060f1SDimitry Andric } 1947fe6060f1SDimitry Andric 194881ad6265SDimitry Andric int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { 194981ad6265SDimitry Andric // Early exit if no padding is requested. 195081ad6265SDimitry Andric if (MFMAPaddingRatio == 0) 195181ad6265SDimitry Andric return 0; 195281ad6265SDimitry Andric 195381ad6265SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 195481ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) 195581ad6265SDimitry Andric return 0; 195681ad6265SDimitry Andric 195781ad6265SDimitry Andric int NeighborMFMALatency = 0; 195881ad6265SDimitry Andric auto IsNeighboringMFMA = [&NeighborMFMALatency, 195981ad6265SDimitry Andric this](const MachineInstr &MI) { 196081ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI)) 196181ad6265SDimitry Andric return false; 196281ad6265SDimitry Andric 196381ad6265SDimitry Andric NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); 196481ad6265SDimitry Andric return true; 196581ad6265SDimitry Andric }; 196681ad6265SDimitry Andric 196781ad6265SDimitry Andric const int MaxMFMAPipelineWaitStates = 16; 196881ad6265SDimitry Andric int WaitStatesSinceNeighborMFMA = 196981ad6265SDimitry Andric getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); 197081ad6265SDimitry Andric 197181ad6265SDimitry Andric int NeighborMFMAPaddingNeeded = 197281ad6265SDimitry Andric (NeighborMFMALatency * MFMAPaddingRatio / 100) - 197381ad6265SDimitry Andric WaitStatesSinceNeighborMFMA; 197481ad6265SDimitry Andric 197581ad6265SDimitry Andric return std::max(0, NeighborMFMAPaddingNeeded); 197681ad6265SDimitry Andric } 197781ad6265SDimitry Andric 1978fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { 19790b57cec5SDimitry Andric int WaitStatesNeeded = 0; 19800b57cec5SDimitry Andric unsigned Opc = MI->getOpcode(); 19810b57cec5SDimitry Andric 1982fe6060f1SDimitry Andric auto IsVALUFn = [](const MachineInstr &MI) { 1983bdd1243dSDimitry Andric return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); 19840b57cec5SDimitry Andric }; 19850b57cec5SDimitry Andric 1986e8d8bef9SDimitry Andric if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write 19870b57cec5SDimitry Andric const int LegacyVALUWritesVGPRWaitStates = 2; 19880b57cec5SDimitry Andric const int VALUWritesExecWaitStates = 4; 19890b57cec5SDimitry Andric const int MaxWaitStates = 4; 19900b57cec5SDimitry Andric 19910b57cec5SDimitry Andric int WaitStatesNeededForUse = VALUWritesExecWaitStates - 19920b57cec5SDimitry Andric getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 19930b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 19940b57cec5SDimitry Andric 19950b57cec5SDimitry Andric if (WaitStatesNeeded < MaxWaitStates) { 19960b57cec5SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) { 19970b57cec5SDimitry Andric const int MaxWaitStates = 2; 19980b57cec5SDimitry Andric 19990b57cec5SDimitry Andric if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 20000b57cec5SDimitry Andric continue; 20010b57cec5SDimitry Andric 20020b57cec5SDimitry Andric int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 20030b57cec5SDimitry Andric getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 20040b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 20050b57cec5SDimitry Andric 20060b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 20070b57cec5SDimitry Andric break; 20080b57cec5SDimitry Andric } 20090b57cec5SDimitry Andric } 20100b57cec5SDimitry Andric } 20110b57cec5SDimitry Andric 20120b57cec5SDimitry Andric for (const MachineOperand &Op : MI->explicit_operands()) { 20130b57cec5SDimitry Andric if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 20140b57cec5SDimitry Andric continue; 20150b57cec5SDimitry Andric 2016e8d8bef9SDimitry Andric if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 20170b57cec5SDimitry Andric continue; 20180b57cec5SDimitry Andric 20190b57cec5SDimitry Andric const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 20200b57cec5SDimitry Andric const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 20210b57cec5SDimitry Andric const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 20220b57cec5SDimitry Andric const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 20230b57cec5SDimitry Andric const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 20240b57cec5SDimitry Andric const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 20250b57cec5SDimitry Andric const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 20260b57cec5SDimitry Andric const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 20270b57cec5SDimitry Andric const int MaxWaitStates = 18; 20288bcb0991SDimitry Andric Register Reg = Op.getReg(); 20290b57cec5SDimitry Andric unsigned HazardDefLatency = 0; 20300b57cec5SDimitry Andric 203181ad6265SDimitry Andric auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, 2032fe6060f1SDimitry Andric this](const MachineInstr &MI) { 203381ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI)) 20340b57cec5SDimitry Andric return false; 2035fe6060f1SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 20360b57cec5SDimitry Andric if (DstReg == Reg) 20370b57cec5SDimitry Andric return false; 2038fe6060f1SDimitry Andric HazardDefLatency = 2039fe6060f1SDimitry Andric std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 20400b57cec5SDimitry Andric return TRI.regsOverlap(DstReg, Reg); 20410b57cec5SDimitry Andric }; 20420b57cec5SDimitry Andric 20430b57cec5SDimitry Andric int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 20440b57cec5SDimitry Andric MaxWaitStates); 20450b57cec5SDimitry Andric int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 20460b57cec5SDimitry Andric int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 204706c3fb27SDimitry Andric int OpNo = Op.getOperandNo(); 20480b57cec5SDimitry Andric if (OpNo == SrcCIdx) { 20490b57cec5SDimitry Andric NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 2050e8d8bef9SDimitry Andric } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { 20510b57cec5SDimitry Andric switch (HazardDefLatency) { 20520b57cec5SDimitry Andric case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 20530b57cec5SDimitry Andric break; 20540b57cec5SDimitry Andric case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 20550b57cec5SDimitry Andric break; 2056bdd1243dSDimitry Andric case 16: [[fallthrough]]; 20570b57cec5SDimitry Andric default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 20580b57cec5SDimitry Andric break; 20590b57cec5SDimitry Andric } 2060e8d8bef9SDimitry Andric } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 20610b57cec5SDimitry Andric switch (HazardDefLatency) { 20620b57cec5SDimitry Andric case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 20630b57cec5SDimitry Andric break; 20640b57cec5SDimitry Andric case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 20650b57cec5SDimitry Andric break; 2066bdd1243dSDimitry Andric case 16: [[fallthrough]]; 20670b57cec5SDimitry Andric default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 20680b57cec5SDimitry Andric break; 20690b57cec5SDimitry Andric } 20700b57cec5SDimitry Andric } 20710b57cec5SDimitry Andric 20720b57cec5SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 20730b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 20740b57cec5SDimitry Andric 20750b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 20760b57cec5SDimitry Andric return WaitStatesNeeded; // Early exit. 20770b57cec5SDimitry Andric 2078fe6060f1SDimitry Andric auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { 2079fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 20800b57cec5SDimitry Andric return false; 2081fe6060f1SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 20820b57cec5SDimitry Andric return TRI.regsOverlap(Reg, DstReg); 20830b57cec5SDimitry Andric }; 20840b57cec5SDimitry Andric 20850b57cec5SDimitry Andric const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 20860b57cec5SDimitry Andric const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 20870b57cec5SDimitry Andric const int AccVGPRWriteAccVgprReadWaitStates = 3; 20880b57cec5SDimitry Andric NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 20890b57cec5SDimitry Andric if (OpNo == SrcCIdx) 20900b57cec5SDimitry Andric NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 2091e8d8bef9SDimitry Andric else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) 20920b57cec5SDimitry Andric NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 20930b57cec5SDimitry Andric 20940b57cec5SDimitry Andric WaitStatesNeededForUse = NeedWaitStates - 20950b57cec5SDimitry Andric getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 20960b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 20970b57cec5SDimitry Andric 20980b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 20990b57cec5SDimitry Andric return WaitStatesNeeded; // Early exit. 21000b57cec5SDimitry Andric } 21010b57cec5SDimitry Andric 2102e8d8bef9SDimitry Andric if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { 21030b57cec5SDimitry Andric const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 21040b57cec5SDimitry Andric const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 21050b57cec5SDimitry Andric const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 21060b57cec5SDimitry Andric const int MaxWaitStates = 13; 21078bcb0991SDimitry Andric Register DstReg = MI->getOperand(0).getReg(); 21080b57cec5SDimitry Andric unsigned HazardDefLatency = 0; 21090b57cec5SDimitry Andric 211081ad6265SDimitry Andric auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, 2111fe6060f1SDimitry Andric this](const MachineInstr &MI) { 211281ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI)) 21130b57cec5SDimitry Andric return false; 2114fe6060f1SDimitry Andric Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); 2115fe6060f1SDimitry Andric HazardDefLatency = 2116fe6060f1SDimitry Andric std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); 21170b57cec5SDimitry Andric return TRI.regsOverlap(Reg, DstReg); 21180b57cec5SDimitry Andric }; 21190b57cec5SDimitry Andric 21200b57cec5SDimitry Andric int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 21210b57cec5SDimitry Andric int NeedWaitStates; 21220b57cec5SDimitry Andric switch (HazardDefLatency) { 21230b57cec5SDimitry Andric case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 21240b57cec5SDimitry Andric break; 21250b57cec5SDimitry Andric case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 21260b57cec5SDimitry Andric break; 2127bdd1243dSDimitry Andric case 16: [[fallthrough]]; 21280b57cec5SDimitry Andric default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 21290b57cec5SDimitry Andric break; 21300b57cec5SDimitry Andric } 21310b57cec5SDimitry Andric 21320b57cec5SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 21330b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 21340b57cec5SDimitry Andric } 21350b57cec5SDimitry Andric 213681ad6265SDimitry Andric // Pad neighboring MFMA with noops for better inter-wave performance. 213781ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 213881ad6265SDimitry Andric 21390b57cec5SDimitry Andric return WaitStatesNeeded; 21400b57cec5SDimitry Andric } 21410b57cec5SDimitry Andric 2142*0fca6ea1SDimitry Andric static int 2143*0fca6ea1SDimitry Andric GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2144*0fca6ea1SDimitry Andric // 2 pass -> 3 2145*0fca6ea1SDimitry Andric // 4 pass -> 5 2146*0fca6ea1SDimitry Andric // 8 pass -> 9 2147*0fca6ea1SDimitry Andric // 16 pass -> 17 2148*0fca6ea1SDimitry Andric return NumPasses + 1; 2149*0fca6ea1SDimitry Andric } 2150*0fca6ea1SDimitry Andric 2151*0fca6ea1SDimitry Andric static int 2152*0fca6ea1SDimitry Andric GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { 2153*0fca6ea1SDimitry Andric // 2 pass -> 2 2154*0fca6ea1SDimitry Andric // 4 pass -> 4 2155*0fca6ea1SDimitry Andric // 8 pass -> 8 2156*0fca6ea1SDimitry Andric // 16 pass -> 16 2157*0fca6ea1SDimitry Andric return NumPasses; 2158*0fca6ea1SDimitry Andric } 2159*0fca6ea1SDimitry Andric 2160*0fca6ea1SDimitry Andric static int 2161*0fca6ea1SDimitry Andric GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2162*0fca6ea1SDimitry Andric // 2 pass -> 4 2163*0fca6ea1SDimitry Andric // 4 pass -> 6 2164*0fca6ea1SDimitry Andric // 8 pass -> 10 2165*0fca6ea1SDimitry Andric // 16 pass -> 18 2166*0fca6ea1SDimitry Andric return NumPasses + 2; 2167*0fca6ea1SDimitry Andric } 2168*0fca6ea1SDimitry Andric 2169*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { 2170*0fca6ea1SDimitry Andric // 2 pass -> 5 2171*0fca6ea1SDimitry Andric // 4 pass -> 7 2172*0fca6ea1SDimitry Andric // 8 pass -> 11 2173*0fca6ea1SDimitry Andric // 16 pass -> 19 2174*0fca6ea1SDimitry Andric return NumPasses + 3; 2175*0fca6ea1SDimitry Andric } 2176*0fca6ea1SDimitry Andric 2177fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { 2178fe6060f1SDimitry Andric int WaitStatesNeeded = 0; 2179fe6060f1SDimitry Andric unsigned Opc = MI->getOpcode(); 2180fe6060f1SDimitry Andric 218181ad6265SDimitry Andric auto IsLegacyVALUFn = [](const MachineInstr &MI) { 218281ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); 2183fe6060f1SDimitry Andric }; 2184fe6060f1SDimitry Andric 218581ad6265SDimitry Andric auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { 218681ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && 218781ad6265SDimitry Andric !SIInstrInfo::isDOT(MI); 2188fe6060f1SDimitry Andric }; 2189fe6060f1SDimitry Andric 219081ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(*MI)) 2191fe6060f1SDimitry Andric return WaitStatesNeeded; 2192fe6060f1SDimitry Andric 2193fe6060f1SDimitry Andric const int VALUWritesExecWaitStates = 4; 2194fe6060f1SDimitry Andric int WaitStatesNeededForUse = VALUWritesExecWaitStates - 2195fe6060f1SDimitry Andric getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, 2196fe6060f1SDimitry Andric VALUWritesExecWaitStates); 2197fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2198fe6060f1SDimitry Andric 2199fe6060f1SDimitry Andric int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 2200fe6060f1SDimitry Andric 2201fe6060f1SDimitry Andric // Loop for both DGEMM and S/HGEMM 2nd instruction. 2202fe6060f1SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) { 2203fe6060f1SDimitry Andric const int LegacyVALUNotDotWritesVGPRWaitStates = 2; 2204fe6060f1SDimitry Andric const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; 2205fe6060f1SDimitry Andric const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; 2206fe6060f1SDimitry Andric const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; 2207fe6060f1SDimitry Andric const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; 2208fe6060f1SDimitry Andric const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; 2209fe6060f1SDimitry Andric const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; 2210fe6060f1SDimitry Andric const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; 2211fe6060f1SDimitry Andric const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; 2212fe6060f1SDimitry Andric const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; 2213fe6060f1SDimitry Andric const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; 2214fe6060f1SDimitry Andric const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; 2215fe6060f1SDimitry Andric const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; 2216fe6060f1SDimitry Andric const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; 2217fe6060f1SDimitry Andric const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; 221881ad6265SDimitry Andric const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; 2219fe6060f1SDimitry Andric const int MaxWaitStates = 19; 2220fe6060f1SDimitry Andric 2221fe6060f1SDimitry Andric if (!Use.isReg()) 2222fe6060f1SDimitry Andric continue; 222304eeddc0SDimitry Andric Register Reg = Use.getReg(); 2224fe6060f1SDimitry Andric bool FullReg; 2225fe6060f1SDimitry Andric const MachineInstr *MI1; 2226fe6060f1SDimitry Andric 222781ad6265SDimitry Andric auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, 2228fe6060f1SDimitry Andric this](const MachineInstr &MI) { 222981ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI)) 2230fe6060f1SDimitry Andric return false; 2231fe6060f1SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2232fe6060f1SDimitry Andric FullReg = (DstReg == Reg); 2233fe6060f1SDimitry Andric MI1 = &MI; 2234fe6060f1SDimitry Andric return TRI.regsOverlap(DstReg, Reg); 2235fe6060f1SDimitry Andric }; 2236fe6060f1SDimitry Andric 2237fe6060f1SDimitry Andric WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - 2238fe6060f1SDimitry Andric getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); 2239fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2240fe6060f1SDimitry Andric 22414824e7fdSDimitry Andric int NumWaitStates = 22424824e7fdSDimitry Andric getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); 2243fe6060f1SDimitry Andric if (NumWaitStates == std::numeric_limits<int>::max()) 2244fe6060f1SDimitry Andric continue; 2245fe6060f1SDimitry Andric 224606c3fb27SDimitry Andric int OpNo = Use.getOperandNo(); 2247fe6060f1SDimitry Andric unsigned Opc1 = MI1->getOpcode(); 2248fe6060f1SDimitry Andric int NeedWaitStates = 0; 2249fe6060f1SDimitry Andric if (OpNo == SrcCIdx) { 225081ad6265SDimitry Andric if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { 2251fe6060f1SDimitry Andric NeedWaitStates = 0; 2252fe6060f1SDimitry Andric } else if (FullReg) { 2253fe6060f1SDimitry Andric if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2254fe6060f1SDimitry Andric Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && 2255fe6060f1SDimitry Andric (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || 2256fe6060f1SDimitry Andric Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) 2257fe6060f1SDimitry Andric NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; 225881ad6265SDimitry Andric else if (ST.hasGFX940Insts() && 225981ad6265SDimitry Andric TSchedModel.computeInstrLatency(MI1) == 2) 226081ad6265SDimitry Andric NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; 2261fe6060f1SDimitry Andric } else { 2262fe6060f1SDimitry Andric switch (Opc1) { 2263fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2264fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 226504eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 226604eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2267fe6060f1SDimitry Andric if (!isXDL(ST, *MI)) 2268fe6060f1SDimitry Andric NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; 2269fe6060f1SDimitry Andric break; 2270fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2271fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2272fe6060f1SDimitry Andric if (!isXDL(ST, *MI)) 2273fe6060f1SDimitry Andric NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; 2274fe6060f1SDimitry Andric break; 2275fe6060f1SDimitry Andric default: 2276*0fca6ea1SDimitry Andric int NumPasses = TSchedModel.computeInstrLatency(MI1); 2277*0fca6ea1SDimitry Andric if (ST.hasGFX940Insts()) { 2278*0fca6ea1SDimitry Andric if (isXDL(ST, *MI) && !isXDL(ST, *MI1)) 227981ad6265SDimitry Andric break; 2280*0fca6ea1SDimitry Andric 2281*0fca6ea1SDimitry Andric NeedWaitStates = 2282*0fca6ea1SDimitry Andric isXDL(ST, *MI1) 2283*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2284*0fca6ea1SDimitry Andric NumPasses) 2285*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( 2286*0fca6ea1SDimitry Andric NumPasses); 2287*0fca6ea1SDimitry Andric break; 2288*0fca6ea1SDimitry Andric } 2289*0fca6ea1SDimitry Andric 2290*0fca6ea1SDimitry Andric switch (NumPasses) { 2291fe6060f1SDimitry Andric case 2: 2292*0fca6ea1SDimitry Andric NeedWaitStates = 2293*0fca6ea1SDimitry Andric isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates 2294fe6060f1SDimitry Andric : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; 2295fe6060f1SDimitry Andric break; 2296fe6060f1SDimitry Andric case 8: 2297*0fca6ea1SDimitry Andric NeedWaitStates = 2298*0fca6ea1SDimitry Andric isDGEMM(Opc) 2299fe6060f1SDimitry Andric ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates 2300fe6060f1SDimitry Andric : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; 2301fe6060f1SDimitry Andric break; 2302*0fca6ea1SDimitry Andric case 16: 2303*0fca6ea1SDimitry Andric NeedWaitStates = 2304*0fca6ea1SDimitry Andric isDGEMM(Opc) 2305fe6060f1SDimitry Andric ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates 2306fe6060f1SDimitry Andric : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; 2307*0fca6ea1SDimitry Andric break; 2308*0fca6ea1SDimitry Andric default: 2309*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of passes"); 2310fe6060f1SDimitry Andric } 2311fe6060f1SDimitry Andric } 2312fe6060f1SDimitry Andric } 2313fe6060f1SDimitry Andric } else { 2314fe6060f1SDimitry Andric switch (Opc1) { 2315fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_e64: 2316fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: 231704eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: 231804eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: 2319fe6060f1SDimitry Andric NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; 2320fe6060f1SDimitry Andric break; 2321fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_e64: 2322fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: 2323fe6060f1SDimitry Andric NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; 2324fe6060f1SDimitry Andric break; 2325fe6060f1SDimitry Andric default: 2326*0fca6ea1SDimitry Andric int NumPasses = TSchedModel.computeInstrLatency(MI1); 2327*0fca6ea1SDimitry Andric 2328*0fca6ea1SDimitry Andric if (ST.hasGFX940Insts()) { 2329*0fca6ea1SDimitry Andric NeedWaitStates = 2330*0fca6ea1SDimitry Andric isXDL(ST, *MI1) 2331*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( 2332*0fca6ea1SDimitry Andric NumPasses) 2333*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( 2334*0fca6ea1SDimitry Andric NumPasses); 2335*0fca6ea1SDimitry Andric break; 2336*0fca6ea1SDimitry Andric } 2337*0fca6ea1SDimitry Andric 2338*0fca6ea1SDimitry Andric switch (NumPasses) { 2339fe6060f1SDimitry Andric case 2: 2340*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; 234181ad6265SDimitry Andric break; 234281ad6265SDimitry Andric case 4: 2343*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of passes for mfma"); 2344fe6060f1SDimitry Andric case 8: 2345*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; 2346fe6060f1SDimitry Andric break; 2347*0fca6ea1SDimitry Andric case 16: 2348fe6060f1SDimitry Andric default: 2349*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; 2350fe6060f1SDimitry Andric } 2351fe6060f1SDimitry Andric } 2352fe6060f1SDimitry Andric } 2353fe6060f1SDimitry Andric if (WaitStatesNeeded >= NeedWaitStates) 2354fe6060f1SDimitry Andric continue; 2355fe6060f1SDimitry Andric 2356fe6060f1SDimitry Andric WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; 2357fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2358fe6060f1SDimitry Andric 2359fe6060f1SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 2360fe6060f1SDimitry Andric break; 2361fe6060f1SDimitry Andric } 2362fe6060f1SDimitry Andric 2363*0fca6ea1SDimitry Andric // Pad neighboring MFMA with noops for better inter-wave performance. 2364*0fca6ea1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); 2365*0fca6ea1SDimitry Andric 2366fe6060f1SDimitry Andric return WaitStatesNeeded; 2367fe6060f1SDimitry Andric } 2368fe6060f1SDimitry Andric 23690b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 2370349cc55cSDimitry Andric // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() 2371fe6060f1SDimitry Andric if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) 23720b57cec5SDimitry Andric return 0; 23730b57cec5SDimitry Andric 23740b57cec5SDimitry Andric int WaitStatesNeeded = 0; 23750b57cec5SDimitry Andric 2376fe6060f1SDimitry Andric auto IsAccVgprReadFn = [](const MachineInstr &MI) { 2377fe6060f1SDimitry Andric return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; 23780b57cec5SDimitry Andric }; 23790b57cec5SDimitry Andric 23800b57cec5SDimitry Andric for (const MachineOperand &Op : MI->explicit_uses()) { 23810b57cec5SDimitry Andric if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 23820b57cec5SDimitry Andric continue; 23830b57cec5SDimitry Andric 23848bcb0991SDimitry Andric Register Reg = Op.getReg(); 23850b57cec5SDimitry Andric 23860b57cec5SDimitry Andric const int AccVgprReadLdStWaitStates = 2; 2387e8d8bef9SDimitry Andric const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 23880b57cec5SDimitry Andric const int MaxWaitStates = 2; 23890b57cec5SDimitry Andric 23900b57cec5SDimitry Andric int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 23910b57cec5SDimitry Andric getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 23920b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 23930b57cec5SDimitry Andric 23940b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 23950b57cec5SDimitry Andric return WaitStatesNeeded; // Early exit. 23960b57cec5SDimitry Andric 2397fe6060f1SDimitry Andric auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { 2398fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && 2399fe6060f1SDimitry Andric MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 24000b57cec5SDimitry Andric return false; 2401fe6060f1SDimitry Andric auto IsVALUFn = [](const MachineInstr &MI) { 2402fe6060f1SDimitry Andric return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); 24030b57cec5SDimitry Andric }; 24040b57cec5SDimitry Andric return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 24050b57cec5SDimitry Andric std::numeric_limits<int>::max(); 24060b57cec5SDimitry Andric }; 24070b57cec5SDimitry Andric 2408e8d8bef9SDimitry Andric WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 2409e8d8bef9SDimitry Andric getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 24100b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 24110b57cec5SDimitry Andric } 24120b57cec5SDimitry Andric 24130b57cec5SDimitry Andric return WaitStatesNeeded; 24140b57cec5SDimitry Andric } 2415e8d8bef9SDimitry Andric 2416*0fca6ea1SDimitry Andric static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2417*0fca6ea1SDimitry Andric // 2 pass -> 4 2418*0fca6ea1SDimitry Andric // 4 pass -> 6 2419*0fca6ea1SDimitry Andric // 8 pass -> 10 2420*0fca6ea1SDimitry Andric // 16 pass -> 18 2421*0fca6ea1SDimitry Andric return NumPasses + 2; 2422*0fca6ea1SDimitry Andric } 2423*0fca6ea1SDimitry Andric 2424*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { 2425*0fca6ea1SDimitry Andric // 2 pass -> 5 2426*0fca6ea1SDimitry Andric // 4 pass -> 7 2427*0fca6ea1SDimitry Andric // 8 pass -> 11 2428*0fca6ea1SDimitry Andric // 16 pass -> 19 2429*0fca6ea1SDimitry Andric return NumPasses + 3; 2430*0fca6ea1SDimitry Andric } 2431*0fca6ea1SDimitry Andric 2432*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2433*0fca6ea1SDimitry Andric // 2 pass -> 5 2434*0fca6ea1SDimitry Andric // 4 pass -> 7 2435*0fca6ea1SDimitry Andric // 8 pass -> 11 2436*0fca6ea1SDimitry Andric // 16 pass -> 19 2437*0fca6ea1SDimitry Andric return NumPasses + 3; 2438*0fca6ea1SDimitry Andric } 2439*0fca6ea1SDimitry Andric 2440*0fca6ea1SDimitry Andric static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { 2441*0fca6ea1SDimitry Andric // 2 pass -> 4 2442*0fca6ea1SDimitry Andric // 4 pass -> 6 2443*0fca6ea1SDimitry Andric // 8 pass -> 10 2444*0fca6ea1SDimitry Andric // 16 pass -> 18 2445*0fca6ea1SDimitry Andric return NumPasses + 2; 2446*0fca6ea1SDimitry Andric } 2447*0fca6ea1SDimitry Andric 2448fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { 2449fe6060f1SDimitry Andric if (!ST.hasGFX90AInsts()) 2450fe6060f1SDimitry Andric return 0; 2451fe6060f1SDimitry Andric 2452fe6060f1SDimitry Andric auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { 2453fe6060f1SDimitry Andric return isDGEMM(MI.getOpcode()); 2454fe6060f1SDimitry Andric }; 2455fe6060f1SDimitry Andric 2456fe6060f1SDimitry Andric // This is checked in checkMAIHazards90A() 245781ad6265SDimitry Andric if (SIInstrInfo::isMFMA(*MI)) 2458fe6060f1SDimitry Andric return 0; 2459fe6060f1SDimitry Andric 2460bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 2461bdd1243dSDimitry Andric 2462fe6060f1SDimitry Andric int WaitStatesNeeded = 0; 2463fe6060f1SDimitry Andric 2464bdd1243dSDimitry Andric bool IsMem = SIInstrInfo::isVMEM(*MI) || 2465fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI) || 2466bdd1243dSDimitry Andric SIInstrInfo::isDS(*MI); 2467bdd1243dSDimitry Andric bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); 2468fe6060f1SDimitry Andric bool IsVALU = SIInstrInfo::isVALU(*MI); 2469fe6060f1SDimitry Andric 2470fe6060f1SDimitry Andric const MachineInstr *MFMA = nullptr; 2471fe6060f1SDimitry Andric unsigned Reg; 247281ad6265SDimitry Andric auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 247381ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI) || 247481ad6265SDimitry Andric !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2475fe6060f1SDimitry Andric return false; 2476fe6060f1SDimitry Andric MFMA = &MI; 2477fe6060f1SDimitry Andric return true; 2478fe6060f1SDimitry Andric }; 2479fe6060f1SDimitry Andric 2480fe6060f1SDimitry Andric const MachineInstr *DOT = nullptr; 2481fe6060f1SDimitry Andric auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { 2482fe6060f1SDimitry Andric if (!SIInstrInfo::isDOT(MI) || 2483fe6060f1SDimitry Andric !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) 2484fe6060f1SDimitry Andric return false; 2485fe6060f1SDimitry Andric DOT = &MI; 2486fe6060f1SDimitry Andric return true; 2487fe6060f1SDimitry Andric }; 2488fe6060f1SDimitry Andric 2489bdd1243dSDimitry Andric bool DGEMMAfterVALUWrite = false; 2490bdd1243dSDimitry Andric auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { 2491bdd1243dSDimitry Andric // Found DGEMM on reverse traversal to def. 2492bdd1243dSDimitry Andric if (isDGEMM(MI.getOpcode())) 2493bdd1243dSDimitry Andric DGEMMAfterVALUWrite = true; 2494bdd1243dSDimitry Andric 2495bdd1243dSDimitry Andric // Only hazard if register is defined by a VALU and a DGEMM is found after 2496bdd1243dSDimitry Andric // after the def. 2497bdd1243dSDimitry Andric if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) 2498bdd1243dSDimitry Andric return false; 2499bdd1243dSDimitry Andric 2500bdd1243dSDimitry Andric return true; 2501bdd1243dSDimitry Andric }; 2502bdd1243dSDimitry Andric 2503fe6060f1SDimitry Andric int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2504fe6060f1SDimitry Andric AMDGPU::OpName::src2); 2505fe6060f1SDimitry Andric 2506fe6060f1SDimitry Andric if (IsMemOrExport || IsVALU) { 2507fe6060f1SDimitry Andric const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; 2508fe6060f1SDimitry Andric const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; 2509fe6060f1SDimitry Andric const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; 2510fe6060f1SDimitry Andric const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; 2511fe6060f1SDimitry Andric const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; 2512fe6060f1SDimitry Andric const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; 2513fe6060f1SDimitry Andric const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; 2514fe6060f1SDimitry Andric const int DotWriteSameDotReadSrcAB = 3; 2515fe6060f1SDimitry Andric const int DotWriteDifferentVALURead = 3; 2516bdd1243dSDimitry Andric const int DMFMABetweenVALUWriteVMEMRead = 2; 2517fe6060f1SDimitry Andric const int MaxWaitStates = 19; 2518fe6060f1SDimitry Andric 2519fe6060f1SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) { 2520fe6060f1SDimitry Andric if (!Use.isReg()) 2521fe6060f1SDimitry Andric continue; 2522fe6060f1SDimitry Andric Reg = Use.getReg(); 2523fe6060f1SDimitry Andric 2524fe6060f1SDimitry Andric DOT = nullptr; 2525fe6060f1SDimitry Andric int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2526fe6060f1SDimitry Andric MaxWaitStates); 2527fe6060f1SDimitry Andric if (DOT) { 2528fe6060f1SDimitry Andric int NeedWaitStates = 0; 2529fe6060f1SDimitry Andric if (DOT->getOpcode() == MI->getOpcode()) { 2530fe6060f1SDimitry Andric if (&Use - &MI->getOperand(0) != SrcCIdx) 2531fe6060f1SDimitry Andric NeedWaitStates = DotWriteSameDotReadSrcAB; 2532fe6060f1SDimitry Andric } else { 2533fe6060f1SDimitry Andric NeedWaitStates = DotWriteDifferentVALURead; 2534fe6060f1SDimitry Andric } 2535fe6060f1SDimitry Andric 2536fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2537fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2538fe6060f1SDimitry Andric } 2539fe6060f1SDimitry Andric 2540bdd1243dSDimitry Andric // Workaround for HW data hazard bug observed only in GFX90A. When there 2541bdd1243dSDimitry Andric // is a DGEMM instruction in-between a VALU and a VMEM instruction it 2542bdd1243dSDimitry Andric // causes the SQ to incorrectly not insert two wait states between the two 2543bdd1243dSDimitry Andric // instructions needed to avoid data hazard. 2544bdd1243dSDimitry Andric if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { 2545bdd1243dSDimitry Andric DGEMMAfterVALUWrite = false; 2546bdd1243dSDimitry Andric if (TRI.isVectorRegister(MRI, Reg)) { 2547bdd1243dSDimitry Andric int WaitStatesNeededForUse = 2548bdd1243dSDimitry Andric DMFMABetweenVALUWriteVMEMRead - 2549bdd1243dSDimitry Andric getWaitStatesSinceDef(Reg, IsDGEMMHazard, 2550bdd1243dSDimitry Andric DMFMABetweenVALUWriteVMEMRead); 2551bdd1243dSDimitry Andric 2552bdd1243dSDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2553bdd1243dSDimitry Andric } 2554bdd1243dSDimitry Andric } 2555bdd1243dSDimitry Andric 2556fe6060f1SDimitry Andric MFMA = nullptr; 25574824e7fdSDimitry Andric WaitStatesSinceDef = 25584824e7fdSDimitry Andric getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2559fe6060f1SDimitry Andric if (!MFMA) 2560fe6060f1SDimitry Andric continue; 2561fe6060f1SDimitry Andric 2562fe6060f1SDimitry Andric unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2563*0fca6ea1SDimitry Andric int NumPasses = HazardDefLatency; 2564fe6060f1SDimitry Andric int NeedWaitStates = MaxWaitStates; 2565*0fca6ea1SDimitry Andric 2566*0fca6ea1SDimitry Andric if (isDGEMM(MFMA->getOpcode())) { 2567fe6060f1SDimitry Andric switch (HazardDefLatency) { 2568fe6060f1SDimitry Andric case 4: 2569*0fca6ea1SDimitry Andric NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates 2570*0fca6ea1SDimitry Andric : DMFMA4x4WriteVgprVALUReadWaitStates; 2571fe6060f1SDimitry Andric break; 2572fe6060f1SDimitry Andric case 8: 2573*0fca6ea1SDimitry Andric case 16: 2574*0fca6ea1SDimitry Andric NeedWaitStates = IsMemOrExport 2575*0fca6ea1SDimitry Andric ? DMFMA16x16WriteVgprMemExpReadWaitStates 2576*0fca6ea1SDimitry Andric : DMFMA16x16WriteVgprVALUReadWaitStates; 2577fe6060f1SDimitry Andric break; 2578fe6060f1SDimitry Andric default: 2579*0fca6ea1SDimitry Andric llvm_unreachable("unexpected dgemm"); 2580*0fca6ea1SDimitry Andric } 2581*0fca6ea1SDimitry Andric } else if (ST.hasGFX940Insts()) { 2582fe6060f1SDimitry Andric NeedWaitStates = 2583*0fca6ea1SDimitry Andric isXDL(ST, *MFMA) 2584*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) 2585*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( 2586*0fca6ea1SDimitry Andric NumPasses); 2587*0fca6ea1SDimitry Andric } else { 2588*0fca6ea1SDimitry Andric switch (HazardDefLatency) { 2589*0fca6ea1SDimitry Andric case 2: 2590*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; 2591fe6060f1SDimitry Andric break; 2592*0fca6ea1SDimitry Andric case 8: 2593*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; 2594*0fca6ea1SDimitry Andric break; 2595*0fca6ea1SDimitry Andric case 16: 2596*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; 2597*0fca6ea1SDimitry Andric break; 2598*0fca6ea1SDimitry Andric default: 2599*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of passes for mfma"); 2600*0fca6ea1SDimitry Andric } 2601fe6060f1SDimitry Andric } 2602fe6060f1SDimitry Andric 2603fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2604fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2605fe6060f1SDimitry Andric 2606fe6060f1SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 2607fe6060f1SDimitry Andric break; 2608fe6060f1SDimitry Andric } 2609fe6060f1SDimitry Andric } 2610fe6060f1SDimitry Andric 2611fe6060f1SDimitry Andric unsigned Opc = MI->getOpcode(); 2612fe6060f1SDimitry Andric const int DMFMAToFMA64WaitStates = 2; 2613fe6060f1SDimitry Andric if ((Opc == AMDGPU::V_FMA_F64_e64 || 2614fe6060f1SDimitry Andric Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || 2615fe6060f1SDimitry Andric Opc == AMDGPU::V_FMAC_F64_dpp) && 2616fe6060f1SDimitry Andric WaitStatesNeeded < DMFMAToFMA64WaitStates) { 2617fe6060f1SDimitry Andric int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - 2618fe6060f1SDimitry Andric getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); 2619fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2620fe6060f1SDimitry Andric } 2621fe6060f1SDimitry Andric 2622fe6060f1SDimitry Andric if (!IsVALU && !IsMemOrExport) 2623fe6060f1SDimitry Andric return WaitStatesNeeded; 2624fe6060f1SDimitry Andric 2625fe6060f1SDimitry Andric for (const MachineOperand &Def : MI->defs()) { 2626fe6060f1SDimitry Andric const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; 2627fe6060f1SDimitry Andric const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; 2628fe6060f1SDimitry Andric const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; 2629fe6060f1SDimitry Andric const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; 263081ad6265SDimitry Andric const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; 2631fe6060f1SDimitry Andric const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; 2632fe6060f1SDimitry Andric const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; 2633fe6060f1SDimitry Andric const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; 2634fe6060f1SDimitry Andric const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; 2635fe6060f1SDimitry Andric const int DotWriteDifferentVALUWrite = 3; 2636fe6060f1SDimitry Andric const int MaxWaitStates = 19; 2637fe6060f1SDimitry Andric const int MaxWarWaitStates = 15; 2638fe6060f1SDimitry Andric 2639fe6060f1SDimitry Andric Reg = Def.getReg(); 2640fe6060f1SDimitry Andric 2641fe6060f1SDimitry Andric DOT = nullptr; 2642fe6060f1SDimitry Andric int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, 2643fe6060f1SDimitry Andric MaxWaitStates); 2644fe6060f1SDimitry Andric if (DOT && DOT->getOpcode() != MI->getOpcode()) 2645fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - 2646fe6060f1SDimitry Andric WaitStatesSinceDef); 2647fe6060f1SDimitry Andric 2648fe6060f1SDimitry Andric MFMA = nullptr; 26494824e7fdSDimitry Andric WaitStatesSinceDef = 26504824e7fdSDimitry Andric getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); 2651fe6060f1SDimitry Andric if (MFMA) { 2652fe6060f1SDimitry Andric int NeedWaitStates = MaxWaitStates; 2653*0fca6ea1SDimitry Andric int NumPasses = TSchedModel.computeInstrLatency(MFMA); 2654*0fca6ea1SDimitry Andric 2655*0fca6ea1SDimitry Andric if (isDGEMM(MFMA->getOpcode())) { 2656*0fca6ea1SDimitry Andric switch (NumPasses) { 2657fe6060f1SDimitry Andric case 4: 2658*0fca6ea1SDimitry Andric NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; 2659fe6060f1SDimitry Andric break; 2660fe6060f1SDimitry Andric case 8: 2661*0fca6ea1SDimitry Andric case 16: 2662*0fca6ea1SDimitry Andric NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; 2663fe6060f1SDimitry Andric break; 2664fe6060f1SDimitry Andric default: 2665*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of cycles for dgemm"); 2666*0fca6ea1SDimitry Andric } 2667*0fca6ea1SDimitry Andric } else if (ST.hasGFX940Insts()) { 2668*0fca6ea1SDimitry Andric NeedWaitStates = 2669*0fca6ea1SDimitry Andric isXDL(ST, *MFMA) 2670*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) 2671*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); 2672*0fca6ea1SDimitry Andric } else { 2673*0fca6ea1SDimitry Andric switch (NumPasses) { 2674*0fca6ea1SDimitry Andric case 2: 2675*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; 2676fe6060f1SDimitry Andric break; 2677*0fca6ea1SDimitry Andric case 8: 2678*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; 2679*0fca6ea1SDimitry Andric break; 2680*0fca6ea1SDimitry Andric case 16: 2681*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; 2682*0fca6ea1SDimitry Andric break; 2683*0fca6ea1SDimitry Andric default: 2684*0fca6ea1SDimitry Andric llvm_unreachable("Unexpected number of passes for mfma"); 2685*0fca6ea1SDimitry Andric } 2686fe6060f1SDimitry Andric } 2687fe6060f1SDimitry Andric 2688fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 2689fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2690fe6060f1SDimitry Andric 2691fe6060f1SDimitry Andric if (WaitStatesNeeded == MaxWaitStates) 2692fe6060f1SDimitry Andric break; 2693fe6060f1SDimitry Andric } 2694fe6060f1SDimitry Andric 269581ad6265SDimitry Andric auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { 269681ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || 2697fe6060f1SDimitry Andric !MI.readsRegister(Reg, &TRI)) 2698fe6060f1SDimitry Andric return false; 2699fe6060f1SDimitry Andric 270081ad6265SDimitry Andric if (ST.hasGFX940Insts() && !isXDL(ST, MI)) 270181ad6265SDimitry Andric return false; 270281ad6265SDimitry Andric 2703fe6060f1SDimitry Andric const MachineOperand *SrcC = 2704fe6060f1SDimitry Andric TII.getNamedOperand(MI, AMDGPU::OpName::src2); 2705fe6060f1SDimitry Andric assert(SrcC); 2706fe6060f1SDimitry Andric if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) 2707fe6060f1SDimitry Andric return false; 2708fe6060f1SDimitry Andric 2709fe6060f1SDimitry Andric MFMA = &MI; 2710fe6060f1SDimitry Andric return true; 2711fe6060f1SDimitry Andric }; 2712fe6060f1SDimitry Andric 2713fe6060f1SDimitry Andric MFMA = nullptr; 2714fe6060f1SDimitry Andric int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, 2715fe6060f1SDimitry Andric MaxWarWaitStates); 2716fe6060f1SDimitry Andric if (!MFMA) 2717fe6060f1SDimitry Andric continue; 2718fe6060f1SDimitry Andric 2719fe6060f1SDimitry Andric unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); 2720fe6060f1SDimitry Andric int NeedWaitStates = MaxWaitStates; 2721fe6060f1SDimitry Andric switch (HazardDefLatency) { 2722fe6060f1SDimitry Andric case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; 2723fe6060f1SDimitry Andric break; 272481ad6265SDimitry Andric case 4: assert(ST.hasGFX940Insts()); 272581ad6265SDimitry Andric NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; 272681ad6265SDimitry Andric break; 2727fe6060f1SDimitry Andric case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; 2728fe6060f1SDimitry Andric break; 2729bdd1243dSDimitry Andric case 16: [[fallthrough]]; 2730fe6060f1SDimitry Andric default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; 2731fe6060f1SDimitry Andric break; 2732fe6060f1SDimitry Andric } 2733fe6060f1SDimitry Andric 2734fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; 2735fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 2736fe6060f1SDimitry Andric } 2737fe6060f1SDimitry Andric 2738fe6060f1SDimitry Andric return WaitStatesNeeded; 2739fe6060f1SDimitry Andric } 2740fe6060f1SDimitry Andric 2741e8d8bef9SDimitry Andric bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 2742e8d8bef9SDimitry Andric if (!SU->isInstr()) 2743e8d8bef9SDimitry Andric return false; 2744e8d8bef9SDimitry Andric 2745fe6060f1SDimitry Andric const MachineInstr *MAI = nullptr; 274681ad6265SDimitry Andric 2747fe6060f1SDimitry Andric auto IsMFMAFn = [&MAI](const MachineInstr &MI) { 2748e8d8bef9SDimitry Andric MAI = nullptr; 274981ad6265SDimitry Andric if (SIInstrInfo::isMFMA(MI)) 2750fe6060f1SDimitry Andric MAI = &MI; 2751e8d8bef9SDimitry Andric return MAI != nullptr; 2752e8d8bef9SDimitry Andric }; 2753e8d8bef9SDimitry Andric 2754e8d8bef9SDimitry Andric MachineInstr *MI = SU->getInstr(); 2755fe6060f1SDimitry Andric if (IsMFMAFn(*MI)) { 2756e8d8bef9SDimitry Andric int W = getWaitStatesSince(IsMFMAFn, 16); 2757e8d8bef9SDimitry Andric if (MAI) 2758e8d8bef9SDimitry Andric return W < (int)TSchedModel.computeInstrLatency(MAI); 2759e8d8bef9SDimitry Andric } 2760e8d8bef9SDimitry Andric 2761e8d8bef9SDimitry Andric return false; 2762e8d8bef9SDimitry Andric } 2763bdd1243dSDimitry Andric 2764bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { 2765bdd1243dSDimitry Andric if (!ST.hasVALUMaskWriteHazard()) 2766bdd1243dSDimitry Andric return false; 27677a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts()); 27687a6dacacSDimitry Andric 27697a6dacacSDimitry Andric if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) 2770bdd1243dSDimitry Andric return false; 2771bdd1243dSDimitry Andric 2772bdd1243dSDimitry Andric // The hazard sequence is three instructions: 2773bdd1243dSDimitry Andric // 1. VALU reads SGPR as mask 2774bdd1243dSDimitry Andric // 2. SALU writes SGPR 2775bdd1243dSDimitry Andric // 3. SALU reads SGPR 2776bdd1243dSDimitry Andric // The hazard can expire if the distance between 2 and 3 is sufficient. 2777bdd1243dSDimitry Andric // In practice this happens <10% of the time, hence this always assumes 2778bdd1243dSDimitry Andric // the hazard exists if 1 and 2 are present to avoid searching. 2779bdd1243dSDimitry Andric 2780bdd1243dSDimitry Andric const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); 2781bdd1243dSDimitry Andric if (!SDSTOp || !SDSTOp->isReg()) 2782bdd1243dSDimitry Andric return false; 2783bdd1243dSDimitry Andric 2784bdd1243dSDimitry Andric const Register HazardReg = SDSTOp->getReg(); 2785bdd1243dSDimitry Andric if (HazardReg == AMDGPU::EXEC || 2786bdd1243dSDimitry Andric HazardReg == AMDGPU::EXEC_LO || 2787bdd1243dSDimitry Andric HazardReg == AMDGPU::EXEC_HI || 2788bdd1243dSDimitry Andric HazardReg == AMDGPU::M0) 2789bdd1243dSDimitry Andric return false; 2790bdd1243dSDimitry Andric 2791bdd1243dSDimitry Andric auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { 2792bdd1243dSDimitry Andric switch (I.getOpcode()) { 2793bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_e32: 2794bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_dpp: 2795bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_e32: 2796bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_dpp: 2797bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_e32: 2798bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_dpp: 2799bdd1243dSDimitry Andric case AMDGPU::V_DIV_FMAS_F32_e64: 2800bdd1243dSDimitry Andric case AMDGPU::V_DIV_FMAS_F64_e64: 2801bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_e32: 2802bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_dpp: 2803bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_e32: 2804bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_dpp: 2805bdd1243dSDimitry Andric // These implicitly read VCC as mask source. 2806bdd1243dSDimitry Andric return HazardReg == AMDGPU::VCC || 2807bdd1243dSDimitry Andric HazardReg == AMDGPU::VCC_LO || 2808bdd1243dSDimitry Andric HazardReg == AMDGPU::VCC_HI; 2809bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_e64: 2810bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_e64_dpp: 2811bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_e64: 2812bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_e64_dpp: 2813bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_e64: 2814bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_e64_dpp: 2815bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_e64: 2816bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_e64_dpp: 2817bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_e64: 2818bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_e64_dpp: { 2819bdd1243dSDimitry Andric // Only check mask register overlaps. 2820bdd1243dSDimitry Andric const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); 2821bdd1243dSDimitry Andric assert(SSRCOp); 2822bdd1243dSDimitry Andric return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); 2823bdd1243dSDimitry Andric } 2824bdd1243dSDimitry Andric default: 2825bdd1243dSDimitry Andric return false; 2826bdd1243dSDimitry Andric } 2827bdd1243dSDimitry Andric }; 2828bdd1243dSDimitry Andric 2829bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 2830bdd1243dSDimitry Andric auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { 2831bdd1243dSDimitry Andric // s_waitcnt_depctr sa_sdst(0) mitigates hazard. 2832bdd1243dSDimitry Andric if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 283306c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) 2834bdd1243dSDimitry Andric return true; 2835bdd1243dSDimitry Andric 2836bdd1243dSDimitry Andric // VALU access to any SGPR or literal constant other than HazardReg 2837bdd1243dSDimitry Andric // mitigates hazard. No need to check HazardReg here as this will 2838bdd1243dSDimitry Andric // only be called when !IsHazardFn. 2839bdd1243dSDimitry Andric if (!SIInstrInfo::isVALU(I)) 2840bdd1243dSDimitry Andric return false; 2841bdd1243dSDimitry Andric for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { 2842bdd1243dSDimitry Andric const MachineOperand &Op = I.getOperand(OpNo); 2843bdd1243dSDimitry Andric if (Op.isReg()) { 2844bdd1243dSDimitry Andric Register OpReg = Op.getReg(); 2845bdd1243dSDimitry Andric // Only consider uses 2846bdd1243dSDimitry Andric if (!Op.isUse()) 2847bdd1243dSDimitry Andric continue; 2848bdd1243dSDimitry Andric // Ignore EXEC 2849bdd1243dSDimitry Andric if (OpReg == AMDGPU::EXEC || 2850bdd1243dSDimitry Andric OpReg == AMDGPU::EXEC_LO || 2851bdd1243dSDimitry Andric OpReg == AMDGPU::EXEC_HI) 2852bdd1243dSDimitry Andric continue; 2853bdd1243dSDimitry Andric // Ignore all implicit uses except VCC 2854bdd1243dSDimitry Andric if (Op.isImplicit()) { 2855bdd1243dSDimitry Andric if (OpReg == AMDGPU::VCC || 2856bdd1243dSDimitry Andric OpReg == AMDGPU::VCC_LO || 2857bdd1243dSDimitry Andric OpReg == AMDGPU::VCC_HI) 2858bdd1243dSDimitry Andric return true; 2859bdd1243dSDimitry Andric continue; 2860bdd1243dSDimitry Andric } 2861bdd1243dSDimitry Andric if (TRI.isSGPRReg(MRI, OpReg)) 2862bdd1243dSDimitry Andric return true; 2863bdd1243dSDimitry Andric } else { 2864bdd1243dSDimitry Andric const MCInstrDesc &InstDesc = I.getDesc(); 2865bdd1243dSDimitry Andric const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 2866bdd1243dSDimitry Andric if (!TII.isInlineConstant(Op, OpInfo)) 2867bdd1243dSDimitry Andric return true; 2868bdd1243dSDimitry Andric } 2869bdd1243dSDimitry Andric } 2870bdd1243dSDimitry Andric return false; 2871bdd1243dSDimitry Andric }; 2872bdd1243dSDimitry Andric 2873bdd1243dSDimitry Andric // Check for hazard 2874bdd1243dSDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 2875bdd1243dSDimitry Andric std::numeric_limits<int>::max()) 2876bdd1243dSDimitry Andric return false; 2877bdd1243dSDimitry Andric 2878bdd1243dSDimitry Andric auto NextMI = std::next(MI->getIterator()); 2879bdd1243dSDimitry Andric 2880bdd1243dSDimitry Andric // Add s_waitcnt_depctr sa_sdst(0) after SALU write. 2881bdd1243dSDimitry Andric BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), 2882bdd1243dSDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR)) 288306c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); 2884bdd1243dSDimitry Andric 2885bdd1243dSDimitry Andric // SALU write may be s_getpc in a bundle. 2886bdd1243dSDimitry Andric if (MI->getOpcode() == AMDGPU::S_GETPC_B64) { 2887bdd1243dSDimitry Andric // Update offsets of any references in the bundle. 2888bdd1243dSDimitry Andric while (NextMI != MI->getParent()->end() && 2889bdd1243dSDimitry Andric NextMI->isBundledWithPred()) { 2890bdd1243dSDimitry Andric for (auto &Operand : NextMI->operands()) { 2891bdd1243dSDimitry Andric if (Operand.isGlobal()) 2892bdd1243dSDimitry Andric Operand.setOffset(Operand.getOffset() + 4); 2893bdd1243dSDimitry Andric } 2894bdd1243dSDimitry Andric NextMI++; 2895bdd1243dSDimitry Andric } 2896bdd1243dSDimitry Andric } 2897bdd1243dSDimitry Andric 2898bdd1243dSDimitry Andric return true; 2899bdd1243dSDimitry Andric } 2900*0fca6ea1SDimitry Andric 2901*0fca6ea1SDimitry Andric static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, 2902*0fca6ea1SDimitry Andric const SIInstrInfo &TII) { 2903*0fca6ea1SDimitry Andric MachineBasicBlock &EntryMBB = MF->front(); 2904*0fca6ea1SDimitry Andric if (EntryMBB.begin() != EntryMBB.end()) { 2905*0fca6ea1SDimitry Andric auto &EntryMI = *EntryMBB.begin(); 2906*0fca6ea1SDimitry Andric if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && 2907*0fca6ea1SDimitry Andric EntryMI.getOperand(0).getImm() >= Priority) 2908*0fca6ea1SDimitry Andric return false; 2909*0fca6ea1SDimitry Andric } 2910*0fca6ea1SDimitry Andric 2911*0fca6ea1SDimitry Andric BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) 2912*0fca6ea1SDimitry Andric .addImm(Priority); 2913*0fca6ea1SDimitry Andric return true; 2914*0fca6ea1SDimitry Andric } 2915*0fca6ea1SDimitry Andric 2916*0fca6ea1SDimitry Andric bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { 2917*0fca6ea1SDimitry Andric if (!ST.hasRequiredExportPriority()) 2918*0fca6ea1SDimitry Andric return false; 2919*0fca6ea1SDimitry Andric 2920*0fca6ea1SDimitry Andric // Assume the following shader types will never have exports, 2921*0fca6ea1SDimitry Andric // and avoid adding or adjusting S_SETPRIO. 2922*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = MI->getParent(); 2923*0fca6ea1SDimitry Andric MachineFunction *MF = MBB->getParent(); 2924*0fca6ea1SDimitry Andric auto CC = MF->getFunction().getCallingConv(); 2925*0fca6ea1SDimitry Andric switch (CC) { 2926*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_CS: 2927*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_CS_Chain: 2928*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_CS_ChainPreserve: 2929*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_KERNEL: 2930*0fca6ea1SDimitry Andric return false; 2931*0fca6ea1SDimitry Andric default: 2932*0fca6ea1SDimitry Andric break; 2933*0fca6ea1SDimitry Andric } 2934*0fca6ea1SDimitry Andric 2935*0fca6ea1SDimitry Andric const int MaxPriority = 3; 2936*0fca6ea1SDimitry Andric const int NormalPriority = 2; 2937*0fca6ea1SDimitry Andric const int PostExportPriority = 0; 2938*0fca6ea1SDimitry Andric 2939*0fca6ea1SDimitry Andric auto It = MI->getIterator(); 2940*0fca6ea1SDimitry Andric switch (MI->getOpcode()) { 2941*0fca6ea1SDimitry Andric case AMDGPU::S_ENDPGM: 2942*0fca6ea1SDimitry Andric case AMDGPU::S_ENDPGM_SAVED: 2943*0fca6ea1SDimitry Andric case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: 2944*0fca6ea1SDimitry Andric case AMDGPU::SI_RETURN_TO_EPILOG: 2945*0fca6ea1SDimitry Andric // Ensure shader with calls raises priority at entry. 2946*0fca6ea1SDimitry Andric // This ensures correct priority if exports exist in callee. 2947*0fca6ea1SDimitry Andric if (MF->getFrameInfo().hasCalls()) 2948*0fca6ea1SDimitry Andric return ensureEntrySetPrio(MF, NormalPriority, TII); 2949*0fca6ea1SDimitry Andric return false; 2950*0fca6ea1SDimitry Andric case AMDGPU::S_SETPRIO: { 2951*0fca6ea1SDimitry Andric // Raise minimum priority unless in workaround. 2952*0fca6ea1SDimitry Andric auto &PrioOp = MI->getOperand(0); 2953*0fca6ea1SDimitry Andric int Prio = PrioOp.getImm(); 2954*0fca6ea1SDimitry Andric bool InWA = (Prio == PostExportPriority) && 2955*0fca6ea1SDimitry Andric (It != MBB->begin() && TII.isEXP(*std::prev(It))); 2956*0fca6ea1SDimitry Andric if (InWA || Prio >= NormalPriority) 2957*0fca6ea1SDimitry Andric return false; 2958*0fca6ea1SDimitry Andric PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); 2959*0fca6ea1SDimitry Andric return true; 2960*0fca6ea1SDimitry Andric } 2961*0fca6ea1SDimitry Andric default: 2962*0fca6ea1SDimitry Andric if (!TII.isEXP(*MI)) 2963*0fca6ea1SDimitry Andric return false; 2964*0fca6ea1SDimitry Andric break; 2965*0fca6ea1SDimitry Andric } 2966*0fca6ea1SDimitry Andric 2967*0fca6ea1SDimitry Andric // Check entry priority at each export (as there will only be a few). 2968*0fca6ea1SDimitry Andric // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. 2969*0fca6ea1SDimitry Andric bool Changed = false; 2970*0fca6ea1SDimitry Andric if (CC != CallingConv::AMDGPU_Gfx) 2971*0fca6ea1SDimitry Andric Changed = ensureEntrySetPrio(MF, NormalPriority, TII); 2972*0fca6ea1SDimitry Andric 2973*0fca6ea1SDimitry Andric auto NextMI = std::next(It); 2974*0fca6ea1SDimitry Andric bool EndOfShader = false; 2975*0fca6ea1SDimitry Andric if (NextMI != MBB->end()) { 2976*0fca6ea1SDimitry Andric // Only need WA at end of sequence of exports. 2977*0fca6ea1SDimitry Andric if (TII.isEXP(*NextMI)) 2978*0fca6ea1SDimitry Andric return Changed; 2979*0fca6ea1SDimitry Andric // Assume appropriate S_SETPRIO after export means WA already applied. 2980*0fca6ea1SDimitry Andric if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && 2981*0fca6ea1SDimitry Andric NextMI->getOperand(0).getImm() == PostExportPriority) 2982*0fca6ea1SDimitry Andric return Changed; 2983*0fca6ea1SDimitry Andric EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; 2984*0fca6ea1SDimitry Andric } 2985*0fca6ea1SDimitry Andric 2986*0fca6ea1SDimitry Andric const DebugLoc &DL = MI->getDebugLoc(); 2987*0fca6ea1SDimitry Andric 2988*0fca6ea1SDimitry Andric // Lower priority. 2989*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 2990*0fca6ea1SDimitry Andric .addImm(PostExportPriority); 2991*0fca6ea1SDimitry Andric 2992*0fca6ea1SDimitry Andric if (!EndOfShader) { 2993*0fca6ea1SDimitry Andric // Wait for exports to complete. 2994*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) 2995*0fca6ea1SDimitry Andric .addReg(AMDGPU::SGPR_NULL) 2996*0fca6ea1SDimitry Andric .addImm(0); 2997*0fca6ea1SDimitry Andric } 2998*0fca6ea1SDimitry Andric 2999*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3000*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); 3001*0fca6ea1SDimitry Andric 3002*0fca6ea1SDimitry Andric if (!EndOfShader) { 3003*0fca6ea1SDimitry Andric // Return to normal (higher) priority. 3004*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) 3005*0fca6ea1SDimitry Andric .addImm(NormalPriority); 3006*0fca6ea1SDimitry Andric } 3007*0fca6ea1SDimitry Andric 3008*0fca6ea1SDimitry Andric return true; 3009*0fca6ea1SDimitry Andric } 3010