10b57cec5SDimitry Andric //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// Insert wait instructions for memory reads and writes. 110b57cec5SDimitry Andric /// 120b57cec5SDimitry Andric /// Memory reads and writes are issued asynchronously, so we need to insert 130b57cec5SDimitry Andric /// S_WAITCNT instructions when we want to access any of their results or 140b57cec5SDimitry Andric /// overwrite any register that's used asynchronously. 150b57cec5SDimitry Andric /// 160b57cec5SDimitry Andric /// TODO: This pass currently keeps one timeline per hardware counter. A more 170b57cec5SDimitry Andric /// finely-grained approach that keeps one timeline per event type could 180b57cec5SDimitry Andric /// sometimes get away with generating weaker s_waitcnt instructions. For 190b57cec5SDimitry Andric /// example, when both SMEM and LDS are in flight and we need to wait for 200b57cec5SDimitry Andric /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, 210b57cec5SDimitry Andric /// but the pass will currently generate a conservative lgkmcnt(0) because 220b57cec5SDimitry Andric /// multiple event types are in flight. 230b57cec5SDimitry Andric // 240b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 250b57cec5SDimitry Andric 260b57cec5SDimitry Andric #include "AMDGPU.h" 27e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 28e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 290b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 30fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 315ffd83dbSDimitry Andric #include "llvm/ADT/MapVector.h" 320b57cec5SDimitry Andric #include "llvm/ADT/PostOrderIterator.h" 330eae32dcSDimitry Andric #include "llvm/ADT/Sequence.h" 347a6dacacSDimitry Andric #include "llvm/Analysis/AliasAnalysis.h" 3581ad6265SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h" 36480093f4SDimitry Andric #include "llvm/CodeGen/MachinePostDominators.h" 37480093f4SDimitry Andric #include "llvm/InitializePasses.h" 380b57cec5SDimitry Andric #include "llvm/Support/DebugCounter.h" 3906c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h" 400b57cec5SDimitry Andric using namespace llvm; 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric #define DEBUG_TYPE "si-insert-waitcnts" 430b57cec5SDimitry Andric 440b57cec5SDimitry Andric DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", 450b57cec5SDimitry Andric "Force emit s_waitcnt expcnt(0) instrs"); 460b57cec5SDimitry Andric DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", 470b57cec5SDimitry Andric "Force emit s_waitcnt lgkmcnt(0) instrs"); 480b57cec5SDimitry Andric DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", 490b57cec5SDimitry Andric "Force emit s_waitcnt vmcnt(0) instrs"); 500b57cec5SDimitry Andric 510b57cec5SDimitry Andric static cl::opt<bool> ForceEmitZeroFlag( 520b57cec5SDimitry Andric "amdgpu-waitcnt-forcezero", 530b57cec5SDimitry Andric cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), 540b57cec5SDimitry Andric cl::init(false), cl::Hidden); 550b57cec5SDimitry Andric 560b57cec5SDimitry Andric namespace { 570b57cec5SDimitry Andric // Class of object that encapsulates latest instruction counter score 580b57cec5SDimitry Andric // associated with the operand. Used for determining whether 59349cc55cSDimitry Andric // s_waitcnt instruction needs to be emitted. 600b57cec5SDimitry Andric 617a6dacacSDimitry Andric enum InstCounterType { 627a6dacacSDimitry Andric LOAD_CNT = 0, // VMcnt prior to gfx12. 637a6dacacSDimitry Andric DS_CNT, // LKGMcnt prior to gfx12. 647a6dacacSDimitry Andric EXP_CNT, // 657a6dacacSDimitry Andric STORE_CNT, // VScnt in gfx10/gfx11. 667a6dacacSDimitry Andric NUM_NORMAL_INST_CNTS, 677a6dacacSDimitry Andric SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. 687a6dacacSDimitry Andric BVH_CNT, // gfx12+ only. 697a6dacacSDimitry Andric KM_CNT, // gfx12+ only. 707a6dacacSDimitry Andric NUM_EXTENDED_INST_CNTS, 717a6dacacSDimitry Andric NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS 727a6dacacSDimitry Andric }; 730eae32dcSDimitry Andric } // namespace 740b57cec5SDimitry Andric 750eae32dcSDimitry Andric namespace llvm { 760eae32dcSDimitry Andric template <> struct enum_iteration_traits<InstCounterType> { 770eae32dcSDimitry Andric static constexpr bool is_iterable = true; 780eae32dcSDimitry Andric }; 790eae32dcSDimitry Andric } // namespace llvm 800eae32dcSDimitry Andric 810eae32dcSDimitry Andric namespace { 827a6dacacSDimitry Andric // Return an iterator over all counters between LOAD_CNT (the first counter) 837a6dacacSDimitry Andric // and \c MaxCounter (exclusive, default value yields an enumeration over 847a6dacacSDimitry Andric // all counters). 857a6dacacSDimitry Andric auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { 867a6dacacSDimitry Andric return enum_seq(LOAD_CNT, MaxCounter); 877a6dacacSDimitry Andric } 880b57cec5SDimitry Andric 895ffd83dbSDimitry Andric using RegInterval = std::pair<int, int>; 900b57cec5SDimitry Andric 910eae32dcSDimitry Andric struct HardwareLimits { 927a6dacacSDimitry Andric unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. 935ffd83dbSDimitry Andric unsigned ExpcntMax; 947a6dacacSDimitry Andric unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. 957a6dacacSDimitry Andric unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. 967a6dacacSDimitry Andric unsigned SamplecntMax; // gfx12+ only. 977a6dacacSDimitry Andric unsigned BvhcntMax; // gfx12+ only. 987a6dacacSDimitry Andric unsigned KmcntMax; // gfx12+ only. 990eae32dcSDimitry Andric }; 1000b57cec5SDimitry Andric 1010eae32dcSDimitry Andric struct RegisterEncoding { 1020b57cec5SDimitry Andric unsigned VGPR0; 1030b57cec5SDimitry Andric unsigned VGPRL; 1040b57cec5SDimitry Andric unsigned SGPR0; 1050b57cec5SDimitry Andric unsigned SGPRL; 1060eae32dcSDimitry Andric }; 1070b57cec5SDimitry Andric 1080b57cec5SDimitry Andric enum WaitEventType { 1090b57cec5SDimitry Andric VMEM_ACCESS, // vector-memory read & write 1100b57cec5SDimitry Andric VMEM_READ_ACCESS, // vector-memory read 1117a6dacacSDimitry Andric VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only) 1127a6dacacSDimitry Andric VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only) 11306c3fb27SDimitry Andric VMEM_WRITE_ACCESS, // vector-memory write that is not scratch 11406c3fb27SDimitry Andric SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch 1150b57cec5SDimitry Andric LDS_ACCESS, // lds read & write 1160b57cec5SDimitry Andric GDS_ACCESS, // gds read & write 1170b57cec5SDimitry Andric SQ_MESSAGE, // send message 1180b57cec5SDimitry Andric SMEM_ACCESS, // scalar-memory read & write 1190b57cec5SDimitry Andric EXP_GPR_LOCK, // export holding on its data src 1200b57cec5SDimitry Andric GDS_GPR_LOCK, // GDS holding on its data and addr src 1210b57cec5SDimitry Andric EXP_POS_ACCESS, // write to export position 1220b57cec5SDimitry Andric EXP_PARAM_ACCESS, // write to export parameter 1230b57cec5SDimitry Andric VMW_GPR_LOCK, // vector-memory write holding on its data src 12481ad6265SDimitry Andric EXP_LDS_ACCESS, // read by ldsdir counting as export 1250b57cec5SDimitry Andric NUM_WAIT_EVENTS, 1260b57cec5SDimitry Andric }; 1270b57cec5SDimitry Andric 1280b57cec5SDimitry Andric // The mapping is: 1290b57cec5SDimitry Andric // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 1300b57cec5SDimitry Andric // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 1310b57cec5SDimitry Andric // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 1320b57cec5SDimitry Andric // We reserve a fixed number of VGPR slots in the scoring tables for 1330b57cec5SDimitry Andric // special tokens like SCMEM_LDS (needed for buffer load to LDS). 1340b57cec5SDimitry Andric enum RegisterMapping { 135fe6060f1SDimitry Andric SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. 13681ad6265SDimitry Andric AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. 1370b57cec5SDimitry Andric SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 1387a6dacacSDimitry Andric NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. 1397a6dacacSDimitry Andric // Artificial register slots to track LDS writes into specific LDS locations 1407a6dacacSDimitry Andric // if a location is known. When slots are exhausted or location is 1417a6dacacSDimitry Andric // unknown use the first slot. The first slot is also always updated in 1427a6dacacSDimitry Andric // addition to known location's slot to properly generate waits if dependent 1437a6dacacSDimitry Andric // instruction's location is unknown. 1447a6dacacSDimitry Andric EXTRA_VGPR_LDS = 0, 1450b57cec5SDimitry Andric NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 1460b57cec5SDimitry Andric }; 1470b57cec5SDimitry Andric 1485ffd83dbSDimitry Andric // Enumerate different types of result-returning VMEM operations. Although 1495ffd83dbSDimitry Andric // s_waitcnt orders them all with a single vmcnt counter, in the absence of 1505ffd83dbSDimitry Andric // s_waitcnt only instructions of the same VmemType are guaranteed to write 1515ffd83dbSDimitry Andric // their results in order -- so there is no need to insert an s_waitcnt between 1525ffd83dbSDimitry Andric // two instructions of the same type that write the same vgpr. 1535ffd83dbSDimitry Andric enum VmemType { 1545ffd83dbSDimitry Andric // BUF instructions and MIMG instructions without a sampler. 1555ffd83dbSDimitry Andric VMEM_NOSAMPLER, 1565ffd83dbSDimitry Andric // MIMG instructions with a sampler. 1575ffd83dbSDimitry Andric VMEM_SAMPLER, 1584824e7fdSDimitry Andric // BVH instructions 1597a6dacacSDimitry Andric VMEM_BVH, 1607a6dacacSDimitry Andric NUM_VMEM_TYPES 1615ffd83dbSDimitry Andric }; 1625ffd83dbSDimitry Andric 1637a6dacacSDimitry Andric // Maps values of InstCounterType to the instruction that waits on that 1647a6dacacSDimitry Andric // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() 1657a6dacacSDimitry Andric // returns true. 1667a6dacacSDimitry Andric static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { 1677a6dacacSDimitry Andric AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, 1687a6dacacSDimitry Andric AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, 1697a6dacacSDimitry Andric AMDGPU::S_WAIT_KMCNT}; 1707a6dacacSDimitry Andric 171bdd1243dSDimitry Andric static bool updateVMCntOnly(const MachineInstr &Inst) { 172bdd1243dSDimitry Andric return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || 173bdd1243dSDimitry Andric SIInstrInfo::isFLATScratch(Inst); 174bdd1243dSDimitry Andric } 175bdd1243dSDimitry Andric 1767a6dacacSDimitry Andric #ifndef NDEBUG 1777a6dacacSDimitry Andric static bool isNormalMode(InstCounterType MaxCounter) { 1787a6dacacSDimitry Andric return MaxCounter == NUM_NORMAL_INST_CNTS; 1797a6dacacSDimitry Andric } 1807a6dacacSDimitry Andric #endif // NDEBUG 1817a6dacacSDimitry Andric 1825ffd83dbSDimitry Andric VmemType getVmemType(const MachineInstr &Inst) { 183bdd1243dSDimitry Andric assert(updateVMCntOnly(Inst)); 1847a6dacacSDimitry Andric if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) && 1857a6dacacSDimitry Andric !SIInstrInfo::isVSAMPLE(Inst)) 1865ffd83dbSDimitry Andric return VMEM_NOSAMPLER; 1875ffd83dbSDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); 1884824e7fdSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = 1894824e7fdSDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 1900fca6ea1SDimitry Andric // We have to make an additional check for isVSAMPLE here since some 1910fca6ea1SDimitry Andric // instructions don't have a sampler, but are still classified as sampler 1920fca6ea1SDimitry Andric // instructions for the purposes of e.g. waitcnt. 1934824e7fdSDimitry Andric return BaseInfo->BVH ? VMEM_BVH 1940fca6ea1SDimitry Andric : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER 1950fca6ea1SDimitry Andric : VMEM_NOSAMPLER; 1965ffd83dbSDimitry Andric } 1975ffd83dbSDimitry Andric 1987a6dacacSDimitry Andric unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { 1990b57cec5SDimitry Andric switch (T) { 2007a6dacacSDimitry Andric case LOAD_CNT: 2017a6dacacSDimitry Andric return Wait.LoadCnt; 2020b57cec5SDimitry Andric case EXP_CNT: 2037a6dacacSDimitry Andric return Wait.ExpCnt; 2047a6dacacSDimitry Andric case DS_CNT: 2057a6dacacSDimitry Andric return Wait.DsCnt; 2067a6dacacSDimitry Andric case STORE_CNT: 2077a6dacacSDimitry Andric return Wait.StoreCnt; 2087a6dacacSDimitry Andric case SAMPLE_CNT: 2097a6dacacSDimitry Andric return Wait.SampleCnt; 2107a6dacacSDimitry Andric case BVH_CNT: 2117a6dacacSDimitry Andric return Wait.BvhCnt; 2127a6dacacSDimitry Andric case KM_CNT: 2137a6dacacSDimitry Andric return Wait.KmCnt; 2140b57cec5SDimitry Andric default: 2150b57cec5SDimitry Andric llvm_unreachable("bad InstCounterType"); 2160b57cec5SDimitry Andric } 2170b57cec5SDimitry Andric } 2180b57cec5SDimitry Andric 2197a6dacacSDimitry Andric void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { 2207a6dacacSDimitry Andric unsigned &WC = getCounterRef(Wait, T); 2217a6dacacSDimitry Andric WC = std::min(WC, Count); 2227a6dacacSDimitry Andric } 2237a6dacacSDimitry Andric 2247a6dacacSDimitry Andric void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { 2257a6dacacSDimitry Andric getCounterRef(Wait, T) = ~0u; 2267a6dacacSDimitry Andric } 2277a6dacacSDimitry Andric 2287a6dacacSDimitry Andric unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { 2297a6dacacSDimitry Andric return getCounterRef(Wait, T); 2307a6dacacSDimitry Andric } 2317a6dacacSDimitry Andric 2327a6dacacSDimitry Andric // Mapping from event to counter according to the table masks. 2337a6dacacSDimitry Andric InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { 2347a6dacacSDimitry Andric for (auto T : inst_counter_types()) { 2357a6dacacSDimitry Andric if (masks[T] & (1 << E)) 2367a6dacacSDimitry Andric return T; 2377a6dacacSDimitry Andric } 2387a6dacacSDimitry Andric llvm_unreachable("event type has no associated counter"); 2397a6dacacSDimitry Andric } 2407a6dacacSDimitry Andric 2410b57cec5SDimitry Andric // This objects maintains the current score brackets of each wait counter, and 2420b57cec5SDimitry Andric // a per-register scoreboard for each wait counter. 2430b57cec5SDimitry Andric // 2440b57cec5SDimitry Andric // We also maintain the latest score for every event type that can change the 2450b57cec5SDimitry Andric // waitcnt in order to know if there are multiple types of events within 2460b57cec5SDimitry Andric // the brackets. When multiple types of event happen in the bracket, 2470b57cec5SDimitry Andric // wait count may get decreased out of order, therefore we need to put in 2480b57cec5SDimitry Andric // "s_waitcnt 0" before use. 2490b57cec5SDimitry Andric class WaitcntBrackets { 2500b57cec5SDimitry Andric public: 2517a6dacacSDimitry Andric WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, 2527a6dacacSDimitry Andric HardwareLimits Limits, RegisterEncoding Encoding, 2537a6dacacSDimitry Andric const unsigned *WaitEventMaskForInst, 2547a6dacacSDimitry Andric InstCounterType SmemAccessCounter) 2557a6dacacSDimitry Andric : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), 2567a6dacacSDimitry Andric Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst), 2577a6dacacSDimitry Andric SmemAccessCounter(SmemAccessCounter) {} 2580b57cec5SDimitry Andric 2590eae32dcSDimitry Andric unsigned getWaitCountMax(InstCounterType T) const { 2600b57cec5SDimitry Andric switch (T) { 2617a6dacacSDimitry Andric case LOAD_CNT: 2627a6dacacSDimitry Andric return Limits.LoadcntMax; 2637a6dacacSDimitry Andric case DS_CNT: 2647a6dacacSDimitry Andric return Limits.DscntMax; 2650b57cec5SDimitry Andric case EXP_CNT: 2660eae32dcSDimitry Andric return Limits.ExpcntMax; 2677a6dacacSDimitry Andric case STORE_CNT: 2687a6dacacSDimitry Andric return Limits.StorecntMax; 2697a6dacacSDimitry Andric case SAMPLE_CNT: 2707a6dacacSDimitry Andric return Limits.SamplecntMax; 2717a6dacacSDimitry Andric case BVH_CNT: 2727a6dacacSDimitry Andric return Limits.BvhcntMax; 2737a6dacacSDimitry Andric case KM_CNT: 2747a6dacacSDimitry Andric return Limits.KmcntMax; 2750b57cec5SDimitry Andric default: 2760b57cec5SDimitry Andric break; 2770b57cec5SDimitry Andric } 2780b57cec5SDimitry Andric return 0; 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric 2815ffd83dbSDimitry Andric unsigned getScoreLB(InstCounterType T) const { 2820b57cec5SDimitry Andric assert(T < NUM_INST_CNTS); 2830b57cec5SDimitry Andric return ScoreLBs[T]; 2840b57cec5SDimitry Andric } 2850b57cec5SDimitry Andric 2865ffd83dbSDimitry Andric unsigned getScoreUB(InstCounterType T) const { 2870b57cec5SDimitry Andric assert(T < NUM_INST_CNTS); 2880b57cec5SDimitry Andric return ScoreUBs[T]; 2890b57cec5SDimitry Andric } 2900b57cec5SDimitry Andric 291bdd1243dSDimitry Andric unsigned getScoreRange(InstCounterType T) const { 292bdd1243dSDimitry Andric return getScoreUB(T) - getScoreLB(T); 2930b57cec5SDimitry Andric } 2940b57cec5SDimitry Andric 295bdd1243dSDimitry Andric unsigned getRegScore(int GprNo, InstCounterType T) const { 2960b57cec5SDimitry Andric if (GprNo < NUM_ALL_VGPRS) { 2970b57cec5SDimitry Andric return VgprScores[T][GprNo]; 2980b57cec5SDimitry Andric } 2997a6dacacSDimitry Andric assert(T == SmemAccessCounter); 3000b57cec5SDimitry Andric return SgprScores[GprNo - NUM_ALL_VGPRS]; 3010b57cec5SDimitry Andric } 3020b57cec5SDimitry Andric 3030b57cec5SDimitry Andric bool merge(const WaitcntBrackets &Other); 3040b57cec5SDimitry Andric 305cb14a3feSDimitry Andric RegInterval getRegInterval(const MachineInstr *MI, 3060b57cec5SDimitry Andric const MachineRegisterInfo *MRI, 3075ffd83dbSDimitry Andric const SIRegisterInfo *TRI, unsigned OpNo) const; 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric bool counterOutOfOrder(InstCounterType T) const; 310fe6060f1SDimitry Andric void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; 311fe6060f1SDimitry Andric void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; 312bdd1243dSDimitry Andric void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; 3130b57cec5SDimitry Andric void applyWaitcnt(const AMDGPU::Waitcnt &Wait); 3140b57cec5SDimitry Andric void applyWaitcnt(InstCounterType T, unsigned Count); 3150b57cec5SDimitry Andric void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 3160b57cec5SDimitry Andric const MachineRegisterInfo *MRI, WaitEventType E, 3170b57cec5SDimitry Andric MachineInstr &MI); 3180b57cec5SDimitry Andric 319bdd1243dSDimitry Andric unsigned hasPendingEvent() const { return PendingEvents; } 320bdd1243dSDimitry Andric unsigned hasPendingEvent(WaitEventType E) const { 3210b57cec5SDimitry Andric return PendingEvents & (1 << E); 3220b57cec5SDimitry Andric } 323bdd1243dSDimitry Andric unsigned hasPendingEvent(InstCounterType T) const { 324bdd1243dSDimitry Andric unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; 325bdd1243dSDimitry Andric assert((HasPending != 0) == (getScoreRange(T) != 0)); 326bdd1243dSDimitry Andric return HasPending; 327bdd1243dSDimitry Andric } 3280b57cec5SDimitry Andric 3295ffd83dbSDimitry Andric bool hasMixedPendingEvents(InstCounterType T) const { 330bdd1243dSDimitry Andric unsigned Events = hasPendingEvent(T); 3315ffd83dbSDimitry Andric // Return true if more than one bit is set in Events. 3325ffd83dbSDimitry Andric return Events & (Events - 1); 3335ffd83dbSDimitry Andric } 3345ffd83dbSDimitry Andric 3350b57cec5SDimitry Andric bool hasPendingFlat() const { 3367a6dacacSDimitry Andric return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && 3377a6dacacSDimitry Andric LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || 3387a6dacacSDimitry Andric (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && 3397a6dacacSDimitry Andric LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); 3400b57cec5SDimitry Andric } 3410b57cec5SDimitry Andric 3420b57cec5SDimitry Andric void setPendingFlat() { 3437a6dacacSDimitry Andric LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; 3447a6dacacSDimitry Andric LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; 3450b57cec5SDimitry Andric } 3460b57cec5SDimitry Andric 3475ffd83dbSDimitry Andric // Return true if there might be pending writes to the specified vgpr by VMEM 3485ffd83dbSDimitry Andric // instructions with types different from V. 3495ffd83dbSDimitry Andric bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { 3505ffd83dbSDimitry Andric assert(GprNo < NUM_ALL_VGPRS); 3515ffd83dbSDimitry Andric return VgprVmemTypes[GprNo] & ~(1 << V); 3525ffd83dbSDimitry Andric } 3535ffd83dbSDimitry Andric 3545ffd83dbSDimitry Andric void clearVgprVmemTypes(int GprNo) { 3555ffd83dbSDimitry Andric assert(GprNo < NUM_ALL_VGPRS); 3565ffd83dbSDimitry Andric VgprVmemTypes[GprNo] = 0; 3575ffd83dbSDimitry Andric } 3585ffd83dbSDimitry Andric 359297eecfbSDimitry Andric void setStateOnFunctionEntryOrReturn() { 3607a6dacacSDimitry Andric setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); 3617a6dacacSDimitry Andric PendingEvents |= WaitEventMaskForInst[STORE_CNT]; 3627a6dacacSDimitry Andric } 3637a6dacacSDimitry Andric 3647a6dacacSDimitry Andric ArrayRef<const MachineInstr *> getLDSDMAStores() const { 3657a6dacacSDimitry Andric return LDSDMAStores; 3665f757f3fSDimitry Andric } 3675f757f3fSDimitry Andric 3680b57cec5SDimitry Andric void print(raw_ostream &); 3690b57cec5SDimitry Andric void dump() { print(dbgs()); } 3700b57cec5SDimitry Andric 3710b57cec5SDimitry Andric private: 3720b57cec5SDimitry Andric struct MergeInfo { 3735ffd83dbSDimitry Andric unsigned OldLB; 3745ffd83dbSDimitry Andric unsigned OtherLB; 3755ffd83dbSDimitry Andric unsigned MyShift; 3765ffd83dbSDimitry Andric unsigned OtherShift; 3770b57cec5SDimitry Andric }; 3785ffd83dbSDimitry Andric static bool mergeScore(const MergeInfo &M, unsigned &Score, 3795ffd83dbSDimitry Andric unsigned OtherScore); 3800b57cec5SDimitry Andric 3815ffd83dbSDimitry Andric void setScoreLB(InstCounterType T, unsigned Val) { 3820b57cec5SDimitry Andric assert(T < NUM_INST_CNTS); 3830b57cec5SDimitry Andric ScoreLBs[T] = Val; 3840b57cec5SDimitry Andric } 3850b57cec5SDimitry Andric 3865ffd83dbSDimitry Andric void setScoreUB(InstCounterType T, unsigned Val) { 3870b57cec5SDimitry Andric assert(T < NUM_INST_CNTS); 3880b57cec5SDimitry Andric ScoreUBs[T] = Val; 389bdd1243dSDimitry Andric 390bdd1243dSDimitry Andric if (T != EXP_CNT) 391bdd1243dSDimitry Andric return; 392bdd1243dSDimitry Andric 393bdd1243dSDimitry Andric if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) 394bdd1243dSDimitry Andric ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); 3950b57cec5SDimitry Andric } 3960b57cec5SDimitry Andric 3975ffd83dbSDimitry Andric void setRegScore(int GprNo, InstCounterType T, unsigned Val) { 3980b57cec5SDimitry Andric if (GprNo < NUM_ALL_VGPRS) { 3995ffd83dbSDimitry Andric VgprUB = std::max(VgprUB, GprNo); 4000b57cec5SDimitry Andric VgprScores[T][GprNo] = Val; 4010b57cec5SDimitry Andric } else { 4027a6dacacSDimitry Andric assert(T == SmemAccessCounter); 4035ffd83dbSDimitry Andric SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); 4040b57cec5SDimitry Andric SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 4050b57cec5SDimitry Andric } 4060b57cec5SDimitry Andric } 4070b57cec5SDimitry Andric 4080b57cec5SDimitry Andric void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 4090b57cec5SDimitry Andric const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 4105ffd83dbSDimitry Andric unsigned OpNo, unsigned Val); 4110b57cec5SDimitry Andric 4120b57cec5SDimitry Andric const GCNSubtarget *ST = nullptr; 4137a6dacacSDimitry Andric InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; 4140eae32dcSDimitry Andric HardwareLimits Limits = {}; 4150eae32dcSDimitry Andric RegisterEncoding Encoding = {}; 4167a6dacacSDimitry Andric const unsigned *WaitEventMaskForInst; 4177a6dacacSDimitry Andric InstCounterType SmemAccessCounter; 4185ffd83dbSDimitry Andric unsigned ScoreLBs[NUM_INST_CNTS] = {0}; 4195ffd83dbSDimitry Andric unsigned ScoreUBs[NUM_INST_CNTS] = {0}; 4205ffd83dbSDimitry Andric unsigned PendingEvents = 0; 4210b57cec5SDimitry Andric // Remember the last flat memory operation. 4225ffd83dbSDimitry Andric unsigned LastFlat[NUM_INST_CNTS] = {0}; 4230b57cec5SDimitry Andric // wait_cnt scores for every vgpr. 4240b57cec5SDimitry Andric // Keep track of the VgprUB and SgprUB to make merge at join efficient. 4255ffd83dbSDimitry Andric int VgprUB = -1; 4265ffd83dbSDimitry Andric int SgprUB = -1; 4275ffd83dbSDimitry Andric unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; 4287a6dacacSDimitry Andric // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt 4297a6dacacSDimitry Andric // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. 4305ffd83dbSDimitry Andric unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 4315ffd83dbSDimitry Andric // Bitmask of the VmemTypes of VMEM instructions that might have a pending 4325ffd83dbSDimitry Andric // write to each vgpr. 4335ffd83dbSDimitry Andric unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; 4347a6dacacSDimitry Andric // Store representative LDS DMA operations. The only useful info here is 4357a6dacacSDimitry Andric // alias info. One store is kept per unique AAInfo. 4367a6dacacSDimitry Andric SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores; 4377a6dacacSDimitry Andric }; 4387a6dacacSDimitry Andric 4397a6dacacSDimitry Andric // This abstracts the logic for generating and updating S_WAIT* instructions 4407a6dacacSDimitry Andric // away from the analysis that determines where they are needed. This was 4417a6dacacSDimitry Andric // done because the set of counters and instructions for waiting on them 4427a6dacacSDimitry Andric // underwent a major shift with gfx12, sufficiently so that having this 4437a6dacacSDimitry Andric // abstraction allows the main analysis logic to be simpler than it would 4447a6dacacSDimitry Andric // otherwise have had to become. 4457a6dacacSDimitry Andric class WaitcntGenerator { 4467a6dacacSDimitry Andric protected: 4477a6dacacSDimitry Andric const GCNSubtarget *ST = nullptr; 4487a6dacacSDimitry Andric const SIInstrInfo *TII = nullptr; 4497a6dacacSDimitry Andric AMDGPU::IsaVersion IV; 4507a6dacacSDimitry Andric InstCounterType MaxCounter; 4510fca6ea1SDimitry Andric bool OptNone; 4527a6dacacSDimitry Andric 4537a6dacacSDimitry Andric public: 4540fca6ea1SDimitry Andric WaitcntGenerator() = default; 4550fca6ea1SDimitry Andric WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) 4560fca6ea1SDimitry Andric : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), 4570fca6ea1SDimitry Andric IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), 4580fca6ea1SDimitry Andric OptNone(MF.getFunction().hasOptNone() || 4590fca6ea1SDimitry Andric MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} 4600fca6ea1SDimitry Andric 4610fca6ea1SDimitry Andric // Return true if the current function should be compiled with no 4620fca6ea1SDimitry Andric // optimization. 4630fca6ea1SDimitry Andric bool isOptNone() const { return OptNone; } 4647a6dacacSDimitry Andric 4657a6dacacSDimitry Andric // Edits an existing sequence of wait count instructions according 4667a6dacacSDimitry Andric // to an incoming Waitcnt value, which is itself updated to reflect 4677a6dacacSDimitry Andric // any new wait count instructions which may need to be generated by 4687a6dacacSDimitry Andric // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits 4697a6dacacSDimitry Andric // were made. 4707a6dacacSDimitry Andric // 4717a6dacacSDimitry Andric // This editing will usually be merely updated operands, but it may also 4727a6dacacSDimitry Andric // delete instructions if the incoming Wait value indicates they are not 4737a6dacacSDimitry Andric // needed. It may also remove existing instructions for which a wait 4747a6dacacSDimitry Andric // is needed if it can be determined that it is better to generate new 4757a6dacacSDimitry Andric // instructions later, as can happen on gfx12. 4767a6dacacSDimitry Andric virtual bool 4777a6dacacSDimitry Andric applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 4787a6dacacSDimitry Andric MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 4797a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It) const = 0; 4807a6dacacSDimitry Andric 4817a6dacacSDimitry Andric // Transform a soft waitcnt into a normal one. 4827a6dacacSDimitry Andric bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; 4837a6dacacSDimitry Andric 4847a6dacacSDimitry Andric // Generates new wait count instructions according to the value of 4857a6dacacSDimitry Andric // Wait, returning true if any new instructions were created. 4867a6dacacSDimitry Andric virtual bool createNewWaitcnt(MachineBasicBlock &Block, 4877a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It, 4887a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) = 0; 4897a6dacacSDimitry Andric 4907a6dacacSDimitry Andric // Returns an array of bit masks which can be used to map values in 4917a6dacacSDimitry Andric // WaitEventType to corresponding counter values in InstCounterType. 4927a6dacacSDimitry Andric virtual const unsigned *getWaitEventMask() const = 0; 4937a6dacacSDimitry Andric 4940fca6ea1SDimitry Andric // Returns a new waitcnt with all counters except VScnt set to 0. If 4950fca6ea1SDimitry Andric // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. 4960fca6ea1SDimitry Andric virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; 4970fca6ea1SDimitry Andric 4987a6dacacSDimitry Andric virtual ~WaitcntGenerator() = default; 4990fca6ea1SDimitry Andric 5000fca6ea1SDimitry Andric // Create a mask value from the initializer list of wait event types. 5010fca6ea1SDimitry Andric static constexpr unsigned 5020fca6ea1SDimitry Andric eventMask(std::initializer_list<WaitEventType> Events) { 5030fca6ea1SDimitry Andric unsigned Mask = 0; 5040fca6ea1SDimitry Andric for (auto &E : Events) 5050fca6ea1SDimitry Andric Mask |= 1 << E; 5060fca6ea1SDimitry Andric 5070fca6ea1SDimitry Andric return Mask; 5080fca6ea1SDimitry Andric } 5097a6dacacSDimitry Andric }; 5107a6dacacSDimitry Andric 5117a6dacacSDimitry Andric class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { 5127a6dacacSDimitry Andric public: 5130fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12() = default; 5140fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12(const MachineFunction &MF) 5150fca6ea1SDimitry Andric : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {} 5167a6dacacSDimitry Andric 5177a6dacacSDimitry Andric bool 5187a6dacacSDimitry Andric applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 5197a6dacacSDimitry Andric MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 5207a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It) const override; 5217a6dacacSDimitry Andric 5227a6dacacSDimitry Andric bool createNewWaitcnt(MachineBasicBlock &Block, 5237a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It, 5247a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) override; 5257a6dacacSDimitry Andric 5267a6dacacSDimitry Andric const unsigned *getWaitEventMask() const override { 5277a6dacacSDimitry Andric assert(ST); 5287a6dacacSDimitry Andric 5297a6dacacSDimitry Andric static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { 5300fca6ea1SDimitry Andric eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, 5310fca6ea1SDimitry Andric VMEM_BVH_READ_ACCESS}), 5320fca6ea1SDimitry Andric eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}), 5330fca6ea1SDimitry Andric eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, 5340fca6ea1SDimitry Andric EXP_POS_ACCESS, EXP_LDS_ACCESS}), 5350fca6ea1SDimitry Andric eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), 5367a6dacacSDimitry Andric 0, 5377a6dacacSDimitry Andric 0, 5387a6dacacSDimitry Andric 0}; 5397a6dacacSDimitry Andric 5407a6dacacSDimitry Andric return WaitEventMaskForInstPreGFX12; 5417a6dacacSDimitry Andric } 5420fca6ea1SDimitry Andric 5430fca6ea1SDimitry Andric AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; 5447a6dacacSDimitry Andric }; 5457a6dacacSDimitry Andric 5467a6dacacSDimitry Andric class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { 5477a6dacacSDimitry Andric public: 5480fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus() = default; 5490fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus(const MachineFunction &MF, 5500fca6ea1SDimitry Andric InstCounterType MaxCounter) 5510fca6ea1SDimitry Andric : WaitcntGenerator(MF, MaxCounter) {} 5527a6dacacSDimitry Andric 5537a6dacacSDimitry Andric bool 5547a6dacacSDimitry Andric applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 5557a6dacacSDimitry Andric MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 5567a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It) const override; 5577a6dacacSDimitry Andric 5587a6dacacSDimitry Andric bool createNewWaitcnt(MachineBasicBlock &Block, 5597a6dacacSDimitry Andric MachineBasicBlock::instr_iterator It, 5607a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) override; 5617a6dacacSDimitry Andric 5627a6dacacSDimitry Andric const unsigned *getWaitEventMask() const override { 5637a6dacacSDimitry Andric assert(ST); 5647a6dacacSDimitry Andric 5657a6dacacSDimitry Andric static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { 5660fca6ea1SDimitry Andric eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}), 5670fca6ea1SDimitry Andric eventMask({LDS_ACCESS, GDS_ACCESS}), 5680fca6ea1SDimitry Andric eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, 5690fca6ea1SDimitry Andric EXP_POS_ACCESS, EXP_LDS_ACCESS}), 5700fca6ea1SDimitry Andric eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), 5710fca6ea1SDimitry Andric eventMask({VMEM_SAMPLER_READ_ACCESS}), 5720fca6ea1SDimitry Andric eventMask({VMEM_BVH_READ_ACCESS}), 5730fca6ea1SDimitry Andric eventMask({SMEM_ACCESS, SQ_MESSAGE})}; 5747a6dacacSDimitry Andric 5757a6dacacSDimitry Andric return WaitEventMaskForInstGFX12Plus; 5767a6dacacSDimitry Andric } 5770fca6ea1SDimitry Andric 5780fca6ea1SDimitry Andric AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; 5790b57cec5SDimitry Andric }; 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric class SIInsertWaitcnts : public MachineFunctionPass { 5820b57cec5SDimitry Andric private: 5830b57cec5SDimitry Andric const GCNSubtarget *ST = nullptr; 5840b57cec5SDimitry Andric const SIInstrInfo *TII = nullptr; 5850b57cec5SDimitry Andric const SIRegisterInfo *TRI = nullptr; 5860b57cec5SDimitry Andric const MachineRegisterInfo *MRI = nullptr; 5870b57cec5SDimitry Andric 588480093f4SDimitry Andric DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; 58981ad6265SDimitry Andric DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; 59081ad6265SDimitry Andric MachineLoopInfo *MLI; 591480093f4SDimitry Andric MachinePostDominatorTree *PDT; 5927a6dacacSDimitry Andric AliasAnalysis *AA = nullptr; 5930b57cec5SDimitry Andric 5940b57cec5SDimitry Andric struct BlockInfo { 5950b57cec5SDimitry Andric std::unique_ptr<WaitcntBrackets> Incoming; 5960b57cec5SDimitry Andric bool Dirty = true; 5970b57cec5SDimitry Andric }; 5980b57cec5SDimitry Andric 5997a6dacacSDimitry Andric InstCounterType SmemAccessCounter; 6007a6dacacSDimitry Andric 6015ffd83dbSDimitry Andric MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; 6020b57cec5SDimitry Andric 6030b57cec5SDimitry Andric // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 6040b57cec5SDimitry Andric // because of amdgpu-waitcnt-forcezero flag 6050b57cec5SDimitry Andric bool ForceEmitZeroWaitcnts; 6060b57cec5SDimitry Andric bool ForceEmitWaitcnt[NUM_INST_CNTS]; 6070b57cec5SDimitry Andric 6087a6dacacSDimitry Andric // In any given run of this pass, WCG will point to one of these two 6097a6dacacSDimitry Andric // generator objects, which must have been re-initialised before use 6107a6dacacSDimitry Andric // from a value made using a subtarget constructor. 6117a6dacacSDimitry Andric WaitcntGeneratorPreGFX12 WCGPreGFX12; 6127a6dacacSDimitry Andric WaitcntGeneratorGFX12Plus WCGGFX12Plus; 6137a6dacacSDimitry Andric 6147a6dacacSDimitry Andric WaitcntGenerator *WCG = nullptr; 6157a6dacacSDimitry Andric 61606c3fb27SDimitry Andric // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS 61706c3fb27SDimitry Andric // message. 61806c3fb27SDimitry Andric DenseSet<MachineInstr *> ReleaseVGPRInsts; 61906c3fb27SDimitry Andric 6207a6dacacSDimitry Andric InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; 6217a6dacacSDimitry Andric 6220b57cec5SDimitry Andric public: 6230b57cec5SDimitry Andric static char ID; 6240b57cec5SDimitry Andric 6250b57cec5SDimitry Andric SIInsertWaitcnts() : MachineFunctionPass(ID) { 6260b57cec5SDimitry Andric (void)ForceExpCounter; 6270b57cec5SDimitry Andric (void)ForceLgkmCounter; 6280b57cec5SDimitry Andric (void)ForceVMCounter; 6290b57cec5SDimitry Andric } 6300b57cec5SDimitry Andric 63181ad6265SDimitry Andric bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); 63281ad6265SDimitry Andric bool isPreheaderToFlush(MachineBasicBlock &MBB, 63381ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets); 63406c3fb27SDimitry Andric bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; 6350b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 6360b57cec5SDimitry Andric 6370b57cec5SDimitry Andric StringRef getPassName() const override { 6380b57cec5SDimitry Andric return "SI insert wait instructions"; 6390b57cec5SDimitry Andric } 6400b57cec5SDimitry Andric 6410b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 6420b57cec5SDimitry Andric AU.setPreservesCFG(); 6430fca6ea1SDimitry Andric AU.addRequired<MachineLoopInfoWrapperPass>(); 6440fca6ea1SDimitry Andric AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 6457a6dacacSDimitry Andric AU.addUsedIfAvailable<AAResultsWrapperPass>(); 6467a6dacacSDimitry Andric AU.addPreserved<AAResultsWrapperPass>(); 6470b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 6480b57cec5SDimitry Andric } 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric bool isForceEmitWaitcnt() const { 6510b57cec5SDimitry Andric for (auto T : inst_counter_types()) 6520b57cec5SDimitry Andric if (ForceEmitWaitcnt[T]) 6530b57cec5SDimitry Andric return true; 6540b57cec5SDimitry Andric return false; 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric void setForceEmitWaitcnt() { 6580b57cec5SDimitry Andric // For non-debug builds, ForceEmitWaitcnt has been initialized to false; 6590b57cec5SDimitry Andric // For debug builds, get the debug counter info and adjust if need be 6600b57cec5SDimitry Andric #ifndef NDEBUG 6610b57cec5SDimitry Andric if (DebugCounter::isCounterSet(ForceExpCounter) && 6620b57cec5SDimitry Andric DebugCounter::shouldExecute(ForceExpCounter)) { 6630b57cec5SDimitry Andric ForceEmitWaitcnt[EXP_CNT] = true; 6640b57cec5SDimitry Andric } else { 6650b57cec5SDimitry Andric ForceEmitWaitcnt[EXP_CNT] = false; 6660b57cec5SDimitry Andric } 6670b57cec5SDimitry Andric 6680b57cec5SDimitry Andric if (DebugCounter::isCounterSet(ForceLgkmCounter) && 6690b57cec5SDimitry Andric DebugCounter::shouldExecute(ForceLgkmCounter)) { 6707a6dacacSDimitry Andric ForceEmitWaitcnt[DS_CNT] = true; 6717a6dacacSDimitry Andric ForceEmitWaitcnt[KM_CNT] = true; 6720b57cec5SDimitry Andric } else { 6737a6dacacSDimitry Andric ForceEmitWaitcnt[DS_CNT] = false; 6747a6dacacSDimitry Andric ForceEmitWaitcnt[KM_CNT] = false; 6750b57cec5SDimitry Andric } 6760b57cec5SDimitry Andric 6770b57cec5SDimitry Andric if (DebugCounter::isCounterSet(ForceVMCounter) && 6780b57cec5SDimitry Andric DebugCounter::shouldExecute(ForceVMCounter)) { 6797a6dacacSDimitry Andric ForceEmitWaitcnt[LOAD_CNT] = true; 6807a6dacacSDimitry Andric ForceEmitWaitcnt[SAMPLE_CNT] = true; 6817a6dacacSDimitry Andric ForceEmitWaitcnt[BVH_CNT] = true; 6820b57cec5SDimitry Andric } else { 6837a6dacacSDimitry Andric ForceEmitWaitcnt[LOAD_CNT] = false; 6847a6dacacSDimitry Andric ForceEmitWaitcnt[SAMPLE_CNT] = false; 6857a6dacacSDimitry Andric ForceEmitWaitcnt[BVH_CNT] = false; 6860b57cec5SDimitry Andric } 6870b57cec5SDimitry Andric #endif // NDEBUG 6880b57cec5SDimitry Andric } 6890b57cec5SDimitry Andric 690bdd1243dSDimitry Andric // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or 691bdd1243dSDimitry Andric // FLAT instruction. 692bdd1243dSDimitry Andric WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { 6937a6dacacSDimitry Andric // Maps VMEM access types to their corresponding WaitEventType. 6947a6dacacSDimitry Andric static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { 6957a6dacacSDimitry Andric VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; 6967a6dacacSDimitry Andric 697bdd1243dSDimitry Andric assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); 6985f757f3fSDimitry Andric // LDS DMA loads are also stores, but on the LDS side. On the VMEM side 6995f757f3fSDimitry Andric // these should use VM_CNT. 7005f757f3fSDimitry Andric if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) 701bdd1243dSDimitry Andric return VMEM_ACCESS; 7020fca6ea1SDimitry Andric if (Inst.mayStore() && 7030fca6ea1SDimitry Andric (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { 70406c3fb27SDimitry Andric // FLAT and SCRATCH instructions may access scratch. Other VMEM 70506c3fb27SDimitry Andric // instructions do not. 70606c3fb27SDimitry Andric if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) 70706c3fb27SDimitry Andric return SCRATCH_WRITE_ACCESS; 708bdd1243dSDimitry Andric return VMEM_WRITE_ACCESS; 70906c3fb27SDimitry Andric } 7107a6dacacSDimitry Andric if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) 711bdd1243dSDimitry Andric return VMEM_READ_ACCESS; 7127a6dacacSDimitry Andric return VmemReadMapping[getVmemType(Inst)]; 713bdd1243dSDimitry Andric } 714bdd1243dSDimitry Andric 715e8d8bef9SDimitry Andric bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; 7160b57cec5SDimitry Andric bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 71706c3fb27SDimitry Andric bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; 7180b57cec5SDimitry Andric bool generateWaitcntInstBefore(MachineInstr &MI, 7190b57cec5SDimitry Andric WaitcntBrackets &ScoreBrackets, 72081ad6265SDimitry Andric MachineInstr *OldWaitcntInstr, 72181ad6265SDimitry Andric bool FlushVmCnt); 72281ad6265SDimitry Andric bool generateWaitcnt(AMDGPU::Waitcnt Wait, 72381ad6265SDimitry Andric MachineBasicBlock::instr_iterator It, 72481ad6265SDimitry Andric MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, 7250b57cec5SDimitry Andric MachineInstr *OldWaitcntInstr); 7260b57cec5SDimitry Andric void updateEventWaitcntAfter(MachineInstr &Inst, 7270b57cec5SDimitry Andric WaitcntBrackets *ScoreBrackets); 7280b57cec5SDimitry Andric bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, 7290b57cec5SDimitry Andric WaitcntBrackets &ScoreBrackets); 7300b57cec5SDimitry Andric }; 7310b57cec5SDimitry Andric 7320b57cec5SDimitry Andric } // end anonymous namespace 7330b57cec5SDimitry Andric 7340b57cec5SDimitry Andric RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, 7350b57cec5SDimitry Andric const MachineRegisterInfo *MRI, 7360b57cec5SDimitry Andric const SIRegisterInfo *TRI, 7375ffd83dbSDimitry Andric unsigned OpNo) const { 7380b57cec5SDimitry Andric const MachineOperand &Op = MI->getOperand(OpNo); 739fe6060f1SDimitry Andric if (!TRI->isInAllocatableClass(Op.getReg())) 7400b57cec5SDimitry Andric return {-1, -1}; 7410b57cec5SDimitry Andric 7420b57cec5SDimitry Andric // A use via a PW operand does not need a waitcnt. 7430b57cec5SDimitry Andric // A partial write is not a WAW. 7440b57cec5SDimitry Andric assert(!Op.getSubReg() || !Op.isUndef()); 7450b57cec5SDimitry Andric 7460b57cec5SDimitry Andric RegInterval Result; 7470b57cec5SDimitry Andric 7485f757f3fSDimitry Andric unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & 7495f757f3fSDimitry Andric AMDGPU::HWEncoding::REG_IDX_MASK; 7500b57cec5SDimitry Andric 751fe6060f1SDimitry Andric if (TRI->isVectorRegister(*MRI, Op.getReg())) { 7520eae32dcSDimitry Andric assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); 7530eae32dcSDimitry Andric Result.first = Reg - Encoding.VGPR0; 754fe6060f1SDimitry Andric if (TRI->isAGPR(*MRI, Op.getReg())) 755fe6060f1SDimitry Andric Result.first += AGPR_OFFSET; 7560b57cec5SDimitry Andric assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 7575ffd83dbSDimitry Andric } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { 7580eae32dcSDimitry Andric assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 7590eae32dcSDimitry Andric Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; 7600b57cec5SDimitry Andric assert(Result.first >= NUM_ALL_VGPRS && 7610b57cec5SDimitry Andric Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 7620b57cec5SDimitry Andric } 7630b57cec5SDimitry Andric // TODO: Handle TTMP 7645ffd83dbSDimitry Andric // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... 7650b57cec5SDimitry Andric else 7660b57cec5SDimitry Andric return {-1, -1}; 7670b57cec5SDimitry Andric 768cb14a3feSDimitry Andric const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); 7690b57cec5SDimitry Andric unsigned Size = TRI->getRegSizeInBits(*RC); 7705ffd83dbSDimitry Andric Result.second = Result.first + ((Size + 16) / 32); 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric return Result; 7730b57cec5SDimitry Andric } 7740b57cec5SDimitry Andric 7750b57cec5SDimitry Andric void WaitcntBrackets::setExpScore(const MachineInstr *MI, 7760b57cec5SDimitry Andric const SIInstrInfo *TII, 7770b57cec5SDimitry Andric const SIRegisterInfo *TRI, 7780b57cec5SDimitry Andric const MachineRegisterInfo *MRI, unsigned OpNo, 7795ffd83dbSDimitry Andric unsigned Val) { 780cb14a3feSDimitry Andric RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo); 781fe6060f1SDimitry Andric assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); 7825ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 7830b57cec5SDimitry Andric setRegScore(RegNo, EXP_CNT, Val); 7840b57cec5SDimitry Andric } 7850b57cec5SDimitry Andric } 7860b57cec5SDimitry Andric 7870b57cec5SDimitry Andric void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 7880b57cec5SDimitry Andric const SIRegisterInfo *TRI, 7890b57cec5SDimitry Andric const MachineRegisterInfo *MRI, 7900b57cec5SDimitry Andric WaitEventType E, MachineInstr &Inst) { 7917a6dacacSDimitry Andric InstCounterType T = eventCounter(WaitEventMaskForInst, E); 7927a6dacacSDimitry Andric 7937a6dacacSDimitry Andric unsigned UB = getScoreUB(T); 7947a6dacacSDimitry Andric unsigned CurrScore = UB + 1; 7950b57cec5SDimitry Andric if (CurrScore == 0) 7960b57cec5SDimitry Andric report_fatal_error("InsertWaitcnt score wraparound"); 7970b57cec5SDimitry Andric // PendingEvents and ScoreUB need to be update regardless if this event 7980b57cec5SDimitry Andric // changes the score of a register or not. 7990b57cec5SDimitry Andric // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 8000b57cec5SDimitry Andric PendingEvents |= 1 << E; 8010b57cec5SDimitry Andric setScoreUB(T, CurrScore); 8020b57cec5SDimitry Andric 8030b57cec5SDimitry Andric if (T == EXP_CNT) { 8040b57cec5SDimitry Andric // Put score on the source vgprs. If this is a store, just use those 8050b57cec5SDimitry Andric // specific register(s). 8060b57cec5SDimitry Andric if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 8070b57cec5SDimitry Andric int AddrOpIdx = 8080b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); 8090b57cec5SDimitry Andric // All GDS operations must protect their address register (same as 8100b57cec5SDimitry Andric // export.) 8110b57cec5SDimitry Andric if (AddrOpIdx != -1) { 8120b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); 8130b57cec5SDimitry Andric } 8140b57cec5SDimitry Andric 8150b57cec5SDimitry Andric if (Inst.mayStore()) { 816bdd1243dSDimitry Andric if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) { 8170b57cec5SDimitry Andric setExpScore( 8180b57cec5SDimitry Andric &Inst, TII, TRI, MRI, 8190b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 8200b57cec5SDimitry Andric CurrScore); 8210b57cec5SDimitry Andric } 822bdd1243dSDimitry Andric if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) { 8230b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 8240b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 8250b57cec5SDimitry Andric AMDGPU::OpName::data1), 8260b57cec5SDimitry Andric CurrScore); 8270b57cec5SDimitry Andric } 8285f757f3fSDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && 8290b57cec5SDimitry Andric Inst.getOpcode() != AMDGPU::DS_APPEND && 8300b57cec5SDimitry Andric Inst.getOpcode() != AMDGPU::DS_CONSUME && 8310b57cec5SDimitry Andric Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 8320b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 8330b57cec5SDimitry Andric const MachineOperand &Op = Inst.getOperand(I); 834fe6060f1SDimitry Andric if (Op.isReg() && !Op.isDef() && 835fe6060f1SDimitry Andric TRI->isVectorRegister(*MRI, Op.getReg())) { 8360b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 8370b57cec5SDimitry Andric } 8380b57cec5SDimitry Andric } 8390b57cec5SDimitry Andric } 8400b57cec5SDimitry Andric } else if (TII->isFLAT(Inst)) { 8410b57cec5SDimitry Andric if (Inst.mayStore()) { 8420b57cec5SDimitry Andric setExpScore( 8430b57cec5SDimitry Andric &Inst, TII, TRI, MRI, 8440b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 8450b57cec5SDimitry Andric CurrScore); 846fe6060f1SDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst)) { 8470b57cec5SDimitry Andric setExpScore( 8480b57cec5SDimitry Andric &Inst, TII, TRI, MRI, 8490b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 8500b57cec5SDimitry Andric CurrScore); 8510b57cec5SDimitry Andric } 8520b57cec5SDimitry Andric } else if (TII->isMIMG(Inst)) { 8530b57cec5SDimitry Andric if (Inst.mayStore()) { 8540b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 855fe6060f1SDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst)) { 8560b57cec5SDimitry Andric setExpScore( 8570b57cec5SDimitry Andric &Inst, TII, TRI, MRI, 8580b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 8590b57cec5SDimitry Andric CurrScore); 8600b57cec5SDimitry Andric } 8610b57cec5SDimitry Andric } else if (TII->isMTBUF(Inst)) { 8620b57cec5SDimitry Andric if (Inst.mayStore()) { 8630b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 8640b57cec5SDimitry Andric } 8650b57cec5SDimitry Andric } else if (TII->isMUBUF(Inst)) { 8660b57cec5SDimitry Andric if (Inst.mayStore()) { 8670b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 868fe6060f1SDimitry Andric } else if (SIInstrInfo::isAtomicRet(Inst)) { 8690b57cec5SDimitry Andric setExpScore( 8700b57cec5SDimitry Andric &Inst, TII, TRI, MRI, 8710b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 8720b57cec5SDimitry Andric CurrScore); 8730b57cec5SDimitry Andric } 87481ad6265SDimitry Andric } else if (TII->isLDSDIR(Inst)) { 87581ad6265SDimitry Andric // LDSDIR instructions attach the score to the destination. 87681ad6265SDimitry Andric setExpScore( 87781ad6265SDimitry Andric &Inst, TII, TRI, MRI, 87881ad6265SDimitry Andric AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), 87981ad6265SDimitry Andric CurrScore); 8800b57cec5SDimitry Andric } else { 8810b57cec5SDimitry Andric if (TII->isEXP(Inst)) { 8820b57cec5SDimitry Andric // For export the destination registers are really temps that 8830b57cec5SDimitry Andric // can be used as the actual source after export patching, so 8840b57cec5SDimitry Andric // we need to treat them like sources and set the EXP_CNT 8850b57cec5SDimitry Andric // score. 8860b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 8870b57cec5SDimitry Andric MachineOperand &DefMO = Inst.getOperand(I); 8880b57cec5SDimitry Andric if (DefMO.isReg() && DefMO.isDef() && 8895ffd83dbSDimitry Andric TRI->isVGPR(*MRI, DefMO.getReg())) { 890e8d8bef9SDimitry Andric setRegScore( 891e8d8bef9SDimitry Andric TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), 892e8d8bef9SDimitry Andric EXP_CNT, CurrScore); 8930b57cec5SDimitry Andric } 8940b57cec5SDimitry Andric } 8950b57cec5SDimitry Andric } 8960b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 8970b57cec5SDimitry Andric MachineOperand &MO = Inst.getOperand(I); 898fe6060f1SDimitry Andric if (MO.isReg() && !MO.isDef() && 899fe6060f1SDimitry Andric TRI->isVectorRegister(*MRI, MO.getReg())) { 9000b57cec5SDimitry Andric setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 9010b57cec5SDimitry Andric } 9020b57cec5SDimitry Andric } 9030b57cec5SDimitry Andric } 9045f757f3fSDimitry Andric } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { 9050b57cec5SDimitry Andric // Match the score to the destination registers. 9060b57cec5SDimitry Andric for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 9075ffd83dbSDimitry Andric auto &Op = Inst.getOperand(I); 9085ffd83dbSDimitry Andric if (!Op.isReg() || !Op.isDef()) 9090b57cec5SDimitry Andric continue; 910cb14a3feSDimitry Andric RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I); 9117a6dacacSDimitry Andric if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { 9125ffd83dbSDimitry Andric if (Interval.first >= NUM_ALL_VGPRS) 9135ffd83dbSDimitry Andric continue; 914bdd1243dSDimitry Andric if (updateVMCntOnly(Inst)) { 9155f757f3fSDimitry Andric // updateVMCntOnly should only leave us with VGPRs 9165f757f3fSDimitry Andric // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR 9175f757f3fSDimitry Andric // defs. That's required for a sane index into `VgprMemTypes` below 9185f757f3fSDimitry Andric assert(TRI->isVectorRegister(*MRI, Op.getReg())); 9195ffd83dbSDimitry Andric VmemType V = getVmemType(Inst); 9205ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) 9215ffd83dbSDimitry Andric VgprVmemTypes[RegNo] |= 1 << V; 9225ffd83dbSDimitry Andric } 9235ffd83dbSDimitry Andric } 9245ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 9250b57cec5SDimitry Andric setRegScore(RegNo, T, CurrScore); 9260b57cec5SDimitry Andric } 9270b57cec5SDimitry Andric } 9285f757f3fSDimitry Andric if (Inst.mayStore() && 9295f757f3fSDimitry Andric (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { 9305f757f3fSDimitry Andric // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS 9315f757f3fSDimitry Andric // written can be accessed. A load from LDS to VMEM does not need a wait. 9327a6dacacSDimitry Andric unsigned Slot = 0; 9337a6dacacSDimitry Andric for (const auto *MemOp : Inst.memoperands()) { 9347a6dacacSDimitry Andric if (!MemOp->isStore() || 9357a6dacacSDimitry Andric MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) 9367a6dacacSDimitry Andric continue; 9377a6dacacSDimitry Andric // Comparing just AA info does not guarantee memoperands are equal 9387a6dacacSDimitry Andric // in general, but this is so for LDS DMA in practice. 9397a6dacacSDimitry Andric auto AAI = MemOp->getAAInfo(); 9407a6dacacSDimitry Andric // Alias scope information gives a way to definitely identify an 9417a6dacacSDimitry Andric // original memory object and practically produced in the module LDS 9427a6dacacSDimitry Andric // lowering pass. If there is no scope available we will not be able 9437a6dacacSDimitry Andric // to disambiguate LDS aliasing as after the module lowering all LDS 9447a6dacacSDimitry Andric // is squashed into a single big object. Do not attempt to use one of 9457a6dacacSDimitry Andric // the limited LDSDMAStores for something we will not be able to use 9467a6dacacSDimitry Andric // anyway. 9477a6dacacSDimitry Andric if (!AAI || !AAI.Scope) 9487a6dacacSDimitry Andric break; 9497a6dacacSDimitry Andric for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { 9507a6dacacSDimitry Andric for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { 9517a6dacacSDimitry Andric if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { 9527a6dacacSDimitry Andric Slot = I + 1; 9537a6dacacSDimitry Andric break; 9547a6dacacSDimitry Andric } 9557a6dacacSDimitry Andric } 9567a6dacacSDimitry Andric } 9577a6dacacSDimitry Andric if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) 9587a6dacacSDimitry Andric break; 9597a6dacacSDimitry Andric LDSDMAStores.push_back(&Inst); 9607a6dacacSDimitry Andric Slot = LDSDMAStores.size(); 9617a6dacacSDimitry Andric break; 9627a6dacacSDimitry Andric } 9637a6dacacSDimitry Andric setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore); 9647a6dacacSDimitry Andric if (Slot) 9650b57cec5SDimitry Andric setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 9660b57cec5SDimitry Andric } 9670b57cec5SDimitry Andric } 9680b57cec5SDimitry Andric } 9690b57cec5SDimitry Andric 9700b57cec5SDimitry Andric void WaitcntBrackets::print(raw_ostream &OS) { 9710b57cec5SDimitry Andric OS << '\n'; 9727a6dacacSDimitry Andric for (auto T : inst_counter_types(MaxCounter)) { 973bdd1243dSDimitry Andric unsigned SR = getScoreRange(T); 9740b57cec5SDimitry Andric 9750b57cec5SDimitry Andric switch (T) { 9767a6dacacSDimitry Andric case LOAD_CNT: 9777a6dacacSDimitry Andric OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" 9787a6dacacSDimitry Andric << SR << "): "; 9790b57cec5SDimitry Andric break; 9807a6dacacSDimitry Andric case DS_CNT: 9817a6dacacSDimitry Andric OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" 9827a6dacacSDimitry Andric << SR << "): "; 9830b57cec5SDimitry Andric break; 9840b57cec5SDimitry Andric case EXP_CNT: 985bdd1243dSDimitry Andric OS << " EXP_CNT(" << SR << "): "; 9860b57cec5SDimitry Andric break; 9877a6dacacSDimitry Andric case STORE_CNT: 9887a6dacacSDimitry Andric OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" 9897a6dacacSDimitry Andric << SR << "): "; 9907a6dacacSDimitry Andric break; 9917a6dacacSDimitry Andric case SAMPLE_CNT: 9927a6dacacSDimitry Andric OS << " SAMPLE_CNT(" << SR << "): "; 9937a6dacacSDimitry Andric break; 9947a6dacacSDimitry Andric case BVH_CNT: 9957a6dacacSDimitry Andric OS << " BVH_CNT(" << SR << "): "; 9967a6dacacSDimitry Andric break; 9977a6dacacSDimitry Andric case KM_CNT: 9987a6dacacSDimitry Andric OS << " KM_CNT(" << SR << "): "; 9990b57cec5SDimitry Andric break; 10000b57cec5SDimitry Andric default: 1001bdd1243dSDimitry Andric OS << " UNKNOWN(" << SR << "): "; 10020b57cec5SDimitry Andric break; 10030b57cec5SDimitry Andric } 10040b57cec5SDimitry Andric 1005bdd1243dSDimitry Andric if (SR != 0) { 10060b57cec5SDimitry Andric // Print vgpr scores. 1007bdd1243dSDimitry Andric unsigned LB = getScoreLB(T); 1008bdd1243dSDimitry Andric 10095ffd83dbSDimitry Andric for (int J = 0; J <= VgprUB; J++) { 10105ffd83dbSDimitry Andric unsigned RegScore = getRegScore(J, T); 10110b57cec5SDimitry Andric if (RegScore <= LB) 10120b57cec5SDimitry Andric continue; 10135ffd83dbSDimitry Andric unsigned RelScore = RegScore - LB - 1; 10140b57cec5SDimitry Andric if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 10150b57cec5SDimitry Andric OS << RelScore << ":v" << J << " "; 10160b57cec5SDimitry Andric } else { 10170b57cec5SDimitry Andric OS << RelScore << ":ds "; 10180b57cec5SDimitry Andric } 10190b57cec5SDimitry Andric } 10200b57cec5SDimitry Andric // Also need to print sgpr scores for lgkm_cnt. 10217a6dacacSDimitry Andric if (T == SmemAccessCounter) { 10225ffd83dbSDimitry Andric for (int J = 0; J <= SgprUB; J++) { 10237a6dacacSDimitry Andric unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); 10240b57cec5SDimitry Andric if (RegScore <= LB) 10250b57cec5SDimitry Andric continue; 10265ffd83dbSDimitry Andric unsigned RelScore = RegScore - LB - 1; 10270b57cec5SDimitry Andric OS << RelScore << ":s" << J << " "; 10280b57cec5SDimitry Andric } 10290b57cec5SDimitry Andric } 10300b57cec5SDimitry Andric } 10310b57cec5SDimitry Andric OS << '\n'; 10320b57cec5SDimitry Andric } 10330b57cec5SDimitry Andric OS << '\n'; 10340b57cec5SDimitry Andric } 10350b57cec5SDimitry Andric 10360b57cec5SDimitry Andric /// Simplify the waitcnt, in the sense of removing redundant counts, and return 10370b57cec5SDimitry Andric /// whether a waitcnt instruction is needed at all. 1038fe6060f1SDimitry Andric void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { 10397a6dacacSDimitry Andric simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1040fe6060f1SDimitry Andric simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); 10417a6dacacSDimitry Andric simplifyWaitcnt(DS_CNT, Wait.DsCnt); 10427a6dacacSDimitry Andric simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); 10437a6dacacSDimitry Andric simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); 10447a6dacacSDimitry Andric simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); 10457a6dacacSDimitry Andric simplifyWaitcnt(KM_CNT, Wait.KmCnt); 10460b57cec5SDimitry Andric } 10470b57cec5SDimitry Andric 1048fe6060f1SDimitry Andric void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, 10490b57cec5SDimitry Andric unsigned &Count) const { 1050fe6060f1SDimitry Andric // The number of outstanding events for this type, T, can be calculated 1051fe6060f1SDimitry Andric // as (UB - LB). If the current Count is greater than or equal to the number 1052fe6060f1SDimitry Andric // of outstanding events, then the wait for this counter is redundant. 1053bdd1243dSDimitry Andric if (Count >= getScoreRange(T)) 10540b57cec5SDimitry Andric Count = ~0u; 10550b57cec5SDimitry Andric } 10560b57cec5SDimitry Andric 1057bdd1243dSDimitry Andric void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, 10580b57cec5SDimitry Andric AMDGPU::Waitcnt &Wait) const { 1059bdd1243dSDimitry Andric unsigned ScoreToWait = getRegScore(RegNo, T); 1060bdd1243dSDimitry Andric 10610b57cec5SDimitry Andric // If the score of src_operand falls within the bracket, we need an 10620b57cec5SDimitry Andric // s_waitcnt instruction. 10635ffd83dbSDimitry Andric const unsigned LB = getScoreLB(T); 10645ffd83dbSDimitry Andric const unsigned UB = getScoreUB(T); 10650b57cec5SDimitry Andric if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 10667a6dacacSDimitry Andric if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && 10670b57cec5SDimitry Andric !ST->hasFlatLgkmVMemCountInOrder()) { 10680b57cec5SDimitry Andric // If there is a pending FLAT operation, and this is a VMem or LGKM 10690b57cec5SDimitry Andric // waitcnt and the target can report early completion, then we need 10700b57cec5SDimitry Andric // to force a waitcnt 0. 10710b57cec5SDimitry Andric addWait(Wait, T, 0); 10720b57cec5SDimitry Andric } else if (counterOutOfOrder(T)) { 10730b57cec5SDimitry Andric // Counter can get decremented out-of-order when there 10740b57cec5SDimitry Andric // are multiple types event in the bracket. Also emit an s_wait counter 10750b57cec5SDimitry Andric // with a conservative value of 0 for the counter. 10760b57cec5SDimitry Andric addWait(Wait, T, 0); 10770b57cec5SDimitry Andric } else { 1078480093f4SDimitry Andric // If a counter has been maxed out avoid overflow by waiting for 1079480093f4SDimitry Andric // MAX(CounterType) - 1 instead. 10805ffd83dbSDimitry Andric unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); 1081480093f4SDimitry Andric addWait(Wait, T, NeededWait); 10820b57cec5SDimitry Andric } 10830b57cec5SDimitry Andric } 10840b57cec5SDimitry Andric } 10850b57cec5SDimitry Andric 10860b57cec5SDimitry Andric void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { 10877a6dacacSDimitry Andric applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 10880b57cec5SDimitry Andric applyWaitcnt(EXP_CNT, Wait.ExpCnt); 10897a6dacacSDimitry Andric applyWaitcnt(DS_CNT, Wait.DsCnt); 10907a6dacacSDimitry Andric applyWaitcnt(STORE_CNT, Wait.StoreCnt); 10917a6dacacSDimitry Andric applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); 10927a6dacacSDimitry Andric applyWaitcnt(BVH_CNT, Wait.BvhCnt); 10937a6dacacSDimitry Andric applyWaitcnt(KM_CNT, Wait.KmCnt); 10940b57cec5SDimitry Andric } 10950b57cec5SDimitry Andric 10960b57cec5SDimitry Andric void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { 10975ffd83dbSDimitry Andric const unsigned UB = getScoreUB(T); 10980b57cec5SDimitry Andric if (Count >= UB) 10990b57cec5SDimitry Andric return; 11000b57cec5SDimitry Andric if (Count != 0) { 11010b57cec5SDimitry Andric if (counterOutOfOrder(T)) 11020b57cec5SDimitry Andric return; 11030b57cec5SDimitry Andric setScoreLB(T, std::max(getScoreLB(T), UB - Count)); 11040b57cec5SDimitry Andric } else { 11050b57cec5SDimitry Andric setScoreLB(T, UB); 11060b57cec5SDimitry Andric PendingEvents &= ~WaitEventMaskForInst[T]; 11070b57cec5SDimitry Andric } 11080b57cec5SDimitry Andric } 11090b57cec5SDimitry Andric 11100b57cec5SDimitry Andric // Where there are multiple types of event in the bracket of a counter, 11110b57cec5SDimitry Andric // the decrement may go out of order. 11120b57cec5SDimitry Andric bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { 11130b57cec5SDimitry Andric // Scalar memory read always can go out of order. 11147a6dacacSDimitry Andric if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) 11150b57cec5SDimitry Andric return true; 11165ffd83dbSDimitry Andric return hasMixedPendingEvents(T); 11170b57cec5SDimitry Andric } 11180b57cec5SDimitry Andric 11190b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 11200b57cec5SDimitry Andric false) 11210fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) 11220fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 11230b57cec5SDimitry Andric INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 11240b57cec5SDimitry Andric false) 11250b57cec5SDimitry Andric 11260b57cec5SDimitry Andric char SIInsertWaitcnts::ID = 0; 11270b57cec5SDimitry Andric 11280b57cec5SDimitry Andric char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 11290b57cec5SDimitry Andric 11300b57cec5SDimitry Andric FunctionPass *llvm::createSIInsertWaitcntsPass() { 11310b57cec5SDimitry Andric return new SIInsertWaitcnts(); 11320b57cec5SDimitry Andric } 11330b57cec5SDimitry Andric 1134bdd1243dSDimitry Andric static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, 1135bdd1243dSDimitry Andric unsigned NewEnc) { 1136bdd1243dSDimitry Andric int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 1137bdd1243dSDimitry Andric assert(OpIdx >= 0); 1138bdd1243dSDimitry Andric 1139bdd1243dSDimitry Andric MachineOperand &MO = MI.getOperand(OpIdx); 1140bdd1243dSDimitry Andric 1141bdd1243dSDimitry Andric if (NewEnc == MO.getImm()) 1142bdd1243dSDimitry Andric return false; 1143bdd1243dSDimitry Andric 1144bdd1243dSDimitry Andric MO.setImm(NewEnc); 1145bdd1243dSDimitry Andric return true; 1146bdd1243dSDimitry Andric } 1147bdd1243dSDimitry Andric 11487a6dacacSDimitry Andric /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, 11497a6dacacSDimitry Andric /// and if so, which counter it is waiting on. 11507a6dacacSDimitry Andric static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { 11517a6dacacSDimitry Andric switch (Opcode) { 11527a6dacacSDimitry Andric case AMDGPU::S_WAIT_LOADCNT: 11537a6dacacSDimitry Andric return LOAD_CNT; 11547a6dacacSDimitry Andric case AMDGPU::S_WAIT_EXPCNT: 11557a6dacacSDimitry Andric return EXP_CNT; 11567a6dacacSDimitry Andric case AMDGPU::S_WAIT_STORECNT: 11577a6dacacSDimitry Andric return STORE_CNT; 11587a6dacacSDimitry Andric case AMDGPU::S_WAIT_SAMPLECNT: 11597a6dacacSDimitry Andric return SAMPLE_CNT; 11607a6dacacSDimitry Andric case AMDGPU::S_WAIT_BVHCNT: 11617a6dacacSDimitry Andric return BVH_CNT; 11627a6dacacSDimitry Andric case AMDGPU::S_WAIT_DSCNT: 11637a6dacacSDimitry Andric return DS_CNT; 11647a6dacacSDimitry Andric case AMDGPU::S_WAIT_KMCNT: 11657a6dacacSDimitry Andric return KM_CNT; 11667a6dacacSDimitry Andric default: 11677a6dacacSDimitry Andric return {}; 11687a6dacacSDimitry Andric } 11697a6dacacSDimitry Andric } 11707a6dacacSDimitry Andric 11717a6dacacSDimitry Andric bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { 11727a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode()); 11737a6dacacSDimitry Andric if (Opcode == Waitcnt->getOpcode()) 11745f757f3fSDimitry Andric return false; 11755f757f3fSDimitry Andric 11767a6dacacSDimitry Andric Waitcnt->setDesc(TII->get(Opcode)); 11775f757f3fSDimitry Andric return true; 11785f757f3fSDimitry Andric } 11795f757f3fSDimitry Andric 11807a6dacacSDimitry Andric /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that 11817a6dacacSDimitry Andric /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits 11827a6dacacSDimitry Andric /// from \p Wait that were added by previous passes. Currently this pass 11837a6dacacSDimitry Andric /// conservatively assumes that these preexisting waits are required for 11847a6dacacSDimitry Andric /// correctness. 11857a6dacacSDimitry Andric bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( 118681ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 1187bdd1243dSDimitry Andric AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 11887a6dacacSDimitry Andric assert(ST); 11897a6dacacSDimitry Andric assert(isNormalMode(MaxCounter)); 11907a6dacacSDimitry Andric 1191fe6060f1SDimitry Andric bool Modified = false; 1192fe6060f1SDimitry Andric MachineInstr *WaitcntInstr = nullptr; 1193fe6060f1SDimitry Andric MachineInstr *WaitcntVsCntInstr = nullptr; 119481ad6265SDimitry Andric 119581ad6265SDimitry Andric for (auto &II : 119681ad6265SDimitry Andric make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 119781ad6265SDimitry Andric if (II.isMetaInstruction()) 1198fe6060f1SDimitry Andric continue; 1199fe6060f1SDimitry Andric 12007a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); 12010fca6ea1SDimitry Andric bool TrySimplify = Opcode != II.getOpcode() && !OptNone; 12025f757f3fSDimitry Andric 12035f757f3fSDimitry Andric // Update required wait count. If this is a soft waitcnt (= it was added 12045f757f3fSDimitry Andric // by an earlier pass), it may be entirely removed. 12057a6dacacSDimitry Andric if (Opcode == AMDGPU::S_WAITCNT) { 120681ad6265SDimitry Andric unsigned IEnc = II.getOperand(0).getImm(); 1207fe6060f1SDimitry Andric AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); 12080fca6ea1SDimitry Andric if (TrySimplify) 12095f757f3fSDimitry Andric ScoreBrackets.simplifyWaitcnt(OldWait); 1210fe6060f1SDimitry Andric Wait = Wait.combined(OldWait); 1211fe6060f1SDimitry Andric 1212fe6060f1SDimitry Andric // Merge consecutive waitcnt of the same type by erasing multiples. 12130fca6ea1SDimitry Andric if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { 121481ad6265SDimitry Andric II.eraseFromParent(); 1215fe6060f1SDimitry Andric Modified = true; 12165f757f3fSDimitry Andric } else 12175f757f3fSDimitry Andric WaitcntInstr = &II; 1218fe6060f1SDimitry Andric } else { 12197a6dacacSDimitry Andric assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); 122081ad6265SDimitry Andric assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 12215f757f3fSDimitry Andric 1222fe6060f1SDimitry Andric unsigned OldVSCnt = 122381ad6265SDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 12240fca6ea1SDimitry Andric if (TrySimplify) 12257a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); 12267a6dacacSDimitry Andric Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); 1227fe6060f1SDimitry Andric 12280fca6ea1SDimitry Andric if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) { 122981ad6265SDimitry Andric II.eraseFromParent(); 1230fe6060f1SDimitry Andric Modified = true; 12315f757f3fSDimitry Andric } else 12325f757f3fSDimitry Andric WaitcntVsCntInstr = &II; 1233fe6060f1SDimitry Andric } 1234fe6060f1SDimitry Andric } 1235fe6060f1SDimitry Andric 1236fe6060f1SDimitry Andric if (WaitcntInstr) { 12375f757f3fSDimitry Andric Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, 1238bdd1243dSDimitry Andric AMDGPU::encodeWaitcnt(IV, Wait)); 12395f757f3fSDimitry Andric Modified |= promoteSoftWaitCnt(WaitcntInstr); 12405f757f3fSDimitry Andric 12417a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 12427a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); 12437a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 12447a6dacacSDimitry Andric Wait.LoadCnt = ~0u; 1245fe6060f1SDimitry Andric Wait.ExpCnt = ~0u; 12467a6dacacSDimitry Andric Wait.DsCnt = ~0u; 1247fe6060f1SDimitry Andric 12487a6dacacSDimitry Andric LLVM_DEBUG(It == WaitcntInstr->getParent()->end() 12495f757f3fSDimitry Andric ? dbgs() 12505f757f3fSDimitry Andric << "applyPreexistingWaitcnt\n" 12515f757f3fSDimitry Andric << "New Instr at block end: " << *WaitcntInstr << '\n' 125281ad6265SDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n" 125381ad6265SDimitry Andric << "Old Instr: " << *It 125481ad6265SDimitry Andric << "New Instr: " << *WaitcntInstr << '\n'); 1255fe6060f1SDimitry Andric } 1256fe6060f1SDimitry Andric 1257fe6060f1SDimitry Andric if (WaitcntVsCntInstr) { 1258bdd1243dSDimitry Andric Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, 12597a6dacacSDimitry Andric AMDGPU::OpName::simm16, Wait.StoreCnt); 12605f757f3fSDimitry Andric Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); 1261fe6060f1SDimitry Andric 12627a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); 12637a6dacacSDimitry Andric Wait.StoreCnt = ~0u; 12647a6dacacSDimitry Andric 12657a6dacacSDimitry Andric LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() 126681ad6265SDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n" 12675f757f3fSDimitry Andric << "New Instr at block end: " << *WaitcntVsCntInstr 12685f757f3fSDimitry Andric << '\n' 126981ad6265SDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n" 127081ad6265SDimitry Andric << "Old Instr: " << *It 1271fe6060f1SDimitry Andric << "New Instr: " << *WaitcntVsCntInstr << '\n'); 1272fe6060f1SDimitry Andric } 1273fe6060f1SDimitry Andric 1274fe6060f1SDimitry Andric return Modified; 1275fe6060f1SDimitry Andric } 1276fe6060f1SDimitry Andric 12777a6dacacSDimitry Andric /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any 12787a6dacacSDimitry Andric /// required counters in \p Wait 12797a6dacacSDimitry Andric bool WaitcntGeneratorPreGFX12::createNewWaitcnt( 12807a6dacacSDimitry Andric MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, 12817a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) { 12827a6dacacSDimitry Andric assert(ST); 12837a6dacacSDimitry Andric assert(isNormalMode(MaxCounter)); 12847a6dacacSDimitry Andric 12857a6dacacSDimitry Andric bool Modified = false; 12867a6dacacSDimitry Andric const DebugLoc &DL = Block.findDebugLoc(It); 12877a6dacacSDimitry Andric 12887a6dacacSDimitry Andric // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a 12897a6dacacSDimitry Andric // single instruction while VScnt has its own instruction. 12907a6dacacSDimitry Andric if (Wait.hasWaitExceptStoreCnt()) { 12917a6dacacSDimitry Andric unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); 12927a6dacacSDimitry Andric [[maybe_unused]] auto SWaitInst = 12937a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); 12947a6dacacSDimitry Andric Modified = true; 12957a6dacacSDimitry Andric 12967a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 12977a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 12987a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n'); 12997a6dacacSDimitry Andric } 13007a6dacacSDimitry Andric 13017a6dacacSDimitry Andric if (Wait.hasWaitStoreCnt()) { 13027a6dacacSDimitry Andric assert(ST->hasVscnt()); 13037a6dacacSDimitry Andric 13047a6dacacSDimitry Andric [[maybe_unused]] auto SWaitInst = 13057a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 13067a6dacacSDimitry Andric .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 13077a6dacacSDimitry Andric .addImm(Wait.StoreCnt); 13087a6dacacSDimitry Andric Modified = true; 13097a6dacacSDimitry Andric 13107a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 13117a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 13127a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n'); 13137a6dacacSDimitry Andric } 13147a6dacacSDimitry Andric 13157a6dacacSDimitry Andric return Modified; 13167a6dacacSDimitry Andric } 13177a6dacacSDimitry Andric 13180fca6ea1SDimitry Andric AMDGPU::Waitcnt 13190fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { 13200fca6ea1SDimitry Andric return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); 13210fca6ea1SDimitry Andric } 13220fca6ea1SDimitry Andric 13230fca6ea1SDimitry Andric AMDGPU::Waitcnt 13240fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { 13250fca6ea1SDimitry Andric return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); 13260fca6ea1SDimitry Andric } 13270fca6ea1SDimitry Andric 13287a6dacacSDimitry Andric /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and 13297a6dacacSDimitry Andric /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that 13307a6dacacSDimitry Andric /// were added by previous passes. Currently this pass conservatively 13317a6dacacSDimitry Andric /// assumes that these preexisting waits are required for correctness. 13327a6dacacSDimitry Andric bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( 13337a6dacacSDimitry Andric WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 13347a6dacacSDimitry Andric AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 13357a6dacacSDimitry Andric assert(ST); 13367a6dacacSDimitry Andric assert(!isNormalMode(MaxCounter)); 13377a6dacacSDimitry Andric 13387a6dacacSDimitry Andric bool Modified = false; 13397a6dacacSDimitry Andric MachineInstr *CombinedLoadDsCntInstr = nullptr; 13407a6dacacSDimitry Andric MachineInstr *CombinedStoreDsCntInstr = nullptr; 13417a6dacacSDimitry Andric MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; 13427a6dacacSDimitry Andric 13437a6dacacSDimitry Andric for (auto &II : 13447a6dacacSDimitry Andric make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 13457a6dacacSDimitry Andric if (II.isMetaInstruction()) 13467a6dacacSDimitry Andric continue; 13477a6dacacSDimitry Andric 13487a6dacacSDimitry Andric MachineInstr **UpdatableInstr; 13497a6dacacSDimitry Andric 13507a6dacacSDimitry Andric // Update required wait count. If this is a soft waitcnt (= it was added 13517a6dacacSDimitry Andric // by an earlier pass), it may be entirely removed. 13527a6dacacSDimitry Andric 13537a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); 13540fca6ea1SDimitry Andric bool TrySimplify = Opcode != II.getOpcode() && !OptNone; 13550fca6ea1SDimitry Andric 13560fca6ea1SDimitry Andric // Don't crash if the programmer used legacy waitcnt intrinsics, but don't 13570fca6ea1SDimitry Andric // attempt to do more than that either. 13580fca6ea1SDimitry Andric if (Opcode == AMDGPU::S_WAITCNT) 13590fca6ea1SDimitry Andric continue; 13607a6dacacSDimitry Andric 13617a6dacacSDimitry Andric if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { 13627a6dacacSDimitry Andric unsigned OldEnc = 13637a6dacacSDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 13647a6dacacSDimitry Andric AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); 13650fca6ea1SDimitry Andric if (TrySimplify) 13667a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(OldWait); 13677a6dacacSDimitry Andric Wait = Wait.combined(OldWait); 13687a6dacacSDimitry Andric UpdatableInstr = &CombinedLoadDsCntInstr; 13697a6dacacSDimitry Andric } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { 13707a6dacacSDimitry Andric unsigned OldEnc = 13717a6dacacSDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 13727a6dacacSDimitry Andric AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); 13730fca6ea1SDimitry Andric if (TrySimplify) 13747a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(OldWait); 13757a6dacacSDimitry Andric Wait = Wait.combined(OldWait); 13767a6dacacSDimitry Andric UpdatableInstr = &CombinedStoreDsCntInstr; 13777a6dacacSDimitry Andric } else { 13787a6dacacSDimitry Andric std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); 13797a6dacacSDimitry Andric assert(CT.has_value()); 13807a6dacacSDimitry Andric unsigned OldCnt = 13817a6dacacSDimitry Andric TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 13820fca6ea1SDimitry Andric if (TrySimplify) 13837a6dacacSDimitry Andric ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); 13847a6dacacSDimitry Andric addWait(Wait, CT.value(), OldCnt); 13857a6dacacSDimitry Andric UpdatableInstr = &WaitInstrs[CT.value()]; 13867a6dacacSDimitry Andric } 13877a6dacacSDimitry Andric 13887a6dacacSDimitry Andric // Merge consecutive waitcnt of the same type by erasing multiples. 13897a6dacacSDimitry Andric if (!*UpdatableInstr) { 13907a6dacacSDimitry Andric *UpdatableInstr = &II; 13917a6dacacSDimitry Andric } else { 13927a6dacacSDimitry Andric II.eraseFromParent(); 13937a6dacacSDimitry Andric Modified = true; 13947a6dacacSDimitry Andric } 13957a6dacacSDimitry Andric } 13967a6dacacSDimitry Andric 13977a6dacacSDimitry Andric if (CombinedLoadDsCntInstr) { 13987a6dacacSDimitry Andric // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need 13997a6dacacSDimitry Andric // to be waited for. Otherwise, let the instruction be deleted so 14007a6dacacSDimitry Andric // the appropriate single counter wait instruction can be inserted 14017a6dacacSDimitry Andric // instead, when new S_WAIT_*CNT instructions are inserted by 14027a6dacacSDimitry Andric // createNewWaitcnt(). As a side effect, resetting the wait counts will 14037a6dacacSDimitry Andric // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by 14047a6dacacSDimitry Andric // the loop below that deals with single counter instructions. 14057a6dacacSDimitry Andric if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { 14067a6dacacSDimitry Andric unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); 14077a6dacacSDimitry Andric Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, 14087a6dacacSDimitry Andric AMDGPU::OpName::simm16, NewEnc); 14097a6dacacSDimitry Andric Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr); 14107a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 14117a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 14127a6dacacSDimitry Andric Wait.LoadCnt = ~0u; 14137a6dacacSDimitry Andric Wait.DsCnt = ~0u; 14147a6dacacSDimitry Andric 14157a6dacacSDimitry Andric LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 14167a6dacacSDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n" 14177a6dacacSDimitry Andric << "New Instr at block end: " 14187a6dacacSDimitry Andric << *CombinedLoadDsCntInstr << '\n' 14197a6dacacSDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n" 14207a6dacacSDimitry Andric << "Old Instr: " << *It << "New Instr: " 14217a6dacacSDimitry Andric << *CombinedLoadDsCntInstr << '\n'); 14227a6dacacSDimitry Andric } else { 14237a6dacacSDimitry Andric CombinedLoadDsCntInstr->eraseFromParent(); 14247a6dacacSDimitry Andric Modified = true; 14257a6dacacSDimitry Andric } 14267a6dacacSDimitry Andric } 14277a6dacacSDimitry Andric 14287a6dacacSDimitry Andric if (CombinedStoreDsCntInstr) { 14297a6dacacSDimitry Andric // Similarly for S_WAIT_STORECNT_DSCNT. 14307a6dacacSDimitry Andric if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { 14317a6dacacSDimitry Andric unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait); 14327a6dacacSDimitry Andric Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr, 14337a6dacacSDimitry Andric AMDGPU::OpName::simm16, NewEnc); 14347a6dacacSDimitry Andric Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr); 14357a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); 14367a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 14377a6dacacSDimitry Andric Wait.StoreCnt = ~0u; 14387a6dacacSDimitry Andric Wait.DsCnt = ~0u; 14397a6dacacSDimitry Andric 14407a6dacacSDimitry Andric LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 14417a6dacacSDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n" 14427a6dacacSDimitry Andric << "New Instr at block end: " 14437a6dacacSDimitry Andric << *CombinedStoreDsCntInstr << '\n' 14447a6dacacSDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n" 14457a6dacacSDimitry Andric << "Old Instr: " << *It << "New Instr: " 14467a6dacacSDimitry Andric << *CombinedStoreDsCntInstr << '\n'); 14477a6dacacSDimitry Andric } else { 14487a6dacacSDimitry Andric CombinedStoreDsCntInstr->eraseFromParent(); 14497a6dacacSDimitry Andric Modified = true; 14507a6dacacSDimitry Andric } 14517a6dacacSDimitry Andric } 14527a6dacacSDimitry Andric 14537a6dacacSDimitry Andric // Look for an opportunity to convert existing S_WAIT_LOADCNT, 14547a6dacacSDimitry Andric // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT 14557a6dacacSDimitry Andric // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing 14567a6dacacSDimitry Andric // instructions so that createNewWaitcnt() will create new combined 14577a6dacacSDimitry Andric // instructions to replace them. 14587a6dacacSDimitry Andric 14597a6dacacSDimitry Andric if (Wait.DsCnt != ~0u) { 14607a6dacacSDimitry Andric // This is a vector of addresses in WaitInstrs pointing to instructions 14617a6dacacSDimitry Andric // that should be removed if they are present. 14627a6dacacSDimitry Andric SmallVector<MachineInstr **, 2> WaitsToErase; 14637a6dacacSDimitry Andric 14647a6dacacSDimitry Andric // If it's known that both DScnt and either LOADcnt or STOREcnt (but not 14657a6dacacSDimitry Andric // both) need to be waited for, ensure that there are no existing 14667a6dacacSDimitry Andric // individual wait count instructions for these. 14677a6dacacSDimitry Andric 14687a6dacacSDimitry Andric if (Wait.LoadCnt != ~0u) { 14697a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]); 14707a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[DS_CNT]); 14717a6dacacSDimitry Andric } else if (Wait.StoreCnt != ~0u) { 14727a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[STORE_CNT]); 14737a6dacacSDimitry Andric WaitsToErase.push_back(&WaitInstrs[DS_CNT]); 14747a6dacacSDimitry Andric } 14757a6dacacSDimitry Andric 14767a6dacacSDimitry Andric for (MachineInstr **WI : WaitsToErase) { 14777a6dacacSDimitry Andric if (!*WI) 14787a6dacacSDimitry Andric continue; 14797a6dacacSDimitry Andric 14807a6dacacSDimitry Andric (*WI)->eraseFromParent(); 14817a6dacacSDimitry Andric *WI = nullptr; 14827a6dacacSDimitry Andric Modified = true; 14837a6dacacSDimitry Andric } 14847a6dacacSDimitry Andric } 14857a6dacacSDimitry Andric 14867a6dacacSDimitry Andric for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 14877a6dacacSDimitry Andric if (!WaitInstrs[CT]) 14887a6dacacSDimitry Andric continue; 14897a6dacacSDimitry Andric 14907a6dacacSDimitry Andric unsigned NewCnt = getWait(Wait, CT); 14917a6dacacSDimitry Andric if (NewCnt != ~0u) { 14927a6dacacSDimitry Andric Modified |= updateOperandIfDifferent(*WaitInstrs[CT], 14937a6dacacSDimitry Andric AMDGPU::OpName::simm16, NewCnt); 14947a6dacacSDimitry Andric Modified |= promoteSoftWaitCnt(WaitInstrs[CT]); 14957a6dacacSDimitry Andric 14967a6dacacSDimitry Andric ScoreBrackets.applyWaitcnt(CT, NewCnt); 14977a6dacacSDimitry Andric setNoWait(Wait, CT); 14987a6dacacSDimitry Andric 14997a6dacacSDimitry Andric LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 15007a6dacacSDimitry Andric ? dbgs() << "applyPreexistingWaitcnt\n" 15017a6dacacSDimitry Andric << "New Instr at block end: " << *WaitInstrs[CT] 15027a6dacacSDimitry Andric << '\n' 15037a6dacacSDimitry Andric : dbgs() << "applyPreexistingWaitcnt\n" 15047a6dacacSDimitry Andric << "Old Instr: " << *It 15057a6dacacSDimitry Andric << "New Instr: " << *WaitInstrs[CT] << '\n'); 15067a6dacacSDimitry Andric } else { 15077a6dacacSDimitry Andric WaitInstrs[CT]->eraseFromParent(); 15087a6dacacSDimitry Andric Modified = true; 15097a6dacacSDimitry Andric } 15107a6dacacSDimitry Andric } 15117a6dacacSDimitry Andric 15127a6dacacSDimitry Andric return Modified; 15137a6dacacSDimitry Andric } 15147a6dacacSDimitry Andric 15157a6dacacSDimitry Andric /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait 15167a6dacacSDimitry Andric bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( 15177a6dacacSDimitry Andric MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, 15187a6dacacSDimitry Andric AMDGPU::Waitcnt Wait) { 15197a6dacacSDimitry Andric assert(ST); 15207a6dacacSDimitry Andric assert(!isNormalMode(MaxCounter)); 15217a6dacacSDimitry Andric 15227a6dacacSDimitry Andric bool Modified = false; 15237a6dacacSDimitry Andric const DebugLoc &DL = Block.findDebugLoc(It); 15247a6dacacSDimitry Andric 15257a6dacacSDimitry Andric // Check for opportunities to use combined wait instructions. 15267a6dacacSDimitry Andric if (Wait.DsCnt != ~0u) { 15277a6dacacSDimitry Andric MachineInstr *SWaitInst = nullptr; 15287a6dacacSDimitry Andric 15297a6dacacSDimitry Andric if (Wait.LoadCnt != ~0u) { 15307a6dacacSDimitry Andric unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); 15317a6dacacSDimitry Andric 15327a6dacacSDimitry Andric SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) 15337a6dacacSDimitry Andric .addImm(Enc); 15347a6dacacSDimitry Andric 15357a6dacacSDimitry Andric Wait.LoadCnt = ~0u; 15367a6dacacSDimitry Andric Wait.DsCnt = ~0u; 15377a6dacacSDimitry Andric } else if (Wait.StoreCnt != ~0u) { 15387a6dacacSDimitry Andric unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); 15397a6dacacSDimitry Andric 15407a6dacacSDimitry Andric SWaitInst = 15417a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) 15427a6dacacSDimitry Andric .addImm(Enc); 15437a6dacacSDimitry Andric 15447a6dacacSDimitry Andric Wait.StoreCnt = ~0u; 15457a6dacacSDimitry Andric Wait.DsCnt = ~0u; 15467a6dacacSDimitry Andric } 15477a6dacacSDimitry Andric 15487a6dacacSDimitry Andric if (SWaitInst) { 15497a6dacacSDimitry Andric Modified = true; 15507a6dacacSDimitry Andric 15517a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 15527a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 15537a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n'); 15547a6dacacSDimitry Andric } 15557a6dacacSDimitry Andric } 15567a6dacacSDimitry Andric 15577a6dacacSDimitry Andric // Generate an instruction for any remaining counter that needs 15587a6dacacSDimitry Andric // waiting for. 15597a6dacacSDimitry Andric 15607a6dacacSDimitry Andric for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 15617a6dacacSDimitry Andric unsigned Count = getWait(Wait, CT); 15627a6dacacSDimitry Andric if (Count == ~0u) 15637a6dacacSDimitry Andric continue; 15647a6dacacSDimitry Andric 15657a6dacacSDimitry Andric [[maybe_unused]] auto SWaitInst = 15667a6dacacSDimitry Andric BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) 15677a6dacacSDimitry Andric .addImm(Count); 15687a6dacacSDimitry Andric 15697a6dacacSDimitry Andric Modified = true; 15707a6dacacSDimitry Andric 15717a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 15727a6dacacSDimitry Andric if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 15737a6dacacSDimitry Andric dbgs() << "New Instr: " << *SWaitInst << '\n'); 15747a6dacacSDimitry Andric } 15757a6dacacSDimitry Andric 15767a6dacacSDimitry Andric return Modified; 15777a6dacacSDimitry Andric } 15787a6dacacSDimitry Andric 15790b57cec5SDimitry Andric static bool readsVCCZ(const MachineInstr &MI) { 15800b57cec5SDimitry Andric unsigned Opc = MI.getOpcode(); 15810b57cec5SDimitry Andric return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 15820b57cec5SDimitry Andric !MI.getOperand(1).isUndef(); 15830b57cec5SDimitry Andric } 15840b57cec5SDimitry Andric 15850b57cec5SDimitry Andric /// \returns true if the callee inserts an s_waitcnt 0 on function entry. 15860b57cec5SDimitry Andric static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { 15870b57cec5SDimitry Andric // Currently all conventions wait, but this may not always be the case. 15880b57cec5SDimitry Andric // 15890b57cec5SDimitry Andric // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make 15900b57cec5SDimitry Andric // senses to omit the wait and do it in the caller. 15910b57cec5SDimitry Andric return true; 15920b57cec5SDimitry Andric } 15930b57cec5SDimitry Andric 15940b57cec5SDimitry Andric /// \returns true if the callee is expected to wait for any outstanding waits 15950b57cec5SDimitry Andric /// before returning. 15960b57cec5SDimitry Andric static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { 15970b57cec5SDimitry Andric return true; 15980b57cec5SDimitry Andric } 15990b57cec5SDimitry Andric 16000b57cec5SDimitry Andric /// Generate s_waitcnt instruction to be placed before cur_Inst. 16010b57cec5SDimitry Andric /// Instructions of a given type are returned in order, 16020b57cec5SDimitry Andric /// but instructions of different types can complete out of order. 16030b57cec5SDimitry Andric /// We rely on this in-order completion 16040b57cec5SDimitry Andric /// and simply assign a score to the memory access instructions. 16050b57cec5SDimitry Andric /// We keep track of the active "score bracket" to determine 16060b57cec5SDimitry Andric /// if an access of a memory read requires an s_waitcnt 16070b57cec5SDimitry Andric /// and if so what the value of each counter is. 16080b57cec5SDimitry Andric /// The "score bracket" is bound by the lower bound and upper bound 16090b57cec5SDimitry Andric /// scores (*_score_LB and *_score_ub respectively). 161081ad6265SDimitry Andric /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to 161181ad6265SDimitry Andric /// flush the vmcnt counter here. 161281ad6265SDimitry Andric bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, 161381ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets, 161481ad6265SDimitry Andric MachineInstr *OldWaitcntInstr, 161581ad6265SDimitry Andric bool FlushVmCnt) { 16160b57cec5SDimitry Andric setForceEmitWaitcnt(); 16170b57cec5SDimitry Andric 1618e8d8bef9SDimitry Andric if (MI.isMetaInstruction()) 16190b57cec5SDimitry Andric return false; 16200b57cec5SDimitry Andric 16210b57cec5SDimitry Andric AMDGPU::Waitcnt Wait; 16220b57cec5SDimitry Andric 1623fe6060f1SDimitry Andric // FIXME: This should have already been handled by the memory legalizer. 1624fe6060f1SDimitry Andric // Removing this currently doesn't affect any lit tests, but we need to 1625fe6060f1SDimitry Andric // verify that nothing was relying on this. The number of buffer invalidates 1626fe6060f1SDimitry Andric // being handled here should not be expanded. 16270b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 16280b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 16290b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || 16300b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || 16310b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { 16327a6dacacSDimitry Andric Wait.LoadCnt = 0; 16330b57cec5SDimitry Andric } 16340b57cec5SDimitry Andric 16350b57cec5SDimitry Andric // All waits must be resolved at call return. 16360b57cec5SDimitry Andric // NOTE: this could be improved with knowledge of all call sites or 16370b57cec5SDimitry Andric // with knowledge of the called routines. 16380b57cec5SDimitry Andric if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 163981ad6265SDimitry Andric MI.getOpcode() == AMDGPU::SI_RETURN || 16400b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::S_SETPC_B64_return || 16410b57cec5SDimitry Andric (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { 16420fca6ea1SDimitry Andric Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); 164306c3fb27SDimitry Andric } 164406c3fb27SDimitry Andric // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM 164506c3fb27SDimitry Andric // stores. In this case it can be useful to send a message to explicitly 164606c3fb27SDimitry Andric // release all VGPRs before the stores have completed, but it is only safe to 16477a6dacacSDimitry Andric // do this if: 16487a6dacacSDimitry Andric // * there are no outstanding scratch stores 16497a6dacacSDimitry Andric // * we are not in Dynamic VGPR mode 165006c3fb27SDimitry Andric else if (MI.getOpcode() == AMDGPU::S_ENDPGM || 165106c3fb27SDimitry Andric MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { 16520fca6ea1SDimitry Andric if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() && 16537a6dacacSDimitry Andric ScoreBrackets.getScoreRange(STORE_CNT) != 0 && 165406c3fb27SDimitry Andric !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) 165506c3fb27SDimitry Andric ReleaseVGPRInsts.insert(&MI); 16560b57cec5SDimitry Andric } 16570b57cec5SDimitry Andric // Resolve vm waits before gs-done. 16580b57cec5SDimitry Andric else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 16590b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 166081ad6265SDimitry Andric ST->hasLegacyGeometry() && 166181ad6265SDimitry Andric ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == 166281ad6265SDimitry Andric AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { 16637a6dacacSDimitry Andric Wait.LoadCnt = 0; 16640b57cec5SDimitry Andric } 16650b57cec5SDimitry Andric 16660b57cec5SDimitry Andric // Export & GDS instructions do not read the EXEC mask until after the export 16670b57cec5SDimitry Andric // is granted (which can occur well after the instruction is issued). 16680b57cec5SDimitry Andric // The shader program must flush all EXP operations on the export-count 16690b57cec5SDimitry Andric // before overwriting the EXEC mask. 16700b57cec5SDimitry Andric else { 16710b57cec5SDimitry Andric if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 16720b57cec5SDimitry Andric // Export and GDS are tracked individually, either may trigger a waitcnt 16730b57cec5SDimitry Andric // for EXEC. 16740b57cec5SDimitry Andric if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || 16750b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || 16760b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || 16770b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { 16780b57cec5SDimitry Andric Wait.ExpCnt = 0; 16790b57cec5SDimitry Andric } 16800b57cec5SDimitry Andric } 16810b57cec5SDimitry Andric 16820b57cec5SDimitry Andric if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { 1683480093f4SDimitry Andric // The function is going to insert a wait on everything in its prolog. 1684480093f4SDimitry Andric // This still needs to be careful if the call target is a load (e.g. a GOT 168581ad6265SDimitry Andric // load). We also need to check WAW dependency with saved PC. 16860b57cec5SDimitry Andric Wait = AMDGPU::Waitcnt(); 16870b57cec5SDimitry Andric 16880b57cec5SDimitry Andric int CallAddrOpIdx = 16890b57cec5SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 1690e8d8bef9SDimitry Andric 1691e8d8bef9SDimitry Andric if (MI.getOperand(CallAddrOpIdx).isReg()) { 16925ffd83dbSDimitry Andric RegInterval CallAddrOpInterval = 1693cb14a3feSDimitry Andric ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx); 1694480093f4SDimitry Andric 16955ffd83dbSDimitry Andric for (int RegNo = CallAddrOpInterval.first; 1696480093f4SDimitry Andric RegNo < CallAddrOpInterval.second; ++RegNo) 16977a6dacacSDimitry Andric ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); 1698480093f4SDimitry Andric 1699480093f4SDimitry Andric int RtnAddrOpIdx = 1700480093f4SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 1701480093f4SDimitry Andric if (RtnAddrOpIdx != -1) { 17025ffd83dbSDimitry Andric RegInterval RtnAddrOpInterval = 1703cb14a3feSDimitry Andric ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx); 1704480093f4SDimitry Andric 17055ffd83dbSDimitry Andric for (int RegNo = RtnAddrOpInterval.first; 1706480093f4SDimitry Andric RegNo < RtnAddrOpInterval.second; ++RegNo) 17077a6dacacSDimitry Andric ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); 17080b57cec5SDimitry Andric } 1709e8d8bef9SDimitry Andric } 17100b57cec5SDimitry Andric } else { 17110b57cec5SDimitry Andric // FIXME: Should not be relying on memoperands. 17120b57cec5SDimitry Andric // Look at the source operands of every instruction to see if 17130b57cec5SDimitry Andric // any of them results from a previous memory operation that affects 17140b57cec5SDimitry Andric // its current usage. If so, an s_waitcnt instruction needs to be 17150b57cec5SDimitry Andric // emitted. 17160b57cec5SDimitry Andric // If the source operand was defined by a load, add the s_waitcnt 17170b57cec5SDimitry Andric // instruction. 17185ffd83dbSDimitry Andric // 17190b57cec5SDimitry Andric // Two cases are handled for destination operands: 17200b57cec5SDimitry Andric // 1) If the destination operand was defined by a load, add the s_waitcnt 17210b57cec5SDimitry Andric // instruction to guarantee the right WAW order. 17220b57cec5SDimitry Andric // 2) If a destination operand that was used by a recent export/store ins, 17230b57cec5SDimitry Andric // add s_waitcnt on exp_cnt to guarantee the WAR order. 17247a6dacacSDimitry Andric 17250b57cec5SDimitry Andric for (const MachineMemOperand *Memop : MI.memoperands()) { 1726480093f4SDimitry Andric const Value *Ptr = Memop->getValue(); 17275ffd83dbSDimitry Andric if (Memop->isStore() && SLoadAddresses.count(Ptr)) { 17287a6dacacSDimitry Andric addWait(Wait, SmemAccessCounter, 0); 17295ffd83dbSDimitry Andric if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) 1730480093f4SDimitry Andric SLoadAddresses.erase(Ptr); 1731480093f4SDimitry Andric } 17320b57cec5SDimitry Andric unsigned AS = Memop->getAddrSpace(); 173381ad6265SDimitry Andric if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) 173481ad6265SDimitry Andric continue; 173581ad6265SDimitry Andric // No need to wait before load from VMEM to LDS. 17365f757f3fSDimitry Andric if (TII->mayWriteLDSThroughDMA(MI)) 17370b57cec5SDimitry Andric continue; 17387a6dacacSDimitry Andric 17397a6dacacSDimitry Andric // LOAD_CNT is only relevant to vgpr or LDS. 17400b57cec5SDimitry Andric unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 17417a6dacacSDimitry Andric bool FoundAliasingStore = false; 17427a6dacacSDimitry Andric // Only objects with alias scope info were added to LDSDMAScopes array. 17437a6dacacSDimitry Andric // In the absense of the scope info we will not be able to disambiguate 17447a6dacacSDimitry Andric // aliasing here. There is no need to try searching for a corresponding 17457a6dacacSDimitry Andric // store slot. This is conservatively correct because in that case we 17467a6dacacSDimitry Andric // will produce a wait using the first (general) LDS DMA wait slot which 17477a6dacacSDimitry Andric // will wait on all of them anyway. 17487a6dacacSDimitry Andric if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { 17497a6dacacSDimitry Andric const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); 17507a6dacacSDimitry Andric for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { 17517a6dacacSDimitry Andric if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { 17527a6dacacSDimitry Andric FoundAliasingStore = true; 17537a6dacacSDimitry Andric ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); 17547a6dacacSDimitry Andric } 17557a6dacacSDimitry Andric } 17567a6dacacSDimitry Andric } 17577a6dacacSDimitry Andric if (!FoundAliasingStore) 17587a6dacacSDimitry Andric ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); 17595ffd83dbSDimitry Andric if (Memop->isStore()) { 1760bdd1243dSDimitry Andric ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 17610b57cec5SDimitry Andric } 17620b57cec5SDimitry Andric } 17635ffd83dbSDimitry Andric 17645ffd83dbSDimitry Andric // Loop over use and def operands. 17650b57cec5SDimitry Andric for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 17665ffd83dbSDimitry Andric MachineOperand &Op = MI.getOperand(I); 17675ffd83dbSDimitry Andric if (!Op.isReg()) 17685ffd83dbSDimitry Andric continue; 1769bdd1243dSDimitry Andric 1770bdd1243dSDimitry Andric // If the instruction does not read tied source, skip the operand. 1771bdd1243dSDimitry Andric if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) 1772bdd1243dSDimitry Andric continue; 1773bdd1243dSDimitry Andric 1774cb14a3feSDimitry Andric RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I); 1775e8d8bef9SDimitry Andric 1776fe6060f1SDimitry Andric const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); 17775ffd83dbSDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1778e8d8bef9SDimitry Andric if (IsVGPR) { 17795ffd83dbSDimitry Andric // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the 17805ffd83dbSDimitry Andric // previous write and this write are the same type of VMEM 1781*6c4b055cSDimitry Andric // instruction, in which case they are (in some architectures) 1782*6c4b055cSDimitry Andric // guaranteed to write their results in order anyway. 1783bdd1243dSDimitry Andric if (Op.isUse() || !updateVMCntOnly(MI) || 17845ffd83dbSDimitry Andric ScoreBrackets.hasOtherPendingVmemTypes(RegNo, 1785*6c4b055cSDimitry Andric getVmemType(MI)) || 1786*6c4b055cSDimitry Andric !ST->hasVmemWriteVgprInOrder()) { 17877a6dacacSDimitry Andric ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); 17887a6dacacSDimitry Andric ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); 17897a6dacacSDimitry Andric ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); 17905ffd83dbSDimitry Andric ScoreBrackets.clearVgprVmemTypes(RegNo); 17915ffd83dbSDimitry Andric } 179281ad6265SDimitry Andric if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { 1793bdd1243dSDimitry Andric ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 17940b57cec5SDimitry Andric } 17957a6dacacSDimitry Andric ScoreBrackets.determineWait(DS_CNT, RegNo, Wait); 17967a6dacacSDimitry Andric } else { 17977a6dacacSDimitry Andric ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); 17985ffd83dbSDimitry Andric } 17990b57cec5SDimitry Andric } 18005ffd83dbSDimitry Andric } 18010b57cec5SDimitry Andric } 18020b57cec5SDimitry Andric } 18030b57cec5SDimitry Andric 1804bdd1243dSDimitry Andric // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does 1805bdd1243dSDimitry Andric // not, we need to ensure the subtarget is capable of backing off barrier 1806bdd1243dSDimitry Andric // instructions in case there are any outstanding memory operations that may 1807bdd1243dSDimitry Andric // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. 18083a079333SDimitry Andric if (TII->isBarrierStart(MI.getOpcode()) && 1809bdd1243dSDimitry Andric !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { 18100fca6ea1SDimitry Andric Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); 18110b57cec5SDimitry Andric } 18120b57cec5SDimitry Andric 18130b57cec5SDimitry Andric // TODO: Remove this work-around, enable the assert for Bug 457939 18140b57cec5SDimitry Andric // after fixing the scheduler. Also, the Shader Compiler code is 18150b57cec5SDimitry Andric // independent of target. 18160b57cec5SDimitry Andric if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { 1817bdd1243dSDimitry Andric if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 18187a6dacacSDimitry Andric Wait.DsCnt = 0; 18190b57cec5SDimitry Andric } 18200b57cec5SDimitry Andric } 18210b57cec5SDimitry Andric 1822fe6060f1SDimitry Andric // Verify that the wait is actually needed. 1823fe6060f1SDimitry Andric ScoreBrackets.simplifyWaitcnt(Wait); 18240b57cec5SDimitry Andric 18250b57cec5SDimitry Andric if (ForceEmitZeroWaitcnts) 18260fca6ea1SDimitry Andric Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); 18270b57cec5SDimitry Andric 18287a6dacacSDimitry Andric if (ForceEmitWaitcnt[LOAD_CNT]) 18297a6dacacSDimitry Andric Wait.LoadCnt = 0; 18300b57cec5SDimitry Andric if (ForceEmitWaitcnt[EXP_CNT]) 18310b57cec5SDimitry Andric Wait.ExpCnt = 0; 18327a6dacacSDimitry Andric if (ForceEmitWaitcnt[DS_CNT]) 18337a6dacacSDimitry Andric Wait.DsCnt = 0; 18347a6dacacSDimitry Andric if (ForceEmitWaitcnt[SAMPLE_CNT]) 18357a6dacacSDimitry Andric Wait.SampleCnt = 0; 18367a6dacacSDimitry Andric if (ForceEmitWaitcnt[BVH_CNT]) 18377a6dacacSDimitry Andric Wait.BvhCnt = 0; 18387a6dacacSDimitry Andric if (ForceEmitWaitcnt[KM_CNT]) 18397a6dacacSDimitry Andric Wait.KmCnt = 0; 18400b57cec5SDimitry Andric 184181ad6265SDimitry Andric if (FlushVmCnt) { 18427a6dacacSDimitry Andric if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) 18437a6dacacSDimitry Andric Wait.LoadCnt = 0; 18447a6dacacSDimitry Andric if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) 18457a6dacacSDimitry Andric Wait.SampleCnt = 0; 18467a6dacacSDimitry Andric if (ScoreBrackets.hasPendingEvent(BVH_CNT)) 18477a6dacacSDimitry Andric Wait.BvhCnt = 0; 184881ad6265SDimitry Andric } 184981ad6265SDimitry Andric 185081ad6265SDimitry Andric return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, 185181ad6265SDimitry Andric OldWaitcntInstr); 185281ad6265SDimitry Andric } 185381ad6265SDimitry Andric 185481ad6265SDimitry Andric bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, 185581ad6265SDimitry Andric MachineBasicBlock::instr_iterator It, 185681ad6265SDimitry Andric MachineBasicBlock &Block, 185781ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets, 185881ad6265SDimitry Andric MachineInstr *OldWaitcntInstr) { 185981ad6265SDimitry Andric bool Modified = false; 186081ad6265SDimitry Andric 186181ad6265SDimitry Andric if (OldWaitcntInstr) 1862fe6060f1SDimitry Andric // Try to merge the required wait with preexisting waitcnt instructions. 1863fe6060f1SDimitry Andric // Also erase redundant waitcnt. 1864fe6060f1SDimitry Andric Modified = 18657a6dacacSDimitry Andric WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); 18667a6dacacSDimitry Andric 18677a6dacacSDimitry Andric // Any counts that could have been applied to any existing waitcnt 18687a6dacacSDimitry Andric // instructions will have been done so, now deal with any remaining. 1869fe6060f1SDimitry Andric ScoreBrackets.applyWaitcnt(Wait); 187081ad6265SDimitry Andric 187181ad6265SDimitry Andric // ExpCnt can be merged into VINTERP. 187281ad6265SDimitry Andric if (Wait.ExpCnt != ~0u && It != Block.instr_end() && 187381ad6265SDimitry Andric SIInstrInfo::isVINTERP(*It)) { 187481ad6265SDimitry Andric MachineOperand *WaitExp = 187581ad6265SDimitry Andric TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); 187681ad6265SDimitry Andric if (Wait.ExpCnt < WaitExp->getImm()) { 187781ad6265SDimitry Andric WaitExp->setImm(Wait.ExpCnt); 187881ad6265SDimitry Andric Modified = true; 187981ad6265SDimitry Andric } 188081ad6265SDimitry Andric Wait.ExpCnt = ~0u; 188181ad6265SDimitry Andric 18827a6dacacSDimitry Andric LLVM_DEBUG(dbgs() << "generateWaitcnt\n" 188381ad6265SDimitry Andric << "Update Instr: " << *It); 18840b57cec5SDimitry Andric } 18850b57cec5SDimitry Andric 18867a6dacacSDimitry Andric if (WCG->createNewWaitcnt(Block, It, Wait)) 18870b57cec5SDimitry Andric Modified = true; 18880b57cec5SDimitry Andric 18890b57cec5SDimitry Andric return Modified; 18900b57cec5SDimitry Andric } 18910b57cec5SDimitry Andric 1892e8d8bef9SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens other 1893e8d8bef9SDimitry Andric // than LDS. Other address spaces supported by flat memory operations involve 1894e8d8bef9SDimitry Andric // global memory. 1895e8d8bef9SDimitry Andric bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { 1896e8d8bef9SDimitry Andric assert(TII->isFLAT(MI)); 1897e8d8bef9SDimitry Andric 1898e8d8bef9SDimitry Andric // All flat instructions use the VMEM counter. 1899e8d8bef9SDimitry Andric assert(TII->usesVM_CNT(MI)); 1900e8d8bef9SDimitry Andric 1901e8d8bef9SDimitry Andric // If there are no memory operands then conservatively assume the flat 1902e8d8bef9SDimitry Andric // operation may access VMEM. 19030b57cec5SDimitry Andric if (MI.memoperands_empty()) 19040b57cec5SDimitry Andric return true; 19050b57cec5SDimitry Andric 1906e8d8bef9SDimitry Andric // See if any memory operand specifies an address space that involves VMEM. 1907e8d8bef9SDimitry Andric // Flat operations only supported FLAT, LOCAL (LDS), or address spaces 1908e8d8bef9SDimitry Andric // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION 1909e8d8bef9SDimitry Andric // (GDS) address space is not supported by flat operations. Therefore, simply 1910e8d8bef9SDimitry Andric // return true unless only the LDS address space is found. 1911e8d8bef9SDimitry Andric for (const MachineMemOperand *Memop : MI.memoperands()) { 1912e8d8bef9SDimitry Andric unsigned AS = Memop->getAddrSpace(); 1913e8d8bef9SDimitry Andric assert(AS != AMDGPUAS::REGION_ADDRESS); 1914e8d8bef9SDimitry Andric if (AS != AMDGPUAS::LOCAL_ADDRESS) 1915e8d8bef9SDimitry Andric return true; 1916e8d8bef9SDimitry Andric } 1917e8d8bef9SDimitry Andric 1918e8d8bef9SDimitry Andric return false; 1919e8d8bef9SDimitry Andric } 1920e8d8bef9SDimitry Andric 1921e8d8bef9SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens for 1922e8d8bef9SDimitry Andric // either LDS or FLAT. 1923e8d8bef9SDimitry Andric bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 1924e8d8bef9SDimitry Andric assert(TII->isFLAT(MI)); 1925e8d8bef9SDimitry Andric 1926e8d8bef9SDimitry Andric // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. 1927e8d8bef9SDimitry Andric if (!TII->usesLGKM_CNT(MI)) 1928e8d8bef9SDimitry Andric return false; 1929e8d8bef9SDimitry Andric 1930fe6060f1SDimitry Andric // If in tgsplit mode then there can be no use of LDS. 1931fe6060f1SDimitry Andric if (ST->isTgSplitEnabled()) 1932fe6060f1SDimitry Andric return false; 1933fe6060f1SDimitry Andric 1934e8d8bef9SDimitry Andric // If there are no memory operands then conservatively assume the flat 1935e8d8bef9SDimitry Andric // operation may access LDS. 1936e8d8bef9SDimitry Andric if (MI.memoperands_empty()) 1937e8d8bef9SDimitry Andric return true; 1938e8d8bef9SDimitry Andric 1939e8d8bef9SDimitry Andric // See if any memory operand specifies an address space that involves LDS. 19400b57cec5SDimitry Andric for (const MachineMemOperand *Memop : MI.memoperands()) { 19410b57cec5SDimitry Andric unsigned AS = Memop->getAddrSpace(); 19420b57cec5SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 19430b57cec5SDimitry Andric return true; 19440b57cec5SDimitry Andric } 19450b57cec5SDimitry Andric 19460b57cec5SDimitry Andric return false; 19470b57cec5SDimitry Andric } 19480b57cec5SDimitry Andric 194906c3fb27SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens for 195006c3fb27SDimitry Andric // either scratch or FLAT. 195106c3fb27SDimitry Andric bool SIInsertWaitcnts::mayAccessScratchThroughFlat( 195206c3fb27SDimitry Andric const MachineInstr &MI) const { 195306c3fb27SDimitry Andric assert(TII->isFLAT(MI)); 195406c3fb27SDimitry Andric 195506c3fb27SDimitry Andric // SCRATCH instructions always access scratch. 195606c3fb27SDimitry Andric if (TII->isFLATScratch(MI)) 195706c3fb27SDimitry Andric return true; 195806c3fb27SDimitry Andric 195906c3fb27SDimitry Andric // GLOBAL instructions never access scratch. 196006c3fb27SDimitry Andric if (TII->isFLATGlobal(MI)) 196106c3fb27SDimitry Andric return false; 196206c3fb27SDimitry Andric 196306c3fb27SDimitry Andric // If there are no memory operands then conservatively assume the flat 196406c3fb27SDimitry Andric // operation may access scratch. 196506c3fb27SDimitry Andric if (MI.memoperands_empty()) 196606c3fb27SDimitry Andric return true; 196706c3fb27SDimitry Andric 196806c3fb27SDimitry Andric // See if any memory operand specifies an address space that involves scratch. 196906c3fb27SDimitry Andric return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { 197006c3fb27SDimitry Andric unsigned AS = Memop->getAddrSpace(); 197106c3fb27SDimitry Andric return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 197206c3fb27SDimitry Andric }); 197306c3fb27SDimitry Andric } 197406c3fb27SDimitry Andric 19751db9f3b2SDimitry Andric static bool isCacheInvOrWBInst(MachineInstr &Inst) { 19761db9f3b2SDimitry Andric auto Opc = Inst.getOpcode(); 19771db9f3b2SDimitry Andric return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || 19781db9f3b2SDimitry Andric Opc == AMDGPU::GLOBAL_WBINV; 19791db9f3b2SDimitry Andric } 19801db9f3b2SDimitry Andric 19810b57cec5SDimitry Andric void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, 19820b57cec5SDimitry Andric WaitcntBrackets *ScoreBrackets) { 19830b57cec5SDimitry Andric // Now look at the instruction opcode. If it is a memory access 19840b57cec5SDimitry Andric // instruction, update the upper-bound of the appropriate counter's 19850b57cec5SDimitry Andric // bracket and the destination operand scores. 19867a6dacacSDimitry Andric // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. 19877a6dacacSDimitry Andric 19880b57cec5SDimitry Andric if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 19890b57cec5SDimitry Andric if (TII->isAlwaysGDS(Inst.getOpcode()) || 19900b57cec5SDimitry Andric TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 19910b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 19920b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 19930b57cec5SDimitry Andric } else { 19940b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 19950b57cec5SDimitry Andric } 19960b57cec5SDimitry Andric } else if (TII->isFLAT(Inst)) { 19971db9f3b2SDimitry Andric // TODO: Track this properly. 19981db9f3b2SDimitry Andric if (isCacheInvOrWBInst(Inst)) 19991db9f3b2SDimitry Andric return; 20001db9f3b2SDimitry Andric 2001480093f4SDimitry Andric assert(Inst.mayLoadOrStore()); 20020b57cec5SDimitry Andric 2003e8d8bef9SDimitry Andric int FlatASCount = 0; 2004e8d8bef9SDimitry Andric 2005e8d8bef9SDimitry Andric if (mayAccessVMEMThroughFlat(Inst)) { 2006e8d8bef9SDimitry Andric ++FlatASCount; 2007bdd1243dSDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2008bdd1243dSDimitry Andric Inst); 20090b57cec5SDimitry Andric } 20100b57cec5SDimitry Andric 2011e8d8bef9SDimitry Andric if (mayAccessLDSThroughFlat(Inst)) { 2012e8d8bef9SDimitry Andric ++FlatASCount; 20130b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 20140b57cec5SDimitry Andric } 2015e8d8bef9SDimitry Andric 2016e8d8bef9SDimitry Andric // A Flat memory operation must access at least one address space. 2017e8d8bef9SDimitry Andric assert(FlatASCount); 2018e8d8bef9SDimitry Andric 2019e8d8bef9SDimitry Andric // This is a flat memory operation that access both VMEM and LDS, so note it 2020e8d8bef9SDimitry Andric // - it will require that both the VM and LGKM be flushed to zero if it is 2021e8d8bef9SDimitry Andric // pending when a VM or LGKM dependency occurs. 2022e8d8bef9SDimitry Andric if (FlatASCount > 1) 2023e8d8bef9SDimitry Andric ScoreBrackets->setPendingFlat(); 20240b57cec5SDimitry Andric } else if (SIInstrInfo::isVMEM(Inst) && 2025fe6060f1SDimitry Andric !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { 2026bdd1243dSDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2027bdd1243dSDimitry Andric Inst); 20280b57cec5SDimitry Andric 20290b57cec5SDimitry Andric if (ST->vmemWriteNeedsExpWaitcnt() && 2030fe6060f1SDimitry Andric (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { 20310b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 20320b57cec5SDimitry Andric } 20330b57cec5SDimitry Andric } else if (TII->isSMRD(Inst)) { 20340b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 20350b57cec5SDimitry Andric } else if (Inst.isCall()) { 20360b57cec5SDimitry Andric if (callWaitsOnFunctionReturn(Inst)) { 20370b57cec5SDimitry Andric // Act as a wait on everything 20387a6dacacSDimitry Andric ScoreBrackets->applyWaitcnt( 20390fca6ea1SDimitry Andric WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); 2040297eecfbSDimitry Andric ScoreBrackets->setStateOnFunctionEntryOrReturn(); 20410b57cec5SDimitry Andric } else { 20420b57cec5SDimitry Andric // May need to way wait for anything. 20430b57cec5SDimitry Andric ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); 20440b57cec5SDimitry Andric } 204581ad6265SDimitry Andric } else if (SIInstrInfo::isLDSDIR(Inst)) { 204681ad6265SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); 204781ad6265SDimitry Andric } else if (TII->isVINTERP(Inst)) { 204881ad6265SDimitry Andric int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); 204981ad6265SDimitry Andric ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); 2050e8d8bef9SDimitry Andric } else if (SIInstrInfo::isEXP(Inst)) { 2051e8d8bef9SDimitry Andric unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 2052e8d8bef9SDimitry Andric if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) 2053e8d8bef9SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 2054e8d8bef9SDimitry Andric else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) 2055e8d8bef9SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 2056e8d8bef9SDimitry Andric else 2057e8d8bef9SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 20580b57cec5SDimitry Andric } else { 20590b57cec5SDimitry Andric switch (Inst.getOpcode()) { 20600b57cec5SDimitry Andric case AMDGPU::S_SENDMSG: 206181ad6265SDimitry Andric case AMDGPU::S_SENDMSG_RTN_B32: 206281ad6265SDimitry Andric case AMDGPU::S_SENDMSG_RTN_B64: 20630b57cec5SDimitry Andric case AMDGPU::S_SENDMSGHALT: 20640b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 20650b57cec5SDimitry Andric break; 20660b57cec5SDimitry Andric case AMDGPU::S_MEMTIME: 20670b57cec5SDimitry Andric case AMDGPU::S_MEMREALTIME: 20685f757f3fSDimitry Andric case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: 20695f757f3fSDimitry Andric case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: 20705f757f3fSDimitry Andric case AMDGPU::S_BARRIER_LEAVE: 20715f757f3fSDimitry Andric case AMDGPU::S_GET_BARRIER_STATE_M0: 20725f757f3fSDimitry Andric case AMDGPU::S_GET_BARRIER_STATE_IMM: 20730b57cec5SDimitry Andric ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 20740b57cec5SDimitry Andric break; 20750b57cec5SDimitry Andric } 20760b57cec5SDimitry Andric } 20770b57cec5SDimitry Andric } 20780b57cec5SDimitry Andric 20795ffd83dbSDimitry Andric bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, 20805ffd83dbSDimitry Andric unsigned OtherScore) { 20815ffd83dbSDimitry Andric unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; 20825ffd83dbSDimitry Andric unsigned OtherShifted = 20830b57cec5SDimitry Andric OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; 20840b57cec5SDimitry Andric Score = std::max(MyShifted, OtherShifted); 20850b57cec5SDimitry Andric return OtherShifted > MyShifted; 20860b57cec5SDimitry Andric } 20870b57cec5SDimitry Andric 20880b57cec5SDimitry Andric /// Merge the pending events and associater score brackets of \p Other into 20890b57cec5SDimitry Andric /// this brackets status. 20900b57cec5SDimitry Andric /// 20910b57cec5SDimitry Andric /// Returns whether the merge resulted in a change that requires tighter waits 20920b57cec5SDimitry Andric /// (i.e. the merged brackets strictly dominate the original brackets). 20930b57cec5SDimitry Andric bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { 20940b57cec5SDimitry Andric bool StrictDom = false; 20950b57cec5SDimitry Andric 20965ffd83dbSDimitry Andric VgprUB = std::max(VgprUB, Other.VgprUB); 20975ffd83dbSDimitry Andric SgprUB = std::max(SgprUB, Other.SgprUB); 20985ffd83dbSDimitry Andric 20997a6dacacSDimitry Andric for (auto T : inst_counter_types(MaxCounter)) { 21000b57cec5SDimitry Andric // Merge event flags for this counter 21015ffd83dbSDimitry Andric const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; 21025ffd83dbSDimitry Andric const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; 21030b57cec5SDimitry Andric if (OtherEvents & ~OldEvents) 21040b57cec5SDimitry Andric StrictDom = true; 21050b57cec5SDimitry Andric PendingEvents |= OtherEvents; 21060b57cec5SDimitry Andric 21070b57cec5SDimitry Andric // Merge scores for this counter 21085ffd83dbSDimitry Andric const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; 21095ffd83dbSDimitry Andric const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; 21105ffd83dbSDimitry Andric const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); 21115ffd83dbSDimitry Andric if (NewUB < ScoreLBs[T]) 21125ffd83dbSDimitry Andric report_fatal_error("waitcnt score overflow"); 21135ffd83dbSDimitry Andric 21140b57cec5SDimitry Andric MergeInfo M; 21150b57cec5SDimitry Andric M.OldLB = ScoreLBs[T]; 21160b57cec5SDimitry Andric M.OtherLB = Other.ScoreLBs[T]; 21175ffd83dbSDimitry Andric M.MyShift = NewUB - ScoreUBs[T]; 21185ffd83dbSDimitry Andric M.OtherShift = NewUB - Other.ScoreUBs[T]; 21190b57cec5SDimitry Andric 21200b57cec5SDimitry Andric ScoreUBs[T] = NewUB; 21210b57cec5SDimitry Andric 21220b57cec5SDimitry Andric StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); 21230b57cec5SDimitry Andric 2124bdd1243dSDimitry Andric for (int J = 0; J <= VgprUB; J++) 2125bdd1243dSDimitry Andric StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); 21265ffd83dbSDimitry Andric 21277a6dacacSDimitry Andric if (T == SmemAccessCounter) { 2128bdd1243dSDimitry Andric for (int J = 0; J <= SgprUB; J++) 2129bdd1243dSDimitry Andric StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); 21300b57cec5SDimitry Andric } 21310b57cec5SDimitry Andric } 21320b57cec5SDimitry Andric 2133bdd1243dSDimitry Andric for (int J = 0; J <= VgprUB; J++) { 2134bdd1243dSDimitry Andric unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; 2135bdd1243dSDimitry Andric StrictDom |= NewVmemTypes != VgprVmemTypes[J]; 2136bdd1243dSDimitry Andric VgprVmemTypes[J] = NewVmemTypes; 21370b57cec5SDimitry Andric } 21380b57cec5SDimitry Andric 21390b57cec5SDimitry Andric return StrictDom; 21400b57cec5SDimitry Andric } 21410b57cec5SDimitry Andric 2142bdd1243dSDimitry Andric static bool isWaitInstr(MachineInstr &Inst) { 21437a6dacacSDimitry Andric unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode()); 21447a6dacacSDimitry Andric return Opcode == AMDGPU::S_WAITCNT || 21457a6dacacSDimitry Andric (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && 21467a6dacacSDimitry Andric Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || 21477a6dacacSDimitry Andric Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || 21487a6dacacSDimitry Andric Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || 21497a6dacacSDimitry Andric counterTypeForInstr(Opcode).has_value(); 2150bdd1243dSDimitry Andric } 2151bdd1243dSDimitry Andric 21520b57cec5SDimitry Andric // Generate s_waitcnt instructions where needed. 21530b57cec5SDimitry Andric bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 21540b57cec5SDimitry Andric MachineBasicBlock &Block, 21550b57cec5SDimitry Andric WaitcntBrackets &ScoreBrackets) { 21560b57cec5SDimitry Andric bool Modified = false; 21570b57cec5SDimitry Andric 21580b57cec5SDimitry Andric LLVM_DEBUG({ 21590b57cec5SDimitry Andric dbgs() << "*** Block" << Block.getNumber() << " ***"; 21600b57cec5SDimitry Andric ScoreBrackets.dump(); 21610b57cec5SDimitry Andric }); 21620b57cec5SDimitry Andric 2163e8d8bef9SDimitry Andric // Track the correctness of vccz through this basic block. There are two 2164e8d8bef9SDimitry Andric // reasons why it might be incorrect; see ST->hasReadVCCZBug() and 2165e8d8bef9SDimitry Andric // ST->partialVCCWritesUpdateVCCZ(). 21665ffd83dbSDimitry Andric bool VCCZCorrect = true; 2167e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug()) { 2168e8d8bef9SDimitry Andric // vccz could be incorrect at a basic block boundary if a predecessor wrote 2169e8d8bef9SDimitry Andric // to vcc and then issued an smem load. 2170e8d8bef9SDimitry Andric VCCZCorrect = false; 2171e8d8bef9SDimitry Andric } else if (!ST->partialVCCWritesUpdateVCCZ()) { 2172e8d8bef9SDimitry Andric // vccz could be incorrect at a basic block boundary if a predecessor wrote 2173e8d8bef9SDimitry Andric // to vcc_lo or vcc_hi. 2174e8d8bef9SDimitry Andric VCCZCorrect = false; 2175e8d8bef9SDimitry Andric } 21765ffd83dbSDimitry Andric 21770b57cec5SDimitry Andric // Walk over the instructions. 21780b57cec5SDimitry Andric MachineInstr *OldWaitcntInstr = nullptr; 21790b57cec5SDimitry Andric 21800b57cec5SDimitry Andric for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), 21810b57cec5SDimitry Andric E = Block.instr_end(); 21820b57cec5SDimitry Andric Iter != E;) { 21830b57cec5SDimitry Andric MachineInstr &Inst = *Iter; 21840b57cec5SDimitry Andric 2185fe6060f1SDimitry Andric // Track pre-existing waitcnts that were added in earlier iterations or by 2186fe6060f1SDimitry Andric // the memory legalizer. 2187bdd1243dSDimitry Andric if (isWaitInstr(Inst)) { 21880b57cec5SDimitry Andric if (!OldWaitcntInstr) 21890b57cec5SDimitry Andric OldWaitcntInstr = &Inst; 21900b57cec5SDimitry Andric ++Iter; 21910b57cec5SDimitry Andric continue; 21920b57cec5SDimitry Andric } 21930b57cec5SDimitry Andric 219481ad6265SDimitry Andric bool FlushVmCnt = Block.getFirstTerminator() == Inst && 219581ad6265SDimitry Andric isPreheaderToFlush(Block, ScoreBrackets); 219681ad6265SDimitry Andric 2197e8d8bef9SDimitry Andric // Generate an s_waitcnt instruction to be placed before Inst, if needed. 219881ad6265SDimitry Andric Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, 219981ad6265SDimitry Andric FlushVmCnt); 2200e8d8bef9SDimitry Andric OldWaitcntInstr = nullptr; 2201e8d8bef9SDimitry Andric 2202e8d8bef9SDimitry Andric // Restore vccz if it's not known to be correct already. 2203e8d8bef9SDimitry Andric bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); 2204e8d8bef9SDimitry Andric 2205e8d8bef9SDimitry Andric // Don't examine operands unless we need to track vccz correctness. 2206e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { 22070fca6ea1SDimitry Andric if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) || 22080fca6ea1SDimitry Andric Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) { 2209e8d8bef9SDimitry Andric // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. 2210e8d8bef9SDimitry Andric if (!ST->partialVCCWritesUpdateVCCZ()) 2211e8d8bef9SDimitry Andric VCCZCorrect = false; 22120fca6ea1SDimitry Andric } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) { 22135ffd83dbSDimitry Andric // There is a hardware bug on CI/SI where SMRD instruction may corrupt 22145ffd83dbSDimitry Andric // vccz bit, so when we detect that an instruction may read from a 22155ffd83dbSDimitry Andric // corrupt vccz bit, we need to: 22165ffd83dbSDimitry Andric // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD 22175ffd83dbSDimitry Andric // operations to complete. 22185ffd83dbSDimitry Andric // 2. Restore the correct value of vccz by writing the current value 22195ffd83dbSDimitry Andric // of vcc back to vcc. 2220e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug() && 22210b57cec5SDimitry Andric ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 2222e8d8bef9SDimitry Andric // Writes to vcc while there's an outstanding smem read may get 2223e8d8bef9SDimitry Andric // clobbered as soon as any read completes. 2224e8d8bef9SDimitry Andric VCCZCorrect = false; 2225e8d8bef9SDimitry Andric } else { 2226e8d8bef9SDimitry Andric // Writes to vcc will fix any incorrect value in vccz. 2227e8d8bef9SDimitry Andric VCCZCorrect = true; 22285ffd83dbSDimitry Andric } 22290b57cec5SDimitry Andric } 22300b57cec5SDimitry Andric } 22310b57cec5SDimitry Andric 2232480093f4SDimitry Andric if (TII->isSMRD(Inst)) { 2233480093f4SDimitry Andric for (const MachineMemOperand *Memop : Inst.memoperands()) { 2234fe6060f1SDimitry Andric // No need to handle invariant loads when avoiding WAR conflicts, as 2235fe6060f1SDimitry Andric // there cannot be a vector store to the same memory location. 2236fe6060f1SDimitry Andric if (!Memop->isInvariant()) { 2237480093f4SDimitry Andric const Value *Ptr = Memop->getValue(); 2238bdd1243dSDimitry Andric SLoadAddresses.insert(std::pair(Ptr, Inst.getParent())); 2239480093f4SDimitry Andric } 2240fe6060f1SDimitry Andric } 2241e8d8bef9SDimitry Andric if (ST->hasReadVCCZBug()) { 2242e8d8bef9SDimitry Andric // This smem read could complete and clobber vccz at any time. 22435ffd83dbSDimitry Andric VCCZCorrect = false; 22445ffd83dbSDimitry Andric } 2245e8d8bef9SDimitry Andric } 22460b57cec5SDimitry Andric 22470b57cec5SDimitry Andric updateEventWaitcntAfter(Inst, &ScoreBrackets); 22480b57cec5SDimitry Andric 22490fca6ea1SDimitry Andric if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { 22500fca6ea1SDimitry Andric AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( 22510fca6ea1SDimitry Andric Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)); 22520fca6ea1SDimitry Andric ScoreBrackets.simplifyWaitcnt(Wait); 22530fca6ea1SDimitry Andric Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, 22540fca6ea1SDimitry Andric ScoreBrackets, /*OldWaitcntInstr=*/nullptr); 22550b57cec5SDimitry Andric } 22560b57cec5SDimitry Andric 22570b57cec5SDimitry Andric LLVM_DEBUG({ 22580b57cec5SDimitry Andric Inst.print(dbgs()); 22590b57cec5SDimitry Andric ScoreBrackets.dump(); 22600b57cec5SDimitry Andric }); 22610b57cec5SDimitry Andric 22620b57cec5SDimitry Andric // TODO: Remove this work-around after fixing the scheduler and enable the 22630b57cec5SDimitry Andric // assert above. 22645ffd83dbSDimitry Andric if (RestoreVCCZ) { 22650b57cec5SDimitry Andric // Restore the vccz bit. Any time a value is written to vcc, the vcc 22660b57cec5SDimitry Andric // bit is updated, so we can restore the bit by reading the value of 22670b57cec5SDimitry Andric // vcc and then writing it back to the register. 22680b57cec5SDimitry Andric BuildMI(Block, Inst, Inst.getDebugLoc(), 22690b57cec5SDimitry Andric TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 22700b57cec5SDimitry Andric TRI->getVCC()) 22710b57cec5SDimitry Andric .addReg(TRI->getVCC()); 22725ffd83dbSDimitry Andric VCCZCorrect = true; 22730b57cec5SDimitry Andric Modified = true; 22740b57cec5SDimitry Andric } 22750b57cec5SDimitry Andric 22760b57cec5SDimitry Andric ++Iter; 22770b57cec5SDimitry Andric } 22780b57cec5SDimitry Andric 22790fca6ea1SDimitry Andric // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if 22800fca6ea1SDimitry Andric // needed. 22810fca6ea1SDimitry Andric AMDGPU::Waitcnt Wait; 228281ad6265SDimitry Andric if (Block.getFirstTerminator() == Block.end() && 22830fca6ea1SDimitry Andric isPreheaderToFlush(Block, ScoreBrackets)) { 22840fca6ea1SDimitry Andric if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) 22850fca6ea1SDimitry Andric Wait.LoadCnt = 0; 22860fca6ea1SDimitry Andric if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) 22870fca6ea1SDimitry Andric Wait.SampleCnt = 0; 22880fca6ea1SDimitry Andric if (ScoreBrackets.hasPendingEvent(BVH_CNT)) 22890fca6ea1SDimitry Andric Wait.BvhCnt = 0; 22900fca6ea1SDimitry Andric } 22910fca6ea1SDimitry Andric 22920fca6ea1SDimitry Andric // Combine or remove any redundant waitcnts at the end of the block. 22930fca6ea1SDimitry Andric Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, 22940fca6ea1SDimitry Andric OldWaitcntInstr); 229581ad6265SDimitry Andric 22960b57cec5SDimitry Andric return Modified; 22970b57cec5SDimitry Andric } 22980b57cec5SDimitry Andric 229981ad6265SDimitry Andric // Return true if the given machine basic block is a preheader of a loop in 230081ad6265SDimitry Andric // which we want to flush the vmcnt counter, and false otherwise. 230181ad6265SDimitry Andric bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, 230281ad6265SDimitry Andric WaitcntBrackets &ScoreBrackets) { 23035f757f3fSDimitry Andric auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); 23045f757f3fSDimitry Andric if (!IsInserted) 23055f757f3fSDimitry Andric return Iterator->second; 230681ad6265SDimitry Andric 230781ad6265SDimitry Andric MachineBasicBlock *Succ = MBB.getSingleSuccessor(); 230881ad6265SDimitry Andric if (!Succ) 23095f757f3fSDimitry Andric return false; 231081ad6265SDimitry Andric 231181ad6265SDimitry Andric MachineLoop *Loop = MLI->getLoopFor(Succ); 231281ad6265SDimitry Andric if (!Loop) 23135f757f3fSDimitry Andric return false; 231481ad6265SDimitry Andric 23155f757f3fSDimitry Andric if (Loop->getLoopPreheader() == &MBB && 23165f757f3fSDimitry Andric shouldFlushVmCnt(Loop, ScoreBrackets)) { 23175f757f3fSDimitry Andric Iterator->second = true; 23185f757f3fSDimitry Andric return true; 23195f757f3fSDimitry Andric } 232081ad6265SDimitry Andric 23215f757f3fSDimitry Andric return false; 232281ad6265SDimitry Andric } 232381ad6265SDimitry Andric 232406c3fb27SDimitry Andric bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { 232506c3fb27SDimitry Andric return SIInstrInfo::isVMEM(MI) || 232606c3fb27SDimitry Andric (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); 232706c3fb27SDimitry Andric } 232806c3fb27SDimitry Andric 232981ad6265SDimitry Andric // Return true if it is better to flush the vmcnt counter in the preheader of 233081ad6265SDimitry Andric // the given loop. We currently decide to flush in two situations: 233181ad6265SDimitry Andric // 1. The loop contains vmem store(s), no vmem load and at least one use of a 233281ad6265SDimitry Andric // vgpr containing a value that is loaded outside of the loop. (Only on 233381ad6265SDimitry Andric // targets with no vscnt counter). 233481ad6265SDimitry Andric // 2. The loop contains vmem load(s), but the loaded values are not used in the 233581ad6265SDimitry Andric // loop, and at least one use of a vgpr containing a value that is loaded 233681ad6265SDimitry Andric // outside of the loop. 233781ad6265SDimitry Andric bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, 233881ad6265SDimitry Andric WaitcntBrackets &Brackets) { 233981ad6265SDimitry Andric bool HasVMemLoad = false; 234081ad6265SDimitry Andric bool HasVMemStore = false; 234181ad6265SDimitry Andric bool UsesVgprLoadedOutside = false; 234281ad6265SDimitry Andric DenseSet<Register> VgprUse; 234381ad6265SDimitry Andric DenseSet<Register> VgprDef; 234481ad6265SDimitry Andric 234581ad6265SDimitry Andric for (MachineBasicBlock *MBB : ML->blocks()) { 234681ad6265SDimitry Andric for (MachineInstr &MI : *MBB) { 234706c3fb27SDimitry Andric if (isVMEMOrFlatVMEM(MI)) { 234881ad6265SDimitry Andric if (MI.mayLoad()) 234981ad6265SDimitry Andric HasVMemLoad = true; 235081ad6265SDimitry Andric if (MI.mayStore()) 235181ad6265SDimitry Andric HasVMemStore = true; 235281ad6265SDimitry Andric } 235381ad6265SDimitry Andric for (unsigned I = 0; I < MI.getNumOperands(); I++) { 235481ad6265SDimitry Andric MachineOperand &Op = MI.getOperand(I); 235581ad6265SDimitry Andric if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) 235681ad6265SDimitry Andric continue; 2357cb14a3feSDimitry Andric RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I); 235881ad6265SDimitry Andric // Vgpr use 235981ad6265SDimitry Andric if (Op.isUse()) { 236081ad6265SDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 236181ad6265SDimitry Andric // If we find a register that is loaded inside the loop, 1. and 2. 236281ad6265SDimitry Andric // are invalidated and we can exit. 236381ad6265SDimitry Andric if (VgprDef.contains(RegNo)) 236481ad6265SDimitry Andric return false; 236581ad6265SDimitry Andric VgprUse.insert(RegNo); 236681ad6265SDimitry Andric // If at least one of Op's registers is in the score brackets, the 236781ad6265SDimitry Andric // value is likely loaded outside of the loop. 23687a6dacacSDimitry Andric if (Brackets.getRegScore(RegNo, LOAD_CNT) > 23697a6dacacSDimitry Andric Brackets.getScoreLB(LOAD_CNT) || 23707a6dacacSDimitry Andric Brackets.getRegScore(RegNo, SAMPLE_CNT) > 23717a6dacacSDimitry Andric Brackets.getScoreLB(SAMPLE_CNT) || 23727a6dacacSDimitry Andric Brackets.getRegScore(RegNo, BVH_CNT) > 23737a6dacacSDimitry Andric Brackets.getScoreLB(BVH_CNT)) { 237481ad6265SDimitry Andric UsesVgprLoadedOutside = true; 237581ad6265SDimitry Andric break; 237681ad6265SDimitry Andric } 237781ad6265SDimitry Andric } 237881ad6265SDimitry Andric } 237981ad6265SDimitry Andric // VMem load vgpr def 238006c3fb27SDimitry Andric else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) 238181ad6265SDimitry Andric for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 238281ad6265SDimitry Andric // If we find a register that is loaded inside the loop, 1. and 2. 238381ad6265SDimitry Andric // are invalidated and we can exit. 238481ad6265SDimitry Andric if (VgprUse.contains(RegNo)) 238581ad6265SDimitry Andric return false; 238681ad6265SDimitry Andric VgprDef.insert(RegNo); 238781ad6265SDimitry Andric } 238881ad6265SDimitry Andric } 238981ad6265SDimitry Andric } 239081ad6265SDimitry Andric } 239181ad6265SDimitry Andric if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) 239281ad6265SDimitry Andric return true; 2393*6c4b055cSDimitry Andric return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); 239481ad6265SDimitry Andric } 239581ad6265SDimitry Andric 23960b57cec5SDimitry Andric bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 23970b57cec5SDimitry Andric ST = &MF.getSubtarget<GCNSubtarget>(); 23980b57cec5SDimitry Andric TII = ST->getInstrInfo(); 23990b57cec5SDimitry Andric TRI = &TII->getRegisterInfo(); 24000b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 24010b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 24020fca6ea1SDimitry Andric MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 24030fca6ea1SDimitry Andric PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 24047a6dacacSDimitry Andric if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) 24057a6dacacSDimitry Andric AA = &AAR->getAAResults(); 24067a6dacacSDimitry Andric 24077a6dacacSDimitry Andric AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); 24087a6dacacSDimitry Andric 24097a6dacacSDimitry Andric if (ST->hasExtendedWaitCounts()) { 24107a6dacacSDimitry Andric MaxCounter = NUM_EXTENDED_INST_CNTS; 24110fca6ea1SDimitry Andric WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter); 24127a6dacacSDimitry Andric WCG = &WCGGFX12Plus; 24137a6dacacSDimitry Andric } else { 24147a6dacacSDimitry Andric MaxCounter = NUM_NORMAL_INST_CNTS; 24150fca6ea1SDimitry Andric WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF); 24167a6dacacSDimitry Andric WCG = &WCGPreGFX12; 24177a6dacacSDimitry Andric } 24180b57cec5SDimitry Andric 24190b57cec5SDimitry Andric ForceEmitZeroWaitcnts = ForceEmitZeroFlag; 24200b57cec5SDimitry Andric for (auto T : inst_counter_types()) 24210b57cec5SDimitry Andric ForceEmitWaitcnt[T] = false; 24220b57cec5SDimitry Andric 24237a6dacacSDimitry Andric const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); 24247a6dacacSDimitry Andric 24257a6dacacSDimitry Andric SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); 24267a6dacacSDimitry Andric 24270eae32dcSDimitry Andric HardwareLimits Limits = {}; 24287a6dacacSDimitry Andric if (ST->hasExtendedWaitCounts()) { 24297a6dacacSDimitry Andric Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); 24307a6dacacSDimitry Andric Limits.DscntMax = AMDGPU::getDscntBitMask(IV); 24317a6dacacSDimitry Andric } else { 24327a6dacacSDimitry Andric Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); 24337a6dacacSDimitry Andric Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); 24347a6dacacSDimitry Andric } 24350eae32dcSDimitry Andric Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 24367a6dacacSDimitry Andric Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); 24377a6dacacSDimitry Andric Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); 24387a6dacacSDimitry Andric Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); 24397a6dacacSDimitry Andric Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); 24400b57cec5SDimitry Andric 24415ffd83dbSDimitry Andric unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); 24425ffd83dbSDimitry Andric unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); 24435ffd83dbSDimitry Andric assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 24445ffd83dbSDimitry Andric assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 24450b57cec5SDimitry Andric 24460eae32dcSDimitry Andric RegisterEncoding Encoding = {}; 24475f757f3fSDimitry Andric Encoding.VGPR0 = 24485f757f3fSDimitry Andric TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 24490eae32dcSDimitry Andric Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; 24505f757f3fSDimitry Andric Encoding.SGPR0 = 24515f757f3fSDimitry Andric TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 24520eae32dcSDimitry Andric Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; 24530b57cec5SDimitry Andric 24540b57cec5SDimitry Andric BlockInfos.clear(); 2455fe6060f1SDimitry Andric bool Modified = false; 2456fe6060f1SDimitry Andric 24577a6dacacSDimitry Andric MachineBasicBlock &EntryBB = MF.front(); 24587a6dacacSDimitry Andric MachineBasicBlock::iterator I = EntryBB.begin(); 24597a6dacacSDimitry Andric 2460fe6060f1SDimitry Andric if (!MFI->isEntryFunction()) { 2461fe6060f1SDimitry Andric // Wait for any outstanding memory operations that the input registers may 2462fe6060f1SDimitry Andric // depend on. We can't track them and it's better to do the wait after the 2463fe6060f1SDimitry Andric // costly call sequence. 2464fe6060f1SDimitry Andric 2465fe6060f1SDimitry Andric // TODO: Could insert earlier and schedule more liberally with operations 2466fe6060f1SDimitry Andric // that only use caller preserved registers. 2467fe6060f1SDimitry Andric for (MachineBasicBlock::iterator E = EntryBB.end(); 2468fe6060f1SDimitry Andric I != E && (I->isPHI() || I->isMetaInstruction()); ++I) 2469fe6060f1SDimitry Andric ; 2470fe6060f1SDimitry Andric 24717a6dacacSDimitry Andric if (ST->hasExtendedWaitCounts()) { 24727a6dacacSDimitry Andric BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) 24737a6dacacSDimitry Andric .addImm(0); 24747a6dacacSDimitry Andric for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 24757a6dacacSDimitry Andric if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) 24767a6dacacSDimitry Andric continue; 24777a6dacacSDimitry Andric 24787a6dacacSDimitry Andric BuildMI(EntryBB, I, DebugLoc(), 24797a6dacacSDimitry Andric TII->get(instrsForExtendedCounterTypes[CT])) 24807a6dacacSDimitry Andric .addImm(0); 24817a6dacacSDimitry Andric } 24827a6dacacSDimitry Andric } else { 24837a6dacacSDimitry Andric BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); 24847a6dacacSDimitry Andric } 24857a6dacacSDimitry Andric 24867a6dacacSDimitry Andric auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( 24877a6dacacSDimitry Andric ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, 24887a6dacacSDimitry Andric SmemAccessCounter); 2489297eecfbSDimitry Andric NonKernelInitialState->setStateOnFunctionEntryOrReturn(); 24905f757f3fSDimitry Andric BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); 24915f757f3fSDimitry Andric 2492fe6060f1SDimitry Andric Modified = true; 2493fe6060f1SDimitry Andric } 24940b57cec5SDimitry Andric 24950b57cec5SDimitry Andric // Keep iterating over the blocks in reverse post order, inserting and 24960b57cec5SDimitry Andric // updating s_waitcnt where needed, until a fix point is reached. 24975ffd83dbSDimitry Andric for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) 249806c3fb27SDimitry Andric BlockInfos.insert({MBB, BlockInfo()}); 24990b57cec5SDimitry Andric 25000b57cec5SDimitry Andric std::unique_ptr<WaitcntBrackets> Brackets; 25010b57cec5SDimitry Andric bool Repeat; 25020b57cec5SDimitry Andric do { 25030b57cec5SDimitry Andric Repeat = false; 25040b57cec5SDimitry Andric 25055ffd83dbSDimitry Andric for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; 25065ffd83dbSDimitry Andric ++BII) { 250706c3fb27SDimitry Andric MachineBasicBlock *MBB = BII->first; 25085ffd83dbSDimitry Andric BlockInfo &BI = BII->second; 25090b57cec5SDimitry Andric if (!BI.Dirty) 25100b57cec5SDimitry Andric continue; 25110b57cec5SDimitry Andric 25120b57cec5SDimitry Andric if (BI.Incoming) { 25130b57cec5SDimitry Andric if (!Brackets) 25148bcb0991SDimitry Andric Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); 25150b57cec5SDimitry Andric else 25160b57cec5SDimitry Andric *Brackets = *BI.Incoming; 25170b57cec5SDimitry Andric } else { 25180b57cec5SDimitry Andric if (!Brackets) 25197a6dacacSDimitry Andric Brackets = std::make_unique<WaitcntBrackets>( 25207a6dacacSDimitry Andric ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, 25217a6dacacSDimitry Andric SmemAccessCounter); 25220b57cec5SDimitry Andric else 25237a6dacacSDimitry Andric *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding, 25247a6dacacSDimitry Andric WaitEventMaskForInst, SmemAccessCounter); 25250b57cec5SDimitry Andric } 25260b57cec5SDimitry Andric 252706c3fb27SDimitry Andric Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); 25280b57cec5SDimitry Andric BI.Dirty = false; 25290b57cec5SDimitry Andric 2530bdd1243dSDimitry Andric if (Brackets->hasPendingEvent()) { 25310b57cec5SDimitry Andric BlockInfo *MoveBracketsToSucc = nullptr; 253206c3fb27SDimitry Andric for (MachineBasicBlock *Succ : MBB->successors()) { 25335ffd83dbSDimitry Andric auto SuccBII = BlockInfos.find(Succ); 25345ffd83dbSDimitry Andric BlockInfo &SuccBI = SuccBII->second; 25350b57cec5SDimitry Andric if (!SuccBI.Incoming) { 25360b57cec5SDimitry Andric SuccBI.Dirty = true; 25375ffd83dbSDimitry Andric if (SuccBII <= BII) 25380b57cec5SDimitry Andric Repeat = true; 25390b57cec5SDimitry Andric if (!MoveBracketsToSucc) { 25400b57cec5SDimitry Andric MoveBracketsToSucc = &SuccBI; 25410b57cec5SDimitry Andric } else { 25428bcb0991SDimitry Andric SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); 25430b57cec5SDimitry Andric } 25440b57cec5SDimitry Andric } else if (SuccBI.Incoming->merge(*Brackets)) { 25450b57cec5SDimitry Andric SuccBI.Dirty = true; 25465ffd83dbSDimitry Andric if (SuccBII <= BII) 25470b57cec5SDimitry Andric Repeat = true; 25480b57cec5SDimitry Andric } 25490b57cec5SDimitry Andric } 25500b57cec5SDimitry Andric if (MoveBracketsToSucc) 25510b57cec5SDimitry Andric MoveBracketsToSucc->Incoming = std::move(Brackets); 25520b57cec5SDimitry Andric } 25530b57cec5SDimitry Andric } 25540b57cec5SDimitry Andric } while (Repeat); 25550b57cec5SDimitry Andric 25560eae32dcSDimitry Andric if (ST->hasScalarStores()) { 25570b57cec5SDimitry Andric SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 25580b57cec5SDimitry Andric bool HaveScalarStores = false; 25590b57cec5SDimitry Andric 2560349cc55cSDimitry Andric for (MachineBasicBlock &MBB : MF) { 2561349cc55cSDimitry Andric for (MachineInstr &MI : MBB) { 2562349cc55cSDimitry Andric if (!HaveScalarStores && TII->isScalarStore(MI)) 25630b57cec5SDimitry Andric HaveScalarStores = true; 25640b57cec5SDimitry Andric 2565349cc55cSDimitry Andric if (MI.getOpcode() == AMDGPU::S_ENDPGM || 2566349cc55cSDimitry Andric MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 25670b57cec5SDimitry Andric EndPgmBlocks.push_back(&MBB); 25680b57cec5SDimitry Andric } 25690b57cec5SDimitry Andric } 25700b57cec5SDimitry Andric 25710b57cec5SDimitry Andric if (HaveScalarStores) { 25720b57cec5SDimitry Andric // If scalar writes are used, the cache must be flushed or else the next 25730b57cec5SDimitry Andric // wave to reuse the same scratch memory can be clobbered. 25740b57cec5SDimitry Andric // 25750b57cec5SDimitry Andric // Insert s_dcache_wb at wave termination points if there were any scalar 25760eae32dcSDimitry Andric // stores, and only if the cache hasn't already been flushed. This could 25770eae32dcSDimitry Andric // be improved by looking across blocks for flushes in postdominating 25780eae32dcSDimitry Andric // blocks from the stores but an explicitly requested flush is probably 25790eae32dcSDimitry Andric // very rare. 25800b57cec5SDimitry Andric for (MachineBasicBlock *MBB : EndPgmBlocks) { 25810b57cec5SDimitry Andric bool SeenDCacheWB = false; 25820b57cec5SDimitry Andric 25830eae32dcSDimitry Andric for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 25840eae32dcSDimitry Andric I != E; ++I) { 25850b57cec5SDimitry Andric if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 25860b57cec5SDimitry Andric SeenDCacheWB = true; 25870b57cec5SDimitry Andric else if (TII->isScalarStore(*I)) 25880b57cec5SDimitry Andric SeenDCacheWB = false; 25890b57cec5SDimitry Andric 25900b57cec5SDimitry Andric // FIXME: It would be better to insert this before a waitcnt if any. 25910b57cec5SDimitry Andric if ((I->getOpcode() == AMDGPU::S_ENDPGM || 25920b57cec5SDimitry Andric I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 25930b57cec5SDimitry Andric !SeenDCacheWB) { 25940b57cec5SDimitry Andric Modified = true; 25950b57cec5SDimitry Andric BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 25960b57cec5SDimitry Andric } 25970b57cec5SDimitry Andric } 25980b57cec5SDimitry Andric } 25990b57cec5SDimitry Andric } 26000eae32dcSDimitry Andric } 26010b57cec5SDimitry Andric 260206c3fb27SDimitry Andric // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM 260306c3fb27SDimitry Andric // instructions. 260406c3fb27SDimitry Andric for (MachineInstr *MI : ReleaseVGPRInsts) { 260506c3fb27SDimitry Andric if (ST->requiresNopBeforeDeallocVGPRs()) { 26060fca6ea1SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP)) 260706c3fb27SDimitry Andric .addImm(0); 260806c3fb27SDimitry Andric } 26090fca6ea1SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 26100fca6ea1SDimitry Andric TII->get(AMDGPU::S_SENDMSG)) 261106c3fb27SDimitry Andric .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); 261206c3fb27SDimitry Andric Modified = true; 261306c3fb27SDimitry Andric } 261406c3fb27SDimitry Andric ReleaseVGPRInsts.clear(); 261506c3fb27SDimitry Andric 26160b57cec5SDimitry Andric return Modified; 26170b57cec5SDimitry Andric } 2618