xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (revision 6c4b055cfb6bf549e9145dde6454cc6b178c35e4)
10b57cec5SDimitry Andric //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// Insert wait instructions for memory reads and writes.
110b57cec5SDimitry Andric ///
120b57cec5SDimitry Andric /// Memory reads and writes are issued asynchronously, so we need to insert
130b57cec5SDimitry Andric /// S_WAITCNT instructions when we want to access any of their results or
140b57cec5SDimitry Andric /// overwrite any register that's used asynchronously.
150b57cec5SDimitry Andric ///
160b57cec5SDimitry Andric /// TODO: This pass currently keeps one timeline per hardware counter. A more
170b57cec5SDimitry Andric /// finely-grained approach that keeps one timeline per event type could
180b57cec5SDimitry Andric /// sometimes get away with generating weaker s_waitcnt instructions. For
190b57cec5SDimitry Andric /// example, when both SMEM and LDS are in flight and we need to wait for
200b57cec5SDimitry Andric /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
210b57cec5SDimitry Andric /// but the pass will currently generate a conservative lgkmcnt(0) because
220b57cec5SDimitry Andric /// multiple event types are in flight.
230b57cec5SDimitry Andric //
240b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
250b57cec5SDimitry Andric 
260b57cec5SDimitry Andric #include "AMDGPU.h"
27e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
28e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
290b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
30fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
315ffd83dbSDimitry Andric #include "llvm/ADT/MapVector.h"
320b57cec5SDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
330eae32dcSDimitry Andric #include "llvm/ADT/Sequence.h"
347a6dacacSDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
3581ad6265SDimitry Andric #include "llvm/CodeGen/MachineLoopInfo.h"
36480093f4SDimitry Andric #include "llvm/CodeGen/MachinePostDominators.h"
37480093f4SDimitry Andric #include "llvm/InitializePasses.h"
380b57cec5SDimitry Andric #include "llvm/Support/DebugCounter.h"
3906c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h"
400b57cec5SDimitry Andric using namespace llvm;
410b57cec5SDimitry Andric 
420b57cec5SDimitry Andric #define DEBUG_TYPE "si-insert-waitcnts"
430b57cec5SDimitry Andric 
440b57cec5SDimitry Andric DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
450b57cec5SDimitry Andric               "Force emit s_waitcnt expcnt(0) instrs");
460b57cec5SDimitry Andric DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
470b57cec5SDimitry Andric               "Force emit s_waitcnt lgkmcnt(0) instrs");
480b57cec5SDimitry Andric DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
490b57cec5SDimitry Andric               "Force emit s_waitcnt vmcnt(0) instrs");
500b57cec5SDimitry Andric 
510b57cec5SDimitry Andric static cl::opt<bool> ForceEmitZeroFlag(
520b57cec5SDimitry Andric   "amdgpu-waitcnt-forcezero",
530b57cec5SDimitry Andric   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
540b57cec5SDimitry Andric   cl::init(false), cl::Hidden);
550b57cec5SDimitry Andric 
560b57cec5SDimitry Andric namespace {
570b57cec5SDimitry Andric // Class of object that encapsulates latest instruction counter score
580b57cec5SDimitry Andric // associated with the operand.  Used for determining whether
59349cc55cSDimitry Andric // s_waitcnt instruction needs to be emitted.
600b57cec5SDimitry Andric 
617a6dacacSDimitry Andric enum InstCounterType {
627a6dacacSDimitry Andric   LOAD_CNT = 0, // VMcnt prior to gfx12.
637a6dacacSDimitry Andric   DS_CNT,       // LKGMcnt prior to gfx12.
647a6dacacSDimitry Andric   EXP_CNT,      //
657a6dacacSDimitry Andric   STORE_CNT,    // VScnt in gfx10/gfx11.
667a6dacacSDimitry Andric   NUM_NORMAL_INST_CNTS,
677a6dacacSDimitry Andric   SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
687a6dacacSDimitry Andric   BVH_CNT,                           // gfx12+ only.
697a6dacacSDimitry Andric   KM_CNT,                            // gfx12+ only.
707a6dacacSDimitry Andric   NUM_EXTENDED_INST_CNTS,
717a6dacacSDimitry Andric   NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
727a6dacacSDimitry Andric };
730eae32dcSDimitry Andric } // namespace
740b57cec5SDimitry Andric 
750eae32dcSDimitry Andric namespace llvm {
760eae32dcSDimitry Andric template <> struct enum_iteration_traits<InstCounterType> {
770eae32dcSDimitry Andric   static constexpr bool is_iterable = true;
780eae32dcSDimitry Andric };
790eae32dcSDimitry Andric } // namespace llvm
800eae32dcSDimitry Andric 
810eae32dcSDimitry Andric namespace {
827a6dacacSDimitry Andric // Return an iterator over all counters between LOAD_CNT (the first counter)
837a6dacacSDimitry Andric // and \c MaxCounter (exclusive, default value yields an enumeration over
847a6dacacSDimitry Andric // all counters).
857a6dacacSDimitry Andric auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
867a6dacacSDimitry Andric   return enum_seq(LOAD_CNT, MaxCounter);
877a6dacacSDimitry Andric }
880b57cec5SDimitry Andric 
895ffd83dbSDimitry Andric using RegInterval = std::pair<int, int>;
900b57cec5SDimitry Andric 
910eae32dcSDimitry Andric struct HardwareLimits {
927a6dacacSDimitry Andric   unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
935ffd83dbSDimitry Andric   unsigned ExpcntMax;
947a6dacacSDimitry Andric   unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
957a6dacacSDimitry Andric   unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
967a6dacacSDimitry Andric   unsigned SamplecntMax; // gfx12+ only.
977a6dacacSDimitry Andric   unsigned BvhcntMax;    // gfx12+ only.
987a6dacacSDimitry Andric   unsigned KmcntMax;     // gfx12+ only.
990eae32dcSDimitry Andric };
1000b57cec5SDimitry Andric 
1010eae32dcSDimitry Andric struct RegisterEncoding {
1020b57cec5SDimitry Andric   unsigned VGPR0;
1030b57cec5SDimitry Andric   unsigned VGPRL;
1040b57cec5SDimitry Andric   unsigned SGPR0;
1050b57cec5SDimitry Andric   unsigned SGPRL;
1060eae32dcSDimitry Andric };
1070b57cec5SDimitry Andric 
1080b57cec5SDimitry Andric enum WaitEventType {
1090b57cec5SDimitry Andric   VMEM_ACCESS,              // vector-memory read & write
1100b57cec5SDimitry Andric   VMEM_READ_ACCESS,         // vector-memory read
1117a6dacacSDimitry Andric   VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
1127a6dacacSDimitry Andric   VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
11306c3fb27SDimitry Andric   VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
11406c3fb27SDimitry Andric   SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
1150b57cec5SDimitry Andric   LDS_ACCESS,               // lds read & write
1160b57cec5SDimitry Andric   GDS_ACCESS,               // gds read & write
1170b57cec5SDimitry Andric   SQ_MESSAGE,               // send message
1180b57cec5SDimitry Andric   SMEM_ACCESS,              // scalar-memory read & write
1190b57cec5SDimitry Andric   EXP_GPR_LOCK,             // export holding on its data src
1200b57cec5SDimitry Andric   GDS_GPR_LOCK,             // GDS holding on its data and addr src
1210b57cec5SDimitry Andric   EXP_POS_ACCESS,           // write to export position
1220b57cec5SDimitry Andric   EXP_PARAM_ACCESS,         // write to export parameter
1230b57cec5SDimitry Andric   VMW_GPR_LOCK,             // vector-memory write holding on its data src
12481ad6265SDimitry Andric   EXP_LDS_ACCESS,           // read by ldsdir counting as export
1250b57cec5SDimitry Andric   NUM_WAIT_EVENTS,
1260b57cec5SDimitry Andric };
1270b57cec5SDimitry Andric 
1280b57cec5SDimitry Andric // The mapping is:
1290b57cec5SDimitry Andric //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
1300b57cec5SDimitry Andric //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
1310b57cec5SDimitry Andric //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
1320b57cec5SDimitry Andric // We reserve a fixed number of VGPR slots in the scoring tables for
1330b57cec5SDimitry Andric // special tokens like SCMEM_LDS (needed for buffer load to LDS).
1340b57cec5SDimitry Andric enum RegisterMapping {
135fe6060f1SDimitry Andric   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
13681ad6265SDimitry Andric   AGPR_OFFSET = 256,      // Maximum programmable ArchVGPRs across all targets.
1370b57cec5SDimitry Andric   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
1387a6dacacSDimitry Andric   NUM_EXTRA_VGPRS = 9,    // Reserved slots for DS.
1397a6dacacSDimitry Andric   // Artificial register slots to track LDS writes into specific LDS locations
1407a6dacacSDimitry Andric   // if a location is known. When slots are exhausted or location is
1417a6dacacSDimitry Andric   // unknown use the first slot. The first slot is also always updated in
1427a6dacacSDimitry Andric   // addition to known location's slot to properly generate waits if dependent
1437a6dacacSDimitry Andric   // instruction's location is unknown.
1447a6dacacSDimitry Andric   EXTRA_VGPR_LDS = 0,
1450b57cec5SDimitry Andric   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
1460b57cec5SDimitry Andric };
1470b57cec5SDimitry Andric 
1485ffd83dbSDimitry Andric // Enumerate different types of result-returning VMEM operations. Although
1495ffd83dbSDimitry Andric // s_waitcnt orders them all with a single vmcnt counter, in the absence of
1505ffd83dbSDimitry Andric // s_waitcnt only instructions of the same VmemType are guaranteed to write
1515ffd83dbSDimitry Andric // their results in order -- so there is no need to insert an s_waitcnt between
1525ffd83dbSDimitry Andric // two instructions of the same type that write the same vgpr.
1535ffd83dbSDimitry Andric enum VmemType {
1545ffd83dbSDimitry Andric   // BUF instructions and MIMG instructions without a sampler.
1555ffd83dbSDimitry Andric   VMEM_NOSAMPLER,
1565ffd83dbSDimitry Andric   // MIMG instructions with a sampler.
1575ffd83dbSDimitry Andric   VMEM_SAMPLER,
1584824e7fdSDimitry Andric   // BVH instructions
1597a6dacacSDimitry Andric   VMEM_BVH,
1607a6dacacSDimitry Andric   NUM_VMEM_TYPES
1615ffd83dbSDimitry Andric };
1625ffd83dbSDimitry Andric 
1637a6dacacSDimitry Andric // Maps values of InstCounterType to the instruction that waits on that
1647a6dacacSDimitry Andric // counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
1657a6dacacSDimitry Andric // returns true.
1667a6dacacSDimitry Andric static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
1677a6dacacSDimitry Andric     AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
1687a6dacacSDimitry Andric     AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
1697a6dacacSDimitry Andric     AMDGPU::S_WAIT_KMCNT};
1707a6dacacSDimitry Andric 
171bdd1243dSDimitry Andric static bool updateVMCntOnly(const MachineInstr &Inst) {
172bdd1243dSDimitry Andric   return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
173bdd1243dSDimitry Andric          SIInstrInfo::isFLATScratch(Inst);
174bdd1243dSDimitry Andric }
175bdd1243dSDimitry Andric 
1767a6dacacSDimitry Andric #ifndef NDEBUG
1777a6dacacSDimitry Andric static bool isNormalMode(InstCounterType MaxCounter) {
1787a6dacacSDimitry Andric   return MaxCounter == NUM_NORMAL_INST_CNTS;
1797a6dacacSDimitry Andric }
1807a6dacacSDimitry Andric #endif // NDEBUG
1817a6dacacSDimitry Andric 
1825ffd83dbSDimitry Andric VmemType getVmemType(const MachineInstr &Inst) {
183bdd1243dSDimitry Andric   assert(updateVMCntOnly(Inst));
1847a6dacacSDimitry Andric   if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
1857a6dacacSDimitry Andric       !SIInstrInfo::isVSAMPLE(Inst))
1865ffd83dbSDimitry Andric     return VMEM_NOSAMPLER;
1875ffd83dbSDimitry Andric   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
1884824e7fdSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1894824e7fdSDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
1900fca6ea1SDimitry Andric   // We have to make an additional check for isVSAMPLE here since some
1910fca6ea1SDimitry Andric   // instructions don't have a sampler, but are still classified as sampler
1920fca6ea1SDimitry Andric   // instructions for the purposes of e.g. waitcnt.
1934824e7fdSDimitry Andric   return BaseInfo->BVH                                         ? VMEM_BVH
1940fca6ea1SDimitry Andric          : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
1950fca6ea1SDimitry Andric                                                                : VMEM_NOSAMPLER;
1965ffd83dbSDimitry Andric }
1975ffd83dbSDimitry Andric 
1987a6dacacSDimitry Andric unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
1990b57cec5SDimitry Andric   switch (T) {
2007a6dacacSDimitry Andric   case LOAD_CNT:
2017a6dacacSDimitry Andric     return Wait.LoadCnt;
2020b57cec5SDimitry Andric   case EXP_CNT:
2037a6dacacSDimitry Andric     return Wait.ExpCnt;
2047a6dacacSDimitry Andric   case DS_CNT:
2057a6dacacSDimitry Andric     return Wait.DsCnt;
2067a6dacacSDimitry Andric   case STORE_CNT:
2077a6dacacSDimitry Andric     return Wait.StoreCnt;
2087a6dacacSDimitry Andric   case SAMPLE_CNT:
2097a6dacacSDimitry Andric     return Wait.SampleCnt;
2107a6dacacSDimitry Andric   case BVH_CNT:
2117a6dacacSDimitry Andric     return Wait.BvhCnt;
2127a6dacacSDimitry Andric   case KM_CNT:
2137a6dacacSDimitry Andric     return Wait.KmCnt;
2140b57cec5SDimitry Andric   default:
2150b57cec5SDimitry Andric     llvm_unreachable("bad InstCounterType");
2160b57cec5SDimitry Andric   }
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric 
2197a6dacacSDimitry Andric void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
2207a6dacacSDimitry Andric   unsigned &WC = getCounterRef(Wait, T);
2217a6dacacSDimitry Andric   WC = std::min(WC, Count);
2227a6dacacSDimitry Andric }
2237a6dacacSDimitry Andric 
2247a6dacacSDimitry Andric void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
2257a6dacacSDimitry Andric   getCounterRef(Wait, T) = ~0u;
2267a6dacacSDimitry Andric }
2277a6dacacSDimitry Andric 
2287a6dacacSDimitry Andric unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
2297a6dacacSDimitry Andric   return getCounterRef(Wait, T);
2307a6dacacSDimitry Andric }
2317a6dacacSDimitry Andric 
2327a6dacacSDimitry Andric // Mapping from event to counter according to the table masks.
2337a6dacacSDimitry Andric InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
2347a6dacacSDimitry Andric   for (auto T : inst_counter_types()) {
2357a6dacacSDimitry Andric     if (masks[T] & (1 << E))
2367a6dacacSDimitry Andric       return T;
2377a6dacacSDimitry Andric   }
2387a6dacacSDimitry Andric   llvm_unreachable("event type has no associated counter");
2397a6dacacSDimitry Andric }
2407a6dacacSDimitry Andric 
2410b57cec5SDimitry Andric // This objects maintains the current score brackets of each wait counter, and
2420b57cec5SDimitry Andric // a per-register scoreboard for each wait counter.
2430b57cec5SDimitry Andric //
2440b57cec5SDimitry Andric // We also maintain the latest score for every event type that can change the
2450b57cec5SDimitry Andric // waitcnt in order to know if there are multiple types of events within
2460b57cec5SDimitry Andric // the brackets. When multiple types of event happen in the bracket,
2470b57cec5SDimitry Andric // wait count may get decreased out of order, therefore we need to put in
2480b57cec5SDimitry Andric // "s_waitcnt 0" before use.
2490b57cec5SDimitry Andric class WaitcntBrackets {
2500b57cec5SDimitry Andric public:
2517a6dacacSDimitry Andric   WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
2527a6dacacSDimitry Andric                   HardwareLimits Limits, RegisterEncoding Encoding,
2537a6dacacSDimitry Andric                   const unsigned *WaitEventMaskForInst,
2547a6dacacSDimitry Andric                   InstCounterType SmemAccessCounter)
2557a6dacacSDimitry Andric       : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
2567a6dacacSDimitry Andric         Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
2577a6dacacSDimitry Andric         SmemAccessCounter(SmemAccessCounter) {}
2580b57cec5SDimitry Andric 
2590eae32dcSDimitry Andric   unsigned getWaitCountMax(InstCounterType T) const {
2600b57cec5SDimitry Andric     switch (T) {
2617a6dacacSDimitry Andric     case LOAD_CNT:
2627a6dacacSDimitry Andric       return Limits.LoadcntMax;
2637a6dacacSDimitry Andric     case DS_CNT:
2647a6dacacSDimitry Andric       return Limits.DscntMax;
2650b57cec5SDimitry Andric     case EXP_CNT:
2660eae32dcSDimitry Andric       return Limits.ExpcntMax;
2677a6dacacSDimitry Andric     case STORE_CNT:
2687a6dacacSDimitry Andric       return Limits.StorecntMax;
2697a6dacacSDimitry Andric     case SAMPLE_CNT:
2707a6dacacSDimitry Andric       return Limits.SamplecntMax;
2717a6dacacSDimitry Andric     case BVH_CNT:
2727a6dacacSDimitry Andric       return Limits.BvhcntMax;
2737a6dacacSDimitry Andric     case KM_CNT:
2747a6dacacSDimitry Andric       return Limits.KmcntMax;
2750b57cec5SDimitry Andric     default:
2760b57cec5SDimitry Andric       break;
2770b57cec5SDimitry Andric     }
2780b57cec5SDimitry Andric     return 0;
2790b57cec5SDimitry Andric   }
2800b57cec5SDimitry Andric 
2815ffd83dbSDimitry Andric   unsigned getScoreLB(InstCounterType T) const {
2820b57cec5SDimitry Andric     assert(T < NUM_INST_CNTS);
2830b57cec5SDimitry Andric     return ScoreLBs[T];
2840b57cec5SDimitry Andric   }
2850b57cec5SDimitry Andric 
2865ffd83dbSDimitry Andric   unsigned getScoreUB(InstCounterType T) const {
2870b57cec5SDimitry Andric     assert(T < NUM_INST_CNTS);
2880b57cec5SDimitry Andric     return ScoreUBs[T];
2890b57cec5SDimitry Andric   }
2900b57cec5SDimitry Andric 
291bdd1243dSDimitry Andric   unsigned getScoreRange(InstCounterType T) const {
292bdd1243dSDimitry Andric     return getScoreUB(T) - getScoreLB(T);
2930b57cec5SDimitry Andric   }
2940b57cec5SDimitry Andric 
295bdd1243dSDimitry Andric   unsigned getRegScore(int GprNo, InstCounterType T) const {
2960b57cec5SDimitry Andric     if (GprNo < NUM_ALL_VGPRS) {
2970b57cec5SDimitry Andric       return VgprScores[T][GprNo];
2980b57cec5SDimitry Andric     }
2997a6dacacSDimitry Andric     assert(T == SmemAccessCounter);
3000b57cec5SDimitry Andric     return SgprScores[GprNo - NUM_ALL_VGPRS];
3010b57cec5SDimitry Andric   }
3020b57cec5SDimitry Andric 
3030b57cec5SDimitry Andric   bool merge(const WaitcntBrackets &Other);
3040b57cec5SDimitry Andric 
305cb14a3feSDimitry Andric   RegInterval getRegInterval(const MachineInstr *MI,
3060b57cec5SDimitry Andric                              const MachineRegisterInfo *MRI,
3075ffd83dbSDimitry Andric                              const SIRegisterInfo *TRI, unsigned OpNo) const;
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric   bool counterOutOfOrder(InstCounterType T) const;
310fe6060f1SDimitry Andric   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
311fe6060f1SDimitry Andric   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
312bdd1243dSDimitry Andric   void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
3130b57cec5SDimitry Andric   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
3140b57cec5SDimitry Andric   void applyWaitcnt(InstCounterType T, unsigned Count);
3150b57cec5SDimitry Andric   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
3160b57cec5SDimitry Andric                      const MachineRegisterInfo *MRI, WaitEventType E,
3170b57cec5SDimitry Andric                      MachineInstr &MI);
3180b57cec5SDimitry Andric 
319bdd1243dSDimitry Andric   unsigned hasPendingEvent() const { return PendingEvents; }
320bdd1243dSDimitry Andric   unsigned hasPendingEvent(WaitEventType E) const {
3210b57cec5SDimitry Andric     return PendingEvents & (1 << E);
3220b57cec5SDimitry Andric   }
323bdd1243dSDimitry Andric   unsigned hasPendingEvent(InstCounterType T) const {
324bdd1243dSDimitry Andric     unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
325bdd1243dSDimitry Andric     assert((HasPending != 0) == (getScoreRange(T) != 0));
326bdd1243dSDimitry Andric     return HasPending;
327bdd1243dSDimitry Andric   }
3280b57cec5SDimitry Andric 
3295ffd83dbSDimitry Andric   bool hasMixedPendingEvents(InstCounterType T) const {
330bdd1243dSDimitry Andric     unsigned Events = hasPendingEvent(T);
3315ffd83dbSDimitry Andric     // Return true if more than one bit is set in Events.
3325ffd83dbSDimitry Andric     return Events & (Events - 1);
3335ffd83dbSDimitry Andric   }
3345ffd83dbSDimitry Andric 
3350b57cec5SDimitry Andric   bool hasPendingFlat() const {
3367a6dacacSDimitry Andric     return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
3377a6dacacSDimitry Andric              LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
3387a6dacacSDimitry Andric             (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
3397a6dacacSDimitry Andric              LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
3400b57cec5SDimitry Andric   }
3410b57cec5SDimitry Andric 
3420b57cec5SDimitry Andric   void setPendingFlat() {
3437a6dacacSDimitry Andric     LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
3447a6dacacSDimitry Andric     LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
3450b57cec5SDimitry Andric   }
3460b57cec5SDimitry Andric 
3475ffd83dbSDimitry Andric   // Return true if there might be pending writes to the specified vgpr by VMEM
3485ffd83dbSDimitry Andric   // instructions with types different from V.
3495ffd83dbSDimitry Andric   bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
3505ffd83dbSDimitry Andric     assert(GprNo < NUM_ALL_VGPRS);
3515ffd83dbSDimitry Andric     return VgprVmemTypes[GprNo] & ~(1 << V);
3525ffd83dbSDimitry Andric   }
3535ffd83dbSDimitry Andric 
3545ffd83dbSDimitry Andric   void clearVgprVmemTypes(int GprNo) {
3555ffd83dbSDimitry Andric     assert(GprNo < NUM_ALL_VGPRS);
3565ffd83dbSDimitry Andric     VgprVmemTypes[GprNo] = 0;
3575ffd83dbSDimitry Andric   }
3585ffd83dbSDimitry Andric 
359297eecfbSDimitry Andric   void setStateOnFunctionEntryOrReturn() {
3607a6dacacSDimitry Andric     setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
3617a6dacacSDimitry Andric     PendingEvents |= WaitEventMaskForInst[STORE_CNT];
3627a6dacacSDimitry Andric   }
3637a6dacacSDimitry Andric 
3647a6dacacSDimitry Andric   ArrayRef<const MachineInstr *> getLDSDMAStores() const {
3657a6dacacSDimitry Andric     return LDSDMAStores;
3665f757f3fSDimitry Andric   }
3675f757f3fSDimitry Andric 
3680b57cec5SDimitry Andric   void print(raw_ostream &);
3690b57cec5SDimitry Andric   void dump() { print(dbgs()); }
3700b57cec5SDimitry Andric 
3710b57cec5SDimitry Andric private:
3720b57cec5SDimitry Andric   struct MergeInfo {
3735ffd83dbSDimitry Andric     unsigned OldLB;
3745ffd83dbSDimitry Andric     unsigned OtherLB;
3755ffd83dbSDimitry Andric     unsigned MyShift;
3765ffd83dbSDimitry Andric     unsigned OtherShift;
3770b57cec5SDimitry Andric   };
3785ffd83dbSDimitry Andric   static bool mergeScore(const MergeInfo &M, unsigned &Score,
3795ffd83dbSDimitry Andric                          unsigned OtherScore);
3800b57cec5SDimitry Andric 
3815ffd83dbSDimitry Andric   void setScoreLB(InstCounterType T, unsigned Val) {
3820b57cec5SDimitry Andric     assert(T < NUM_INST_CNTS);
3830b57cec5SDimitry Andric     ScoreLBs[T] = Val;
3840b57cec5SDimitry Andric   }
3850b57cec5SDimitry Andric 
3865ffd83dbSDimitry Andric   void setScoreUB(InstCounterType T, unsigned Val) {
3870b57cec5SDimitry Andric     assert(T < NUM_INST_CNTS);
3880b57cec5SDimitry Andric     ScoreUBs[T] = Val;
389bdd1243dSDimitry Andric 
390bdd1243dSDimitry Andric     if (T != EXP_CNT)
391bdd1243dSDimitry Andric       return;
392bdd1243dSDimitry Andric 
393bdd1243dSDimitry Andric     if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
394bdd1243dSDimitry Andric       ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
3950b57cec5SDimitry Andric   }
3960b57cec5SDimitry Andric 
3975ffd83dbSDimitry Andric   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
3980b57cec5SDimitry Andric     if (GprNo < NUM_ALL_VGPRS) {
3995ffd83dbSDimitry Andric       VgprUB = std::max(VgprUB, GprNo);
4000b57cec5SDimitry Andric       VgprScores[T][GprNo] = Val;
4010b57cec5SDimitry Andric     } else {
4027a6dacacSDimitry Andric       assert(T == SmemAccessCounter);
4035ffd83dbSDimitry Andric       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
4040b57cec5SDimitry Andric       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
4050b57cec5SDimitry Andric     }
4060b57cec5SDimitry Andric   }
4070b57cec5SDimitry Andric 
4080b57cec5SDimitry Andric   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
4090b57cec5SDimitry Andric                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
4105ffd83dbSDimitry Andric                    unsigned OpNo, unsigned Val);
4110b57cec5SDimitry Andric 
4120b57cec5SDimitry Andric   const GCNSubtarget *ST = nullptr;
4137a6dacacSDimitry Andric   InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
4140eae32dcSDimitry Andric   HardwareLimits Limits = {};
4150eae32dcSDimitry Andric   RegisterEncoding Encoding = {};
4167a6dacacSDimitry Andric   const unsigned *WaitEventMaskForInst;
4177a6dacacSDimitry Andric   InstCounterType SmemAccessCounter;
4185ffd83dbSDimitry Andric   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
4195ffd83dbSDimitry Andric   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
4205ffd83dbSDimitry Andric   unsigned PendingEvents = 0;
4210b57cec5SDimitry Andric   // Remember the last flat memory operation.
4225ffd83dbSDimitry Andric   unsigned LastFlat[NUM_INST_CNTS] = {0};
4230b57cec5SDimitry Andric   // wait_cnt scores for every vgpr.
4240b57cec5SDimitry Andric   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
4255ffd83dbSDimitry Andric   int VgprUB = -1;
4265ffd83dbSDimitry Andric   int SgprUB = -1;
4275ffd83dbSDimitry Andric   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
4287a6dacacSDimitry Andric   // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
4297a6dacacSDimitry Andric   // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
4305ffd83dbSDimitry Andric   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
4315ffd83dbSDimitry Andric   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
4325ffd83dbSDimitry Andric   // write to each vgpr.
4335ffd83dbSDimitry Andric   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
4347a6dacacSDimitry Andric   // Store representative LDS DMA operations. The only useful info here is
4357a6dacacSDimitry Andric   // alias info. One store is kept per unique AAInfo.
4367a6dacacSDimitry Andric   SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
4377a6dacacSDimitry Andric };
4387a6dacacSDimitry Andric 
4397a6dacacSDimitry Andric // This abstracts the logic for generating and updating S_WAIT* instructions
4407a6dacacSDimitry Andric // away from the analysis that determines where they are needed. This was
4417a6dacacSDimitry Andric // done because the set of counters and instructions for waiting on them
4427a6dacacSDimitry Andric // underwent a major shift with gfx12, sufficiently so that having this
4437a6dacacSDimitry Andric // abstraction allows the main analysis logic to be simpler than it would
4447a6dacacSDimitry Andric // otherwise have had to become.
4457a6dacacSDimitry Andric class WaitcntGenerator {
4467a6dacacSDimitry Andric protected:
4477a6dacacSDimitry Andric   const GCNSubtarget *ST = nullptr;
4487a6dacacSDimitry Andric   const SIInstrInfo *TII = nullptr;
4497a6dacacSDimitry Andric   AMDGPU::IsaVersion IV;
4507a6dacacSDimitry Andric   InstCounterType MaxCounter;
4510fca6ea1SDimitry Andric   bool OptNone;
4527a6dacacSDimitry Andric 
4537a6dacacSDimitry Andric public:
4540fca6ea1SDimitry Andric   WaitcntGenerator() = default;
4550fca6ea1SDimitry Andric   WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
4560fca6ea1SDimitry Andric       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
4570fca6ea1SDimitry Andric         IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
4580fca6ea1SDimitry Andric         OptNone(MF.getFunction().hasOptNone() ||
4590fca6ea1SDimitry Andric                 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
4600fca6ea1SDimitry Andric 
4610fca6ea1SDimitry Andric   // Return true if the current function should be compiled with no
4620fca6ea1SDimitry Andric   // optimization.
4630fca6ea1SDimitry Andric   bool isOptNone() const { return OptNone; }
4647a6dacacSDimitry Andric 
4657a6dacacSDimitry Andric   // Edits an existing sequence of wait count instructions according
4667a6dacacSDimitry Andric   // to an incoming Waitcnt value, which is itself updated to reflect
4677a6dacacSDimitry Andric   // any new wait count instructions which may need to be generated by
4687a6dacacSDimitry Andric   // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
4697a6dacacSDimitry Andric   // were made.
4707a6dacacSDimitry Andric   //
4717a6dacacSDimitry Andric   // This editing will usually be merely updated operands, but it may also
4727a6dacacSDimitry Andric   // delete instructions if the incoming Wait value indicates they are not
4737a6dacacSDimitry Andric   // needed. It may also remove existing instructions for which a wait
4747a6dacacSDimitry Andric   // is needed if it can be determined that it is better to generate new
4757a6dacacSDimitry Andric   // instructions later, as can happen on gfx12.
4767a6dacacSDimitry Andric   virtual bool
4777a6dacacSDimitry Andric   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
4787a6dacacSDimitry Andric                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
4797a6dacacSDimitry Andric                           MachineBasicBlock::instr_iterator It) const = 0;
4807a6dacacSDimitry Andric 
4817a6dacacSDimitry Andric   // Transform a soft waitcnt into a normal one.
4827a6dacacSDimitry Andric   bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
4837a6dacacSDimitry Andric 
4847a6dacacSDimitry Andric   // Generates new wait count instructions according to the  value of
4857a6dacacSDimitry Andric   // Wait, returning true if any new instructions were created.
4867a6dacacSDimitry Andric   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
4877a6dacacSDimitry Andric                                 MachineBasicBlock::instr_iterator It,
4887a6dacacSDimitry Andric                                 AMDGPU::Waitcnt Wait) = 0;
4897a6dacacSDimitry Andric 
4907a6dacacSDimitry Andric   // Returns an array of bit masks which can be used to map values in
4917a6dacacSDimitry Andric   // WaitEventType to corresponding counter values in InstCounterType.
4927a6dacacSDimitry Andric   virtual const unsigned *getWaitEventMask() const = 0;
4937a6dacacSDimitry Andric 
4940fca6ea1SDimitry Andric   // Returns a new waitcnt with all counters except VScnt set to 0. If
4950fca6ea1SDimitry Andric   // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
4960fca6ea1SDimitry Andric   virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
4970fca6ea1SDimitry Andric 
4987a6dacacSDimitry Andric   virtual ~WaitcntGenerator() = default;
4990fca6ea1SDimitry Andric 
5000fca6ea1SDimitry Andric   // Create a mask value from the initializer list of wait event types.
5010fca6ea1SDimitry Andric   static constexpr unsigned
5020fca6ea1SDimitry Andric   eventMask(std::initializer_list<WaitEventType> Events) {
5030fca6ea1SDimitry Andric     unsigned Mask = 0;
5040fca6ea1SDimitry Andric     for (auto &E : Events)
5050fca6ea1SDimitry Andric       Mask |= 1 << E;
5060fca6ea1SDimitry Andric 
5070fca6ea1SDimitry Andric     return Mask;
5080fca6ea1SDimitry Andric   }
5097a6dacacSDimitry Andric };
5107a6dacacSDimitry Andric 
5117a6dacacSDimitry Andric class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
5127a6dacacSDimitry Andric public:
5130fca6ea1SDimitry Andric   WaitcntGeneratorPreGFX12() = default;
5140fca6ea1SDimitry Andric   WaitcntGeneratorPreGFX12(const MachineFunction &MF)
5150fca6ea1SDimitry Andric       : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
5167a6dacacSDimitry Andric 
5177a6dacacSDimitry Andric   bool
5187a6dacacSDimitry Andric   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
5197a6dacacSDimitry Andric                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
5207a6dacacSDimitry Andric                           MachineBasicBlock::instr_iterator It) const override;
5217a6dacacSDimitry Andric 
5227a6dacacSDimitry Andric   bool createNewWaitcnt(MachineBasicBlock &Block,
5237a6dacacSDimitry Andric                         MachineBasicBlock::instr_iterator It,
5247a6dacacSDimitry Andric                         AMDGPU::Waitcnt Wait) override;
5257a6dacacSDimitry Andric 
5267a6dacacSDimitry Andric   const unsigned *getWaitEventMask() const override {
5277a6dacacSDimitry Andric     assert(ST);
5287a6dacacSDimitry Andric 
5297a6dacacSDimitry Andric     static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
5300fca6ea1SDimitry Andric         eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
5310fca6ea1SDimitry Andric                    VMEM_BVH_READ_ACCESS}),
5320fca6ea1SDimitry Andric         eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
5330fca6ea1SDimitry Andric         eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
5340fca6ea1SDimitry Andric                    EXP_POS_ACCESS, EXP_LDS_ACCESS}),
5350fca6ea1SDimitry Andric         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
5367a6dacacSDimitry Andric         0,
5377a6dacacSDimitry Andric         0,
5387a6dacacSDimitry Andric         0};
5397a6dacacSDimitry Andric 
5407a6dacacSDimitry Andric     return WaitEventMaskForInstPreGFX12;
5417a6dacacSDimitry Andric   }
5420fca6ea1SDimitry Andric 
5430fca6ea1SDimitry Andric   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
5447a6dacacSDimitry Andric };
5457a6dacacSDimitry Andric 
5467a6dacacSDimitry Andric class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
5477a6dacacSDimitry Andric public:
5480fca6ea1SDimitry Andric   WaitcntGeneratorGFX12Plus() = default;
5490fca6ea1SDimitry Andric   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
5500fca6ea1SDimitry Andric                             InstCounterType MaxCounter)
5510fca6ea1SDimitry Andric       : WaitcntGenerator(MF, MaxCounter) {}
5527a6dacacSDimitry Andric 
5537a6dacacSDimitry Andric   bool
5547a6dacacSDimitry Andric   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
5557a6dacacSDimitry Andric                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
5567a6dacacSDimitry Andric                           MachineBasicBlock::instr_iterator It) const override;
5577a6dacacSDimitry Andric 
5587a6dacacSDimitry Andric   bool createNewWaitcnt(MachineBasicBlock &Block,
5597a6dacacSDimitry Andric                         MachineBasicBlock::instr_iterator It,
5607a6dacacSDimitry Andric                         AMDGPU::Waitcnt Wait) override;
5617a6dacacSDimitry Andric 
5627a6dacacSDimitry Andric   const unsigned *getWaitEventMask() const override {
5637a6dacacSDimitry Andric     assert(ST);
5647a6dacacSDimitry Andric 
5657a6dacacSDimitry Andric     static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
5660fca6ea1SDimitry Andric         eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
5670fca6ea1SDimitry Andric         eventMask({LDS_ACCESS, GDS_ACCESS}),
5680fca6ea1SDimitry Andric         eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
5690fca6ea1SDimitry Andric                    EXP_POS_ACCESS, EXP_LDS_ACCESS}),
5700fca6ea1SDimitry Andric         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
5710fca6ea1SDimitry Andric         eventMask({VMEM_SAMPLER_READ_ACCESS}),
5720fca6ea1SDimitry Andric         eventMask({VMEM_BVH_READ_ACCESS}),
5730fca6ea1SDimitry Andric         eventMask({SMEM_ACCESS, SQ_MESSAGE})};
5747a6dacacSDimitry Andric 
5757a6dacacSDimitry Andric     return WaitEventMaskForInstGFX12Plus;
5767a6dacacSDimitry Andric   }
5770fca6ea1SDimitry Andric 
5780fca6ea1SDimitry Andric   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
5790b57cec5SDimitry Andric };
5800b57cec5SDimitry Andric 
5810b57cec5SDimitry Andric class SIInsertWaitcnts : public MachineFunctionPass {
5820b57cec5SDimitry Andric private:
5830b57cec5SDimitry Andric   const GCNSubtarget *ST = nullptr;
5840b57cec5SDimitry Andric   const SIInstrInfo *TII = nullptr;
5850b57cec5SDimitry Andric   const SIRegisterInfo *TRI = nullptr;
5860b57cec5SDimitry Andric   const MachineRegisterInfo *MRI = nullptr;
5870b57cec5SDimitry Andric 
588480093f4SDimitry Andric   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
58981ad6265SDimitry Andric   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
59081ad6265SDimitry Andric   MachineLoopInfo *MLI;
591480093f4SDimitry Andric   MachinePostDominatorTree *PDT;
5927a6dacacSDimitry Andric   AliasAnalysis *AA = nullptr;
5930b57cec5SDimitry Andric 
5940b57cec5SDimitry Andric   struct BlockInfo {
5950b57cec5SDimitry Andric     std::unique_ptr<WaitcntBrackets> Incoming;
5960b57cec5SDimitry Andric     bool Dirty = true;
5970b57cec5SDimitry Andric   };
5980b57cec5SDimitry Andric 
5997a6dacacSDimitry Andric   InstCounterType SmemAccessCounter;
6007a6dacacSDimitry Andric 
6015ffd83dbSDimitry Andric   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
6020b57cec5SDimitry Andric 
6030b57cec5SDimitry Andric   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
6040b57cec5SDimitry Andric   // because of amdgpu-waitcnt-forcezero flag
6050b57cec5SDimitry Andric   bool ForceEmitZeroWaitcnts;
6060b57cec5SDimitry Andric   bool ForceEmitWaitcnt[NUM_INST_CNTS];
6070b57cec5SDimitry Andric 
6087a6dacacSDimitry Andric   // In any given run of this pass, WCG will point to one of these two
6097a6dacacSDimitry Andric   // generator objects, which must have been re-initialised before use
6107a6dacacSDimitry Andric   // from a value made using a subtarget constructor.
6117a6dacacSDimitry Andric   WaitcntGeneratorPreGFX12 WCGPreGFX12;
6127a6dacacSDimitry Andric   WaitcntGeneratorGFX12Plus WCGGFX12Plus;
6137a6dacacSDimitry Andric 
6147a6dacacSDimitry Andric   WaitcntGenerator *WCG = nullptr;
6157a6dacacSDimitry Andric 
61606c3fb27SDimitry Andric   // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
61706c3fb27SDimitry Andric   // message.
61806c3fb27SDimitry Andric   DenseSet<MachineInstr *> ReleaseVGPRInsts;
61906c3fb27SDimitry Andric 
6207a6dacacSDimitry Andric   InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
6217a6dacacSDimitry Andric 
6220b57cec5SDimitry Andric public:
6230b57cec5SDimitry Andric   static char ID;
6240b57cec5SDimitry Andric 
6250b57cec5SDimitry Andric   SIInsertWaitcnts() : MachineFunctionPass(ID) {
6260b57cec5SDimitry Andric     (void)ForceExpCounter;
6270b57cec5SDimitry Andric     (void)ForceLgkmCounter;
6280b57cec5SDimitry Andric     (void)ForceVMCounter;
6290b57cec5SDimitry Andric   }
6300b57cec5SDimitry Andric 
63181ad6265SDimitry Andric   bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
63281ad6265SDimitry Andric   bool isPreheaderToFlush(MachineBasicBlock &MBB,
63381ad6265SDimitry Andric                           WaitcntBrackets &ScoreBrackets);
63406c3fb27SDimitry Andric   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
6350b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
6360b57cec5SDimitry Andric 
6370b57cec5SDimitry Andric   StringRef getPassName() const override {
6380b57cec5SDimitry Andric     return "SI insert wait instructions";
6390b57cec5SDimitry Andric   }
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
6420b57cec5SDimitry Andric     AU.setPreservesCFG();
6430fca6ea1SDimitry Andric     AU.addRequired<MachineLoopInfoWrapperPass>();
6440fca6ea1SDimitry Andric     AU.addRequired<MachinePostDominatorTreeWrapperPass>();
6457a6dacacSDimitry Andric     AU.addUsedIfAvailable<AAResultsWrapperPass>();
6467a6dacacSDimitry Andric     AU.addPreserved<AAResultsWrapperPass>();
6470b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
6480b57cec5SDimitry Andric   }
6490b57cec5SDimitry Andric 
6500b57cec5SDimitry Andric   bool isForceEmitWaitcnt() const {
6510b57cec5SDimitry Andric     for (auto T : inst_counter_types())
6520b57cec5SDimitry Andric       if (ForceEmitWaitcnt[T])
6530b57cec5SDimitry Andric         return true;
6540b57cec5SDimitry Andric     return false;
6550b57cec5SDimitry Andric   }
6560b57cec5SDimitry Andric 
6570b57cec5SDimitry Andric   void setForceEmitWaitcnt() {
6580b57cec5SDimitry Andric // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
6590b57cec5SDimitry Andric // For debug builds, get the debug counter info and adjust if need be
6600b57cec5SDimitry Andric #ifndef NDEBUG
6610b57cec5SDimitry Andric     if (DebugCounter::isCounterSet(ForceExpCounter) &&
6620b57cec5SDimitry Andric         DebugCounter::shouldExecute(ForceExpCounter)) {
6630b57cec5SDimitry Andric       ForceEmitWaitcnt[EXP_CNT] = true;
6640b57cec5SDimitry Andric     } else {
6650b57cec5SDimitry Andric       ForceEmitWaitcnt[EXP_CNT] = false;
6660b57cec5SDimitry Andric     }
6670b57cec5SDimitry Andric 
6680b57cec5SDimitry Andric     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
6690b57cec5SDimitry Andric         DebugCounter::shouldExecute(ForceLgkmCounter)) {
6707a6dacacSDimitry Andric       ForceEmitWaitcnt[DS_CNT] = true;
6717a6dacacSDimitry Andric       ForceEmitWaitcnt[KM_CNT] = true;
6720b57cec5SDimitry Andric     } else {
6737a6dacacSDimitry Andric       ForceEmitWaitcnt[DS_CNT] = false;
6747a6dacacSDimitry Andric       ForceEmitWaitcnt[KM_CNT] = false;
6750b57cec5SDimitry Andric     }
6760b57cec5SDimitry Andric 
6770b57cec5SDimitry Andric     if (DebugCounter::isCounterSet(ForceVMCounter) &&
6780b57cec5SDimitry Andric         DebugCounter::shouldExecute(ForceVMCounter)) {
6797a6dacacSDimitry Andric       ForceEmitWaitcnt[LOAD_CNT] = true;
6807a6dacacSDimitry Andric       ForceEmitWaitcnt[SAMPLE_CNT] = true;
6817a6dacacSDimitry Andric       ForceEmitWaitcnt[BVH_CNT] = true;
6820b57cec5SDimitry Andric     } else {
6837a6dacacSDimitry Andric       ForceEmitWaitcnt[LOAD_CNT] = false;
6847a6dacacSDimitry Andric       ForceEmitWaitcnt[SAMPLE_CNT] = false;
6857a6dacacSDimitry Andric       ForceEmitWaitcnt[BVH_CNT] = false;
6860b57cec5SDimitry Andric     }
6870b57cec5SDimitry Andric #endif // NDEBUG
6880b57cec5SDimitry Andric   }
6890b57cec5SDimitry Andric 
690bdd1243dSDimitry Andric   // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
691bdd1243dSDimitry Andric   // FLAT instruction.
692bdd1243dSDimitry Andric   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
6937a6dacacSDimitry Andric     // Maps VMEM access types to their corresponding WaitEventType.
6947a6dacacSDimitry Andric     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
6957a6dacacSDimitry Andric         VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
6967a6dacacSDimitry Andric 
697bdd1243dSDimitry Andric     assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
6985f757f3fSDimitry Andric     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
6995f757f3fSDimitry Andric     // these should use VM_CNT.
7005f757f3fSDimitry Andric     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
701bdd1243dSDimitry Andric       return VMEM_ACCESS;
7020fca6ea1SDimitry Andric     if (Inst.mayStore() &&
7030fca6ea1SDimitry Andric         (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
70406c3fb27SDimitry Andric       // FLAT and SCRATCH instructions may access scratch. Other VMEM
70506c3fb27SDimitry Andric       // instructions do not.
70606c3fb27SDimitry Andric       if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
70706c3fb27SDimitry Andric         return SCRATCH_WRITE_ACCESS;
708bdd1243dSDimitry Andric       return VMEM_WRITE_ACCESS;
70906c3fb27SDimitry Andric     }
7107a6dacacSDimitry Andric     if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
711bdd1243dSDimitry Andric       return VMEM_READ_ACCESS;
7127a6dacacSDimitry Andric     return VmemReadMapping[getVmemType(Inst)];
713bdd1243dSDimitry Andric   }
714bdd1243dSDimitry Andric 
715e8d8bef9SDimitry Andric   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
7160b57cec5SDimitry Andric   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
71706c3fb27SDimitry Andric   bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
7180b57cec5SDimitry Andric   bool generateWaitcntInstBefore(MachineInstr &MI,
7190b57cec5SDimitry Andric                                  WaitcntBrackets &ScoreBrackets,
72081ad6265SDimitry Andric                                  MachineInstr *OldWaitcntInstr,
72181ad6265SDimitry Andric                                  bool FlushVmCnt);
72281ad6265SDimitry Andric   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
72381ad6265SDimitry Andric                        MachineBasicBlock::instr_iterator It,
72481ad6265SDimitry Andric                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
7250b57cec5SDimitry Andric                        MachineInstr *OldWaitcntInstr);
7260b57cec5SDimitry Andric   void updateEventWaitcntAfter(MachineInstr &Inst,
7270b57cec5SDimitry Andric                                WaitcntBrackets *ScoreBrackets);
7280b57cec5SDimitry Andric   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
7290b57cec5SDimitry Andric                             WaitcntBrackets &ScoreBrackets);
7300b57cec5SDimitry Andric };
7310b57cec5SDimitry Andric 
7320b57cec5SDimitry Andric } // end anonymous namespace
7330b57cec5SDimitry Andric 
7340b57cec5SDimitry Andric RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
7350b57cec5SDimitry Andric                                             const MachineRegisterInfo *MRI,
7360b57cec5SDimitry Andric                                             const SIRegisterInfo *TRI,
7375ffd83dbSDimitry Andric                                             unsigned OpNo) const {
7380b57cec5SDimitry Andric   const MachineOperand &Op = MI->getOperand(OpNo);
739fe6060f1SDimitry Andric   if (!TRI->isInAllocatableClass(Op.getReg()))
7400b57cec5SDimitry Andric     return {-1, -1};
7410b57cec5SDimitry Andric 
7420b57cec5SDimitry Andric   // A use via a PW operand does not need a waitcnt.
7430b57cec5SDimitry Andric   // A partial write is not a WAW.
7440b57cec5SDimitry Andric   assert(!Op.getSubReg() || !Op.isUndef());
7450b57cec5SDimitry Andric 
7460b57cec5SDimitry Andric   RegInterval Result;
7470b57cec5SDimitry Andric 
7485f757f3fSDimitry Andric   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
7495f757f3fSDimitry Andric                  AMDGPU::HWEncoding::REG_IDX_MASK;
7500b57cec5SDimitry Andric 
751fe6060f1SDimitry Andric   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
7520eae32dcSDimitry Andric     assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
7530eae32dcSDimitry Andric     Result.first = Reg - Encoding.VGPR0;
754fe6060f1SDimitry Andric     if (TRI->isAGPR(*MRI, Op.getReg()))
755fe6060f1SDimitry Andric       Result.first += AGPR_OFFSET;
7560b57cec5SDimitry Andric     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
7575ffd83dbSDimitry Andric   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
7580eae32dcSDimitry Andric     assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
7590eae32dcSDimitry Andric     Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
7600b57cec5SDimitry Andric     assert(Result.first >= NUM_ALL_VGPRS &&
7610b57cec5SDimitry Andric            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
7620b57cec5SDimitry Andric   }
7630b57cec5SDimitry Andric   // TODO: Handle TTMP
7645ffd83dbSDimitry Andric   // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
7650b57cec5SDimitry Andric   else
7660b57cec5SDimitry Andric     return {-1, -1};
7670b57cec5SDimitry Andric 
768cb14a3feSDimitry Andric   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
7690b57cec5SDimitry Andric   unsigned Size = TRI->getRegSizeInBits(*RC);
7705ffd83dbSDimitry Andric   Result.second = Result.first + ((Size + 16) / 32);
7710b57cec5SDimitry Andric 
7720b57cec5SDimitry Andric   return Result;
7730b57cec5SDimitry Andric }
7740b57cec5SDimitry Andric 
7750b57cec5SDimitry Andric void WaitcntBrackets::setExpScore(const MachineInstr *MI,
7760b57cec5SDimitry Andric                                   const SIInstrInfo *TII,
7770b57cec5SDimitry Andric                                   const SIRegisterInfo *TRI,
7780b57cec5SDimitry Andric                                   const MachineRegisterInfo *MRI, unsigned OpNo,
7795ffd83dbSDimitry Andric                                   unsigned Val) {
780cb14a3feSDimitry Andric   RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
781fe6060f1SDimitry Andric   assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
7825ffd83dbSDimitry Andric   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
7830b57cec5SDimitry Andric     setRegScore(RegNo, EXP_CNT, Val);
7840b57cec5SDimitry Andric   }
7850b57cec5SDimitry Andric }
7860b57cec5SDimitry Andric 
7870b57cec5SDimitry Andric void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
7880b57cec5SDimitry Andric                                     const SIRegisterInfo *TRI,
7890b57cec5SDimitry Andric                                     const MachineRegisterInfo *MRI,
7900b57cec5SDimitry Andric                                     WaitEventType E, MachineInstr &Inst) {
7917a6dacacSDimitry Andric   InstCounterType T = eventCounter(WaitEventMaskForInst, E);
7927a6dacacSDimitry Andric 
7937a6dacacSDimitry Andric   unsigned UB = getScoreUB(T);
7947a6dacacSDimitry Andric   unsigned CurrScore = UB + 1;
7950b57cec5SDimitry Andric   if (CurrScore == 0)
7960b57cec5SDimitry Andric     report_fatal_error("InsertWaitcnt score wraparound");
7970b57cec5SDimitry Andric   // PendingEvents and ScoreUB need to be update regardless if this event
7980b57cec5SDimitry Andric   // changes the score of a register or not.
7990b57cec5SDimitry Andric   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
8000b57cec5SDimitry Andric   PendingEvents |= 1 << E;
8010b57cec5SDimitry Andric   setScoreUB(T, CurrScore);
8020b57cec5SDimitry Andric 
8030b57cec5SDimitry Andric   if (T == EXP_CNT) {
8040b57cec5SDimitry Andric     // Put score on the source vgprs. If this is a store, just use those
8050b57cec5SDimitry Andric     // specific register(s).
8060b57cec5SDimitry Andric     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
8070b57cec5SDimitry Andric       int AddrOpIdx =
8080b57cec5SDimitry Andric           AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
8090b57cec5SDimitry Andric       // All GDS operations must protect their address register (same as
8100b57cec5SDimitry Andric       // export.)
8110b57cec5SDimitry Andric       if (AddrOpIdx != -1) {
8120b57cec5SDimitry Andric         setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
8130b57cec5SDimitry Andric       }
8140b57cec5SDimitry Andric 
8150b57cec5SDimitry Andric       if (Inst.mayStore()) {
816bdd1243dSDimitry Andric         if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
8170b57cec5SDimitry Andric           setExpScore(
8180b57cec5SDimitry Andric               &Inst, TII, TRI, MRI,
8190b57cec5SDimitry Andric               AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
8200b57cec5SDimitry Andric               CurrScore);
8210b57cec5SDimitry Andric         }
822bdd1243dSDimitry Andric         if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
8230b57cec5SDimitry Andric           setExpScore(&Inst, TII, TRI, MRI,
8240b57cec5SDimitry Andric                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
8250b57cec5SDimitry Andric                                                  AMDGPU::OpName::data1),
8260b57cec5SDimitry Andric                       CurrScore);
8270b57cec5SDimitry Andric         }
8285f757f3fSDimitry Andric       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
8290b57cec5SDimitry Andric                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
8300b57cec5SDimitry Andric                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
8310b57cec5SDimitry Andric                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
8320b57cec5SDimitry Andric         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
8330b57cec5SDimitry Andric           const MachineOperand &Op = Inst.getOperand(I);
834fe6060f1SDimitry Andric           if (Op.isReg() && !Op.isDef() &&
835fe6060f1SDimitry Andric               TRI->isVectorRegister(*MRI, Op.getReg())) {
8360b57cec5SDimitry Andric             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
8370b57cec5SDimitry Andric           }
8380b57cec5SDimitry Andric         }
8390b57cec5SDimitry Andric       }
8400b57cec5SDimitry Andric     } else if (TII->isFLAT(Inst)) {
8410b57cec5SDimitry Andric       if (Inst.mayStore()) {
8420b57cec5SDimitry Andric         setExpScore(
8430b57cec5SDimitry Andric             &Inst, TII, TRI, MRI,
8440b57cec5SDimitry Andric             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8450b57cec5SDimitry Andric             CurrScore);
846fe6060f1SDimitry Andric       } else if (SIInstrInfo::isAtomicRet(Inst)) {
8470b57cec5SDimitry Andric         setExpScore(
8480b57cec5SDimitry Andric             &Inst, TII, TRI, MRI,
8490b57cec5SDimitry Andric             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8500b57cec5SDimitry Andric             CurrScore);
8510b57cec5SDimitry Andric       }
8520b57cec5SDimitry Andric     } else if (TII->isMIMG(Inst)) {
8530b57cec5SDimitry Andric       if (Inst.mayStore()) {
8540b57cec5SDimitry Andric         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
855fe6060f1SDimitry Andric       } else if (SIInstrInfo::isAtomicRet(Inst)) {
8560b57cec5SDimitry Andric         setExpScore(
8570b57cec5SDimitry Andric             &Inst, TII, TRI, MRI,
8580b57cec5SDimitry Andric             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8590b57cec5SDimitry Andric             CurrScore);
8600b57cec5SDimitry Andric       }
8610b57cec5SDimitry Andric     } else if (TII->isMTBUF(Inst)) {
8620b57cec5SDimitry Andric       if (Inst.mayStore()) {
8630b57cec5SDimitry Andric         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
8640b57cec5SDimitry Andric       }
8650b57cec5SDimitry Andric     } else if (TII->isMUBUF(Inst)) {
8660b57cec5SDimitry Andric       if (Inst.mayStore()) {
8670b57cec5SDimitry Andric         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
868fe6060f1SDimitry Andric       } else if (SIInstrInfo::isAtomicRet(Inst)) {
8690b57cec5SDimitry Andric         setExpScore(
8700b57cec5SDimitry Andric             &Inst, TII, TRI, MRI,
8710b57cec5SDimitry Andric             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
8720b57cec5SDimitry Andric             CurrScore);
8730b57cec5SDimitry Andric       }
87481ad6265SDimitry Andric     } else if (TII->isLDSDIR(Inst)) {
87581ad6265SDimitry Andric       // LDSDIR instructions attach the score to the destination.
87681ad6265SDimitry Andric       setExpScore(
87781ad6265SDimitry Andric           &Inst, TII, TRI, MRI,
87881ad6265SDimitry Andric           AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
87981ad6265SDimitry Andric           CurrScore);
8800b57cec5SDimitry Andric     } else {
8810b57cec5SDimitry Andric       if (TII->isEXP(Inst)) {
8820b57cec5SDimitry Andric         // For export the destination registers are really temps that
8830b57cec5SDimitry Andric         // can be used as the actual source after export patching, so
8840b57cec5SDimitry Andric         // we need to treat them like sources and set the EXP_CNT
8850b57cec5SDimitry Andric         // score.
8860b57cec5SDimitry Andric         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
8870b57cec5SDimitry Andric           MachineOperand &DefMO = Inst.getOperand(I);
8880b57cec5SDimitry Andric           if (DefMO.isReg() && DefMO.isDef() &&
8895ffd83dbSDimitry Andric               TRI->isVGPR(*MRI, DefMO.getReg())) {
890e8d8bef9SDimitry Andric             setRegScore(
891e8d8bef9SDimitry Andric                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
892e8d8bef9SDimitry Andric                 EXP_CNT, CurrScore);
8930b57cec5SDimitry Andric           }
8940b57cec5SDimitry Andric         }
8950b57cec5SDimitry Andric       }
8960b57cec5SDimitry Andric       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
8970b57cec5SDimitry Andric         MachineOperand &MO = Inst.getOperand(I);
898fe6060f1SDimitry Andric         if (MO.isReg() && !MO.isDef() &&
899fe6060f1SDimitry Andric             TRI->isVectorRegister(*MRI, MO.getReg())) {
9000b57cec5SDimitry Andric           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
9010b57cec5SDimitry Andric         }
9020b57cec5SDimitry Andric       }
9030b57cec5SDimitry Andric     }
9045f757f3fSDimitry Andric   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
9050b57cec5SDimitry Andric     // Match the score to the destination registers.
9060b57cec5SDimitry Andric     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
9075ffd83dbSDimitry Andric       auto &Op = Inst.getOperand(I);
9085ffd83dbSDimitry Andric       if (!Op.isReg() || !Op.isDef())
9090b57cec5SDimitry Andric         continue;
910cb14a3feSDimitry Andric       RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
9117a6dacacSDimitry Andric       if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
9125ffd83dbSDimitry Andric         if (Interval.first >= NUM_ALL_VGPRS)
9135ffd83dbSDimitry Andric           continue;
914bdd1243dSDimitry Andric         if (updateVMCntOnly(Inst)) {
9155f757f3fSDimitry Andric           // updateVMCntOnly should only leave us with VGPRs
9165f757f3fSDimitry Andric           // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
9175f757f3fSDimitry Andric           // defs. That's required for a sane index into `VgprMemTypes` below
9185f757f3fSDimitry Andric           assert(TRI->isVectorRegister(*MRI, Op.getReg()));
9195ffd83dbSDimitry Andric           VmemType V = getVmemType(Inst);
9205ffd83dbSDimitry Andric           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
9215ffd83dbSDimitry Andric             VgprVmemTypes[RegNo] |= 1 << V;
9225ffd83dbSDimitry Andric         }
9235ffd83dbSDimitry Andric       }
9245ffd83dbSDimitry Andric       for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
9250b57cec5SDimitry Andric         setRegScore(RegNo, T, CurrScore);
9260b57cec5SDimitry Andric       }
9270b57cec5SDimitry Andric     }
9285f757f3fSDimitry Andric     if (Inst.mayStore() &&
9295f757f3fSDimitry Andric         (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
9305f757f3fSDimitry Andric       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
9315f757f3fSDimitry Andric       // written can be accessed. A load from LDS to VMEM does not need a wait.
9327a6dacacSDimitry Andric       unsigned Slot = 0;
9337a6dacacSDimitry Andric       for (const auto *MemOp : Inst.memoperands()) {
9347a6dacacSDimitry Andric         if (!MemOp->isStore() ||
9357a6dacacSDimitry Andric             MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
9367a6dacacSDimitry Andric           continue;
9377a6dacacSDimitry Andric         // Comparing just AA info does not guarantee memoperands are equal
9387a6dacacSDimitry Andric         // in general, but this is so for LDS DMA in practice.
9397a6dacacSDimitry Andric         auto AAI = MemOp->getAAInfo();
9407a6dacacSDimitry Andric         // Alias scope information gives a way to definitely identify an
9417a6dacacSDimitry Andric         // original memory object and practically produced in the module LDS
9427a6dacacSDimitry Andric         // lowering pass. If there is no scope available we will not be able
9437a6dacacSDimitry Andric         // to disambiguate LDS aliasing as after the module lowering all LDS
9447a6dacacSDimitry Andric         // is squashed into a single big object. Do not attempt to use one of
9457a6dacacSDimitry Andric         // the limited LDSDMAStores for something we will not be able to use
9467a6dacacSDimitry Andric         // anyway.
9477a6dacacSDimitry Andric         if (!AAI || !AAI.Scope)
9487a6dacacSDimitry Andric           break;
9497a6dacacSDimitry Andric         for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
9507a6dacacSDimitry Andric           for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
9517a6dacacSDimitry Andric             if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
9527a6dacacSDimitry Andric               Slot = I + 1;
9537a6dacacSDimitry Andric               break;
9547a6dacacSDimitry Andric             }
9557a6dacacSDimitry Andric           }
9567a6dacacSDimitry Andric         }
9577a6dacacSDimitry Andric         if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
9587a6dacacSDimitry Andric           break;
9597a6dacacSDimitry Andric         LDSDMAStores.push_back(&Inst);
9607a6dacacSDimitry Andric         Slot = LDSDMAStores.size();
9617a6dacacSDimitry Andric         break;
9627a6dacacSDimitry Andric       }
9637a6dacacSDimitry Andric       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
9647a6dacacSDimitry Andric       if (Slot)
9650b57cec5SDimitry Andric         setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
9660b57cec5SDimitry Andric     }
9670b57cec5SDimitry Andric   }
9680b57cec5SDimitry Andric }
9690b57cec5SDimitry Andric 
9700b57cec5SDimitry Andric void WaitcntBrackets::print(raw_ostream &OS) {
9710b57cec5SDimitry Andric   OS << '\n';
9727a6dacacSDimitry Andric   for (auto T : inst_counter_types(MaxCounter)) {
973bdd1243dSDimitry Andric     unsigned SR = getScoreRange(T);
9740b57cec5SDimitry Andric 
9750b57cec5SDimitry Andric     switch (T) {
9767a6dacacSDimitry Andric     case LOAD_CNT:
9777a6dacacSDimitry Andric       OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
9787a6dacacSDimitry Andric          << SR << "): ";
9790b57cec5SDimitry Andric       break;
9807a6dacacSDimitry Andric     case DS_CNT:
9817a6dacacSDimitry Andric       OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
9827a6dacacSDimitry Andric          << SR << "): ";
9830b57cec5SDimitry Andric       break;
9840b57cec5SDimitry Andric     case EXP_CNT:
985bdd1243dSDimitry Andric       OS << "    EXP_CNT(" << SR << "): ";
9860b57cec5SDimitry Andric       break;
9877a6dacacSDimitry Andric     case STORE_CNT:
9887a6dacacSDimitry Andric       OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
9897a6dacacSDimitry Andric          << SR << "): ";
9907a6dacacSDimitry Andric       break;
9917a6dacacSDimitry Andric     case SAMPLE_CNT:
9927a6dacacSDimitry Andric       OS << "    SAMPLE_CNT(" << SR << "): ";
9937a6dacacSDimitry Andric       break;
9947a6dacacSDimitry Andric     case BVH_CNT:
9957a6dacacSDimitry Andric       OS << "    BVH_CNT(" << SR << "): ";
9967a6dacacSDimitry Andric       break;
9977a6dacacSDimitry Andric     case KM_CNT:
9987a6dacacSDimitry Andric       OS << "    KM_CNT(" << SR << "): ";
9990b57cec5SDimitry Andric       break;
10000b57cec5SDimitry Andric     default:
1001bdd1243dSDimitry Andric       OS << "    UNKNOWN(" << SR << "): ";
10020b57cec5SDimitry Andric       break;
10030b57cec5SDimitry Andric     }
10040b57cec5SDimitry Andric 
1005bdd1243dSDimitry Andric     if (SR != 0) {
10060b57cec5SDimitry Andric       // Print vgpr scores.
1007bdd1243dSDimitry Andric       unsigned LB = getScoreLB(T);
1008bdd1243dSDimitry Andric 
10095ffd83dbSDimitry Andric       for (int J = 0; J <= VgprUB; J++) {
10105ffd83dbSDimitry Andric         unsigned RegScore = getRegScore(J, T);
10110b57cec5SDimitry Andric         if (RegScore <= LB)
10120b57cec5SDimitry Andric           continue;
10135ffd83dbSDimitry Andric         unsigned RelScore = RegScore - LB - 1;
10140b57cec5SDimitry Andric         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
10150b57cec5SDimitry Andric           OS << RelScore << ":v" << J << " ";
10160b57cec5SDimitry Andric         } else {
10170b57cec5SDimitry Andric           OS << RelScore << ":ds ";
10180b57cec5SDimitry Andric         }
10190b57cec5SDimitry Andric       }
10200b57cec5SDimitry Andric       // Also need to print sgpr scores for lgkm_cnt.
10217a6dacacSDimitry Andric       if (T == SmemAccessCounter) {
10225ffd83dbSDimitry Andric         for (int J = 0; J <= SgprUB; J++) {
10237a6dacacSDimitry Andric           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
10240b57cec5SDimitry Andric           if (RegScore <= LB)
10250b57cec5SDimitry Andric             continue;
10265ffd83dbSDimitry Andric           unsigned RelScore = RegScore - LB - 1;
10270b57cec5SDimitry Andric           OS << RelScore << ":s" << J << " ";
10280b57cec5SDimitry Andric         }
10290b57cec5SDimitry Andric       }
10300b57cec5SDimitry Andric     }
10310b57cec5SDimitry Andric     OS << '\n';
10320b57cec5SDimitry Andric   }
10330b57cec5SDimitry Andric   OS << '\n';
10340b57cec5SDimitry Andric }
10350b57cec5SDimitry Andric 
10360b57cec5SDimitry Andric /// Simplify the waitcnt, in the sense of removing redundant counts, and return
10370b57cec5SDimitry Andric /// whether a waitcnt instruction is needed at all.
1038fe6060f1SDimitry Andric void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
10397a6dacacSDimitry Andric   simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1040fe6060f1SDimitry Andric   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
10417a6dacacSDimitry Andric   simplifyWaitcnt(DS_CNT, Wait.DsCnt);
10427a6dacacSDimitry Andric   simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
10437a6dacacSDimitry Andric   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
10447a6dacacSDimitry Andric   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
10457a6dacacSDimitry Andric   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
10460b57cec5SDimitry Andric }
10470b57cec5SDimitry Andric 
1048fe6060f1SDimitry Andric void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
10490b57cec5SDimitry Andric                                       unsigned &Count) const {
1050fe6060f1SDimitry Andric   // The number of outstanding events for this type, T, can be calculated
1051fe6060f1SDimitry Andric   // as (UB - LB). If the current Count is greater than or equal to the number
1052fe6060f1SDimitry Andric   // of outstanding events, then the wait for this counter is redundant.
1053bdd1243dSDimitry Andric   if (Count >= getScoreRange(T))
10540b57cec5SDimitry Andric     Count = ~0u;
10550b57cec5SDimitry Andric }
10560b57cec5SDimitry Andric 
1057bdd1243dSDimitry Andric void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
10580b57cec5SDimitry Andric                                     AMDGPU::Waitcnt &Wait) const {
1059bdd1243dSDimitry Andric   unsigned ScoreToWait = getRegScore(RegNo, T);
1060bdd1243dSDimitry Andric 
10610b57cec5SDimitry Andric   // If the score of src_operand falls within the bracket, we need an
10620b57cec5SDimitry Andric   // s_waitcnt instruction.
10635ffd83dbSDimitry Andric   const unsigned LB = getScoreLB(T);
10645ffd83dbSDimitry Andric   const unsigned UB = getScoreUB(T);
10650b57cec5SDimitry Andric   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
10667a6dacacSDimitry Andric     if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
10670b57cec5SDimitry Andric         !ST->hasFlatLgkmVMemCountInOrder()) {
10680b57cec5SDimitry Andric       // If there is a pending FLAT operation, and this is a VMem or LGKM
10690b57cec5SDimitry Andric       // waitcnt and the target can report early completion, then we need
10700b57cec5SDimitry Andric       // to force a waitcnt 0.
10710b57cec5SDimitry Andric       addWait(Wait, T, 0);
10720b57cec5SDimitry Andric     } else if (counterOutOfOrder(T)) {
10730b57cec5SDimitry Andric       // Counter can get decremented out-of-order when there
10740b57cec5SDimitry Andric       // are multiple types event in the bracket. Also emit an s_wait counter
10750b57cec5SDimitry Andric       // with a conservative value of 0 for the counter.
10760b57cec5SDimitry Andric       addWait(Wait, T, 0);
10770b57cec5SDimitry Andric     } else {
1078480093f4SDimitry Andric       // If a counter has been maxed out avoid overflow by waiting for
1079480093f4SDimitry Andric       // MAX(CounterType) - 1 instead.
10805ffd83dbSDimitry Andric       unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1081480093f4SDimitry Andric       addWait(Wait, T, NeededWait);
10820b57cec5SDimitry Andric     }
10830b57cec5SDimitry Andric   }
10840b57cec5SDimitry Andric }
10850b57cec5SDimitry Andric 
10860b57cec5SDimitry Andric void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
10877a6dacacSDimitry Andric   applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
10880b57cec5SDimitry Andric   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
10897a6dacacSDimitry Andric   applyWaitcnt(DS_CNT, Wait.DsCnt);
10907a6dacacSDimitry Andric   applyWaitcnt(STORE_CNT, Wait.StoreCnt);
10917a6dacacSDimitry Andric   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
10927a6dacacSDimitry Andric   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
10937a6dacacSDimitry Andric   applyWaitcnt(KM_CNT, Wait.KmCnt);
10940b57cec5SDimitry Andric }
10950b57cec5SDimitry Andric 
10960b57cec5SDimitry Andric void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
10975ffd83dbSDimitry Andric   const unsigned UB = getScoreUB(T);
10980b57cec5SDimitry Andric   if (Count >= UB)
10990b57cec5SDimitry Andric     return;
11000b57cec5SDimitry Andric   if (Count != 0) {
11010b57cec5SDimitry Andric     if (counterOutOfOrder(T))
11020b57cec5SDimitry Andric       return;
11030b57cec5SDimitry Andric     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
11040b57cec5SDimitry Andric   } else {
11050b57cec5SDimitry Andric     setScoreLB(T, UB);
11060b57cec5SDimitry Andric     PendingEvents &= ~WaitEventMaskForInst[T];
11070b57cec5SDimitry Andric   }
11080b57cec5SDimitry Andric }
11090b57cec5SDimitry Andric 
11100b57cec5SDimitry Andric // Where there are multiple types of event in the bracket of a counter,
11110b57cec5SDimitry Andric // the decrement may go out of order.
11120b57cec5SDimitry Andric bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
11130b57cec5SDimitry Andric   // Scalar memory read always can go out of order.
11147a6dacacSDimitry Andric   if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
11150b57cec5SDimitry Andric     return true;
11165ffd83dbSDimitry Andric   return hasMixedPendingEvents(T);
11170b57cec5SDimitry Andric }
11180b57cec5SDimitry Andric 
11190b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
11200b57cec5SDimitry Andric                       false)
11210fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
11220fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
11230b57cec5SDimitry Andric INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
11240b57cec5SDimitry Andric                     false)
11250b57cec5SDimitry Andric 
11260b57cec5SDimitry Andric char SIInsertWaitcnts::ID = 0;
11270b57cec5SDimitry Andric 
11280b57cec5SDimitry Andric char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
11290b57cec5SDimitry Andric 
11300b57cec5SDimitry Andric FunctionPass *llvm::createSIInsertWaitcntsPass() {
11310b57cec5SDimitry Andric   return new SIInsertWaitcnts();
11320b57cec5SDimitry Andric }
11330b57cec5SDimitry Andric 
1134bdd1243dSDimitry Andric static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
1135bdd1243dSDimitry Andric                                      unsigned NewEnc) {
1136bdd1243dSDimitry Andric   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1137bdd1243dSDimitry Andric   assert(OpIdx >= 0);
1138bdd1243dSDimitry Andric 
1139bdd1243dSDimitry Andric   MachineOperand &MO = MI.getOperand(OpIdx);
1140bdd1243dSDimitry Andric 
1141bdd1243dSDimitry Andric   if (NewEnc == MO.getImm())
1142bdd1243dSDimitry Andric     return false;
1143bdd1243dSDimitry Andric 
1144bdd1243dSDimitry Andric   MO.setImm(NewEnc);
1145bdd1243dSDimitry Andric   return true;
1146bdd1243dSDimitry Andric }
1147bdd1243dSDimitry Andric 
11487a6dacacSDimitry Andric /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
11497a6dacacSDimitry Andric /// and if so, which counter it is waiting on.
11507a6dacacSDimitry Andric static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
11517a6dacacSDimitry Andric   switch (Opcode) {
11527a6dacacSDimitry Andric   case AMDGPU::S_WAIT_LOADCNT:
11537a6dacacSDimitry Andric     return LOAD_CNT;
11547a6dacacSDimitry Andric   case AMDGPU::S_WAIT_EXPCNT:
11557a6dacacSDimitry Andric     return EXP_CNT;
11567a6dacacSDimitry Andric   case AMDGPU::S_WAIT_STORECNT:
11577a6dacacSDimitry Andric     return STORE_CNT;
11587a6dacacSDimitry Andric   case AMDGPU::S_WAIT_SAMPLECNT:
11597a6dacacSDimitry Andric     return SAMPLE_CNT;
11607a6dacacSDimitry Andric   case AMDGPU::S_WAIT_BVHCNT:
11617a6dacacSDimitry Andric     return BVH_CNT;
11627a6dacacSDimitry Andric   case AMDGPU::S_WAIT_DSCNT:
11637a6dacacSDimitry Andric     return DS_CNT;
11647a6dacacSDimitry Andric   case AMDGPU::S_WAIT_KMCNT:
11657a6dacacSDimitry Andric     return KM_CNT;
11667a6dacacSDimitry Andric   default:
11677a6dacacSDimitry Andric     return {};
11687a6dacacSDimitry Andric   }
11697a6dacacSDimitry Andric }
11707a6dacacSDimitry Andric 
11717a6dacacSDimitry Andric bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
11727a6dacacSDimitry Andric   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
11737a6dacacSDimitry Andric   if (Opcode == Waitcnt->getOpcode())
11745f757f3fSDimitry Andric     return false;
11755f757f3fSDimitry Andric 
11767a6dacacSDimitry Andric   Waitcnt->setDesc(TII->get(Opcode));
11775f757f3fSDimitry Andric   return true;
11785f757f3fSDimitry Andric }
11795f757f3fSDimitry Andric 
11807a6dacacSDimitry Andric /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
11817a6dacacSDimitry Andric /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
11827a6dacacSDimitry Andric /// from \p Wait that were added by previous passes. Currently this pass
11837a6dacacSDimitry Andric /// conservatively assumes that these preexisting waits are required for
11847a6dacacSDimitry Andric /// correctness.
11857a6dacacSDimitry Andric bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
118681ad6265SDimitry Andric     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1187bdd1243dSDimitry Andric     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
11887a6dacacSDimitry Andric   assert(ST);
11897a6dacacSDimitry Andric   assert(isNormalMode(MaxCounter));
11907a6dacacSDimitry Andric 
1191fe6060f1SDimitry Andric   bool Modified = false;
1192fe6060f1SDimitry Andric   MachineInstr *WaitcntInstr = nullptr;
1193fe6060f1SDimitry Andric   MachineInstr *WaitcntVsCntInstr = nullptr;
119481ad6265SDimitry Andric 
119581ad6265SDimitry Andric   for (auto &II :
119681ad6265SDimitry Andric        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
119781ad6265SDimitry Andric     if (II.isMetaInstruction())
1198fe6060f1SDimitry Andric       continue;
1199fe6060f1SDimitry Andric 
12007a6dacacSDimitry Andric     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
12010fca6ea1SDimitry Andric     bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
12025f757f3fSDimitry Andric 
12035f757f3fSDimitry Andric     // Update required wait count. If this is a soft waitcnt (= it was added
12045f757f3fSDimitry Andric     // by an earlier pass), it may be entirely removed.
12057a6dacacSDimitry Andric     if (Opcode == AMDGPU::S_WAITCNT) {
120681ad6265SDimitry Andric       unsigned IEnc = II.getOperand(0).getImm();
1207fe6060f1SDimitry Andric       AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
12080fca6ea1SDimitry Andric       if (TrySimplify)
12095f757f3fSDimitry Andric         ScoreBrackets.simplifyWaitcnt(OldWait);
1210fe6060f1SDimitry Andric       Wait = Wait.combined(OldWait);
1211fe6060f1SDimitry Andric 
1212fe6060f1SDimitry Andric       // Merge consecutive waitcnt of the same type by erasing multiples.
12130fca6ea1SDimitry Andric       if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
121481ad6265SDimitry Andric         II.eraseFromParent();
1215fe6060f1SDimitry Andric         Modified = true;
12165f757f3fSDimitry Andric       } else
12175f757f3fSDimitry Andric         WaitcntInstr = &II;
1218fe6060f1SDimitry Andric     } else {
12197a6dacacSDimitry Andric       assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
122081ad6265SDimitry Andric       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
12215f757f3fSDimitry Andric 
1222fe6060f1SDimitry Andric       unsigned OldVSCnt =
122381ad6265SDimitry Andric           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
12240fca6ea1SDimitry Andric       if (TrySimplify)
12257a6dacacSDimitry Andric         ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
12267a6dacacSDimitry Andric       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1227fe6060f1SDimitry Andric 
12280fca6ea1SDimitry Andric       if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
122981ad6265SDimitry Andric         II.eraseFromParent();
1230fe6060f1SDimitry Andric         Modified = true;
12315f757f3fSDimitry Andric       } else
12325f757f3fSDimitry Andric         WaitcntVsCntInstr = &II;
1233fe6060f1SDimitry Andric     }
1234fe6060f1SDimitry Andric   }
1235fe6060f1SDimitry Andric 
1236fe6060f1SDimitry Andric   if (WaitcntInstr) {
12375f757f3fSDimitry Andric     Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1238bdd1243dSDimitry Andric                                          AMDGPU::encodeWaitcnt(IV, Wait));
12395f757f3fSDimitry Andric     Modified |= promoteSoftWaitCnt(WaitcntInstr);
12405f757f3fSDimitry Andric 
12417a6dacacSDimitry Andric     ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
12427a6dacacSDimitry Andric     ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
12437a6dacacSDimitry Andric     ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
12447a6dacacSDimitry Andric     Wait.LoadCnt = ~0u;
1245fe6060f1SDimitry Andric     Wait.ExpCnt = ~0u;
12467a6dacacSDimitry Andric     Wait.DsCnt = ~0u;
1247fe6060f1SDimitry Andric 
12487a6dacacSDimitry Andric     LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
12495f757f3fSDimitry Andric                    ? dbgs()
12505f757f3fSDimitry Andric                          << "applyPreexistingWaitcnt\n"
12515f757f3fSDimitry Andric                          << "New Instr at block end: " << *WaitcntInstr << '\n'
125281ad6265SDimitry Andric                    : dbgs() << "applyPreexistingWaitcnt\n"
125381ad6265SDimitry Andric                             << "Old Instr: " << *It
125481ad6265SDimitry Andric                             << "New Instr: " << *WaitcntInstr << '\n');
1255fe6060f1SDimitry Andric   }
1256fe6060f1SDimitry Andric 
1257fe6060f1SDimitry Andric   if (WaitcntVsCntInstr) {
1258bdd1243dSDimitry Andric     Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
12597a6dacacSDimitry Andric                                          AMDGPU::OpName::simm16, Wait.StoreCnt);
12605f757f3fSDimitry Andric     Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1261fe6060f1SDimitry Andric 
12627a6dacacSDimitry Andric     ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
12637a6dacacSDimitry Andric     Wait.StoreCnt = ~0u;
12647a6dacacSDimitry Andric 
12657a6dacacSDimitry Andric     LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
126681ad6265SDimitry Andric                    ? dbgs() << "applyPreexistingWaitcnt\n"
12675f757f3fSDimitry Andric                             << "New Instr at block end: " << *WaitcntVsCntInstr
12685f757f3fSDimitry Andric                             << '\n'
126981ad6265SDimitry Andric                    : dbgs() << "applyPreexistingWaitcnt\n"
127081ad6265SDimitry Andric                             << "Old Instr: " << *It
1271fe6060f1SDimitry Andric                             << "New Instr: " << *WaitcntVsCntInstr << '\n');
1272fe6060f1SDimitry Andric   }
1273fe6060f1SDimitry Andric 
1274fe6060f1SDimitry Andric   return Modified;
1275fe6060f1SDimitry Andric }
1276fe6060f1SDimitry Andric 
12777a6dacacSDimitry Andric /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
12787a6dacacSDimitry Andric /// required counters in \p Wait
12797a6dacacSDimitry Andric bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
12807a6dacacSDimitry Andric     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
12817a6dacacSDimitry Andric     AMDGPU::Waitcnt Wait) {
12827a6dacacSDimitry Andric   assert(ST);
12837a6dacacSDimitry Andric   assert(isNormalMode(MaxCounter));
12847a6dacacSDimitry Andric 
12857a6dacacSDimitry Andric   bool Modified = false;
12867a6dacacSDimitry Andric   const DebugLoc &DL = Block.findDebugLoc(It);
12877a6dacacSDimitry Andric 
12887a6dacacSDimitry Andric   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
12897a6dacacSDimitry Andric   // single instruction while VScnt has its own instruction.
12907a6dacacSDimitry Andric   if (Wait.hasWaitExceptStoreCnt()) {
12917a6dacacSDimitry Andric     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
12927a6dacacSDimitry Andric     [[maybe_unused]] auto SWaitInst =
12937a6dacacSDimitry Andric         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
12947a6dacacSDimitry Andric     Modified = true;
12957a6dacacSDimitry Andric 
12967a6dacacSDimitry Andric     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
12977a6dacacSDimitry Andric                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
12987a6dacacSDimitry Andric                dbgs() << "New Instr: " << *SWaitInst << '\n');
12997a6dacacSDimitry Andric   }
13007a6dacacSDimitry Andric 
13017a6dacacSDimitry Andric   if (Wait.hasWaitStoreCnt()) {
13027a6dacacSDimitry Andric     assert(ST->hasVscnt());
13037a6dacacSDimitry Andric 
13047a6dacacSDimitry Andric     [[maybe_unused]] auto SWaitInst =
13057a6dacacSDimitry Andric         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
13067a6dacacSDimitry Andric             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
13077a6dacacSDimitry Andric             .addImm(Wait.StoreCnt);
13087a6dacacSDimitry Andric     Modified = true;
13097a6dacacSDimitry Andric 
13107a6dacacSDimitry Andric     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
13117a6dacacSDimitry Andric                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
13127a6dacacSDimitry Andric                dbgs() << "New Instr: " << *SWaitInst << '\n');
13137a6dacacSDimitry Andric   }
13147a6dacacSDimitry Andric 
13157a6dacacSDimitry Andric   return Modified;
13167a6dacacSDimitry Andric }
13177a6dacacSDimitry Andric 
13180fca6ea1SDimitry Andric AMDGPU::Waitcnt
13190fca6ea1SDimitry Andric WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
13200fca6ea1SDimitry Andric   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
13210fca6ea1SDimitry Andric }
13220fca6ea1SDimitry Andric 
13230fca6ea1SDimitry Andric AMDGPU::Waitcnt
13240fca6ea1SDimitry Andric WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
13250fca6ea1SDimitry Andric   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
13260fca6ea1SDimitry Andric }
13270fca6ea1SDimitry Andric 
13287a6dacacSDimitry Andric /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
13297a6dacacSDimitry Andric /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
13307a6dacacSDimitry Andric /// were added by previous passes. Currently this pass conservatively
13317a6dacacSDimitry Andric /// assumes that these preexisting waits are required for correctness.
13327a6dacacSDimitry Andric bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
13337a6dacacSDimitry Andric     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
13347a6dacacSDimitry Andric     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
13357a6dacacSDimitry Andric   assert(ST);
13367a6dacacSDimitry Andric   assert(!isNormalMode(MaxCounter));
13377a6dacacSDimitry Andric 
13387a6dacacSDimitry Andric   bool Modified = false;
13397a6dacacSDimitry Andric   MachineInstr *CombinedLoadDsCntInstr = nullptr;
13407a6dacacSDimitry Andric   MachineInstr *CombinedStoreDsCntInstr = nullptr;
13417a6dacacSDimitry Andric   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
13427a6dacacSDimitry Andric 
13437a6dacacSDimitry Andric   for (auto &II :
13447a6dacacSDimitry Andric        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
13457a6dacacSDimitry Andric     if (II.isMetaInstruction())
13467a6dacacSDimitry Andric       continue;
13477a6dacacSDimitry Andric 
13487a6dacacSDimitry Andric     MachineInstr **UpdatableInstr;
13497a6dacacSDimitry Andric 
13507a6dacacSDimitry Andric     // Update required wait count. If this is a soft waitcnt (= it was added
13517a6dacacSDimitry Andric     // by an earlier pass), it may be entirely removed.
13527a6dacacSDimitry Andric 
13537a6dacacSDimitry Andric     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
13540fca6ea1SDimitry Andric     bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
13550fca6ea1SDimitry Andric 
13560fca6ea1SDimitry Andric     // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
13570fca6ea1SDimitry Andric     // attempt to do more than that either.
13580fca6ea1SDimitry Andric     if (Opcode == AMDGPU::S_WAITCNT)
13590fca6ea1SDimitry Andric       continue;
13607a6dacacSDimitry Andric 
13617a6dacacSDimitry Andric     if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
13627a6dacacSDimitry Andric       unsigned OldEnc =
13637a6dacacSDimitry Andric           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13647a6dacacSDimitry Andric       AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
13650fca6ea1SDimitry Andric       if (TrySimplify)
13667a6dacacSDimitry Andric         ScoreBrackets.simplifyWaitcnt(OldWait);
13677a6dacacSDimitry Andric       Wait = Wait.combined(OldWait);
13687a6dacacSDimitry Andric       UpdatableInstr = &CombinedLoadDsCntInstr;
13697a6dacacSDimitry Andric     } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
13707a6dacacSDimitry Andric       unsigned OldEnc =
13717a6dacacSDimitry Andric           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13727a6dacacSDimitry Andric       AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
13730fca6ea1SDimitry Andric       if (TrySimplify)
13747a6dacacSDimitry Andric         ScoreBrackets.simplifyWaitcnt(OldWait);
13757a6dacacSDimitry Andric       Wait = Wait.combined(OldWait);
13767a6dacacSDimitry Andric       UpdatableInstr = &CombinedStoreDsCntInstr;
13777a6dacacSDimitry Andric     } else {
13787a6dacacSDimitry Andric       std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
13797a6dacacSDimitry Andric       assert(CT.has_value());
13807a6dacacSDimitry Andric       unsigned OldCnt =
13817a6dacacSDimitry Andric           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
13820fca6ea1SDimitry Andric       if (TrySimplify)
13837a6dacacSDimitry Andric         ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
13847a6dacacSDimitry Andric       addWait(Wait, CT.value(), OldCnt);
13857a6dacacSDimitry Andric       UpdatableInstr = &WaitInstrs[CT.value()];
13867a6dacacSDimitry Andric     }
13877a6dacacSDimitry Andric 
13887a6dacacSDimitry Andric     // Merge consecutive waitcnt of the same type by erasing multiples.
13897a6dacacSDimitry Andric     if (!*UpdatableInstr) {
13907a6dacacSDimitry Andric       *UpdatableInstr = &II;
13917a6dacacSDimitry Andric     } else {
13927a6dacacSDimitry Andric       II.eraseFromParent();
13937a6dacacSDimitry Andric       Modified = true;
13947a6dacacSDimitry Andric     }
13957a6dacacSDimitry Andric   }
13967a6dacacSDimitry Andric 
13977a6dacacSDimitry Andric   if (CombinedLoadDsCntInstr) {
13987a6dacacSDimitry Andric     // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
13997a6dacacSDimitry Andric     // to be waited for. Otherwise, let the instruction be deleted so
14007a6dacacSDimitry Andric     // the appropriate single counter wait instruction can be inserted
14017a6dacacSDimitry Andric     // instead, when new S_WAIT_*CNT instructions are inserted by
14027a6dacacSDimitry Andric     // createNewWaitcnt(). As a side effect, resetting the wait counts will
14037a6dacacSDimitry Andric     // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
14047a6dacacSDimitry Andric     // the loop below that deals with single counter instructions.
14057a6dacacSDimitry Andric     if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
14067a6dacacSDimitry Andric       unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
14077a6dacacSDimitry Andric       Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
14087a6dacacSDimitry Andric                                            AMDGPU::OpName::simm16, NewEnc);
14097a6dacacSDimitry Andric       Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
14107a6dacacSDimitry Andric       ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
14117a6dacacSDimitry Andric       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
14127a6dacacSDimitry Andric       Wait.LoadCnt = ~0u;
14137a6dacacSDimitry Andric       Wait.DsCnt = ~0u;
14147a6dacacSDimitry Andric 
14157a6dacacSDimitry Andric       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
14167a6dacacSDimitry Andric                      ? dbgs() << "applyPreexistingWaitcnt\n"
14177a6dacacSDimitry Andric                               << "New Instr at block end: "
14187a6dacacSDimitry Andric                               << *CombinedLoadDsCntInstr << '\n'
14197a6dacacSDimitry Andric                      : dbgs() << "applyPreexistingWaitcnt\n"
14207a6dacacSDimitry Andric                               << "Old Instr: " << *It << "New Instr: "
14217a6dacacSDimitry Andric                               << *CombinedLoadDsCntInstr << '\n');
14227a6dacacSDimitry Andric     } else {
14237a6dacacSDimitry Andric       CombinedLoadDsCntInstr->eraseFromParent();
14247a6dacacSDimitry Andric       Modified = true;
14257a6dacacSDimitry Andric     }
14267a6dacacSDimitry Andric   }
14277a6dacacSDimitry Andric 
14287a6dacacSDimitry Andric   if (CombinedStoreDsCntInstr) {
14297a6dacacSDimitry Andric     // Similarly for S_WAIT_STORECNT_DSCNT.
14307a6dacacSDimitry Andric     if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
14317a6dacacSDimitry Andric       unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
14327a6dacacSDimitry Andric       Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
14337a6dacacSDimitry Andric                                            AMDGPU::OpName::simm16, NewEnc);
14347a6dacacSDimitry Andric       Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
14357a6dacacSDimitry Andric       ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
14367a6dacacSDimitry Andric       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
14377a6dacacSDimitry Andric       Wait.StoreCnt = ~0u;
14387a6dacacSDimitry Andric       Wait.DsCnt = ~0u;
14397a6dacacSDimitry Andric 
14407a6dacacSDimitry Andric       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
14417a6dacacSDimitry Andric                      ? dbgs() << "applyPreexistingWaitcnt\n"
14427a6dacacSDimitry Andric                               << "New Instr at block end: "
14437a6dacacSDimitry Andric                               << *CombinedStoreDsCntInstr << '\n'
14447a6dacacSDimitry Andric                      : dbgs() << "applyPreexistingWaitcnt\n"
14457a6dacacSDimitry Andric                               << "Old Instr: " << *It << "New Instr: "
14467a6dacacSDimitry Andric                               << *CombinedStoreDsCntInstr << '\n');
14477a6dacacSDimitry Andric     } else {
14487a6dacacSDimitry Andric       CombinedStoreDsCntInstr->eraseFromParent();
14497a6dacacSDimitry Andric       Modified = true;
14507a6dacacSDimitry Andric     }
14517a6dacacSDimitry Andric   }
14527a6dacacSDimitry Andric 
14537a6dacacSDimitry Andric   // Look for an opportunity to convert existing S_WAIT_LOADCNT,
14547a6dacacSDimitry Andric   // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
14557a6dacacSDimitry Andric   // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
14567a6dacacSDimitry Andric   // instructions so that createNewWaitcnt() will create new combined
14577a6dacacSDimitry Andric   // instructions to replace them.
14587a6dacacSDimitry Andric 
14597a6dacacSDimitry Andric   if (Wait.DsCnt != ~0u) {
14607a6dacacSDimitry Andric     // This is a vector of addresses in WaitInstrs pointing to instructions
14617a6dacacSDimitry Andric     // that should be removed if they are present.
14627a6dacacSDimitry Andric     SmallVector<MachineInstr **, 2> WaitsToErase;
14637a6dacacSDimitry Andric 
14647a6dacacSDimitry Andric     // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
14657a6dacacSDimitry Andric     // both) need to be waited for, ensure that there are no existing
14667a6dacacSDimitry Andric     // individual wait count instructions for these.
14677a6dacacSDimitry Andric 
14687a6dacacSDimitry Andric     if (Wait.LoadCnt != ~0u) {
14697a6dacacSDimitry Andric       WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
14707a6dacacSDimitry Andric       WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
14717a6dacacSDimitry Andric     } else if (Wait.StoreCnt != ~0u) {
14727a6dacacSDimitry Andric       WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
14737a6dacacSDimitry Andric       WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
14747a6dacacSDimitry Andric     }
14757a6dacacSDimitry Andric 
14767a6dacacSDimitry Andric     for (MachineInstr **WI : WaitsToErase) {
14777a6dacacSDimitry Andric       if (!*WI)
14787a6dacacSDimitry Andric         continue;
14797a6dacacSDimitry Andric 
14807a6dacacSDimitry Andric       (*WI)->eraseFromParent();
14817a6dacacSDimitry Andric       *WI = nullptr;
14827a6dacacSDimitry Andric       Modified = true;
14837a6dacacSDimitry Andric     }
14847a6dacacSDimitry Andric   }
14857a6dacacSDimitry Andric 
14867a6dacacSDimitry Andric   for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
14877a6dacacSDimitry Andric     if (!WaitInstrs[CT])
14887a6dacacSDimitry Andric       continue;
14897a6dacacSDimitry Andric 
14907a6dacacSDimitry Andric     unsigned NewCnt = getWait(Wait, CT);
14917a6dacacSDimitry Andric     if (NewCnt != ~0u) {
14927a6dacacSDimitry Andric       Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
14937a6dacacSDimitry Andric                                            AMDGPU::OpName::simm16, NewCnt);
14947a6dacacSDimitry Andric       Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
14957a6dacacSDimitry Andric 
14967a6dacacSDimitry Andric       ScoreBrackets.applyWaitcnt(CT, NewCnt);
14977a6dacacSDimitry Andric       setNoWait(Wait, CT);
14987a6dacacSDimitry Andric 
14997a6dacacSDimitry Andric       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
15007a6dacacSDimitry Andric                      ? dbgs() << "applyPreexistingWaitcnt\n"
15017a6dacacSDimitry Andric                               << "New Instr at block end: " << *WaitInstrs[CT]
15027a6dacacSDimitry Andric                               << '\n'
15037a6dacacSDimitry Andric                      : dbgs() << "applyPreexistingWaitcnt\n"
15047a6dacacSDimitry Andric                               << "Old Instr: " << *It
15057a6dacacSDimitry Andric                               << "New Instr: " << *WaitInstrs[CT] << '\n');
15067a6dacacSDimitry Andric     } else {
15077a6dacacSDimitry Andric       WaitInstrs[CT]->eraseFromParent();
15087a6dacacSDimitry Andric       Modified = true;
15097a6dacacSDimitry Andric     }
15107a6dacacSDimitry Andric   }
15117a6dacacSDimitry Andric 
15127a6dacacSDimitry Andric   return Modified;
15137a6dacacSDimitry Andric }
15147a6dacacSDimitry Andric 
15157a6dacacSDimitry Andric /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
15167a6dacacSDimitry Andric bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
15177a6dacacSDimitry Andric     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
15187a6dacacSDimitry Andric     AMDGPU::Waitcnt Wait) {
15197a6dacacSDimitry Andric   assert(ST);
15207a6dacacSDimitry Andric   assert(!isNormalMode(MaxCounter));
15217a6dacacSDimitry Andric 
15227a6dacacSDimitry Andric   bool Modified = false;
15237a6dacacSDimitry Andric   const DebugLoc &DL = Block.findDebugLoc(It);
15247a6dacacSDimitry Andric 
15257a6dacacSDimitry Andric   // Check for opportunities to use combined wait instructions.
15267a6dacacSDimitry Andric   if (Wait.DsCnt != ~0u) {
15277a6dacacSDimitry Andric     MachineInstr *SWaitInst = nullptr;
15287a6dacacSDimitry Andric 
15297a6dacacSDimitry Andric     if (Wait.LoadCnt != ~0u) {
15307a6dacacSDimitry Andric       unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
15317a6dacacSDimitry Andric 
15327a6dacacSDimitry Andric       SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
15337a6dacacSDimitry Andric                       .addImm(Enc);
15347a6dacacSDimitry Andric 
15357a6dacacSDimitry Andric       Wait.LoadCnt = ~0u;
15367a6dacacSDimitry Andric       Wait.DsCnt = ~0u;
15377a6dacacSDimitry Andric     } else if (Wait.StoreCnt != ~0u) {
15387a6dacacSDimitry Andric       unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
15397a6dacacSDimitry Andric 
15407a6dacacSDimitry Andric       SWaitInst =
15417a6dacacSDimitry Andric           BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
15427a6dacacSDimitry Andric               .addImm(Enc);
15437a6dacacSDimitry Andric 
15447a6dacacSDimitry Andric       Wait.StoreCnt = ~0u;
15457a6dacacSDimitry Andric       Wait.DsCnt = ~0u;
15467a6dacacSDimitry Andric     }
15477a6dacacSDimitry Andric 
15487a6dacacSDimitry Andric     if (SWaitInst) {
15497a6dacacSDimitry Andric       Modified = true;
15507a6dacacSDimitry Andric 
15517a6dacacSDimitry Andric       LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
15527a6dacacSDimitry Andric                  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
15537a6dacacSDimitry Andric                  dbgs() << "New Instr: " << *SWaitInst << '\n');
15547a6dacacSDimitry Andric     }
15557a6dacacSDimitry Andric   }
15567a6dacacSDimitry Andric 
15577a6dacacSDimitry Andric   // Generate an instruction for any remaining counter that needs
15587a6dacacSDimitry Andric   // waiting for.
15597a6dacacSDimitry Andric 
15607a6dacacSDimitry Andric   for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
15617a6dacacSDimitry Andric     unsigned Count = getWait(Wait, CT);
15627a6dacacSDimitry Andric     if (Count == ~0u)
15637a6dacacSDimitry Andric       continue;
15647a6dacacSDimitry Andric 
15657a6dacacSDimitry Andric     [[maybe_unused]] auto SWaitInst =
15667a6dacacSDimitry Andric         BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
15677a6dacacSDimitry Andric             .addImm(Count);
15687a6dacacSDimitry Andric 
15697a6dacacSDimitry Andric     Modified = true;
15707a6dacacSDimitry Andric 
15717a6dacacSDimitry Andric     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
15727a6dacacSDimitry Andric                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
15737a6dacacSDimitry Andric                dbgs() << "New Instr: " << *SWaitInst << '\n');
15747a6dacacSDimitry Andric   }
15757a6dacacSDimitry Andric 
15767a6dacacSDimitry Andric   return Modified;
15777a6dacacSDimitry Andric }
15787a6dacacSDimitry Andric 
15790b57cec5SDimitry Andric static bool readsVCCZ(const MachineInstr &MI) {
15800b57cec5SDimitry Andric   unsigned Opc = MI.getOpcode();
15810b57cec5SDimitry Andric   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
15820b57cec5SDimitry Andric          !MI.getOperand(1).isUndef();
15830b57cec5SDimitry Andric }
15840b57cec5SDimitry Andric 
15850b57cec5SDimitry Andric /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
15860b57cec5SDimitry Andric static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
15870b57cec5SDimitry Andric   // Currently all conventions wait, but this may not always be the case.
15880b57cec5SDimitry Andric   //
15890b57cec5SDimitry Andric   // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
15900b57cec5SDimitry Andric   // senses to omit the wait and do it in the caller.
15910b57cec5SDimitry Andric   return true;
15920b57cec5SDimitry Andric }
15930b57cec5SDimitry Andric 
15940b57cec5SDimitry Andric /// \returns true if the callee is expected to wait for any outstanding waits
15950b57cec5SDimitry Andric /// before returning.
15960b57cec5SDimitry Andric static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
15970b57cec5SDimitry Andric   return true;
15980b57cec5SDimitry Andric }
15990b57cec5SDimitry Andric 
16000b57cec5SDimitry Andric ///  Generate s_waitcnt instruction to be placed before cur_Inst.
16010b57cec5SDimitry Andric ///  Instructions of a given type are returned in order,
16020b57cec5SDimitry Andric ///  but instructions of different types can complete out of order.
16030b57cec5SDimitry Andric ///  We rely on this in-order completion
16040b57cec5SDimitry Andric ///  and simply assign a score to the memory access instructions.
16050b57cec5SDimitry Andric ///  We keep track of the active "score bracket" to determine
16060b57cec5SDimitry Andric ///  if an access of a memory read requires an s_waitcnt
16070b57cec5SDimitry Andric ///  and if so what the value of each counter is.
16080b57cec5SDimitry Andric ///  The "score bracket" is bound by the lower bound and upper bound
16090b57cec5SDimitry Andric ///  scores (*_score_LB and *_score_ub respectively).
161081ad6265SDimitry Andric ///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
161181ad6265SDimitry Andric ///  flush the vmcnt counter here.
161281ad6265SDimitry Andric bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
161381ad6265SDimitry Andric                                                  WaitcntBrackets &ScoreBrackets,
161481ad6265SDimitry Andric                                                  MachineInstr *OldWaitcntInstr,
161581ad6265SDimitry Andric                                                  bool FlushVmCnt) {
16160b57cec5SDimitry Andric   setForceEmitWaitcnt();
16170b57cec5SDimitry Andric 
1618e8d8bef9SDimitry Andric   if (MI.isMetaInstruction())
16190b57cec5SDimitry Andric     return false;
16200b57cec5SDimitry Andric 
16210b57cec5SDimitry Andric   AMDGPU::Waitcnt Wait;
16220b57cec5SDimitry Andric 
1623fe6060f1SDimitry Andric   // FIXME: This should have already been handled by the memory legalizer.
1624fe6060f1SDimitry Andric   // Removing this currently doesn't affect any lit tests, but we need to
1625fe6060f1SDimitry Andric   // verify that nothing was relying on this. The number of buffer invalidates
1626fe6060f1SDimitry Andric   // being handled here should not be expanded.
16270b57cec5SDimitry Andric   if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
16280b57cec5SDimitry Andric       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
16290b57cec5SDimitry Andric       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
16300b57cec5SDimitry Andric       MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
16310b57cec5SDimitry Andric       MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
16327a6dacacSDimitry Andric     Wait.LoadCnt = 0;
16330b57cec5SDimitry Andric   }
16340b57cec5SDimitry Andric 
16350b57cec5SDimitry Andric   // All waits must be resolved at call return.
16360b57cec5SDimitry Andric   // NOTE: this could be improved with knowledge of all call sites or
16370b57cec5SDimitry Andric   //   with knowledge of the called routines.
16380b57cec5SDimitry Andric   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
163981ad6265SDimitry Andric       MI.getOpcode() == AMDGPU::SI_RETURN ||
16400b57cec5SDimitry Andric       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
16410b57cec5SDimitry Andric       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
16420fca6ea1SDimitry Andric     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
164306c3fb27SDimitry Andric   }
164406c3fb27SDimitry Andric   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
164506c3fb27SDimitry Andric   // stores. In this case it can be useful to send a message to explicitly
164606c3fb27SDimitry Andric   // release all VGPRs before the stores have completed, but it is only safe to
16477a6dacacSDimitry Andric   // do this if:
16487a6dacacSDimitry Andric   // * there are no outstanding scratch stores
16497a6dacacSDimitry Andric   // * we are not in Dynamic VGPR mode
165006c3fb27SDimitry Andric   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
165106c3fb27SDimitry Andric            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
16520fca6ea1SDimitry Andric     if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
16537a6dacacSDimitry Andric         ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
165406c3fb27SDimitry Andric         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
165506c3fb27SDimitry Andric       ReleaseVGPRInsts.insert(&MI);
16560b57cec5SDimitry Andric   }
16570b57cec5SDimitry Andric   // Resolve vm waits before gs-done.
16580b57cec5SDimitry Andric   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
16590b57cec5SDimitry Andric             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
166081ad6265SDimitry Andric            ST->hasLegacyGeometry() &&
166181ad6265SDimitry Andric            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
166281ad6265SDimitry Andric             AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
16637a6dacacSDimitry Andric     Wait.LoadCnt = 0;
16640b57cec5SDimitry Andric   }
16650b57cec5SDimitry Andric 
16660b57cec5SDimitry Andric   // Export & GDS instructions do not read the EXEC mask until after the export
16670b57cec5SDimitry Andric   // is granted (which can occur well after the instruction is issued).
16680b57cec5SDimitry Andric   // The shader program must flush all EXP operations on the export-count
16690b57cec5SDimitry Andric   // before overwriting the EXEC mask.
16700b57cec5SDimitry Andric   else {
16710b57cec5SDimitry Andric     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
16720b57cec5SDimitry Andric       // Export and GDS are tracked individually, either may trigger a waitcnt
16730b57cec5SDimitry Andric       // for EXEC.
16740b57cec5SDimitry Andric       if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
16750b57cec5SDimitry Andric           ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
16760b57cec5SDimitry Andric           ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
16770b57cec5SDimitry Andric           ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
16780b57cec5SDimitry Andric         Wait.ExpCnt = 0;
16790b57cec5SDimitry Andric       }
16800b57cec5SDimitry Andric     }
16810b57cec5SDimitry Andric 
16820b57cec5SDimitry Andric     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1683480093f4SDimitry Andric       // The function is going to insert a wait on everything in its prolog.
1684480093f4SDimitry Andric       // This still needs to be careful if the call target is a load (e.g. a GOT
168581ad6265SDimitry Andric       // load). We also need to check WAW dependency with saved PC.
16860b57cec5SDimitry Andric       Wait = AMDGPU::Waitcnt();
16870b57cec5SDimitry Andric 
16880b57cec5SDimitry Andric       int CallAddrOpIdx =
16890b57cec5SDimitry Andric           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1690e8d8bef9SDimitry Andric 
1691e8d8bef9SDimitry Andric       if (MI.getOperand(CallAddrOpIdx).isReg()) {
16925ffd83dbSDimitry Andric         RegInterval CallAddrOpInterval =
1693cb14a3feSDimitry Andric             ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1694480093f4SDimitry Andric 
16955ffd83dbSDimitry Andric         for (int RegNo = CallAddrOpInterval.first;
1696480093f4SDimitry Andric              RegNo < CallAddrOpInterval.second; ++RegNo)
16977a6dacacSDimitry Andric           ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1698480093f4SDimitry Andric 
1699480093f4SDimitry Andric         int RtnAddrOpIdx =
1700480093f4SDimitry Andric           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1701480093f4SDimitry Andric         if (RtnAddrOpIdx != -1) {
17025ffd83dbSDimitry Andric           RegInterval RtnAddrOpInterval =
1703cb14a3feSDimitry Andric               ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1704480093f4SDimitry Andric 
17055ffd83dbSDimitry Andric           for (int RegNo = RtnAddrOpInterval.first;
1706480093f4SDimitry Andric                RegNo < RtnAddrOpInterval.second; ++RegNo)
17077a6dacacSDimitry Andric             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
17080b57cec5SDimitry Andric         }
1709e8d8bef9SDimitry Andric       }
17100b57cec5SDimitry Andric     } else {
17110b57cec5SDimitry Andric       // FIXME: Should not be relying on memoperands.
17120b57cec5SDimitry Andric       // Look at the source operands of every instruction to see if
17130b57cec5SDimitry Andric       // any of them results from a previous memory operation that affects
17140b57cec5SDimitry Andric       // its current usage. If so, an s_waitcnt instruction needs to be
17150b57cec5SDimitry Andric       // emitted.
17160b57cec5SDimitry Andric       // If the source operand was defined by a load, add the s_waitcnt
17170b57cec5SDimitry Andric       // instruction.
17185ffd83dbSDimitry Andric       //
17190b57cec5SDimitry Andric       // Two cases are handled for destination operands:
17200b57cec5SDimitry Andric       // 1) If the destination operand was defined by a load, add the s_waitcnt
17210b57cec5SDimitry Andric       // instruction to guarantee the right WAW order.
17220b57cec5SDimitry Andric       // 2) If a destination operand that was used by a recent export/store ins,
17230b57cec5SDimitry Andric       // add s_waitcnt on exp_cnt to guarantee the WAR order.
17247a6dacacSDimitry Andric 
17250b57cec5SDimitry Andric       for (const MachineMemOperand *Memop : MI.memoperands()) {
1726480093f4SDimitry Andric         const Value *Ptr = Memop->getValue();
17275ffd83dbSDimitry Andric         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
17287a6dacacSDimitry Andric           addWait(Wait, SmemAccessCounter, 0);
17295ffd83dbSDimitry Andric           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1730480093f4SDimitry Andric             SLoadAddresses.erase(Ptr);
1731480093f4SDimitry Andric         }
17320b57cec5SDimitry Andric         unsigned AS = Memop->getAddrSpace();
173381ad6265SDimitry Andric         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
173481ad6265SDimitry Andric           continue;
173581ad6265SDimitry Andric         // No need to wait before load from VMEM to LDS.
17365f757f3fSDimitry Andric         if (TII->mayWriteLDSThroughDMA(MI))
17370b57cec5SDimitry Andric           continue;
17387a6dacacSDimitry Andric 
17397a6dacacSDimitry Andric         // LOAD_CNT is only relevant to vgpr or LDS.
17400b57cec5SDimitry Andric         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
17417a6dacacSDimitry Andric         bool FoundAliasingStore = false;
17427a6dacacSDimitry Andric         // Only objects with alias scope info were added to LDSDMAScopes array.
17437a6dacacSDimitry Andric         // In the absense of the scope info we will not be able to disambiguate
17447a6dacacSDimitry Andric         // aliasing here. There is no need to try searching for a corresponding
17457a6dacacSDimitry Andric         // store slot. This is conservatively correct because in that case we
17467a6dacacSDimitry Andric         // will produce a wait using the first (general) LDS DMA wait slot which
17477a6dacacSDimitry Andric         // will wait on all of them anyway.
17487a6dacacSDimitry Andric         if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
17497a6dacacSDimitry Andric           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17507a6dacacSDimitry Andric           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
17517a6dacacSDimitry Andric             if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
17527a6dacacSDimitry Andric               FoundAliasingStore = true;
17537a6dacacSDimitry Andric               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
17547a6dacacSDimitry Andric             }
17557a6dacacSDimitry Andric           }
17567a6dacacSDimitry Andric         }
17577a6dacacSDimitry Andric         if (!FoundAliasingStore)
17587a6dacacSDimitry Andric           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
17595ffd83dbSDimitry Andric         if (Memop->isStore()) {
1760bdd1243dSDimitry Andric           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17610b57cec5SDimitry Andric         }
17620b57cec5SDimitry Andric       }
17635ffd83dbSDimitry Andric 
17645ffd83dbSDimitry Andric       // Loop over use and def operands.
17650b57cec5SDimitry Andric       for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
17665ffd83dbSDimitry Andric         MachineOperand &Op = MI.getOperand(I);
17675ffd83dbSDimitry Andric         if (!Op.isReg())
17685ffd83dbSDimitry Andric           continue;
1769bdd1243dSDimitry Andric 
1770bdd1243dSDimitry Andric         // If the instruction does not read tied source, skip the operand.
1771bdd1243dSDimitry Andric         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1772bdd1243dSDimitry Andric           continue;
1773bdd1243dSDimitry Andric 
1774cb14a3feSDimitry Andric         RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1775e8d8bef9SDimitry Andric 
1776fe6060f1SDimitry Andric         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
17775ffd83dbSDimitry Andric         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1778e8d8bef9SDimitry Andric           if (IsVGPR) {
17795ffd83dbSDimitry Andric             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
17805ffd83dbSDimitry Andric             // previous write and this write are the same type of VMEM
1781*6c4b055cSDimitry Andric             // instruction, in which case they are (in some architectures)
1782*6c4b055cSDimitry Andric             // guaranteed to write their results in order anyway.
1783bdd1243dSDimitry Andric             if (Op.isUse() || !updateVMCntOnly(MI) ||
17845ffd83dbSDimitry Andric                 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1785*6c4b055cSDimitry Andric                                                        getVmemType(MI)) ||
1786*6c4b055cSDimitry Andric                 !ST->hasVmemWriteVgprInOrder()) {
17877a6dacacSDimitry Andric               ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
17887a6dacacSDimitry Andric               ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
17897a6dacacSDimitry Andric               ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
17905ffd83dbSDimitry Andric               ScoreBrackets.clearVgprVmemTypes(RegNo);
17915ffd83dbSDimitry Andric             }
179281ad6265SDimitry Andric             if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1793bdd1243dSDimitry Andric               ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17940b57cec5SDimitry Andric             }
17957a6dacacSDimitry Andric             ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
17967a6dacacSDimitry Andric           } else {
17977a6dacacSDimitry Andric             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
17985ffd83dbSDimitry Andric           }
17990b57cec5SDimitry Andric         }
18005ffd83dbSDimitry Andric       }
18010b57cec5SDimitry Andric     }
18020b57cec5SDimitry Andric   }
18030b57cec5SDimitry Andric 
1804bdd1243dSDimitry Andric   // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1805bdd1243dSDimitry Andric   // not, we need to ensure the subtarget is capable of backing off barrier
1806bdd1243dSDimitry Andric   // instructions in case there are any outstanding memory operations that may
1807bdd1243dSDimitry Andric   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
18083a079333SDimitry Andric   if (TII->isBarrierStart(MI.getOpcode()) &&
1809bdd1243dSDimitry Andric       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
18100fca6ea1SDimitry Andric     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
18110b57cec5SDimitry Andric   }
18120b57cec5SDimitry Andric 
18130b57cec5SDimitry Andric   // TODO: Remove this work-around, enable the assert for Bug 457939
18140b57cec5SDimitry Andric   //       after fixing the scheduler. Also, the Shader Compiler code is
18150b57cec5SDimitry Andric   //       independent of target.
18160b57cec5SDimitry Andric   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1817bdd1243dSDimitry Andric     if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
18187a6dacacSDimitry Andric       Wait.DsCnt = 0;
18190b57cec5SDimitry Andric     }
18200b57cec5SDimitry Andric   }
18210b57cec5SDimitry Andric 
1822fe6060f1SDimitry Andric   // Verify that the wait is actually needed.
1823fe6060f1SDimitry Andric   ScoreBrackets.simplifyWaitcnt(Wait);
18240b57cec5SDimitry Andric 
18250b57cec5SDimitry Andric   if (ForceEmitZeroWaitcnts)
18260fca6ea1SDimitry Andric     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
18270b57cec5SDimitry Andric 
18287a6dacacSDimitry Andric   if (ForceEmitWaitcnt[LOAD_CNT])
18297a6dacacSDimitry Andric     Wait.LoadCnt = 0;
18300b57cec5SDimitry Andric   if (ForceEmitWaitcnt[EXP_CNT])
18310b57cec5SDimitry Andric     Wait.ExpCnt = 0;
18327a6dacacSDimitry Andric   if (ForceEmitWaitcnt[DS_CNT])
18337a6dacacSDimitry Andric     Wait.DsCnt = 0;
18347a6dacacSDimitry Andric   if (ForceEmitWaitcnt[SAMPLE_CNT])
18357a6dacacSDimitry Andric     Wait.SampleCnt = 0;
18367a6dacacSDimitry Andric   if (ForceEmitWaitcnt[BVH_CNT])
18377a6dacacSDimitry Andric     Wait.BvhCnt = 0;
18387a6dacacSDimitry Andric   if (ForceEmitWaitcnt[KM_CNT])
18397a6dacacSDimitry Andric     Wait.KmCnt = 0;
18400b57cec5SDimitry Andric 
184181ad6265SDimitry Andric   if (FlushVmCnt) {
18427a6dacacSDimitry Andric     if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
18437a6dacacSDimitry Andric       Wait.LoadCnt = 0;
18447a6dacacSDimitry Andric     if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
18457a6dacacSDimitry Andric       Wait.SampleCnt = 0;
18467a6dacacSDimitry Andric     if (ScoreBrackets.hasPendingEvent(BVH_CNT))
18477a6dacacSDimitry Andric       Wait.BvhCnt = 0;
184881ad6265SDimitry Andric   }
184981ad6265SDimitry Andric 
185081ad6265SDimitry Andric   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
185181ad6265SDimitry Andric                          OldWaitcntInstr);
185281ad6265SDimitry Andric }
185381ad6265SDimitry Andric 
185481ad6265SDimitry Andric bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
185581ad6265SDimitry Andric                                        MachineBasicBlock::instr_iterator It,
185681ad6265SDimitry Andric                                        MachineBasicBlock &Block,
185781ad6265SDimitry Andric                                        WaitcntBrackets &ScoreBrackets,
185881ad6265SDimitry Andric                                        MachineInstr *OldWaitcntInstr) {
185981ad6265SDimitry Andric   bool Modified = false;
186081ad6265SDimitry Andric 
186181ad6265SDimitry Andric   if (OldWaitcntInstr)
1862fe6060f1SDimitry Andric     // Try to merge the required wait with preexisting waitcnt instructions.
1863fe6060f1SDimitry Andric     // Also erase redundant waitcnt.
1864fe6060f1SDimitry Andric     Modified =
18657a6dacacSDimitry Andric         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
18667a6dacacSDimitry Andric 
18677a6dacacSDimitry Andric   // Any counts that could have been applied to any existing waitcnt
18687a6dacacSDimitry Andric   // instructions will have been done so, now deal with any remaining.
1869fe6060f1SDimitry Andric   ScoreBrackets.applyWaitcnt(Wait);
187081ad6265SDimitry Andric 
187181ad6265SDimitry Andric   // ExpCnt can be merged into VINTERP.
187281ad6265SDimitry Andric   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
187381ad6265SDimitry Andric       SIInstrInfo::isVINTERP(*It)) {
187481ad6265SDimitry Andric     MachineOperand *WaitExp =
187581ad6265SDimitry Andric         TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
187681ad6265SDimitry Andric     if (Wait.ExpCnt < WaitExp->getImm()) {
187781ad6265SDimitry Andric       WaitExp->setImm(Wait.ExpCnt);
187881ad6265SDimitry Andric       Modified = true;
187981ad6265SDimitry Andric     }
188081ad6265SDimitry Andric     Wait.ExpCnt = ~0u;
188181ad6265SDimitry Andric 
18827a6dacacSDimitry Andric     LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
188381ad6265SDimitry Andric                       << "Update Instr: " << *It);
18840b57cec5SDimitry Andric   }
18850b57cec5SDimitry Andric 
18867a6dacacSDimitry Andric   if (WCG->createNewWaitcnt(Block, It, Wait))
18870b57cec5SDimitry Andric     Modified = true;
18880b57cec5SDimitry Andric 
18890b57cec5SDimitry Andric   return Modified;
18900b57cec5SDimitry Andric }
18910b57cec5SDimitry Andric 
1892e8d8bef9SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens other
1893e8d8bef9SDimitry Andric // than LDS. Other address spaces supported by flat memory operations involve
1894e8d8bef9SDimitry Andric // global memory.
1895e8d8bef9SDimitry Andric bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1896e8d8bef9SDimitry Andric   assert(TII->isFLAT(MI));
1897e8d8bef9SDimitry Andric 
1898e8d8bef9SDimitry Andric   // All flat instructions use the VMEM counter.
1899e8d8bef9SDimitry Andric   assert(TII->usesVM_CNT(MI));
1900e8d8bef9SDimitry Andric 
1901e8d8bef9SDimitry Andric   // If there are no memory operands then conservatively assume the flat
1902e8d8bef9SDimitry Andric   // operation may access VMEM.
19030b57cec5SDimitry Andric   if (MI.memoperands_empty())
19040b57cec5SDimitry Andric     return true;
19050b57cec5SDimitry Andric 
1906e8d8bef9SDimitry Andric   // See if any memory operand specifies an address space that involves VMEM.
1907e8d8bef9SDimitry Andric   // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1908e8d8bef9SDimitry Andric   // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1909e8d8bef9SDimitry Andric   // (GDS) address space is not supported by flat operations. Therefore, simply
1910e8d8bef9SDimitry Andric   // return true unless only the LDS address space is found.
1911e8d8bef9SDimitry Andric   for (const MachineMemOperand *Memop : MI.memoperands()) {
1912e8d8bef9SDimitry Andric     unsigned AS = Memop->getAddrSpace();
1913e8d8bef9SDimitry Andric     assert(AS != AMDGPUAS::REGION_ADDRESS);
1914e8d8bef9SDimitry Andric     if (AS != AMDGPUAS::LOCAL_ADDRESS)
1915e8d8bef9SDimitry Andric       return true;
1916e8d8bef9SDimitry Andric   }
1917e8d8bef9SDimitry Andric 
1918e8d8bef9SDimitry Andric   return false;
1919e8d8bef9SDimitry Andric }
1920e8d8bef9SDimitry Andric 
1921e8d8bef9SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens for
1922e8d8bef9SDimitry Andric // either LDS or FLAT.
1923e8d8bef9SDimitry Andric bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1924e8d8bef9SDimitry Andric   assert(TII->isFLAT(MI));
1925e8d8bef9SDimitry Andric 
1926e8d8bef9SDimitry Andric   // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1927e8d8bef9SDimitry Andric   if (!TII->usesLGKM_CNT(MI))
1928e8d8bef9SDimitry Andric     return false;
1929e8d8bef9SDimitry Andric 
1930fe6060f1SDimitry Andric   // If in tgsplit mode then there can be no use of LDS.
1931fe6060f1SDimitry Andric   if (ST->isTgSplitEnabled())
1932fe6060f1SDimitry Andric     return false;
1933fe6060f1SDimitry Andric 
1934e8d8bef9SDimitry Andric   // If there are no memory operands then conservatively assume the flat
1935e8d8bef9SDimitry Andric   // operation may access LDS.
1936e8d8bef9SDimitry Andric   if (MI.memoperands_empty())
1937e8d8bef9SDimitry Andric     return true;
1938e8d8bef9SDimitry Andric 
1939e8d8bef9SDimitry Andric   // See if any memory operand specifies an address space that involves LDS.
19400b57cec5SDimitry Andric   for (const MachineMemOperand *Memop : MI.memoperands()) {
19410b57cec5SDimitry Andric     unsigned AS = Memop->getAddrSpace();
19420b57cec5SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
19430b57cec5SDimitry Andric       return true;
19440b57cec5SDimitry Andric   }
19450b57cec5SDimitry Andric 
19460b57cec5SDimitry Andric   return false;
19470b57cec5SDimitry Andric }
19480b57cec5SDimitry Andric 
194906c3fb27SDimitry Andric // This is a flat memory operation. Check to see if it has memory tokens for
195006c3fb27SDimitry Andric // either scratch or FLAT.
195106c3fb27SDimitry Andric bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
195206c3fb27SDimitry Andric     const MachineInstr &MI) const {
195306c3fb27SDimitry Andric   assert(TII->isFLAT(MI));
195406c3fb27SDimitry Andric 
195506c3fb27SDimitry Andric   // SCRATCH instructions always access scratch.
195606c3fb27SDimitry Andric   if (TII->isFLATScratch(MI))
195706c3fb27SDimitry Andric     return true;
195806c3fb27SDimitry Andric 
195906c3fb27SDimitry Andric   // GLOBAL instructions never access scratch.
196006c3fb27SDimitry Andric   if (TII->isFLATGlobal(MI))
196106c3fb27SDimitry Andric     return false;
196206c3fb27SDimitry Andric 
196306c3fb27SDimitry Andric   // If there are no memory operands then conservatively assume the flat
196406c3fb27SDimitry Andric   // operation may access scratch.
196506c3fb27SDimitry Andric   if (MI.memoperands_empty())
196606c3fb27SDimitry Andric     return true;
196706c3fb27SDimitry Andric 
196806c3fb27SDimitry Andric   // See if any memory operand specifies an address space that involves scratch.
196906c3fb27SDimitry Andric   return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
197006c3fb27SDimitry Andric     unsigned AS = Memop->getAddrSpace();
197106c3fb27SDimitry Andric     return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
197206c3fb27SDimitry Andric   });
197306c3fb27SDimitry Andric }
197406c3fb27SDimitry Andric 
19751db9f3b2SDimitry Andric static bool isCacheInvOrWBInst(MachineInstr &Inst) {
19761db9f3b2SDimitry Andric   auto Opc = Inst.getOpcode();
19771db9f3b2SDimitry Andric   return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
19781db9f3b2SDimitry Andric          Opc == AMDGPU::GLOBAL_WBINV;
19791db9f3b2SDimitry Andric }
19801db9f3b2SDimitry Andric 
19810b57cec5SDimitry Andric void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19820b57cec5SDimitry Andric                                                WaitcntBrackets *ScoreBrackets) {
19830b57cec5SDimitry Andric   // Now look at the instruction opcode. If it is a memory access
19840b57cec5SDimitry Andric   // instruction, update the upper-bound of the appropriate counter's
19850b57cec5SDimitry Andric   // bracket and the destination operand scores.
19867a6dacacSDimitry Andric   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
19877a6dacacSDimitry Andric 
19880b57cec5SDimitry Andric   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
19890b57cec5SDimitry Andric     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
19900b57cec5SDimitry Andric         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
19910b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
19920b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
19930b57cec5SDimitry Andric     } else {
19940b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
19950b57cec5SDimitry Andric     }
19960b57cec5SDimitry Andric   } else if (TII->isFLAT(Inst)) {
19971db9f3b2SDimitry Andric     // TODO: Track this properly.
19981db9f3b2SDimitry Andric     if (isCacheInvOrWBInst(Inst))
19991db9f3b2SDimitry Andric       return;
20001db9f3b2SDimitry Andric 
2001480093f4SDimitry Andric     assert(Inst.mayLoadOrStore());
20020b57cec5SDimitry Andric 
2003e8d8bef9SDimitry Andric     int FlatASCount = 0;
2004e8d8bef9SDimitry Andric 
2005e8d8bef9SDimitry Andric     if (mayAccessVMEMThroughFlat(Inst)) {
2006e8d8bef9SDimitry Andric       ++FlatASCount;
2007bdd1243dSDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2008bdd1243dSDimitry Andric                                    Inst);
20090b57cec5SDimitry Andric     }
20100b57cec5SDimitry Andric 
2011e8d8bef9SDimitry Andric     if (mayAccessLDSThroughFlat(Inst)) {
2012e8d8bef9SDimitry Andric       ++FlatASCount;
20130b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
20140b57cec5SDimitry Andric     }
2015e8d8bef9SDimitry Andric 
2016e8d8bef9SDimitry Andric     // A Flat memory operation must access at least one address space.
2017e8d8bef9SDimitry Andric     assert(FlatASCount);
2018e8d8bef9SDimitry Andric 
2019e8d8bef9SDimitry Andric     // This is a flat memory operation that access both VMEM and LDS, so note it
2020e8d8bef9SDimitry Andric     // - it will require that both the VM and LGKM be flushed to zero if it is
2021e8d8bef9SDimitry Andric     // pending when a VM or LGKM dependency occurs.
2022e8d8bef9SDimitry Andric     if (FlatASCount > 1)
2023e8d8bef9SDimitry Andric       ScoreBrackets->setPendingFlat();
20240b57cec5SDimitry Andric   } else if (SIInstrInfo::isVMEM(Inst) &&
2025fe6060f1SDimitry Andric              !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
2026bdd1243dSDimitry Andric     ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2027bdd1243dSDimitry Andric                                  Inst);
20280b57cec5SDimitry Andric 
20290b57cec5SDimitry Andric     if (ST->vmemWriteNeedsExpWaitcnt() &&
2030fe6060f1SDimitry Andric         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
20310b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
20320b57cec5SDimitry Andric     }
20330b57cec5SDimitry Andric   } else if (TII->isSMRD(Inst)) {
20340b57cec5SDimitry Andric     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
20350b57cec5SDimitry Andric   } else if (Inst.isCall()) {
20360b57cec5SDimitry Andric     if (callWaitsOnFunctionReturn(Inst)) {
20370b57cec5SDimitry Andric       // Act as a wait on everything
20387a6dacacSDimitry Andric       ScoreBrackets->applyWaitcnt(
20390fca6ea1SDimitry Andric           WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2040297eecfbSDimitry Andric       ScoreBrackets->setStateOnFunctionEntryOrReturn();
20410b57cec5SDimitry Andric     } else {
20420b57cec5SDimitry Andric       // May need to way wait for anything.
20430b57cec5SDimitry Andric       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
20440b57cec5SDimitry Andric     }
204581ad6265SDimitry Andric   } else if (SIInstrInfo::isLDSDIR(Inst)) {
204681ad6265SDimitry Andric     ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
204781ad6265SDimitry Andric   } else if (TII->isVINTERP(Inst)) {
204881ad6265SDimitry Andric     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
204981ad6265SDimitry Andric     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2050e8d8bef9SDimitry Andric   } else if (SIInstrInfo::isEXP(Inst)) {
2051e8d8bef9SDimitry Andric     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2052e8d8bef9SDimitry Andric     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2053e8d8bef9SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2054e8d8bef9SDimitry Andric     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2055e8d8bef9SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2056e8d8bef9SDimitry Andric     else
2057e8d8bef9SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
20580b57cec5SDimitry Andric   } else {
20590b57cec5SDimitry Andric     switch (Inst.getOpcode()) {
20600b57cec5SDimitry Andric     case AMDGPU::S_SENDMSG:
206181ad6265SDimitry Andric     case AMDGPU::S_SENDMSG_RTN_B32:
206281ad6265SDimitry Andric     case AMDGPU::S_SENDMSG_RTN_B64:
20630b57cec5SDimitry Andric     case AMDGPU::S_SENDMSGHALT:
20640b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
20650b57cec5SDimitry Andric       break;
20660b57cec5SDimitry Andric     case AMDGPU::S_MEMTIME:
20670b57cec5SDimitry Andric     case AMDGPU::S_MEMREALTIME:
20685f757f3fSDimitry Andric     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
20695f757f3fSDimitry Andric     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
20705f757f3fSDimitry Andric     case AMDGPU::S_BARRIER_LEAVE:
20715f757f3fSDimitry Andric     case AMDGPU::S_GET_BARRIER_STATE_M0:
20725f757f3fSDimitry Andric     case AMDGPU::S_GET_BARRIER_STATE_IMM:
20730b57cec5SDimitry Andric       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
20740b57cec5SDimitry Andric       break;
20750b57cec5SDimitry Andric     }
20760b57cec5SDimitry Andric   }
20770b57cec5SDimitry Andric }
20780b57cec5SDimitry Andric 
20795ffd83dbSDimitry Andric bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
20805ffd83dbSDimitry Andric                                  unsigned OtherScore) {
20815ffd83dbSDimitry Andric   unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
20825ffd83dbSDimitry Andric   unsigned OtherShifted =
20830b57cec5SDimitry Andric       OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
20840b57cec5SDimitry Andric   Score = std::max(MyShifted, OtherShifted);
20850b57cec5SDimitry Andric   return OtherShifted > MyShifted;
20860b57cec5SDimitry Andric }
20870b57cec5SDimitry Andric 
20880b57cec5SDimitry Andric /// Merge the pending events and associater score brackets of \p Other into
20890b57cec5SDimitry Andric /// this brackets status.
20900b57cec5SDimitry Andric ///
20910b57cec5SDimitry Andric /// Returns whether the merge resulted in a change that requires tighter waits
20920b57cec5SDimitry Andric /// (i.e. the merged brackets strictly dominate the original brackets).
20930b57cec5SDimitry Andric bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
20940b57cec5SDimitry Andric   bool StrictDom = false;
20950b57cec5SDimitry Andric 
20965ffd83dbSDimitry Andric   VgprUB = std::max(VgprUB, Other.VgprUB);
20975ffd83dbSDimitry Andric   SgprUB = std::max(SgprUB, Other.SgprUB);
20985ffd83dbSDimitry Andric 
20997a6dacacSDimitry Andric   for (auto T : inst_counter_types(MaxCounter)) {
21000b57cec5SDimitry Andric     // Merge event flags for this counter
21015ffd83dbSDimitry Andric     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
21025ffd83dbSDimitry Andric     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
21030b57cec5SDimitry Andric     if (OtherEvents & ~OldEvents)
21040b57cec5SDimitry Andric       StrictDom = true;
21050b57cec5SDimitry Andric     PendingEvents |= OtherEvents;
21060b57cec5SDimitry Andric 
21070b57cec5SDimitry Andric     // Merge scores for this counter
21085ffd83dbSDimitry Andric     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
21095ffd83dbSDimitry Andric     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
21105ffd83dbSDimitry Andric     const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
21115ffd83dbSDimitry Andric     if (NewUB < ScoreLBs[T])
21125ffd83dbSDimitry Andric       report_fatal_error("waitcnt score overflow");
21135ffd83dbSDimitry Andric 
21140b57cec5SDimitry Andric     MergeInfo M;
21150b57cec5SDimitry Andric     M.OldLB = ScoreLBs[T];
21160b57cec5SDimitry Andric     M.OtherLB = Other.ScoreLBs[T];
21175ffd83dbSDimitry Andric     M.MyShift = NewUB - ScoreUBs[T];
21185ffd83dbSDimitry Andric     M.OtherShift = NewUB - Other.ScoreUBs[T];
21190b57cec5SDimitry Andric 
21200b57cec5SDimitry Andric     ScoreUBs[T] = NewUB;
21210b57cec5SDimitry Andric 
21220b57cec5SDimitry Andric     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
21230b57cec5SDimitry Andric 
2124bdd1243dSDimitry Andric     for (int J = 0; J <= VgprUB; J++)
2125bdd1243dSDimitry Andric       StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
21265ffd83dbSDimitry Andric 
21277a6dacacSDimitry Andric     if (T == SmemAccessCounter) {
2128bdd1243dSDimitry Andric       for (int J = 0; J <= SgprUB; J++)
2129bdd1243dSDimitry Andric         StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
21300b57cec5SDimitry Andric     }
21310b57cec5SDimitry Andric   }
21320b57cec5SDimitry Andric 
2133bdd1243dSDimitry Andric   for (int J = 0; J <= VgprUB; J++) {
2134bdd1243dSDimitry Andric     unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2135bdd1243dSDimitry Andric     StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2136bdd1243dSDimitry Andric     VgprVmemTypes[J] = NewVmemTypes;
21370b57cec5SDimitry Andric   }
21380b57cec5SDimitry Andric 
21390b57cec5SDimitry Andric   return StrictDom;
21400b57cec5SDimitry Andric }
21410b57cec5SDimitry Andric 
2142bdd1243dSDimitry Andric static bool isWaitInstr(MachineInstr &Inst) {
21437a6dacacSDimitry Andric   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
21447a6dacacSDimitry Andric   return Opcode == AMDGPU::S_WAITCNT ||
21457a6dacacSDimitry Andric          (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
21467a6dacacSDimitry Andric           Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
21477a6dacacSDimitry Andric          Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
21487a6dacacSDimitry Andric          Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
21497a6dacacSDimitry Andric          counterTypeForInstr(Opcode).has_value();
2150bdd1243dSDimitry Andric }
2151bdd1243dSDimitry Andric 
21520b57cec5SDimitry Andric // Generate s_waitcnt instructions where needed.
21530b57cec5SDimitry Andric bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
21540b57cec5SDimitry Andric                                             MachineBasicBlock &Block,
21550b57cec5SDimitry Andric                                             WaitcntBrackets &ScoreBrackets) {
21560b57cec5SDimitry Andric   bool Modified = false;
21570b57cec5SDimitry Andric 
21580b57cec5SDimitry Andric   LLVM_DEBUG({
21590b57cec5SDimitry Andric     dbgs() << "*** Block" << Block.getNumber() << " ***";
21600b57cec5SDimitry Andric     ScoreBrackets.dump();
21610b57cec5SDimitry Andric   });
21620b57cec5SDimitry Andric 
2163e8d8bef9SDimitry Andric   // Track the correctness of vccz through this basic block. There are two
2164e8d8bef9SDimitry Andric   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2165e8d8bef9SDimitry Andric   // ST->partialVCCWritesUpdateVCCZ().
21665ffd83dbSDimitry Andric   bool VCCZCorrect = true;
2167e8d8bef9SDimitry Andric   if (ST->hasReadVCCZBug()) {
2168e8d8bef9SDimitry Andric     // vccz could be incorrect at a basic block boundary if a predecessor wrote
2169e8d8bef9SDimitry Andric     // to vcc and then issued an smem load.
2170e8d8bef9SDimitry Andric     VCCZCorrect = false;
2171e8d8bef9SDimitry Andric   } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2172e8d8bef9SDimitry Andric     // vccz could be incorrect at a basic block boundary if a predecessor wrote
2173e8d8bef9SDimitry Andric     // to vcc_lo or vcc_hi.
2174e8d8bef9SDimitry Andric     VCCZCorrect = false;
2175e8d8bef9SDimitry Andric   }
21765ffd83dbSDimitry Andric 
21770b57cec5SDimitry Andric   // Walk over the instructions.
21780b57cec5SDimitry Andric   MachineInstr *OldWaitcntInstr = nullptr;
21790b57cec5SDimitry Andric 
21800b57cec5SDimitry Andric   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
21810b57cec5SDimitry Andric                                          E = Block.instr_end();
21820b57cec5SDimitry Andric        Iter != E;) {
21830b57cec5SDimitry Andric     MachineInstr &Inst = *Iter;
21840b57cec5SDimitry Andric 
2185fe6060f1SDimitry Andric     // Track pre-existing waitcnts that were added in earlier iterations or by
2186fe6060f1SDimitry Andric     // the memory legalizer.
2187bdd1243dSDimitry Andric     if (isWaitInstr(Inst)) {
21880b57cec5SDimitry Andric       if (!OldWaitcntInstr)
21890b57cec5SDimitry Andric         OldWaitcntInstr = &Inst;
21900b57cec5SDimitry Andric       ++Iter;
21910b57cec5SDimitry Andric       continue;
21920b57cec5SDimitry Andric     }
21930b57cec5SDimitry Andric 
219481ad6265SDimitry Andric     bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
219581ad6265SDimitry Andric                       isPreheaderToFlush(Block, ScoreBrackets);
219681ad6265SDimitry Andric 
2197e8d8bef9SDimitry Andric     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
219881ad6265SDimitry Andric     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
219981ad6265SDimitry Andric                                           FlushVmCnt);
2200e8d8bef9SDimitry Andric     OldWaitcntInstr = nullptr;
2201e8d8bef9SDimitry Andric 
2202e8d8bef9SDimitry Andric     // Restore vccz if it's not known to be correct already.
2203e8d8bef9SDimitry Andric     bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2204e8d8bef9SDimitry Andric 
2205e8d8bef9SDimitry Andric     // Don't examine operands unless we need to track vccz correctness.
2206e8d8bef9SDimitry Andric     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
22070fca6ea1SDimitry Andric       if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
22080fca6ea1SDimitry Andric           Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2209e8d8bef9SDimitry Andric         // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2210e8d8bef9SDimitry Andric         if (!ST->partialVCCWritesUpdateVCCZ())
2211e8d8bef9SDimitry Andric           VCCZCorrect = false;
22120fca6ea1SDimitry Andric       } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
22135ffd83dbSDimitry Andric         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
22145ffd83dbSDimitry Andric         // vccz bit, so when we detect that an instruction may read from a
22155ffd83dbSDimitry Andric         // corrupt vccz bit, we need to:
22165ffd83dbSDimitry Andric         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
22175ffd83dbSDimitry Andric         //    operations to complete.
22185ffd83dbSDimitry Andric         // 2. Restore the correct value of vccz by writing the current value
22195ffd83dbSDimitry Andric         //    of vcc back to vcc.
2220e8d8bef9SDimitry Andric         if (ST->hasReadVCCZBug() &&
22210b57cec5SDimitry Andric             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2222e8d8bef9SDimitry Andric           // Writes to vcc while there's an outstanding smem read may get
2223e8d8bef9SDimitry Andric           // clobbered as soon as any read completes.
2224e8d8bef9SDimitry Andric           VCCZCorrect = false;
2225e8d8bef9SDimitry Andric         } else {
2226e8d8bef9SDimitry Andric           // Writes to vcc will fix any incorrect value in vccz.
2227e8d8bef9SDimitry Andric           VCCZCorrect = true;
22285ffd83dbSDimitry Andric         }
22290b57cec5SDimitry Andric       }
22300b57cec5SDimitry Andric     }
22310b57cec5SDimitry Andric 
2232480093f4SDimitry Andric     if (TII->isSMRD(Inst)) {
2233480093f4SDimitry Andric       for (const MachineMemOperand *Memop : Inst.memoperands()) {
2234fe6060f1SDimitry Andric         // No need to handle invariant loads when avoiding WAR conflicts, as
2235fe6060f1SDimitry Andric         // there cannot be a vector store to the same memory location.
2236fe6060f1SDimitry Andric         if (!Memop->isInvariant()) {
2237480093f4SDimitry Andric           const Value *Ptr = Memop->getValue();
2238bdd1243dSDimitry Andric           SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2239480093f4SDimitry Andric         }
2240fe6060f1SDimitry Andric       }
2241e8d8bef9SDimitry Andric       if (ST->hasReadVCCZBug()) {
2242e8d8bef9SDimitry Andric         // This smem read could complete and clobber vccz at any time.
22435ffd83dbSDimitry Andric         VCCZCorrect = false;
22445ffd83dbSDimitry Andric       }
2245e8d8bef9SDimitry Andric     }
22460b57cec5SDimitry Andric 
22470b57cec5SDimitry Andric     updateEventWaitcntAfter(Inst, &ScoreBrackets);
22480b57cec5SDimitry Andric 
22490fca6ea1SDimitry Andric     if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
22500fca6ea1SDimitry Andric       AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
22510fca6ea1SDimitry Andric           Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
22520fca6ea1SDimitry Andric       ScoreBrackets.simplifyWaitcnt(Wait);
22530fca6ea1SDimitry Andric       Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
22540fca6ea1SDimitry Andric                                   ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
22550b57cec5SDimitry Andric     }
22560b57cec5SDimitry Andric 
22570b57cec5SDimitry Andric     LLVM_DEBUG({
22580b57cec5SDimitry Andric       Inst.print(dbgs());
22590b57cec5SDimitry Andric       ScoreBrackets.dump();
22600b57cec5SDimitry Andric     });
22610b57cec5SDimitry Andric 
22620b57cec5SDimitry Andric     // TODO: Remove this work-around after fixing the scheduler and enable the
22630b57cec5SDimitry Andric     // assert above.
22645ffd83dbSDimitry Andric     if (RestoreVCCZ) {
22650b57cec5SDimitry Andric       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
22660b57cec5SDimitry Andric       // bit is updated, so we can restore the bit by reading the value of
22670b57cec5SDimitry Andric       // vcc and then writing it back to the register.
22680b57cec5SDimitry Andric       BuildMI(Block, Inst, Inst.getDebugLoc(),
22690b57cec5SDimitry Andric               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
22700b57cec5SDimitry Andric               TRI->getVCC())
22710b57cec5SDimitry Andric           .addReg(TRI->getVCC());
22725ffd83dbSDimitry Andric       VCCZCorrect = true;
22730b57cec5SDimitry Andric       Modified = true;
22740b57cec5SDimitry Andric     }
22750b57cec5SDimitry Andric 
22760b57cec5SDimitry Andric     ++Iter;
22770b57cec5SDimitry Andric   }
22780b57cec5SDimitry Andric 
22790fca6ea1SDimitry Andric   // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
22800fca6ea1SDimitry Andric   // needed.
22810fca6ea1SDimitry Andric   AMDGPU::Waitcnt Wait;
228281ad6265SDimitry Andric   if (Block.getFirstTerminator() == Block.end() &&
22830fca6ea1SDimitry Andric       isPreheaderToFlush(Block, ScoreBrackets)) {
22840fca6ea1SDimitry Andric     if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
22850fca6ea1SDimitry Andric       Wait.LoadCnt = 0;
22860fca6ea1SDimitry Andric     if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
22870fca6ea1SDimitry Andric       Wait.SampleCnt = 0;
22880fca6ea1SDimitry Andric     if (ScoreBrackets.hasPendingEvent(BVH_CNT))
22890fca6ea1SDimitry Andric       Wait.BvhCnt = 0;
22900fca6ea1SDimitry Andric   }
22910fca6ea1SDimitry Andric 
22920fca6ea1SDimitry Andric   // Combine or remove any redundant waitcnts at the end of the block.
22930fca6ea1SDimitry Andric   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
22940fca6ea1SDimitry Andric                               OldWaitcntInstr);
229581ad6265SDimitry Andric 
22960b57cec5SDimitry Andric   return Modified;
22970b57cec5SDimitry Andric }
22980b57cec5SDimitry Andric 
229981ad6265SDimitry Andric // Return true if the given machine basic block is a preheader of a loop in
230081ad6265SDimitry Andric // which we want to flush the vmcnt counter, and false otherwise.
230181ad6265SDimitry Andric bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
230281ad6265SDimitry Andric                                           WaitcntBrackets &ScoreBrackets) {
23035f757f3fSDimitry Andric   auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
23045f757f3fSDimitry Andric   if (!IsInserted)
23055f757f3fSDimitry Andric     return Iterator->second;
230681ad6265SDimitry Andric 
230781ad6265SDimitry Andric   MachineBasicBlock *Succ = MBB.getSingleSuccessor();
230881ad6265SDimitry Andric   if (!Succ)
23095f757f3fSDimitry Andric     return false;
231081ad6265SDimitry Andric 
231181ad6265SDimitry Andric   MachineLoop *Loop = MLI->getLoopFor(Succ);
231281ad6265SDimitry Andric   if (!Loop)
23135f757f3fSDimitry Andric     return false;
231481ad6265SDimitry Andric 
23155f757f3fSDimitry Andric   if (Loop->getLoopPreheader() == &MBB &&
23165f757f3fSDimitry Andric       shouldFlushVmCnt(Loop, ScoreBrackets)) {
23175f757f3fSDimitry Andric     Iterator->second = true;
23185f757f3fSDimitry Andric     return true;
23195f757f3fSDimitry Andric   }
232081ad6265SDimitry Andric 
23215f757f3fSDimitry Andric   return false;
232281ad6265SDimitry Andric }
232381ad6265SDimitry Andric 
232406c3fb27SDimitry Andric bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
232506c3fb27SDimitry Andric   return SIInstrInfo::isVMEM(MI) ||
232606c3fb27SDimitry Andric          (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
232706c3fb27SDimitry Andric }
232806c3fb27SDimitry Andric 
232981ad6265SDimitry Andric // Return true if it is better to flush the vmcnt counter in the preheader of
233081ad6265SDimitry Andric // the given loop. We currently decide to flush in two situations:
233181ad6265SDimitry Andric // 1. The loop contains vmem store(s), no vmem load and at least one use of a
233281ad6265SDimitry Andric //    vgpr containing a value that is loaded outside of the loop. (Only on
233381ad6265SDimitry Andric //    targets with no vscnt counter).
233481ad6265SDimitry Andric // 2. The loop contains vmem load(s), but the loaded values are not used in the
233581ad6265SDimitry Andric //    loop, and at least one use of a vgpr containing a value that is loaded
233681ad6265SDimitry Andric //    outside of the loop.
233781ad6265SDimitry Andric bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
233881ad6265SDimitry Andric                                         WaitcntBrackets &Brackets) {
233981ad6265SDimitry Andric   bool HasVMemLoad = false;
234081ad6265SDimitry Andric   bool HasVMemStore = false;
234181ad6265SDimitry Andric   bool UsesVgprLoadedOutside = false;
234281ad6265SDimitry Andric   DenseSet<Register> VgprUse;
234381ad6265SDimitry Andric   DenseSet<Register> VgprDef;
234481ad6265SDimitry Andric 
234581ad6265SDimitry Andric   for (MachineBasicBlock *MBB : ML->blocks()) {
234681ad6265SDimitry Andric     for (MachineInstr &MI : *MBB) {
234706c3fb27SDimitry Andric       if (isVMEMOrFlatVMEM(MI)) {
234881ad6265SDimitry Andric         if (MI.mayLoad())
234981ad6265SDimitry Andric           HasVMemLoad = true;
235081ad6265SDimitry Andric         if (MI.mayStore())
235181ad6265SDimitry Andric           HasVMemStore = true;
235281ad6265SDimitry Andric       }
235381ad6265SDimitry Andric       for (unsigned I = 0; I < MI.getNumOperands(); I++) {
235481ad6265SDimitry Andric         MachineOperand &Op = MI.getOperand(I);
235581ad6265SDimitry Andric         if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
235681ad6265SDimitry Andric           continue;
2357cb14a3feSDimitry Andric         RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
235881ad6265SDimitry Andric         // Vgpr use
235981ad6265SDimitry Andric         if (Op.isUse()) {
236081ad6265SDimitry Andric           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
236181ad6265SDimitry Andric             // If we find a register that is loaded inside the loop, 1. and 2.
236281ad6265SDimitry Andric             // are invalidated and we can exit.
236381ad6265SDimitry Andric             if (VgprDef.contains(RegNo))
236481ad6265SDimitry Andric               return false;
236581ad6265SDimitry Andric             VgprUse.insert(RegNo);
236681ad6265SDimitry Andric             // If at least one of Op's registers is in the score brackets, the
236781ad6265SDimitry Andric             // value is likely loaded outside of the loop.
23687a6dacacSDimitry Andric             if (Brackets.getRegScore(RegNo, LOAD_CNT) >
23697a6dacacSDimitry Andric                     Brackets.getScoreLB(LOAD_CNT) ||
23707a6dacacSDimitry Andric                 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
23717a6dacacSDimitry Andric                     Brackets.getScoreLB(SAMPLE_CNT) ||
23727a6dacacSDimitry Andric                 Brackets.getRegScore(RegNo, BVH_CNT) >
23737a6dacacSDimitry Andric                     Brackets.getScoreLB(BVH_CNT)) {
237481ad6265SDimitry Andric               UsesVgprLoadedOutside = true;
237581ad6265SDimitry Andric               break;
237681ad6265SDimitry Andric             }
237781ad6265SDimitry Andric           }
237881ad6265SDimitry Andric         }
237981ad6265SDimitry Andric         // VMem load vgpr def
238006c3fb27SDimitry Andric         else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
238181ad6265SDimitry Andric           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
238281ad6265SDimitry Andric             // If we find a register that is loaded inside the loop, 1. and 2.
238381ad6265SDimitry Andric             // are invalidated and we can exit.
238481ad6265SDimitry Andric             if (VgprUse.contains(RegNo))
238581ad6265SDimitry Andric               return false;
238681ad6265SDimitry Andric             VgprDef.insert(RegNo);
238781ad6265SDimitry Andric           }
238881ad6265SDimitry Andric       }
238981ad6265SDimitry Andric     }
239081ad6265SDimitry Andric   }
239181ad6265SDimitry Andric   if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
239281ad6265SDimitry Andric     return true;
2393*6c4b055cSDimitry Andric   return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
239481ad6265SDimitry Andric }
239581ad6265SDimitry Andric 
23960b57cec5SDimitry Andric bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
23970b57cec5SDimitry Andric   ST = &MF.getSubtarget<GCNSubtarget>();
23980b57cec5SDimitry Andric   TII = ST->getInstrInfo();
23990b57cec5SDimitry Andric   TRI = &TII->getRegisterInfo();
24000b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
24010b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
24020fca6ea1SDimitry Andric   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
24030fca6ea1SDimitry Andric   PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
24047a6dacacSDimitry Andric   if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
24057a6dacacSDimitry Andric     AA = &AAR->getAAResults();
24067a6dacacSDimitry Andric 
24077a6dacacSDimitry Andric   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
24087a6dacacSDimitry Andric 
24097a6dacacSDimitry Andric   if (ST->hasExtendedWaitCounts()) {
24107a6dacacSDimitry Andric     MaxCounter = NUM_EXTENDED_INST_CNTS;
24110fca6ea1SDimitry Andric     WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
24127a6dacacSDimitry Andric     WCG = &WCGGFX12Plus;
24137a6dacacSDimitry Andric   } else {
24147a6dacacSDimitry Andric     MaxCounter = NUM_NORMAL_INST_CNTS;
24150fca6ea1SDimitry Andric     WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
24167a6dacacSDimitry Andric     WCG = &WCGPreGFX12;
24177a6dacacSDimitry Andric   }
24180b57cec5SDimitry Andric 
24190b57cec5SDimitry Andric   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
24200b57cec5SDimitry Andric   for (auto T : inst_counter_types())
24210b57cec5SDimitry Andric     ForceEmitWaitcnt[T] = false;
24220b57cec5SDimitry Andric 
24237a6dacacSDimitry Andric   const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
24247a6dacacSDimitry Andric 
24257a6dacacSDimitry Andric   SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
24267a6dacacSDimitry Andric 
24270eae32dcSDimitry Andric   HardwareLimits Limits = {};
24287a6dacacSDimitry Andric   if (ST->hasExtendedWaitCounts()) {
24297a6dacacSDimitry Andric     Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
24307a6dacacSDimitry Andric     Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
24317a6dacacSDimitry Andric   } else {
24327a6dacacSDimitry Andric     Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
24337a6dacacSDimitry Andric     Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
24347a6dacacSDimitry Andric   }
24350eae32dcSDimitry Andric   Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
24367a6dacacSDimitry Andric   Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
24377a6dacacSDimitry Andric   Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
24387a6dacacSDimitry Andric   Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
24397a6dacacSDimitry Andric   Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
24400b57cec5SDimitry Andric 
24415ffd83dbSDimitry Andric   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
24425ffd83dbSDimitry Andric   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
24435ffd83dbSDimitry Andric   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
24445ffd83dbSDimitry Andric   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
24450b57cec5SDimitry Andric 
24460eae32dcSDimitry Andric   RegisterEncoding Encoding = {};
24475f757f3fSDimitry Andric   Encoding.VGPR0 =
24485f757f3fSDimitry Andric       TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
24490eae32dcSDimitry Andric   Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
24505f757f3fSDimitry Andric   Encoding.SGPR0 =
24515f757f3fSDimitry Andric       TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
24520eae32dcSDimitry Andric   Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
24530b57cec5SDimitry Andric 
24540b57cec5SDimitry Andric   BlockInfos.clear();
2455fe6060f1SDimitry Andric   bool Modified = false;
2456fe6060f1SDimitry Andric 
24577a6dacacSDimitry Andric   MachineBasicBlock &EntryBB = MF.front();
24587a6dacacSDimitry Andric   MachineBasicBlock::iterator I = EntryBB.begin();
24597a6dacacSDimitry Andric 
2460fe6060f1SDimitry Andric   if (!MFI->isEntryFunction()) {
2461fe6060f1SDimitry Andric     // Wait for any outstanding memory operations that the input registers may
2462fe6060f1SDimitry Andric     // depend on. We can't track them and it's better to do the wait after the
2463fe6060f1SDimitry Andric     // costly call sequence.
2464fe6060f1SDimitry Andric 
2465fe6060f1SDimitry Andric     // TODO: Could insert earlier and schedule more liberally with operations
2466fe6060f1SDimitry Andric     // that only use caller preserved registers.
2467fe6060f1SDimitry Andric     for (MachineBasicBlock::iterator E = EntryBB.end();
2468fe6060f1SDimitry Andric          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2469fe6060f1SDimitry Andric       ;
2470fe6060f1SDimitry Andric 
24717a6dacacSDimitry Andric     if (ST->hasExtendedWaitCounts()) {
24727a6dacacSDimitry Andric       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
24737a6dacacSDimitry Andric           .addImm(0);
24747a6dacacSDimitry Andric       for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
24757a6dacacSDimitry Andric         if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
24767a6dacacSDimitry Andric           continue;
24777a6dacacSDimitry Andric 
24787a6dacacSDimitry Andric         BuildMI(EntryBB, I, DebugLoc(),
24797a6dacacSDimitry Andric                 TII->get(instrsForExtendedCounterTypes[CT]))
24807a6dacacSDimitry Andric             .addImm(0);
24817a6dacacSDimitry Andric       }
24827a6dacacSDimitry Andric     } else {
24837a6dacacSDimitry Andric       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
24847a6dacacSDimitry Andric     }
24857a6dacacSDimitry Andric 
24867a6dacacSDimitry Andric     auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
24877a6dacacSDimitry Andric         ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
24887a6dacacSDimitry Andric         SmemAccessCounter);
2489297eecfbSDimitry Andric     NonKernelInitialState->setStateOnFunctionEntryOrReturn();
24905f757f3fSDimitry Andric     BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
24915f757f3fSDimitry Andric 
2492fe6060f1SDimitry Andric     Modified = true;
2493fe6060f1SDimitry Andric   }
24940b57cec5SDimitry Andric 
24950b57cec5SDimitry Andric   // Keep iterating over the blocks in reverse post order, inserting and
24960b57cec5SDimitry Andric   // updating s_waitcnt where needed, until a fix point is reached.
24975ffd83dbSDimitry Andric   for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
249806c3fb27SDimitry Andric     BlockInfos.insert({MBB, BlockInfo()});
24990b57cec5SDimitry Andric 
25000b57cec5SDimitry Andric   std::unique_ptr<WaitcntBrackets> Brackets;
25010b57cec5SDimitry Andric   bool Repeat;
25020b57cec5SDimitry Andric   do {
25030b57cec5SDimitry Andric     Repeat = false;
25040b57cec5SDimitry Andric 
25055ffd83dbSDimitry Andric     for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
25065ffd83dbSDimitry Andric          ++BII) {
250706c3fb27SDimitry Andric       MachineBasicBlock *MBB = BII->first;
25085ffd83dbSDimitry Andric       BlockInfo &BI = BII->second;
25090b57cec5SDimitry Andric       if (!BI.Dirty)
25100b57cec5SDimitry Andric         continue;
25110b57cec5SDimitry Andric 
25120b57cec5SDimitry Andric       if (BI.Incoming) {
25130b57cec5SDimitry Andric         if (!Brackets)
25148bcb0991SDimitry Andric           Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
25150b57cec5SDimitry Andric         else
25160b57cec5SDimitry Andric           *Brackets = *BI.Incoming;
25170b57cec5SDimitry Andric       } else {
25180b57cec5SDimitry Andric         if (!Brackets)
25197a6dacacSDimitry Andric           Brackets = std::make_unique<WaitcntBrackets>(
25207a6dacacSDimitry Andric               ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
25217a6dacacSDimitry Andric               SmemAccessCounter);
25220b57cec5SDimitry Andric         else
25237a6dacacSDimitry Andric           *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
25247a6dacacSDimitry Andric                                       WaitEventMaskForInst, SmemAccessCounter);
25250b57cec5SDimitry Andric       }
25260b57cec5SDimitry Andric 
252706c3fb27SDimitry Andric       Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
25280b57cec5SDimitry Andric       BI.Dirty = false;
25290b57cec5SDimitry Andric 
2530bdd1243dSDimitry Andric       if (Brackets->hasPendingEvent()) {
25310b57cec5SDimitry Andric         BlockInfo *MoveBracketsToSucc = nullptr;
253206c3fb27SDimitry Andric         for (MachineBasicBlock *Succ : MBB->successors()) {
25335ffd83dbSDimitry Andric           auto SuccBII = BlockInfos.find(Succ);
25345ffd83dbSDimitry Andric           BlockInfo &SuccBI = SuccBII->second;
25350b57cec5SDimitry Andric           if (!SuccBI.Incoming) {
25360b57cec5SDimitry Andric             SuccBI.Dirty = true;
25375ffd83dbSDimitry Andric             if (SuccBII <= BII)
25380b57cec5SDimitry Andric               Repeat = true;
25390b57cec5SDimitry Andric             if (!MoveBracketsToSucc) {
25400b57cec5SDimitry Andric               MoveBracketsToSucc = &SuccBI;
25410b57cec5SDimitry Andric             } else {
25428bcb0991SDimitry Andric               SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
25430b57cec5SDimitry Andric             }
25440b57cec5SDimitry Andric           } else if (SuccBI.Incoming->merge(*Brackets)) {
25450b57cec5SDimitry Andric             SuccBI.Dirty = true;
25465ffd83dbSDimitry Andric             if (SuccBII <= BII)
25470b57cec5SDimitry Andric               Repeat = true;
25480b57cec5SDimitry Andric           }
25490b57cec5SDimitry Andric         }
25500b57cec5SDimitry Andric         if (MoveBracketsToSucc)
25510b57cec5SDimitry Andric           MoveBracketsToSucc->Incoming = std::move(Brackets);
25520b57cec5SDimitry Andric       }
25530b57cec5SDimitry Andric     }
25540b57cec5SDimitry Andric   } while (Repeat);
25550b57cec5SDimitry Andric 
25560eae32dcSDimitry Andric   if (ST->hasScalarStores()) {
25570b57cec5SDimitry Andric     SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
25580b57cec5SDimitry Andric     bool HaveScalarStores = false;
25590b57cec5SDimitry Andric 
2560349cc55cSDimitry Andric     for (MachineBasicBlock &MBB : MF) {
2561349cc55cSDimitry Andric       for (MachineInstr &MI : MBB) {
2562349cc55cSDimitry Andric         if (!HaveScalarStores && TII->isScalarStore(MI))
25630b57cec5SDimitry Andric           HaveScalarStores = true;
25640b57cec5SDimitry Andric 
2565349cc55cSDimitry Andric         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2566349cc55cSDimitry Andric             MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
25670b57cec5SDimitry Andric           EndPgmBlocks.push_back(&MBB);
25680b57cec5SDimitry Andric       }
25690b57cec5SDimitry Andric     }
25700b57cec5SDimitry Andric 
25710b57cec5SDimitry Andric     if (HaveScalarStores) {
25720b57cec5SDimitry Andric       // If scalar writes are used, the cache must be flushed or else the next
25730b57cec5SDimitry Andric       // wave to reuse the same scratch memory can be clobbered.
25740b57cec5SDimitry Andric       //
25750b57cec5SDimitry Andric       // Insert s_dcache_wb at wave termination points if there were any scalar
25760eae32dcSDimitry Andric       // stores, and only if the cache hasn't already been flushed. This could
25770eae32dcSDimitry Andric       // be improved by looking across blocks for flushes in postdominating
25780eae32dcSDimitry Andric       // blocks from the stores but an explicitly requested flush is probably
25790eae32dcSDimitry Andric       // very rare.
25800b57cec5SDimitry Andric       for (MachineBasicBlock *MBB : EndPgmBlocks) {
25810b57cec5SDimitry Andric         bool SeenDCacheWB = false;
25820b57cec5SDimitry Andric 
25830eae32dcSDimitry Andric         for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
25840eae32dcSDimitry Andric              I != E; ++I) {
25850b57cec5SDimitry Andric           if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
25860b57cec5SDimitry Andric             SeenDCacheWB = true;
25870b57cec5SDimitry Andric           else if (TII->isScalarStore(*I))
25880b57cec5SDimitry Andric             SeenDCacheWB = false;
25890b57cec5SDimitry Andric 
25900b57cec5SDimitry Andric           // FIXME: It would be better to insert this before a waitcnt if any.
25910b57cec5SDimitry Andric           if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
25920b57cec5SDimitry Andric                I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
25930b57cec5SDimitry Andric               !SeenDCacheWB) {
25940b57cec5SDimitry Andric             Modified = true;
25950b57cec5SDimitry Andric             BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
25960b57cec5SDimitry Andric           }
25970b57cec5SDimitry Andric         }
25980b57cec5SDimitry Andric       }
25990b57cec5SDimitry Andric     }
26000eae32dcSDimitry Andric   }
26010b57cec5SDimitry Andric 
260206c3fb27SDimitry Andric   // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
260306c3fb27SDimitry Andric   // instructions.
260406c3fb27SDimitry Andric   for (MachineInstr *MI : ReleaseVGPRInsts) {
260506c3fb27SDimitry Andric     if (ST->requiresNopBeforeDeallocVGPRs()) {
26060fca6ea1SDimitry Andric       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
260706c3fb27SDimitry Andric           .addImm(0);
260806c3fb27SDimitry Andric     }
26090fca6ea1SDimitry Andric     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
26100fca6ea1SDimitry Andric             TII->get(AMDGPU::S_SENDMSG))
261106c3fb27SDimitry Andric         .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
261206c3fb27SDimitry Andric     Modified = true;
261306c3fb27SDimitry Andric   }
261406c3fb27SDimitry Andric   ReleaseVGPRInsts.clear();
261506c3fb27SDimitry Andric 
26160b57cec5SDimitry Andric   return Modified;
26170b57cec5SDimitry Andric }
2618