SIInsertWaitcnts.cpp - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines Matching +full:wait +full:- +full:on +full:- +full:write
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Insert wait instructions for memory reads and writes.
17 /// finely-grained approach that keeps one timeline per event type could
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
24 //===----------------------------------------------------------------------===//
42 #define DEBUG_TYPE "si-insert-waitcnts"
44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
52   "amdgpu-waitcnt-forcezero",
109   VMEM_ACCESS,              // vector-memory read & write
110   VMEM_READ_ACCESS,         // vector-memory read
111   VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112   VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
113   VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
114   SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
115   LDS_ACCESS,               // lds read & write
116   GDS_ACCESS,               // gds read & write
118   SMEM_ACCESS,              // scalar-memory read & write
119   EXP_GPR_LOCK,             // export holding on its data src
120   GDS_GPR_LOCK,             // GDS holding on its data and addr src
121   EXP_POS_ACCESS,           // write to export position
122   EXP_PARAM_ACCESS,         // write to export parameter
123   VMW_GPR_LOCK,             // vector-memory write holding on its data src
129 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
130 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
131 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
148 // Enumerate different types of result-returning VMEM operations. Although
150 // s_waitcnt only instructions of the same VmemType are guaranteed to write
151 // their results in order -- so there is no need to insert an s_waitcnt between
152 // two instructions of the same type that write the same vgpr.
163 // Maps values of InstCounterType to the instruction that waits on that
189       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
193   return BaseInfo->BVH                                         ? VMEM_BVH
194          : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
198 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
201     return Wait.LoadCnt;
203     return Wait.ExpCnt;
205     return Wait.DsCnt;
207     return Wait.StoreCnt;
209     return Wait.SampleCnt;
211     return Wait.BvhCnt;
213     return Wait.KmCnt;
219 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
220   unsigned &WC = getCounterRef(Wait, T);
224 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225   getCounterRef(Wait, T) = ~0u;
228 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
229   return getCounterRef(Wait, T);
241 // This objects maintains the current score brackets of each wait counter, and
242 // a per-register scoreboard for each wait counter.
247 // wait count may get decreased out of order, therefore we need to put in
292     return getScoreUB(T) - getScoreLB(T);
300     return SgprScores[GprNo - NUM_ALL_VGPRS];
310   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
312   void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
313   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
332     return Events & (Events - 1);
394       ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
403       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
404       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
425   int VgprUB = -1;
426   int SgprUB = -1;
428   // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
429   // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
432   // write to each vgpr.
436   SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
441 // done because the set of counters and instructions for waiting on them
456       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
457         IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
465   // Edits an existing sequence of wait count instructions according
467   // any new wait count instructions which may need to be generated by
472   // delete instructions if the incoming Wait value indicates they are not
473   // needed. It may also remove existing instructions for which a wait
475   // instructions later, as can happen on gfx12.
478                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
484   // Generates new wait count instructions according to the  value of
485   // Wait, returning true if any new instructions were created.
488                                 AMDGPU::Waitcnt Wait) = 0;
500   // Create a mask value from the initializer list of wait event types.
519                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
524                         AMDGPU::Waitcnt Wait) override;
555                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
560                         AMDGPU::Waitcnt Wait) override;
604   // because of amdgpu-waitcnt-forcezero flag
609   // generator objects, which must have been re-initialised before use
638     return "SI insert wait instructions";
658 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
698     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
700     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
710     if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
722   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
738   const MachineOperand &Op = MI->getOperand(OpNo);
739   if (!TRI->isInAllocatableClass(Op.getReg()))
740     return {-1, -1};
743   // A partial write is not a WAW.
748   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
751   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
753     Result.first = Reg - Encoding.VGPR0;
754     if (TRI->isAGPR(*MRI, Op.getReg()))
757   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
759     Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
764   // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
766     return {-1, -1};
768   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
769   unsigned Size = TRI->getRegSizeInBits(*RC);
781   assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
799   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
804     // Put score on the source vgprs. If this is a store, just use those
806     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
811       if (AddrOpIdx != -1) {
835               TRI->isVectorRegister(*MRI, Op.getReg())) {
840     } else if (TII->isFLAT(Inst)) {
852     } else if (TII->isMIMG(Inst)) {
861     } else if (TII->isMTBUF(Inst)) {
865     } else if (TII->isMUBUF(Inst)) {
874     } else if (TII->isLDSDIR(Inst)) {
881       if (TII->isEXP(Inst)) {
889               TRI->isVGPR(*MRI, DefMO.getReg())) {
891                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
899             TRI->isVectorRegister(*MRI, MO.getReg())) {
918           assert(TRI->isVectorRegister(*MRI, Op.getReg()));
929         (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
930       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
931       // written can be accessed. A load from LDS to VMEM does not need a wait.
934         if (!MemOp->isStore() ||
935             MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
939         auto AAI = MemOp->getAAInfo();
950           for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
951             if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
957         if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
977       OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
981       OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
988       OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1013         unsigned RelScore = RegScore - LB - 1;
1026           unsigned RelScore = RegScore - LB - 1;
1038 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1039   simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1040   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1041   simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1042   simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1043   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1044   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1045   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1051   // as (UB - LB). If the current Count is greater than or equal to the number
1052   // of outstanding events, then the wait for this counter is redundant.
1058                                     AMDGPU::Waitcnt &Wait) const {
1067         !ST->hasFlatLgkmVMemCountInOrder()) {
1071       addWait(Wait, T, 0);
1073       // Counter can get decremented out-of-order when there
1076       addWait(Wait, T, 0);
1079       // MAX(CounterType) - 1 instead.
1080       unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1081       addWait(Wait, T, NeededWait);
1086 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1087   applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1088   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1089   applyWaitcnt(DS_CNT, Wait.DsCnt);
1090   applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1091   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1092   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1093   applyWaitcnt(KM_CNT, Wait.KmCnt);
1103     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1148 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1149 /// and if so, which counter it is waiting on.
1172   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1173   if (Opcode == Waitcnt->getOpcode())
1176   Waitcnt->setDesc(TII->get(Opcode));
1182 /// from \p Wait that were added by previous passes. Currently this pass
1187     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1203     // Update required wait count. If this is a soft waitcnt (= it was added
1210       Wait = Wait.combined(OldWait);
1213       if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1223           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1226       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1228       if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1238                                          AMDGPU::encodeWaitcnt(IV, Wait));
1241     ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1242     ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1243     ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1244     Wait.LoadCnt = ~0u;
1245     Wait.ExpCnt = ~0u;
1246     Wait.DsCnt = ~0u;
1248     LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1259                                          AMDGPU::OpName::simm16, Wait.StoreCnt);
1262     ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1263     Wait.StoreCnt = ~0u;
1265     LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1278 /// required counters in \p Wait
1281     AMDGPU::Waitcnt Wait) {
1290   if (Wait.hasWaitExceptStoreCnt()) {
1291     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1293         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1301   if (Wait.hasWaitStoreCnt()) {
1302     assert(ST->hasVscnt());
1305         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1307             .addImm(Wait.StoreCnt);
1320   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1329 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1334     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1350     // Update required wait count. If this is a soft waitcnt (= it was added
1363           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1367       Wait = Wait.combined(OldWait);
1371           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1375       Wait = Wait.combined(OldWait);
1381           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1384       addWait(Wait, CT.value(), OldCnt);
1400     // the appropriate single counter wait instruction can be inserted
1402     // createNewWaitcnt(). As a side effect, resetting the wait counts will
1405     if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1406       unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1410       ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1411       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1412       Wait.LoadCnt = ~0u;
1413       Wait.DsCnt = ~0u;
1415       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1423       CombinedLoadDsCntInstr->eraseFromParent();
1430     if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1431       unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1435       ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1436       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1437       Wait.StoreCnt = ~0u;
1438       Wait.DsCnt = ~0u;
1440       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1448       CombinedStoreDsCntInstr->eraseFromParent();
1459   if (Wait.DsCnt != ~0u) {
1466     // individual wait count instructions for these.
1468     if (Wait.LoadCnt != ~0u) {
1471     } else if (Wait.StoreCnt != ~0u) {
1480       (*WI)->eraseFromParent();
1490     unsigned NewCnt = getWait(Wait, CT);
1497       setNoWait(Wait, CT);
1499       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1507       WaitInstrs[CT]->eraseFromParent();
1515 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1518     AMDGPU::Waitcnt Wait) {
1525   // Check for opportunities to use combined wait instructions.
1526   if (Wait.DsCnt != ~0u) {
1529     if (Wait.LoadCnt != ~0u) {
1530       unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1532       SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1535       Wait.LoadCnt = ~0u;
1536       Wait.DsCnt = ~0u;
1537     } else if (Wait.StoreCnt != ~0u) {
1538       unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1541           BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1544       Wait.StoreCnt = ~0u;
1545       Wait.DsCnt = ~0u;
1561     unsigned Count = getWait(Wait, CT);
1566         BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1585 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1587   // Currently all conventions wait, but this may not always be the case.
1590   // senses to omit the wait and do it in the caller.
1594 /// \returns true if the callee is expected to wait for any outstanding waits
1603 ///  We rely on this in-order completion
1621   AMDGPU::Waitcnt Wait;
1625   // verify that nothing was relying on this. The number of buffer invalidates
1632     Wait.LoadCnt = 0;
1642     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1644   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1652     if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1657   // Resolve vm waits before gs-done.
1660            ST->hasLegacyGeometry() &&
1663     Wait.LoadCnt = 0;
1668   // The shader program must flush all EXP operations on the export-count
1678         Wait.ExpCnt = 0;
1683       // The function is going to insert a wait on everything in its prolog.
1686       Wait = AMDGPU::Waitcnt();
1697           ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1701         if (RtnAddrOpIdx != -1) {
1707             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1711       // FIXME: Should not be relying on memoperands.
1723       // add s_waitcnt on exp_cnt to guarantee the WAR order.
1726         const Value *Ptr = Memop->getValue();
1727         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1728           addWait(Wait, SmemAccessCounter, 0);
1729           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1732         unsigned AS = Memop->getAddrSpace();
1735         // No need to wait before load from VMEM to LDS.
1736         if (TII->mayWriteLDSThroughDMA(MI))
1746         // will produce a wait using the first (general) LDS DMA wait slot which
1747         // will wait on all of them anyway.
1748         if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1753               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1758           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1759         if (Memop->isStore()) {
1760           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1771         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1776         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1780             // previous write and this write are the same type of VMEM
1782             // guaranteed to write their results in order anyway.
1786                 !ST->hasVmemWriteVgprInOrder()) {
1787               ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1788               ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1789               ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1793               ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1795             ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1797             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1808   if (TII->isBarrierStart(MI.getOpcode()) &&
1809       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1810     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1813   // TODO: Remove this work-around, enable the assert for Bug 457939
1816   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1818       Wait.DsCnt = 0;
1822   // Verify that the wait is actually needed.
1823   ScoreBrackets.simplifyWaitcnt(Wait);
1826     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1829     Wait.LoadCnt = 0;
1831     Wait.ExpCnt = 0;
1833     Wait.DsCnt = 0;
1835     Wait.SampleCnt = 0;
1837     Wait.BvhCnt = 0;
1839     Wait.KmCnt = 0;
1843       Wait.LoadCnt = 0;
1845       Wait.SampleCnt = 0;
1847       Wait.BvhCnt = 0;
1850   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1854 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1862     // Try to merge the required wait with preexisting waitcnt instructions.
1865         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1869   ScoreBrackets.applyWaitcnt(Wait);
1872   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1875         TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1876     if (Wait.ExpCnt < WaitExp->getImm()) {
1877       WaitExp->setImm(Wait.ExpCnt);
1880     Wait.ExpCnt = ~0u;
1886   if (WCG->createNewWaitcnt(Block, It, Wait))
1896   assert(TII->isFLAT(MI));
1899   assert(TII->usesVM_CNT(MI));
1912     unsigned AS = Memop->getAddrSpace();
1924   assert(TII->isFLAT(MI));
1927   if (!TII->usesLGKM_CNT(MI))
1931   if (ST->isTgSplitEnabled())
1941     unsigned AS = Memop->getAddrSpace();
1953   assert(TII->isFLAT(MI));
1956   if (TII->isFLATScratch(MI))
1960   if (TII->isFLATGlobal(MI))
1970     unsigned AS = Memop->getAddrSpace();
1984   // instruction, update the upper-bound of the appropriate counter's
1988   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1989     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1990         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1991       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1992       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1994       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1996   } else if (TII->isFLAT(Inst)) {
2007       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2013       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2020     // - it will require that both the VM and LGKM be flushed to zero if it is
2023       ScoreBrackets->setPendingFlat();
2026     ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2029     if (ST->vmemWriteNeedsExpWaitcnt() &&
2031       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2033   } else if (TII->isSMRD(Inst)) {
2034     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2037       // Act as a wait on everything
2038       ScoreBrackets->applyWaitcnt(
2039           WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2040       ScoreBrackets->setStateOnFunctionEntryOrReturn();
2042       // May need to way wait for anything.
2043       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2046     ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2047   } else if (TII->isVINTERP(Inst)) {
2048     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2049     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2051     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2053       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2055       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2057       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2064       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2073       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2108     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2109     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2117     M.MyShift = NewUB - ScoreUBs[T];
2118     M.OtherShift = NewUB - Other.ScoreUBs[T];
2164   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2165   // ST->partialVCCWritesUpdateVCCZ().
2167   if (ST->hasReadVCCZBug()) {
2171   } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2185     // Track pre-existing waitcnts that were added in earlier iterations or by
2206     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2210         if (!ST->partialVCCWritesUpdateVCCZ())
2213         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2216         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2220         if (ST->hasReadVCCZBug() &&
2232     if (TII->isSMRD(Inst)) {
2236         if (!Memop->isInvariant()) {
2237           const Value *Ptr = Memop->getValue();
2241       if (ST->hasReadVCCZBug()) {
2249     if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2250       AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2252       ScoreBrackets.simplifyWaitcnt(Wait);
2253       Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2262     // TODO: Remove this work-around after fixing the scheduler and enable the
2269               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2270               TRI->getVCC())
2271           .addReg(TRI->getVCC());
2281   AMDGPU::Waitcnt Wait;
2285       Wait.LoadCnt = 0;
2287       Wait.SampleCnt = 0;
2289       Wait.BvhCnt = 0;
2293   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2305     return Iterator->second;
2311   MachineLoop *Loop = MLI->getLoopFor(Succ);
2315   if (Loop->getLoopPreheader() == &MBB &&
2317     Iterator->second = true;
2332 //    vgpr containing a value that is loaded outside of the loop. (Only on
2345   for (MachineBasicBlock *MBB : ML->blocks()) {
2355         if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2391   if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2393   return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2398   TII = ST->getInstrInfo();
2399   TRI = &TII->getRegisterInfo();
2405     AA = &AAR->getAAResults();
2407   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
2409   if (ST->hasExtendedWaitCounts()) {
2423   const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2428   if (ST->hasExtendedWaitCounts()) {
2441   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2442   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2448       TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2449   Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2451       TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2452   Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2460   if (!MFI->isEntryFunction()) {
2461     // Wait for any outstanding memory operations that the input registers may
2462     // depend on. We can't track them and it's better to do the wait after the
2468          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2471     if (ST->hasExtendedWaitCounts()) {
2472       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2479                 TII->get(instrsForExtendedCounterTypes[CT]))
2483       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2489     NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2507       MachineBasicBlock *MBB = BII->first;
2508       BlockInfo &BI = BII->second;
2530       if (Brackets->hasPendingEvent()) {
2532         for (MachineBasicBlock *Succ : MBB->successors()) {
2534           BlockInfo &SuccBI = SuccBII->second;
2544           } else if (SuccBI.Incoming->merge(*Brackets)) {
2551           MoveBracketsToSucc->Incoming = std::move(Brackets);
2556   if (ST->hasScalarStores()) {
2562         if (!HaveScalarStores && TII->isScalarStore(MI))
2583         for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2585           if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2587           else if (TII->isScalarStore(*I))
2591           if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2592                I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2595             BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2605     if (ST->requiresNopBeforeDeallocVGPRs()) {
2606       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
2609     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2610             TII->get(AMDGPU::S_SENDMSG))