SIInsertWaitcnts.cpp - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines Matching full:wait
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
10 /// Insert wait instructions for memory reads and writes.
19 /// example, when both SMEM and LDS are in flight and we need to wait for
198 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
201     return Wait.LoadCnt;
203     return Wait.ExpCnt;
205     return Wait.DsCnt;
207     return Wait.StoreCnt;
209     return Wait.SampleCnt;
211     return Wait.BvhCnt;
213     return Wait.KmCnt;
219 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
220   unsigned &WC = getCounterRef(Wait, T);
224 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225   getCounterRef(Wait, T) = ~0u;
228 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
229   return getCounterRef(Wait, T);
241 // This objects maintains the current score brackets of each wait counter, and
242 // a per-register scoreboard for each wait counter.
247 // wait count may get decreased out of order, therefore we need to put in
310   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
312   void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
313   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
428   // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
465   // Edits an existing sequence of wait count instructions according
467   // any new wait count instructions which may need to be generated by
472   // delete instructions if the incoming Wait value indicates they are not
473   // needed. It may also remove existing instructions for which a wait
478                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
484   // Generates new wait count instructions according to the  value of
485   // Wait, returning true if any new instructions were created.
488                                 AMDGPU::Waitcnt Wait) = 0;
500   // Create a mask value from the initializer list of wait event types.
519                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
524                         AMDGPU::Waitcnt Wait) override;
555                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
560                         AMDGPU::Waitcnt Wait) override;
638     return "SI insert wait instructions";
722   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
930       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
931       // written can be accessed. A load from LDS to VMEM does not need a wait.
1038 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1039   simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1040   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1041   simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1042   simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1043   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1044   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1045   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1052   // of outstanding events, then the wait for this counter is redundant.
1058                                     AMDGPU::Waitcnt &Wait) const {
1071       addWait(Wait, T, 0);
1076       addWait(Wait, T, 0);
1081       addWait(Wait, T, NeededWait);
1086 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1087   applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1088   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1089   applyWaitcnt(DS_CNT, Wait.DsCnt);
1090   applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1091   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1092   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1093   applyWaitcnt(KM_CNT, Wait.KmCnt);
1182 /// from \p Wait that were added by previous passes. Currently this pass
1187     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1203     // Update required wait count. If this is a soft waitcnt (= it was added
1210       Wait = Wait.combined(OldWait);
1213       if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1226       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1228       if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1238                                          AMDGPU::encodeWaitcnt(IV, Wait));
1241     ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1242     ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1243     ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1244     Wait.LoadCnt = ~0u;
1245     Wait.ExpCnt = ~0u;
1246     Wait.DsCnt = ~0u;
1259                                          AMDGPU::OpName::simm16, Wait.StoreCnt);
1262     ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1263     Wait.StoreCnt = ~0u;
1278 /// required counters in \p Wait
1281     AMDGPU::Waitcnt Wait) {
1290   if (Wait.hasWaitExceptStoreCnt()) {
1291     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1301   if (Wait.hasWaitStoreCnt()) {
1307             .addImm(Wait.StoreCnt);
1329 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1334     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1350     // Update required wait count. If this is a soft waitcnt (= it was added
1367       Wait = Wait.combined(OldWait);
1375       Wait = Wait.combined(OldWait);
1384       addWait(Wait, CT.value(), OldCnt);
1400     // the appropriate single counter wait instruction can be inserted
1402     // createNewWaitcnt(). As a side effect, resetting the wait counts will
1405     if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1406       unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1410       ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1411       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1412       Wait.LoadCnt = ~0u;
1413       Wait.DsCnt = ~0u;
1430     if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1431       unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1435       ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1436       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1437       Wait.StoreCnt = ~0u;
1438       Wait.DsCnt = ~0u;
1459   if (Wait.DsCnt != ~0u) {
1466     // individual wait count instructions for these.
1468     if (Wait.LoadCnt != ~0u) {
1471     } else if (Wait.StoreCnt != ~0u) {
1490     unsigned NewCnt = getWait(Wait, CT);
1497       setNoWait(Wait, CT);
1515 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1518     AMDGPU::Waitcnt Wait) {
1525   // Check for opportunities to use combined wait instructions.
1526   if (Wait.DsCnt != ~0u) {
1529     if (Wait.LoadCnt != ~0u) {
1530       unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1535       Wait.LoadCnt = ~0u;
1536       Wait.DsCnt = ~0u;
1537     } else if (Wait.StoreCnt != ~0u) {
1538       unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1544       Wait.StoreCnt = ~0u;
1545       Wait.DsCnt = ~0u;
1561     unsigned Count = getWait(Wait, CT);
1587   // Currently all conventions wait, but this may not always be the case.
1590   // senses to omit the wait and do it in the caller.
1594 /// \returns true if the callee is expected to wait for any outstanding waits
1621   AMDGPU::Waitcnt Wait;
1632     Wait.LoadCnt = 0;
1642     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1644   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1663     Wait.LoadCnt = 0;
1678         Wait.ExpCnt = 0;
1683       // The function is going to insert a wait on everything in its prolog.
1686       Wait = AMDGPU::Waitcnt();
1697           ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1707             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1728           addWait(Wait, SmemAccessCounter, 0);
1735         // No need to wait before load from VMEM to LDS.
1746         // will produce a wait using the first (general) LDS DMA wait slot which
1747         // will wait on all of them anyway.
1753               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1758           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1760           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1787               ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1788               ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1789               ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1793               ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1795             ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1797             ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1810     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1818       Wait.DsCnt = 0;
1822   // Verify that the wait is actually needed.
1823   ScoreBrackets.simplifyWaitcnt(Wait);
1826     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1829     Wait.LoadCnt = 0;
1831     Wait.ExpCnt = 0;
1833     Wait.DsCnt = 0;
1835     Wait.SampleCnt = 0;
1837     Wait.BvhCnt = 0;
1839     Wait.KmCnt = 0;
1843       Wait.LoadCnt = 0;
1845       Wait.SampleCnt = 0;
1847       Wait.BvhCnt = 0;
1850   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1854 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1862     // Try to merge the required wait with preexisting waitcnt instructions.
1865         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1869   ScoreBrackets.applyWaitcnt(Wait);
1872   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1876     if (Wait.ExpCnt < WaitExp->getImm()) {
1877       WaitExp->setImm(Wait.ExpCnt);
1880     Wait.ExpCnt = ~0u;
1886   if (WCG->createNewWaitcnt(Block, It, Wait))
2037       // Act as a wait on everything
2042       // May need to way wait for anything.
2216         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2250       AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2252       ScoreBrackets.simplifyWaitcnt(Wait);
2253       Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2281   AMDGPU::Waitcnt Wait;
2285       Wait.LoadCnt = 0;
2287       Wait.SampleCnt = 0;
2289       Wait.BvhCnt = 0;
2293   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2461     // Wait for any outstanding memory operations that the input registers may
2462     // depend on. We can't track them and it's better to do the wait after the