xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
181ad6265SDimitry Andric //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric //
981ad6265SDimitry Andric /// \file
1081ad6265SDimitry Andric /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
1181ad6265SDimitry Andric //
1281ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1381ad6265SDimitry Andric 
1481ad6265SDimitry Andric #include "AMDGPU.h"
1581ad6265SDimitry Andric #include "GCNSubtarget.h"
1681ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1781ad6265SDimitry Andric #include "SIInstrInfo.h"
1881ad6265SDimitry Andric #include "llvm/ADT/SetVector.h"
1981ad6265SDimitry Andric 
2081ad6265SDimitry Andric using namespace llvm;
2181ad6265SDimitry Andric 
2281ad6265SDimitry Andric #define DEBUG_TYPE "amdgpu-insert-delay-alu"
2381ad6265SDimitry Andric 
2481ad6265SDimitry Andric namespace {
2581ad6265SDimitry Andric 
2681ad6265SDimitry Andric class AMDGPUInsertDelayAlu : public MachineFunctionPass {
2781ad6265SDimitry Andric public:
2881ad6265SDimitry Andric   static char ID;
2981ad6265SDimitry Andric 
3081ad6265SDimitry Andric   const SIInstrInfo *SII;
3181ad6265SDimitry Andric   const TargetRegisterInfo *TRI;
3281ad6265SDimitry Andric 
3381ad6265SDimitry Andric   TargetSchedModel SchedModel;
3481ad6265SDimitry Andric 
AMDGPUInsertDelayAlu()3581ad6265SDimitry Andric   AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
3681ad6265SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const3781ad6265SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
3881ad6265SDimitry Andric     AU.setPreservesCFG();
3981ad6265SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
4081ad6265SDimitry Andric   }
4181ad6265SDimitry Andric 
4281ad6265SDimitry Andric   // Return true if MI waits for all outstanding VALU instructions to complete.
instructionWaitsForVALU(const MachineInstr & MI)4381ad6265SDimitry Andric   static bool instructionWaitsForVALU(const MachineInstr &MI) {
4481ad6265SDimitry Andric     // These instruction types wait for VA_VDST==0 before issuing.
4581ad6265SDimitry Andric     const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
4681ad6265SDimitry Andric                                SIInstrFlags::FLAT | SIInstrFlags::MIMG |
4781ad6265SDimitry Andric                                SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
4881ad6265SDimitry Andric     if (MI.getDesc().TSFlags & VA_VDST_0)
4981ad6265SDimitry Andric       return true;
5081ad6265SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
5181ad6265SDimitry Andric         MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
5281ad6265SDimitry Andric       return true;
5381ad6265SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54*06c3fb27SDimitry Andric         AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
5581ad6265SDimitry Andric       return true;
5681ad6265SDimitry Andric     return false;
5781ad6265SDimitry Andric   }
5881ad6265SDimitry Andric 
5981ad6265SDimitry Andric   // Types of delay that can be encoded in an s_delay_alu instruction.
6081ad6265SDimitry Andric   enum DelayType { VALU, TRANS, SALU, OTHER };
6181ad6265SDimitry Andric 
6281ad6265SDimitry Andric   // Get the delay type for an instruction with the specified TSFlags.
getDelayType(uint64_t TSFlags)6381ad6265SDimitry Andric   static DelayType getDelayType(uint64_t TSFlags) {
6481ad6265SDimitry Andric     if (TSFlags & SIInstrFlags::TRANS)
6581ad6265SDimitry Andric       return TRANS;
6681ad6265SDimitry Andric     if (TSFlags & SIInstrFlags::VALU)
6781ad6265SDimitry Andric       return VALU;
6881ad6265SDimitry Andric     if (TSFlags & SIInstrFlags::SALU)
6981ad6265SDimitry Andric       return SALU;
7081ad6265SDimitry Andric     return OTHER;
7181ad6265SDimitry Andric   }
7281ad6265SDimitry Andric 
7381ad6265SDimitry Andric   // Information about the last instruction(s) that wrote to a particular
7481ad6265SDimitry Andric   // regunit. In straight-line code there will only be one such instruction, but
7581ad6265SDimitry Andric   // when control flow converges we merge the delay information from each path
7681ad6265SDimitry Andric   // to represent the union of the worst-case delays of each type.
7781ad6265SDimitry Andric   struct DelayInfo {
7881ad6265SDimitry Andric     // One larger than the maximum number of (non-TRANS) VALU instructions we
7981ad6265SDimitry Andric     // can encode in an s_delay_alu instruction.
80*06c3fb27SDimitry Andric     static constexpr unsigned VALU_MAX = 5;
8181ad6265SDimitry Andric 
8281ad6265SDimitry Andric     // One larger than the maximum number of TRANS instructions we can encode in
8381ad6265SDimitry Andric     // an s_delay_alu instruction.
84*06c3fb27SDimitry Andric     static constexpr unsigned TRANS_MAX = 4;
85*06c3fb27SDimitry Andric 
86*06c3fb27SDimitry Andric     // One larger than the maximum number of SALU cycles we can encode in an
87*06c3fb27SDimitry Andric     // s_delay_alu instruction.
88*06c3fb27SDimitry Andric     static constexpr unsigned SALU_CYCLES_MAX = 4;
8981ad6265SDimitry Andric 
9081ad6265SDimitry Andric     // If it was written by a (non-TRANS) VALU, remember how many clock cycles
9181ad6265SDimitry Andric     // are left until it completes, and how many other (non-TRANS) VALU we have
9281ad6265SDimitry Andric     // seen since it was issued.
9381ad6265SDimitry Andric     uint8_t VALUCycles = 0;
9481ad6265SDimitry Andric     uint8_t VALUNum = VALU_MAX;
9581ad6265SDimitry Andric 
9681ad6265SDimitry Andric     // If it was written by a TRANS, remember how many clock cycles are left
9781ad6265SDimitry Andric     // until it completes, and how many other TRANS we have seen since it was
9881ad6265SDimitry Andric     // issued.
9981ad6265SDimitry Andric     uint8_t TRANSCycles = 0;
10081ad6265SDimitry Andric     uint8_t TRANSNum = TRANS_MAX;
10181ad6265SDimitry Andric     // Also remember how many other (non-TRANS) VALU we have seen since it was
10281ad6265SDimitry Andric     // issued. When an instruction depends on both a prior TRANS and a prior
10381ad6265SDimitry Andric     // non-TRANS VALU, this is used to decide whether to encode a wait for just
10481ad6265SDimitry Andric     // one or both of them.
10581ad6265SDimitry Andric     uint8_t TRANSNumVALU = VALU_MAX;
10681ad6265SDimitry Andric 
10781ad6265SDimitry Andric     // If it was written by an SALU, remember how many clock cycles are left
10881ad6265SDimitry Andric     // until it completes.
10981ad6265SDimitry Andric     uint8_t SALUCycles = 0;
11081ad6265SDimitry Andric 
11181ad6265SDimitry Andric     DelayInfo() = default;
11281ad6265SDimitry Andric 
DelayInfo__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo11381ad6265SDimitry Andric     DelayInfo(DelayType Type, unsigned Cycles) {
11481ad6265SDimitry Andric       switch (Type) {
11581ad6265SDimitry Andric       default:
11681ad6265SDimitry Andric         llvm_unreachable("unexpected type");
11781ad6265SDimitry Andric       case VALU:
11881ad6265SDimitry Andric         VALUCycles = Cycles;
11981ad6265SDimitry Andric         VALUNum = 0;
12081ad6265SDimitry Andric         break;
12181ad6265SDimitry Andric       case TRANS:
12281ad6265SDimitry Andric         TRANSCycles = Cycles;
12381ad6265SDimitry Andric         TRANSNum = 0;
12481ad6265SDimitry Andric         TRANSNumVALU = 0;
12581ad6265SDimitry Andric         break;
12681ad6265SDimitry Andric       case SALU:
127*06c3fb27SDimitry Andric         // Guard against pseudo-instructions like SI_CALL which are marked as
128*06c3fb27SDimitry Andric         // SALU but with a very high latency.
129*06c3fb27SDimitry Andric         SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
13081ad6265SDimitry Andric         break;
13181ad6265SDimitry Andric       }
13281ad6265SDimitry Andric     }
13381ad6265SDimitry Andric 
operator ==__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo13481ad6265SDimitry Andric     bool operator==(const DelayInfo &RHS) const {
13581ad6265SDimitry Andric       return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
13681ad6265SDimitry Andric              TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
13781ad6265SDimitry Andric              TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
13881ad6265SDimitry Andric     }
13981ad6265SDimitry Andric 
operator !=__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo14081ad6265SDimitry Andric     bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
14181ad6265SDimitry Andric 
14281ad6265SDimitry Andric     // Merge another DelayInfo into this one, to represent the union of the
14381ad6265SDimitry Andric     // worst-case delays of each type.
merge__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo14481ad6265SDimitry Andric     void merge(const DelayInfo &RHS) {
14581ad6265SDimitry Andric       VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
14681ad6265SDimitry Andric       VALUNum = std::min(VALUNum, RHS.VALUNum);
14781ad6265SDimitry Andric       TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
14881ad6265SDimitry Andric       TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
14981ad6265SDimitry Andric       TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
15081ad6265SDimitry Andric       SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
15181ad6265SDimitry Andric     }
15281ad6265SDimitry Andric 
15381ad6265SDimitry Andric     // Update this DelayInfo after issuing an instruction. IsVALU should be 1
15481ad6265SDimitry Andric     // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
15581ad6265SDimitry Andric     // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
15681ad6265SDimitry Andric     // instruction.  Return true if there is no longer any useful delay info.
advance__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo15781ad6265SDimitry Andric     bool advance(DelayType Type, unsigned Cycles) {
15881ad6265SDimitry Andric       bool Erase = true;
15981ad6265SDimitry Andric 
16081ad6265SDimitry Andric       VALUNum += (Type == VALU);
16181ad6265SDimitry Andric       if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
16281ad6265SDimitry Andric         // Forget about the VALU instruction. It was too far back or has
16381ad6265SDimitry Andric         // definitely completed by now.
16481ad6265SDimitry Andric         VALUNum = VALU_MAX;
16581ad6265SDimitry Andric         VALUCycles = 0;
16681ad6265SDimitry Andric       } else {
16781ad6265SDimitry Andric         VALUCycles -= Cycles;
16881ad6265SDimitry Andric         Erase = false;
16981ad6265SDimitry Andric       }
17081ad6265SDimitry Andric 
17181ad6265SDimitry Andric       TRANSNum += (Type == TRANS);
17281ad6265SDimitry Andric       TRANSNumVALU += (Type == VALU);
17381ad6265SDimitry Andric       if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
17481ad6265SDimitry Andric         // Forget about any TRANS instruction. It was too far back or has
17581ad6265SDimitry Andric         // definitely completed by now.
17681ad6265SDimitry Andric         TRANSNum = TRANS_MAX;
17781ad6265SDimitry Andric         TRANSNumVALU = VALU_MAX;
17881ad6265SDimitry Andric         TRANSCycles = 0;
17981ad6265SDimitry Andric       } else {
18081ad6265SDimitry Andric         TRANSCycles -= Cycles;
18181ad6265SDimitry Andric         Erase = false;
18281ad6265SDimitry Andric       }
18381ad6265SDimitry Andric 
18481ad6265SDimitry Andric       if (SALUCycles <= Cycles) {
18581ad6265SDimitry Andric         // Forget about any SALU instruction. It has definitely completed by
18681ad6265SDimitry Andric         // now.
18781ad6265SDimitry Andric         SALUCycles = 0;
18881ad6265SDimitry Andric       } else {
18981ad6265SDimitry Andric         SALUCycles -= Cycles;
19081ad6265SDimitry Andric         Erase = false;
19181ad6265SDimitry Andric       }
19281ad6265SDimitry Andric 
19381ad6265SDimitry Andric       return Erase;
19481ad6265SDimitry Andric     }
19581ad6265SDimitry Andric 
19681ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo19781ad6265SDimitry Andric     void dump() const {
19881ad6265SDimitry Andric       if (VALUCycles)
19981ad6265SDimitry Andric         dbgs() << " VALUCycles=" << (int)VALUCycles;
20081ad6265SDimitry Andric       if (VALUNum < VALU_MAX)
20181ad6265SDimitry Andric         dbgs() << " VALUNum=" << (int)VALUNum;
20281ad6265SDimitry Andric       if (TRANSCycles)
20381ad6265SDimitry Andric         dbgs() << " TRANSCycles=" << (int)TRANSCycles;
20481ad6265SDimitry Andric       if (TRANSNum < TRANS_MAX)
20581ad6265SDimitry Andric         dbgs() << " TRANSNum=" << (int)TRANSNum;
20681ad6265SDimitry Andric       if (TRANSNumVALU < VALU_MAX)
20781ad6265SDimitry Andric         dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
20881ad6265SDimitry Andric       if (SALUCycles)
20981ad6265SDimitry Andric         dbgs() << " SALUCycles=" << (int)SALUCycles;
21081ad6265SDimitry Andric     }
21181ad6265SDimitry Andric #endif
21281ad6265SDimitry Andric   };
21381ad6265SDimitry Andric 
21481ad6265SDimitry Andric   // A map from regunits to the delay info for that regunit.
21581ad6265SDimitry Andric   struct DelayState : DenseMap<unsigned, DelayInfo> {
21681ad6265SDimitry Andric     // Merge another DelayState into this one by merging the delay info for each
21781ad6265SDimitry Andric     // regunit.
merge__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayState21881ad6265SDimitry Andric     void merge(const DelayState &RHS) {
21981ad6265SDimitry Andric       for (const auto &KV : RHS) {
22081ad6265SDimitry Andric         iterator It;
22181ad6265SDimitry Andric         bool Inserted;
22281ad6265SDimitry Andric         std::tie(It, Inserted) = insert(KV);
22381ad6265SDimitry Andric         if (!Inserted)
22481ad6265SDimitry Andric           It->second.merge(KV.second);
22581ad6265SDimitry Andric       }
22681ad6265SDimitry Andric     }
22781ad6265SDimitry Andric 
22881ad6265SDimitry Andric     // Advance the delay info for each regunit, erasing any that are no longer
22981ad6265SDimitry Andric     // useful.
advance__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayState23081ad6265SDimitry Andric     void advance(DelayType Type, unsigned Cycles) {
23181ad6265SDimitry Andric       iterator Next;
23281ad6265SDimitry Andric       for (auto I = begin(), E = end(); I != E; I = Next) {
23381ad6265SDimitry Andric         Next = std::next(I);
23481ad6265SDimitry Andric         if (I->second.advance(Type, Cycles))
23581ad6265SDimitry Andric           erase(I);
23681ad6265SDimitry Andric       }
23781ad6265SDimitry Andric     }
23881ad6265SDimitry Andric 
23981ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayState24081ad6265SDimitry Andric     void dump(const TargetRegisterInfo *TRI) const {
24181ad6265SDimitry Andric       if (empty()) {
24281ad6265SDimitry Andric         dbgs() << "    empty\n";
24381ad6265SDimitry Andric         return;
24481ad6265SDimitry Andric       }
24581ad6265SDimitry Andric 
24681ad6265SDimitry Andric       // Dump DelayInfo for each RegUnit in numerical order.
24781ad6265SDimitry Andric       SmallVector<const_iterator, 8> Order;
24881ad6265SDimitry Andric       Order.reserve(size());
24981ad6265SDimitry Andric       for (const_iterator I = begin(), E = end(); I != E; ++I)
25081ad6265SDimitry Andric         Order.push_back(I);
25181ad6265SDimitry Andric       llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
25281ad6265SDimitry Andric         return A->first < B->first;
25381ad6265SDimitry Andric       });
25481ad6265SDimitry Andric       for (const_iterator I : Order) {
25581ad6265SDimitry Andric         dbgs() << "    " << printRegUnit(I->first, TRI);
25681ad6265SDimitry Andric         I->second.dump();
25781ad6265SDimitry Andric         dbgs() << "\n";
25881ad6265SDimitry Andric       }
25981ad6265SDimitry Andric     }
26081ad6265SDimitry Andric #endif
26181ad6265SDimitry Andric   };
26281ad6265SDimitry Andric 
26381ad6265SDimitry Andric   // The saved delay state at the end of each basic block.
26481ad6265SDimitry Andric   DenseMap<MachineBasicBlock *, DelayState> BlockState;
26581ad6265SDimitry Andric 
26681ad6265SDimitry Andric   // Emit an s_delay_alu instruction if necessary before MI.
emitDelayAlu(MachineInstr & MI,DelayInfo Delay,MachineInstr * LastDelayAlu)26781ad6265SDimitry Andric   MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
26881ad6265SDimitry Andric                              MachineInstr *LastDelayAlu) {
26981ad6265SDimitry Andric     unsigned Imm = 0;
27081ad6265SDimitry Andric 
27181ad6265SDimitry Andric     // Wait for a TRANS instruction.
27281ad6265SDimitry Andric     if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
27381ad6265SDimitry Andric       Imm |= 4 + Delay.TRANSNum;
27481ad6265SDimitry Andric 
27581ad6265SDimitry Andric     // Wait for a VALU instruction (if it's more recent than any TRANS
27681ad6265SDimitry Andric     // instruction that we're also waiting for).
27781ad6265SDimitry Andric     if (Delay.VALUNum < DelayInfo::VALU_MAX &&
27881ad6265SDimitry Andric         Delay.VALUNum <= Delay.TRANSNumVALU) {
27981ad6265SDimitry Andric       if (Imm & 0xf)
28081ad6265SDimitry Andric         Imm |= Delay.VALUNum << 7;
28181ad6265SDimitry Andric       else
28281ad6265SDimitry Andric         Imm |= Delay.VALUNum;
28381ad6265SDimitry Andric     }
28481ad6265SDimitry Andric 
28581ad6265SDimitry Andric     // Wait for an SALU instruction.
28681ad6265SDimitry Andric     if (Delay.SALUCycles) {
287*06c3fb27SDimitry Andric       assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
28881ad6265SDimitry Andric       if (Imm & 0x780) {
28981ad6265SDimitry Andric         // We have already encoded a VALU and a TRANS delay. There's no room in
29081ad6265SDimitry Andric         // the encoding for an SALU delay as well, so just drop it.
29181ad6265SDimitry Andric       } else if (Imm & 0xf) {
29281ad6265SDimitry Andric         Imm |= (Delay.SALUCycles + 8) << 7;
29381ad6265SDimitry Andric       } else {
29481ad6265SDimitry Andric         Imm |= Delay.SALUCycles + 8;
29581ad6265SDimitry Andric       }
29681ad6265SDimitry Andric     }
29781ad6265SDimitry Andric 
29881ad6265SDimitry Andric     // Don't emit the s_delay_alu instruction if there's nothing to wait for.
29981ad6265SDimitry Andric     if (!Imm)
30081ad6265SDimitry Andric       return LastDelayAlu;
30181ad6265SDimitry Andric 
30281ad6265SDimitry Andric     // If we only need to wait for one instruction, try encoding it in the last
30381ad6265SDimitry Andric     // s_delay_alu that we emitted.
30481ad6265SDimitry Andric     if (!(Imm & 0x780) && LastDelayAlu) {
30581ad6265SDimitry Andric       unsigned Skip = 0;
30681ad6265SDimitry Andric       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
30781ad6265SDimitry Andric                 E = MachineBasicBlock::instr_iterator(MI);
30881ad6265SDimitry Andric            ++I != E;) {
30981ad6265SDimitry Andric         if (!I->isBundle() && !I->isMetaInstruction())
31081ad6265SDimitry Andric           ++Skip;
31181ad6265SDimitry Andric       }
31281ad6265SDimitry Andric       if (Skip < 6) {
31381ad6265SDimitry Andric         MachineOperand &Op = LastDelayAlu->getOperand(0);
31481ad6265SDimitry Andric         unsigned LastImm = Op.getImm();
31581ad6265SDimitry Andric         assert((LastImm & ~0xf) == 0 &&
31681ad6265SDimitry Andric                "Remembered an s_delay_alu with no room for another delay!");
31781ad6265SDimitry Andric         LastImm |= Imm << 7 | Skip << 4;
31881ad6265SDimitry Andric         Op.setImm(LastImm);
31981ad6265SDimitry Andric         return nullptr;
32081ad6265SDimitry Andric       }
32181ad6265SDimitry Andric     }
32281ad6265SDimitry Andric 
32381ad6265SDimitry Andric     auto &MBB = *MI.getParent();
32481ad6265SDimitry Andric     MachineInstr *DelayAlu =
32581ad6265SDimitry Andric         BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
32681ad6265SDimitry Andric     // Remember the s_delay_alu for next time if there is still room in it to
32781ad6265SDimitry Andric     // encode another delay.
32881ad6265SDimitry Andric     return (Imm & 0x780) ? nullptr : DelayAlu;
32981ad6265SDimitry Andric   }
33081ad6265SDimitry Andric 
runOnMachineBasicBlock(MachineBasicBlock & MBB,bool Emit)33181ad6265SDimitry Andric   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
33281ad6265SDimitry Andric     DelayState State;
33381ad6265SDimitry Andric     for (auto *Pred : MBB.predecessors())
33481ad6265SDimitry Andric       State.merge(BlockState[Pred]);
33581ad6265SDimitry Andric 
33681ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
33781ad6265SDimitry Andric                       << "\n";
33881ad6265SDimitry Andric                State.dump(TRI););
33981ad6265SDimitry Andric 
34081ad6265SDimitry Andric     bool Changed = false;
34181ad6265SDimitry Andric     MachineInstr *LastDelayAlu = nullptr;
34281ad6265SDimitry Andric 
34381ad6265SDimitry Andric     // Iterate over the contents of bundles, but don't emit any instructions
34481ad6265SDimitry Andric     // inside a bundle.
34581ad6265SDimitry Andric     for (auto &MI : MBB.instrs()) {
34681ad6265SDimitry Andric       if (MI.isBundle() || MI.isMetaInstruction())
34781ad6265SDimitry Andric         continue;
34881ad6265SDimitry Andric 
34981ad6265SDimitry Andric       // Ignore some more instructions that do not generate any code.
35081ad6265SDimitry Andric       switch (MI.getOpcode()) {
35181ad6265SDimitry Andric       case AMDGPU::SI_RETURN_TO_EPILOG:
35281ad6265SDimitry Andric         continue;
35381ad6265SDimitry Andric       }
35481ad6265SDimitry Andric 
35581ad6265SDimitry Andric       DelayType Type = getDelayType(MI.getDesc().TSFlags);
35681ad6265SDimitry Andric 
35781ad6265SDimitry Andric       if (instructionWaitsForVALU(MI)) {
35881ad6265SDimitry Andric         // Forget about all outstanding VALU delays.
359*06c3fb27SDimitry Andric         // TODO: This is overkill since it also forgets about SALU delays.
36081ad6265SDimitry Andric         State = DelayState();
36181ad6265SDimitry Andric       } else if (Type != OTHER) {
36281ad6265SDimitry Andric         DelayInfo Delay;
36381ad6265SDimitry Andric         // TODO: Scan implicit uses too?
36481ad6265SDimitry Andric         for (const auto &Op : MI.explicit_uses()) {
36581ad6265SDimitry Andric           if (Op.isReg()) {
36681ad6265SDimitry Andric             // One of the operands of the writelane is also the output operand.
36781ad6265SDimitry Andric             // This creates the insertion of redundant delays. Hence, we have to
36881ad6265SDimitry Andric             // ignore this operand.
36981ad6265SDimitry Andric             if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
37081ad6265SDimitry Andric               continue;
371*06c3fb27SDimitry Andric             for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
372*06c3fb27SDimitry Andric               auto It = State.find(Unit);
37381ad6265SDimitry Andric               if (It != State.end()) {
37481ad6265SDimitry Andric                 Delay.merge(It->second);
375*06c3fb27SDimitry Andric                 State.erase(Unit);
37681ad6265SDimitry Andric               }
37781ad6265SDimitry Andric             }
37881ad6265SDimitry Andric           }
37981ad6265SDimitry Andric         }
38081ad6265SDimitry Andric         if (Emit && !MI.isBundledWithPred()) {
38181ad6265SDimitry Andric           // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
38281ad6265SDimitry Andric           // just ignore them?
38381ad6265SDimitry Andric           LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
38481ad6265SDimitry Andric         }
38581ad6265SDimitry Andric       }
38681ad6265SDimitry Andric 
38781ad6265SDimitry Andric       if (Type != OTHER) {
38881ad6265SDimitry Andric         // TODO: Scan implicit defs too?
38981ad6265SDimitry Andric         for (const auto &Op : MI.defs()) {
39081ad6265SDimitry Andric           unsigned Latency = SchedModel.computeOperandLatency(
391*06c3fb27SDimitry Andric               &MI, Op.getOperandNo(), nullptr, 0);
392*06c3fb27SDimitry Andric           for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
393*06c3fb27SDimitry Andric             State[Unit] = DelayInfo(Type, Latency);
39481ad6265SDimitry Andric         }
39581ad6265SDimitry Andric       }
39681ad6265SDimitry Andric 
39781ad6265SDimitry Andric       // Advance by the number of cycles it takes to issue this instruction.
39881ad6265SDimitry Andric       // TODO: Use a more advanced model that accounts for instructions that
39981ad6265SDimitry Andric       // take multiple cycles to issue on a particular pipeline.
40081ad6265SDimitry Andric       unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
40181ad6265SDimitry Andric       // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
40281ad6265SDimitry Andric       // instructions on the assumption that they will usually have to be issued
40381ad6265SDimitry Andric       // twice?
40481ad6265SDimitry Andric       State.advance(Type, Cycles);
40581ad6265SDimitry Andric 
40681ad6265SDimitry Andric       LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
40781ad6265SDimitry Andric     }
40881ad6265SDimitry Andric 
40981ad6265SDimitry Andric     if (Emit) {
41081ad6265SDimitry Andric       assert(State == BlockState[&MBB] &&
41181ad6265SDimitry Andric              "Basic block state should not have changed on final pass!");
41281ad6265SDimitry Andric     } else if (State != BlockState[&MBB]) {
41381ad6265SDimitry Andric       BlockState[&MBB] = std::move(State);
41481ad6265SDimitry Andric       Changed = true;
41581ad6265SDimitry Andric     }
41681ad6265SDimitry Andric     return Changed;
41781ad6265SDimitry Andric   }
41881ad6265SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)41981ad6265SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override {
42081ad6265SDimitry Andric     if (skipFunction(MF.getFunction()))
42181ad6265SDimitry Andric       return false;
42281ad6265SDimitry Andric 
42381ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
42481ad6265SDimitry Andric                       << "\n");
42581ad6265SDimitry Andric 
42681ad6265SDimitry Andric     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
42781ad6265SDimitry Andric     if (!ST.hasDelayAlu())
42881ad6265SDimitry Andric       return false;
42981ad6265SDimitry Andric 
43081ad6265SDimitry Andric     SII = ST.getInstrInfo();
43181ad6265SDimitry Andric     TRI = ST.getRegisterInfo();
43281ad6265SDimitry Andric 
43381ad6265SDimitry Andric     SchedModel.init(&ST);
43481ad6265SDimitry Andric 
43581ad6265SDimitry Andric     // Calculate the delay state for each basic block, iterating until we reach
43681ad6265SDimitry Andric     // a fixed point.
43781ad6265SDimitry Andric     SetVector<MachineBasicBlock *> WorkList;
43881ad6265SDimitry Andric     for (auto &MBB : reverse(MF))
43981ad6265SDimitry Andric       WorkList.insert(&MBB);
44081ad6265SDimitry Andric     while (!WorkList.empty()) {
44181ad6265SDimitry Andric       auto &MBB = *WorkList.pop_back_val();
44281ad6265SDimitry Andric       bool Changed = runOnMachineBasicBlock(MBB, false);
44381ad6265SDimitry Andric       if (Changed)
44481ad6265SDimitry Andric         WorkList.insert(MBB.succ_begin(), MBB.succ_end());
44581ad6265SDimitry Andric     }
44681ad6265SDimitry Andric 
44781ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
44881ad6265SDimitry Andric 
44981ad6265SDimitry Andric     // Make one last pass over all basic blocks to emit s_delay_alu
45081ad6265SDimitry Andric     // instructions.
45181ad6265SDimitry Andric     bool Changed = false;
45281ad6265SDimitry Andric     for (auto &MBB : MF)
45381ad6265SDimitry Andric       Changed |= runOnMachineBasicBlock(MBB, true);
45481ad6265SDimitry Andric     return Changed;
45581ad6265SDimitry Andric   }
45681ad6265SDimitry Andric };
45781ad6265SDimitry Andric 
45881ad6265SDimitry Andric } // namespace
45981ad6265SDimitry Andric 
46081ad6265SDimitry Andric char AMDGPUInsertDelayAlu::ID = 0;
46181ad6265SDimitry Andric 
46281ad6265SDimitry Andric char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
46381ad6265SDimitry Andric 
46481ad6265SDimitry Andric INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
46581ad6265SDimitry Andric                 false, false)
466