xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1*81ad6265SDimitry Andric //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2*81ad6265SDimitry Andric //
3*81ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*81ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*81ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*81ad6265SDimitry Andric //
7*81ad6265SDimitry Andric //===----------------------------------------------------------------------===//
8*81ad6265SDimitry Andric //
9*81ad6265SDimitry Andric /// \file
10*81ad6265SDimitry Andric /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11*81ad6265SDimitry Andric //
12*81ad6265SDimitry Andric //===----------------------------------------------------------------------===//
13*81ad6265SDimitry Andric 
14*81ad6265SDimitry Andric #include "AMDGPU.h"
15*81ad6265SDimitry Andric #include "GCNSubtarget.h"
16*81ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17*81ad6265SDimitry Andric #include "SIInstrInfo.h"
18*81ad6265SDimitry Andric #include "llvm/ADT/SetVector.h"
19*81ad6265SDimitry Andric 
20*81ad6265SDimitry Andric using namespace llvm;
21*81ad6265SDimitry Andric 
22*81ad6265SDimitry Andric #define DEBUG_TYPE "amdgpu-insert-delay-alu"
23*81ad6265SDimitry Andric 
24*81ad6265SDimitry Andric namespace {
25*81ad6265SDimitry Andric 
26*81ad6265SDimitry Andric class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27*81ad6265SDimitry Andric public:
28*81ad6265SDimitry Andric   static char ID;
29*81ad6265SDimitry Andric 
30*81ad6265SDimitry Andric   const SIInstrInfo *SII;
31*81ad6265SDimitry Andric   const TargetRegisterInfo *TRI;
32*81ad6265SDimitry Andric 
33*81ad6265SDimitry Andric   TargetSchedModel SchedModel;
34*81ad6265SDimitry Andric 
35*81ad6265SDimitry Andric   AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
36*81ad6265SDimitry Andric 
37*81ad6265SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
38*81ad6265SDimitry Andric     AU.setPreservesCFG();
39*81ad6265SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
40*81ad6265SDimitry Andric   }
41*81ad6265SDimitry Andric 
42*81ad6265SDimitry Andric   // Return true if MI waits for all outstanding VALU instructions to complete.
43*81ad6265SDimitry Andric   static bool instructionWaitsForVALU(const MachineInstr &MI) {
44*81ad6265SDimitry Andric     // These instruction types wait for VA_VDST==0 before issuing.
45*81ad6265SDimitry Andric     const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
46*81ad6265SDimitry Andric                                SIInstrFlags::FLAT | SIInstrFlags::MIMG |
47*81ad6265SDimitry Andric                                SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
48*81ad6265SDimitry Andric     if (MI.getDesc().TSFlags & VA_VDST_0)
49*81ad6265SDimitry Andric       return true;
50*81ad6265SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51*81ad6265SDimitry Andric         MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52*81ad6265SDimitry Andric       return true;
53*81ad6265SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54*81ad6265SDimitry Andric         (MI.getOperand(0).getImm() & 0xf000) == 0)
55*81ad6265SDimitry Andric       return true;
56*81ad6265SDimitry Andric     return false;
57*81ad6265SDimitry Andric   }
58*81ad6265SDimitry Andric 
59*81ad6265SDimitry Andric   // Types of delay that can be encoded in an s_delay_alu instruction.
60*81ad6265SDimitry Andric   enum DelayType { VALU, TRANS, SALU, OTHER };
61*81ad6265SDimitry Andric 
62*81ad6265SDimitry Andric   // Get the delay type for an instruction with the specified TSFlags.
63*81ad6265SDimitry Andric   static DelayType getDelayType(uint64_t TSFlags) {
64*81ad6265SDimitry Andric     if (TSFlags & SIInstrFlags::TRANS)
65*81ad6265SDimitry Andric       return TRANS;
66*81ad6265SDimitry Andric     if (TSFlags & SIInstrFlags::VALU)
67*81ad6265SDimitry Andric       return VALU;
68*81ad6265SDimitry Andric     if (TSFlags & SIInstrFlags::SALU)
69*81ad6265SDimitry Andric       return SALU;
70*81ad6265SDimitry Andric     return OTHER;
71*81ad6265SDimitry Andric   }
72*81ad6265SDimitry Andric 
73*81ad6265SDimitry Andric   // Information about the last instruction(s) that wrote to a particular
74*81ad6265SDimitry Andric   // regunit. In straight-line code there will only be one such instruction, but
75*81ad6265SDimitry Andric   // when control flow converges we merge the delay information from each path
76*81ad6265SDimitry Andric   // to represent the union of the worst-case delays of each type.
77*81ad6265SDimitry Andric   struct DelayInfo {
78*81ad6265SDimitry Andric     // One larger than the maximum number of (non-TRANS) VALU instructions we
79*81ad6265SDimitry Andric     // can encode in an s_delay_alu instruction.
80*81ad6265SDimitry Andric     static const unsigned VALU_MAX = 5;
81*81ad6265SDimitry Andric 
82*81ad6265SDimitry Andric     // One larger than the maximum number of TRANS instructions we can encode in
83*81ad6265SDimitry Andric     // an s_delay_alu instruction.
84*81ad6265SDimitry Andric     static const unsigned TRANS_MAX = 4;
85*81ad6265SDimitry Andric 
86*81ad6265SDimitry Andric     // If it was written by a (non-TRANS) VALU, remember how many clock cycles
87*81ad6265SDimitry Andric     // are left until it completes, and how many other (non-TRANS) VALU we have
88*81ad6265SDimitry Andric     // seen since it was issued.
89*81ad6265SDimitry Andric     uint8_t VALUCycles = 0;
90*81ad6265SDimitry Andric     uint8_t VALUNum = VALU_MAX;
91*81ad6265SDimitry Andric 
92*81ad6265SDimitry Andric     // If it was written by a TRANS, remember how many clock cycles are left
93*81ad6265SDimitry Andric     // until it completes, and how many other TRANS we have seen since it was
94*81ad6265SDimitry Andric     // issued.
95*81ad6265SDimitry Andric     uint8_t TRANSCycles = 0;
96*81ad6265SDimitry Andric     uint8_t TRANSNum = TRANS_MAX;
97*81ad6265SDimitry Andric     // Also remember how many other (non-TRANS) VALU we have seen since it was
98*81ad6265SDimitry Andric     // issued. When an instruction depends on both a prior TRANS and a prior
99*81ad6265SDimitry Andric     // non-TRANS VALU, this is used to decide whether to encode a wait for just
100*81ad6265SDimitry Andric     // one or both of them.
101*81ad6265SDimitry Andric     uint8_t TRANSNumVALU = VALU_MAX;
102*81ad6265SDimitry Andric 
103*81ad6265SDimitry Andric     // If it was written by an SALU, remember how many clock cycles are left
104*81ad6265SDimitry Andric     // until it completes.
105*81ad6265SDimitry Andric     uint8_t SALUCycles = 0;
106*81ad6265SDimitry Andric 
107*81ad6265SDimitry Andric     DelayInfo() = default;
108*81ad6265SDimitry Andric 
109*81ad6265SDimitry Andric     DelayInfo(DelayType Type, unsigned Cycles) {
110*81ad6265SDimitry Andric       switch (Type) {
111*81ad6265SDimitry Andric       default:
112*81ad6265SDimitry Andric         llvm_unreachable("unexpected type");
113*81ad6265SDimitry Andric       case VALU:
114*81ad6265SDimitry Andric         VALUCycles = Cycles;
115*81ad6265SDimitry Andric         VALUNum = 0;
116*81ad6265SDimitry Andric         break;
117*81ad6265SDimitry Andric       case TRANS:
118*81ad6265SDimitry Andric         TRANSCycles = Cycles;
119*81ad6265SDimitry Andric         TRANSNum = 0;
120*81ad6265SDimitry Andric         TRANSNumVALU = 0;
121*81ad6265SDimitry Andric         break;
122*81ad6265SDimitry Andric       case SALU:
123*81ad6265SDimitry Andric         SALUCycles = Cycles;
124*81ad6265SDimitry Andric         break;
125*81ad6265SDimitry Andric       }
126*81ad6265SDimitry Andric     }
127*81ad6265SDimitry Andric 
128*81ad6265SDimitry Andric     bool operator==(const DelayInfo &RHS) const {
129*81ad6265SDimitry Andric       return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
130*81ad6265SDimitry Andric              TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
131*81ad6265SDimitry Andric              TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
132*81ad6265SDimitry Andric     }
133*81ad6265SDimitry Andric 
134*81ad6265SDimitry Andric     bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
135*81ad6265SDimitry Andric 
136*81ad6265SDimitry Andric     // Merge another DelayInfo into this one, to represent the union of the
137*81ad6265SDimitry Andric     // worst-case delays of each type.
138*81ad6265SDimitry Andric     void merge(const DelayInfo &RHS) {
139*81ad6265SDimitry Andric       VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
140*81ad6265SDimitry Andric       VALUNum = std::min(VALUNum, RHS.VALUNum);
141*81ad6265SDimitry Andric       TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
142*81ad6265SDimitry Andric       TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
143*81ad6265SDimitry Andric       TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
144*81ad6265SDimitry Andric       SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
145*81ad6265SDimitry Andric     }
146*81ad6265SDimitry Andric 
147*81ad6265SDimitry Andric     // Update this DelayInfo after issuing an instruction. IsVALU should be 1
148*81ad6265SDimitry Andric     // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
149*81ad6265SDimitry Andric     // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
150*81ad6265SDimitry Andric     // instruction.  Return true if there is no longer any useful delay info.
151*81ad6265SDimitry Andric     bool advance(DelayType Type, unsigned Cycles) {
152*81ad6265SDimitry Andric       bool Erase = true;
153*81ad6265SDimitry Andric 
154*81ad6265SDimitry Andric       VALUNum += (Type == VALU);
155*81ad6265SDimitry Andric       if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
156*81ad6265SDimitry Andric         // Forget about the VALU instruction. It was too far back or has
157*81ad6265SDimitry Andric         // definitely completed by now.
158*81ad6265SDimitry Andric         VALUNum = VALU_MAX;
159*81ad6265SDimitry Andric         VALUCycles = 0;
160*81ad6265SDimitry Andric       } else {
161*81ad6265SDimitry Andric         VALUCycles -= Cycles;
162*81ad6265SDimitry Andric         Erase = false;
163*81ad6265SDimitry Andric       }
164*81ad6265SDimitry Andric 
165*81ad6265SDimitry Andric       TRANSNum += (Type == TRANS);
166*81ad6265SDimitry Andric       TRANSNumVALU += (Type == VALU);
167*81ad6265SDimitry Andric       if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
168*81ad6265SDimitry Andric         // Forget about any TRANS instruction. It was too far back or has
169*81ad6265SDimitry Andric         // definitely completed by now.
170*81ad6265SDimitry Andric         TRANSNum = TRANS_MAX;
171*81ad6265SDimitry Andric         TRANSNumVALU = VALU_MAX;
172*81ad6265SDimitry Andric         TRANSCycles = 0;
173*81ad6265SDimitry Andric       } else {
174*81ad6265SDimitry Andric         TRANSCycles -= Cycles;
175*81ad6265SDimitry Andric         Erase = false;
176*81ad6265SDimitry Andric       }
177*81ad6265SDimitry Andric 
178*81ad6265SDimitry Andric       if (SALUCycles <= Cycles) {
179*81ad6265SDimitry Andric         // Forget about any SALU instruction. It has definitely completed by
180*81ad6265SDimitry Andric         // now.
181*81ad6265SDimitry Andric         SALUCycles = 0;
182*81ad6265SDimitry Andric       } else {
183*81ad6265SDimitry Andric         SALUCycles -= Cycles;
184*81ad6265SDimitry Andric         Erase = false;
185*81ad6265SDimitry Andric       }
186*81ad6265SDimitry Andric 
187*81ad6265SDimitry Andric       return Erase;
188*81ad6265SDimitry Andric     }
189*81ad6265SDimitry Andric 
190*81ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
191*81ad6265SDimitry Andric     void dump() const {
192*81ad6265SDimitry Andric       if (VALUCycles)
193*81ad6265SDimitry Andric         dbgs() << " VALUCycles=" << (int)VALUCycles;
194*81ad6265SDimitry Andric       if (VALUNum < VALU_MAX)
195*81ad6265SDimitry Andric         dbgs() << " VALUNum=" << (int)VALUNum;
196*81ad6265SDimitry Andric       if (TRANSCycles)
197*81ad6265SDimitry Andric         dbgs() << " TRANSCycles=" << (int)TRANSCycles;
198*81ad6265SDimitry Andric       if (TRANSNum < TRANS_MAX)
199*81ad6265SDimitry Andric         dbgs() << " TRANSNum=" << (int)TRANSNum;
200*81ad6265SDimitry Andric       if (TRANSNumVALU < VALU_MAX)
201*81ad6265SDimitry Andric         dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
202*81ad6265SDimitry Andric       if (SALUCycles)
203*81ad6265SDimitry Andric         dbgs() << " SALUCycles=" << (int)SALUCycles;
204*81ad6265SDimitry Andric     }
205*81ad6265SDimitry Andric #endif
206*81ad6265SDimitry Andric   };
207*81ad6265SDimitry Andric 
208*81ad6265SDimitry Andric   // A map from regunits to the delay info for that regunit.
209*81ad6265SDimitry Andric   struct DelayState : DenseMap<unsigned, DelayInfo> {
210*81ad6265SDimitry Andric     // Merge another DelayState into this one by merging the delay info for each
211*81ad6265SDimitry Andric     // regunit.
212*81ad6265SDimitry Andric     void merge(const DelayState &RHS) {
213*81ad6265SDimitry Andric       for (const auto &KV : RHS) {
214*81ad6265SDimitry Andric         iterator It;
215*81ad6265SDimitry Andric         bool Inserted;
216*81ad6265SDimitry Andric         std::tie(It, Inserted) = insert(KV);
217*81ad6265SDimitry Andric         if (!Inserted)
218*81ad6265SDimitry Andric           It->second.merge(KV.second);
219*81ad6265SDimitry Andric       }
220*81ad6265SDimitry Andric     }
221*81ad6265SDimitry Andric 
222*81ad6265SDimitry Andric     // Advance the delay info for each regunit, erasing any that are no longer
223*81ad6265SDimitry Andric     // useful.
224*81ad6265SDimitry Andric     void advance(DelayType Type, unsigned Cycles) {
225*81ad6265SDimitry Andric       iterator Next;
226*81ad6265SDimitry Andric       for (auto I = begin(), E = end(); I != E; I = Next) {
227*81ad6265SDimitry Andric         Next = std::next(I);
228*81ad6265SDimitry Andric         if (I->second.advance(Type, Cycles))
229*81ad6265SDimitry Andric           erase(I);
230*81ad6265SDimitry Andric       }
231*81ad6265SDimitry Andric     }
232*81ad6265SDimitry Andric 
233*81ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
234*81ad6265SDimitry Andric     void dump(const TargetRegisterInfo *TRI) const {
235*81ad6265SDimitry Andric       if (empty()) {
236*81ad6265SDimitry Andric         dbgs() << "    empty\n";
237*81ad6265SDimitry Andric         return;
238*81ad6265SDimitry Andric       }
239*81ad6265SDimitry Andric 
240*81ad6265SDimitry Andric       // Dump DelayInfo for each RegUnit in numerical order.
241*81ad6265SDimitry Andric       SmallVector<const_iterator, 8> Order;
242*81ad6265SDimitry Andric       Order.reserve(size());
243*81ad6265SDimitry Andric       for (const_iterator I = begin(), E = end(); I != E; ++I)
244*81ad6265SDimitry Andric         Order.push_back(I);
245*81ad6265SDimitry Andric       llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
246*81ad6265SDimitry Andric         return A->first < B->first;
247*81ad6265SDimitry Andric       });
248*81ad6265SDimitry Andric       for (const_iterator I : Order) {
249*81ad6265SDimitry Andric         dbgs() << "    " << printRegUnit(I->first, TRI);
250*81ad6265SDimitry Andric         I->second.dump();
251*81ad6265SDimitry Andric         dbgs() << "\n";
252*81ad6265SDimitry Andric       }
253*81ad6265SDimitry Andric     }
254*81ad6265SDimitry Andric #endif
255*81ad6265SDimitry Andric   };
256*81ad6265SDimitry Andric 
257*81ad6265SDimitry Andric   // The saved delay state at the end of each basic block.
258*81ad6265SDimitry Andric   DenseMap<MachineBasicBlock *, DelayState> BlockState;
259*81ad6265SDimitry Andric 
260*81ad6265SDimitry Andric   // Emit an s_delay_alu instruction if necessary before MI.
261*81ad6265SDimitry Andric   MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
262*81ad6265SDimitry Andric                              MachineInstr *LastDelayAlu) {
263*81ad6265SDimitry Andric     unsigned Imm = 0;
264*81ad6265SDimitry Andric 
265*81ad6265SDimitry Andric     // Wait for a TRANS instruction.
266*81ad6265SDimitry Andric     if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
267*81ad6265SDimitry Andric       Imm |= 4 + Delay.TRANSNum;
268*81ad6265SDimitry Andric 
269*81ad6265SDimitry Andric     // Wait for a VALU instruction (if it's more recent than any TRANS
270*81ad6265SDimitry Andric     // instruction that we're also waiting for).
271*81ad6265SDimitry Andric     if (Delay.VALUNum < DelayInfo::VALU_MAX &&
272*81ad6265SDimitry Andric         Delay.VALUNum <= Delay.TRANSNumVALU) {
273*81ad6265SDimitry Andric       if (Imm & 0xf)
274*81ad6265SDimitry Andric         Imm |= Delay.VALUNum << 7;
275*81ad6265SDimitry Andric       else
276*81ad6265SDimitry Andric         Imm |= Delay.VALUNum;
277*81ad6265SDimitry Andric     }
278*81ad6265SDimitry Andric 
279*81ad6265SDimitry Andric     // Wait for an SALU instruction.
280*81ad6265SDimitry Andric     if (Delay.SALUCycles) {
281*81ad6265SDimitry Andric       if (Imm & 0x780) {
282*81ad6265SDimitry Andric         // We have already encoded a VALU and a TRANS delay. There's no room in
283*81ad6265SDimitry Andric         // the encoding for an SALU delay as well, so just drop it.
284*81ad6265SDimitry Andric       } else if (Imm & 0xf) {
285*81ad6265SDimitry Andric         Imm |= (Delay.SALUCycles + 8) << 7;
286*81ad6265SDimitry Andric       } else {
287*81ad6265SDimitry Andric         Imm |= Delay.SALUCycles + 8;
288*81ad6265SDimitry Andric       }
289*81ad6265SDimitry Andric     }
290*81ad6265SDimitry Andric 
291*81ad6265SDimitry Andric     // Don't emit the s_delay_alu instruction if there's nothing to wait for.
292*81ad6265SDimitry Andric     if (!Imm)
293*81ad6265SDimitry Andric       return LastDelayAlu;
294*81ad6265SDimitry Andric 
295*81ad6265SDimitry Andric     // If we only need to wait for one instruction, try encoding it in the last
296*81ad6265SDimitry Andric     // s_delay_alu that we emitted.
297*81ad6265SDimitry Andric     if (!(Imm & 0x780) && LastDelayAlu) {
298*81ad6265SDimitry Andric       unsigned Skip = 0;
299*81ad6265SDimitry Andric       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
300*81ad6265SDimitry Andric                 E = MachineBasicBlock::instr_iterator(MI);
301*81ad6265SDimitry Andric            ++I != E;) {
302*81ad6265SDimitry Andric         if (!I->isBundle() && !I->isMetaInstruction())
303*81ad6265SDimitry Andric           ++Skip;
304*81ad6265SDimitry Andric       }
305*81ad6265SDimitry Andric       if (Skip < 6) {
306*81ad6265SDimitry Andric         MachineOperand &Op = LastDelayAlu->getOperand(0);
307*81ad6265SDimitry Andric         unsigned LastImm = Op.getImm();
308*81ad6265SDimitry Andric         assert((LastImm & ~0xf) == 0 &&
309*81ad6265SDimitry Andric                "Remembered an s_delay_alu with no room for another delay!");
310*81ad6265SDimitry Andric         LastImm |= Imm << 7 | Skip << 4;
311*81ad6265SDimitry Andric         Op.setImm(LastImm);
312*81ad6265SDimitry Andric         return nullptr;
313*81ad6265SDimitry Andric       }
314*81ad6265SDimitry Andric     }
315*81ad6265SDimitry Andric 
316*81ad6265SDimitry Andric     auto &MBB = *MI.getParent();
317*81ad6265SDimitry Andric     MachineInstr *DelayAlu =
318*81ad6265SDimitry Andric         BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
319*81ad6265SDimitry Andric     // Remember the s_delay_alu for next time if there is still room in it to
320*81ad6265SDimitry Andric     // encode another delay.
321*81ad6265SDimitry Andric     return (Imm & 0x780) ? nullptr : DelayAlu;
322*81ad6265SDimitry Andric   }
323*81ad6265SDimitry Andric 
324*81ad6265SDimitry Andric   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
325*81ad6265SDimitry Andric     DelayState State;
326*81ad6265SDimitry Andric     for (auto *Pred : MBB.predecessors())
327*81ad6265SDimitry Andric       State.merge(BlockState[Pred]);
328*81ad6265SDimitry Andric 
329*81ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
330*81ad6265SDimitry Andric                       << "\n";
331*81ad6265SDimitry Andric                State.dump(TRI););
332*81ad6265SDimitry Andric 
333*81ad6265SDimitry Andric     bool Changed = false;
334*81ad6265SDimitry Andric     MachineInstr *LastDelayAlu = nullptr;
335*81ad6265SDimitry Andric 
336*81ad6265SDimitry Andric     // Iterate over the contents of bundles, but don't emit any instructions
337*81ad6265SDimitry Andric     // inside a bundle.
338*81ad6265SDimitry Andric     for (auto &MI : MBB.instrs()) {
339*81ad6265SDimitry Andric       if (MI.isBundle() || MI.isMetaInstruction())
340*81ad6265SDimitry Andric         continue;
341*81ad6265SDimitry Andric 
342*81ad6265SDimitry Andric       // Ignore some more instructions that do not generate any code.
343*81ad6265SDimitry Andric       switch (MI.getOpcode()) {
344*81ad6265SDimitry Andric       case AMDGPU::SI_RETURN_TO_EPILOG:
345*81ad6265SDimitry Andric         continue;
346*81ad6265SDimitry Andric       }
347*81ad6265SDimitry Andric 
348*81ad6265SDimitry Andric       DelayType Type = getDelayType(MI.getDesc().TSFlags);
349*81ad6265SDimitry Andric 
350*81ad6265SDimitry Andric       if (instructionWaitsForVALU(MI)) {
351*81ad6265SDimitry Andric         // Forget about all outstanding VALU delays.
352*81ad6265SDimitry Andric         State = DelayState();
353*81ad6265SDimitry Andric       } else if (Type != OTHER) {
354*81ad6265SDimitry Andric         DelayInfo Delay;
355*81ad6265SDimitry Andric         // TODO: Scan implicit uses too?
356*81ad6265SDimitry Andric         for (const auto &Op : MI.explicit_uses()) {
357*81ad6265SDimitry Andric           if (Op.isReg()) {
358*81ad6265SDimitry Andric             // One of the operands of the writelane is also the output operand.
359*81ad6265SDimitry Andric             // This creates the insertion of redundant delays. Hence, we have to
360*81ad6265SDimitry Andric             // ignore this operand.
361*81ad6265SDimitry Andric             if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
362*81ad6265SDimitry Andric               continue;
363*81ad6265SDimitry Andric             for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
364*81ad6265SDimitry Andric               auto It = State.find(*UI);
365*81ad6265SDimitry Andric               if (It != State.end()) {
366*81ad6265SDimitry Andric                 Delay.merge(It->second);
367*81ad6265SDimitry Andric                 State.erase(*UI);
368*81ad6265SDimitry Andric               }
369*81ad6265SDimitry Andric             }
370*81ad6265SDimitry Andric           }
371*81ad6265SDimitry Andric         }
372*81ad6265SDimitry Andric         if (Emit && !MI.isBundledWithPred()) {
373*81ad6265SDimitry Andric           // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
374*81ad6265SDimitry Andric           // just ignore them?
375*81ad6265SDimitry Andric           LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
376*81ad6265SDimitry Andric         }
377*81ad6265SDimitry Andric       }
378*81ad6265SDimitry Andric 
379*81ad6265SDimitry Andric       if (Type != OTHER) {
380*81ad6265SDimitry Andric         // TODO: Scan implicit defs too?
381*81ad6265SDimitry Andric         for (const auto &Op : MI.defs()) {
382*81ad6265SDimitry Andric           unsigned Latency = SchedModel.computeOperandLatency(
383*81ad6265SDimitry Andric               &MI, MI.getOperandNo(&Op), nullptr, 0);
384*81ad6265SDimitry Andric           for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
385*81ad6265SDimitry Andric             State[*UI] = DelayInfo(Type, Latency);
386*81ad6265SDimitry Andric         }
387*81ad6265SDimitry Andric       }
388*81ad6265SDimitry Andric 
389*81ad6265SDimitry Andric       // Advance by the number of cycles it takes to issue this instruction.
390*81ad6265SDimitry Andric       // TODO: Use a more advanced model that accounts for instructions that
391*81ad6265SDimitry Andric       // take multiple cycles to issue on a particular pipeline.
392*81ad6265SDimitry Andric       unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
393*81ad6265SDimitry Andric       // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
394*81ad6265SDimitry Andric       // instructions on the assumption that they will usually have to be issued
395*81ad6265SDimitry Andric       // twice?
396*81ad6265SDimitry Andric       State.advance(Type, Cycles);
397*81ad6265SDimitry Andric 
398*81ad6265SDimitry Andric       LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
399*81ad6265SDimitry Andric     }
400*81ad6265SDimitry Andric 
401*81ad6265SDimitry Andric     if (Emit) {
402*81ad6265SDimitry Andric       assert(State == BlockState[&MBB] &&
403*81ad6265SDimitry Andric              "Basic block state should not have changed on final pass!");
404*81ad6265SDimitry Andric     } else if (State != BlockState[&MBB]) {
405*81ad6265SDimitry Andric       BlockState[&MBB] = std::move(State);
406*81ad6265SDimitry Andric       Changed = true;
407*81ad6265SDimitry Andric     }
408*81ad6265SDimitry Andric     return Changed;
409*81ad6265SDimitry Andric   }
410*81ad6265SDimitry Andric 
411*81ad6265SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override {
412*81ad6265SDimitry Andric     if (skipFunction(MF.getFunction()))
413*81ad6265SDimitry Andric       return false;
414*81ad6265SDimitry Andric 
415*81ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
416*81ad6265SDimitry Andric                       << "\n");
417*81ad6265SDimitry Andric 
418*81ad6265SDimitry Andric     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
419*81ad6265SDimitry Andric     if (!ST.hasDelayAlu())
420*81ad6265SDimitry Andric       return false;
421*81ad6265SDimitry Andric 
422*81ad6265SDimitry Andric     SII = ST.getInstrInfo();
423*81ad6265SDimitry Andric     TRI = ST.getRegisterInfo();
424*81ad6265SDimitry Andric 
425*81ad6265SDimitry Andric     SchedModel.init(&ST);
426*81ad6265SDimitry Andric 
427*81ad6265SDimitry Andric     // Calculate the delay state for each basic block, iterating until we reach
428*81ad6265SDimitry Andric     // a fixed point.
429*81ad6265SDimitry Andric     SetVector<MachineBasicBlock *> WorkList;
430*81ad6265SDimitry Andric     for (auto &MBB : reverse(MF))
431*81ad6265SDimitry Andric       WorkList.insert(&MBB);
432*81ad6265SDimitry Andric     while (!WorkList.empty()) {
433*81ad6265SDimitry Andric       auto &MBB = *WorkList.pop_back_val();
434*81ad6265SDimitry Andric       bool Changed = runOnMachineBasicBlock(MBB, false);
435*81ad6265SDimitry Andric       if (Changed)
436*81ad6265SDimitry Andric         WorkList.insert(MBB.succ_begin(), MBB.succ_end());
437*81ad6265SDimitry Andric     }
438*81ad6265SDimitry Andric 
439*81ad6265SDimitry Andric     LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
440*81ad6265SDimitry Andric 
441*81ad6265SDimitry Andric     // Make one last pass over all basic blocks to emit s_delay_alu
442*81ad6265SDimitry Andric     // instructions.
443*81ad6265SDimitry Andric     bool Changed = false;
444*81ad6265SDimitry Andric     for (auto &MBB : MF)
445*81ad6265SDimitry Andric       Changed |= runOnMachineBasicBlock(MBB, true);
446*81ad6265SDimitry Andric     return Changed;
447*81ad6265SDimitry Andric   }
448*81ad6265SDimitry Andric };
449*81ad6265SDimitry Andric 
450*81ad6265SDimitry Andric } // namespace
451*81ad6265SDimitry Andric 
452*81ad6265SDimitry Andric char AMDGPUInsertDelayAlu::ID = 0;
453*81ad6265SDimitry Andric 
454*81ad6265SDimitry Andric char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
455*81ad6265SDimitry Andric 
456*81ad6265SDimitry Andric INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
457*81ad6265SDimitry Andric                 false, false)
458