1*81ad6265SDimitry Andric //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// 2*81ad6265SDimitry Andric // 3*81ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*81ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*81ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*81ad6265SDimitry Andric // 7*81ad6265SDimitry Andric //===----------------------------------------------------------------------===// 8*81ad6265SDimitry Andric // 9*81ad6265SDimitry Andric /// \file 10*81ad6265SDimitry Andric /// Insert s_delay_alu instructions to avoid stalls on GFX11+. 11*81ad6265SDimitry Andric // 12*81ad6265SDimitry Andric //===----------------------------------------------------------------------===// 13*81ad6265SDimitry Andric 14*81ad6265SDimitry Andric #include "AMDGPU.h" 15*81ad6265SDimitry Andric #include "GCNSubtarget.h" 16*81ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17*81ad6265SDimitry Andric #include "SIInstrInfo.h" 18*81ad6265SDimitry Andric #include "llvm/ADT/SetVector.h" 19*81ad6265SDimitry Andric 20*81ad6265SDimitry Andric using namespace llvm; 21*81ad6265SDimitry Andric 22*81ad6265SDimitry Andric #define DEBUG_TYPE "amdgpu-insert-delay-alu" 23*81ad6265SDimitry Andric 24*81ad6265SDimitry Andric namespace { 25*81ad6265SDimitry Andric 26*81ad6265SDimitry Andric class AMDGPUInsertDelayAlu : public MachineFunctionPass { 27*81ad6265SDimitry Andric public: 28*81ad6265SDimitry Andric static char ID; 29*81ad6265SDimitry Andric 30*81ad6265SDimitry Andric const SIInstrInfo *SII; 31*81ad6265SDimitry Andric const TargetRegisterInfo *TRI; 32*81ad6265SDimitry Andric 33*81ad6265SDimitry Andric TargetSchedModel SchedModel; 34*81ad6265SDimitry Andric 35*81ad6265SDimitry Andric AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} 36*81ad6265SDimitry Andric 37*81ad6265SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 38*81ad6265SDimitry Andric AU.setPreservesCFG(); 39*81ad6265SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 40*81ad6265SDimitry Andric } 41*81ad6265SDimitry Andric 42*81ad6265SDimitry Andric // Return true if MI waits for all outstanding VALU instructions to complete. 43*81ad6265SDimitry Andric static bool instructionWaitsForVALU(const MachineInstr &MI) { 44*81ad6265SDimitry Andric // These instruction types wait for VA_VDST==0 before issuing. 45*81ad6265SDimitry Andric const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | 46*81ad6265SDimitry Andric SIInstrFlags::FLAT | SIInstrFlags::MIMG | 47*81ad6265SDimitry Andric SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; 48*81ad6265SDimitry Andric if (MI.getDesc().TSFlags & VA_VDST_0) 49*81ad6265SDimitry Andric return true; 50*81ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || 51*81ad6265SDimitry Andric MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) 52*81ad6265SDimitry Andric return true; 53*81ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 54*81ad6265SDimitry Andric (MI.getOperand(0).getImm() & 0xf000) == 0) 55*81ad6265SDimitry Andric return true; 56*81ad6265SDimitry Andric return false; 57*81ad6265SDimitry Andric } 58*81ad6265SDimitry Andric 59*81ad6265SDimitry Andric // Types of delay that can be encoded in an s_delay_alu instruction. 60*81ad6265SDimitry Andric enum DelayType { VALU, TRANS, SALU, OTHER }; 61*81ad6265SDimitry Andric 62*81ad6265SDimitry Andric // Get the delay type for an instruction with the specified TSFlags. 63*81ad6265SDimitry Andric static DelayType getDelayType(uint64_t TSFlags) { 64*81ad6265SDimitry Andric if (TSFlags & SIInstrFlags::TRANS) 65*81ad6265SDimitry Andric return TRANS; 66*81ad6265SDimitry Andric if (TSFlags & SIInstrFlags::VALU) 67*81ad6265SDimitry Andric return VALU; 68*81ad6265SDimitry Andric if (TSFlags & SIInstrFlags::SALU) 69*81ad6265SDimitry Andric return SALU; 70*81ad6265SDimitry Andric return OTHER; 71*81ad6265SDimitry Andric } 72*81ad6265SDimitry Andric 73*81ad6265SDimitry Andric // Information about the last instruction(s) that wrote to a particular 74*81ad6265SDimitry Andric // regunit. In straight-line code there will only be one such instruction, but 75*81ad6265SDimitry Andric // when control flow converges we merge the delay information from each path 76*81ad6265SDimitry Andric // to represent the union of the worst-case delays of each type. 77*81ad6265SDimitry Andric struct DelayInfo { 78*81ad6265SDimitry Andric // One larger than the maximum number of (non-TRANS) VALU instructions we 79*81ad6265SDimitry Andric // can encode in an s_delay_alu instruction. 80*81ad6265SDimitry Andric static const unsigned VALU_MAX = 5; 81*81ad6265SDimitry Andric 82*81ad6265SDimitry Andric // One larger than the maximum number of TRANS instructions we can encode in 83*81ad6265SDimitry Andric // an s_delay_alu instruction. 84*81ad6265SDimitry Andric static const unsigned TRANS_MAX = 4; 85*81ad6265SDimitry Andric 86*81ad6265SDimitry Andric // If it was written by a (non-TRANS) VALU, remember how many clock cycles 87*81ad6265SDimitry Andric // are left until it completes, and how many other (non-TRANS) VALU we have 88*81ad6265SDimitry Andric // seen since it was issued. 89*81ad6265SDimitry Andric uint8_t VALUCycles = 0; 90*81ad6265SDimitry Andric uint8_t VALUNum = VALU_MAX; 91*81ad6265SDimitry Andric 92*81ad6265SDimitry Andric // If it was written by a TRANS, remember how many clock cycles are left 93*81ad6265SDimitry Andric // until it completes, and how many other TRANS we have seen since it was 94*81ad6265SDimitry Andric // issued. 95*81ad6265SDimitry Andric uint8_t TRANSCycles = 0; 96*81ad6265SDimitry Andric uint8_t TRANSNum = TRANS_MAX; 97*81ad6265SDimitry Andric // Also remember how many other (non-TRANS) VALU we have seen since it was 98*81ad6265SDimitry Andric // issued. When an instruction depends on both a prior TRANS and a prior 99*81ad6265SDimitry Andric // non-TRANS VALU, this is used to decide whether to encode a wait for just 100*81ad6265SDimitry Andric // one or both of them. 101*81ad6265SDimitry Andric uint8_t TRANSNumVALU = VALU_MAX; 102*81ad6265SDimitry Andric 103*81ad6265SDimitry Andric // If it was written by an SALU, remember how many clock cycles are left 104*81ad6265SDimitry Andric // until it completes. 105*81ad6265SDimitry Andric uint8_t SALUCycles = 0; 106*81ad6265SDimitry Andric 107*81ad6265SDimitry Andric DelayInfo() = default; 108*81ad6265SDimitry Andric 109*81ad6265SDimitry Andric DelayInfo(DelayType Type, unsigned Cycles) { 110*81ad6265SDimitry Andric switch (Type) { 111*81ad6265SDimitry Andric default: 112*81ad6265SDimitry Andric llvm_unreachable("unexpected type"); 113*81ad6265SDimitry Andric case VALU: 114*81ad6265SDimitry Andric VALUCycles = Cycles; 115*81ad6265SDimitry Andric VALUNum = 0; 116*81ad6265SDimitry Andric break; 117*81ad6265SDimitry Andric case TRANS: 118*81ad6265SDimitry Andric TRANSCycles = Cycles; 119*81ad6265SDimitry Andric TRANSNum = 0; 120*81ad6265SDimitry Andric TRANSNumVALU = 0; 121*81ad6265SDimitry Andric break; 122*81ad6265SDimitry Andric case SALU: 123*81ad6265SDimitry Andric SALUCycles = Cycles; 124*81ad6265SDimitry Andric break; 125*81ad6265SDimitry Andric } 126*81ad6265SDimitry Andric } 127*81ad6265SDimitry Andric 128*81ad6265SDimitry Andric bool operator==(const DelayInfo &RHS) const { 129*81ad6265SDimitry Andric return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && 130*81ad6265SDimitry Andric TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && 131*81ad6265SDimitry Andric TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; 132*81ad6265SDimitry Andric } 133*81ad6265SDimitry Andric 134*81ad6265SDimitry Andric bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } 135*81ad6265SDimitry Andric 136*81ad6265SDimitry Andric // Merge another DelayInfo into this one, to represent the union of the 137*81ad6265SDimitry Andric // worst-case delays of each type. 138*81ad6265SDimitry Andric void merge(const DelayInfo &RHS) { 139*81ad6265SDimitry Andric VALUCycles = std::max(VALUCycles, RHS.VALUCycles); 140*81ad6265SDimitry Andric VALUNum = std::min(VALUNum, RHS.VALUNum); 141*81ad6265SDimitry Andric TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); 142*81ad6265SDimitry Andric TRANSNum = std::min(TRANSNum, RHS.TRANSNum); 143*81ad6265SDimitry Andric TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); 144*81ad6265SDimitry Andric SALUCycles = std::max(SALUCycles, RHS.SALUCycles); 145*81ad6265SDimitry Andric } 146*81ad6265SDimitry Andric 147*81ad6265SDimitry Andric // Update this DelayInfo after issuing an instruction. IsVALU should be 1 148*81ad6265SDimitry Andric // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing 149*81ad6265SDimitry Andric // a TRANS, else 0. Cycles is the number of cycles it takes to issue the 150*81ad6265SDimitry Andric // instruction. Return true if there is no longer any useful delay info. 151*81ad6265SDimitry Andric bool advance(DelayType Type, unsigned Cycles) { 152*81ad6265SDimitry Andric bool Erase = true; 153*81ad6265SDimitry Andric 154*81ad6265SDimitry Andric VALUNum += (Type == VALU); 155*81ad6265SDimitry Andric if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { 156*81ad6265SDimitry Andric // Forget about the VALU instruction. It was too far back or has 157*81ad6265SDimitry Andric // definitely completed by now. 158*81ad6265SDimitry Andric VALUNum = VALU_MAX; 159*81ad6265SDimitry Andric VALUCycles = 0; 160*81ad6265SDimitry Andric } else { 161*81ad6265SDimitry Andric VALUCycles -= Cycles; 162*81ad6265SDimitry Andric Erase = false; 163*81ad6265SDimitry Andric } 164*81ad6265SDimitry Andric 165*81ad6265SDimitry Andric TRANSNum += (Type == TRANS); 166*81ad6265SDimitry Andric TRANSNumVALU += (Type == VALU); 167*81ad6265SDimitry Andric if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { 168*81ad6265SDimitry Andric // Forget about any TRANS instruction. It was too far back or has 169*81ad6265SDimitry Andric // definitely completed by now. 170*81ad6265SDimitry Andric TRANSNum = TRANS_MAX; 171*81ad6265SDimitry Andric TRANSNumVALU = VALU_MAX; 172*81ad6265SDimitry Andric TRANSCycles = 0; 173*81ad6265SDimitry Andric } else { 174*81ad6265SDimitry Andric TRANSCycles -= Cycles; 175*81ad6265SDimitry Andric Erase = false; 176*81ad6265SDimitry Andric } 177*81ad6265SDimitry Andric 178*81ad6265SDimitry Andric if (SALUCycles <= Cycles) { 179*81ad6265SDimitry Andric // Forget about any SALU instruction. It has definitely completed by 180*81ad6265SDimitry Andric // now. 181*81ad6265SDimitry Andric SALUCycles = 0; 182*81ad6265SDimitry Andric } else { 183*81ad6265SDimitry Andric SALUCycles -= Cycles; 184*81ad6265SDimitry Andric Erase = false; 185*81ad6265SDimitry Andric } 186*81ad6265SDimitry Andric 187*81ad6265SDimitry Andric return Erase; 188*81ad6265SDimitry Andric } 189*81ad6265SDimitry Andric 190*81ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 191*81ad6265SDimitry Andric void dump() const { 192*81ad6265SDimitry Andric if (VALUCycles) 193*81ad6265SDimitry Andric dbgs() << " VALUCycles=" << (int)VALUCycles; 194*81ad6265SDimitry Andric if (VALUNum < VALU_MAX) 195*81ad6265SDimitry Andric dbgs() << " VALUNum=" << (int)VALUNum; 196*81ad6265SDimitry Andric if (TRANSCycles) 197*81ad6265SDimitry Andric dbgs() << " TRANSCycles=" << (int)TRANSCycles; 198*81ad6265SDimitry Andric if (TRANSNum < TRANS_MAX) 199*81ad6265SDimitry Andric dbgs() << " TRANSNum=" << (int)TRANSNum; 200*81ad6265SDimitry Andric if (TRANSNumVALU < VALU_MAX) 201*81ad6265SDimitry Andric dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; 202*81ad6265SDimitry Andric if (SALUCycles) 203*81ad6265SDimitry Andric dbgs() << " SALUCycles=" << (int)SALUCycles; 204*81ad6265SDimitry Andric } 205*81ad6265SDimitry Andric #endif 206*81ad6265SDimitry Andric }; 207*81ad6265SDimitry Andric 208*81ad6265SDimitry Andric // A map from regunits to the delay info for that regunit. 209*81ad6265SDimitry Andric struct DelayState : DenseMap<unsigned, DelayInfo> { 210*81ad6265SDimitry Andric // Merge another DelayState into this one by merging the delay info for each 211*81ad6265SDimitry Andric // regunit. 212*81ad6265SDimitry Andric void merge(const DelayState &RHS) { 213*81ad6265SDimitry Andric for (const auto &KV : RHS) { 214*81ad6265SDimitry Andric iterator It; 215*81ad6265SDimitry Andric bool Inserted; 216*81ad6265SDimitry Andric std::tie(It, Inserted) = insert(KV); 217*81ad6265SDimitry Andric if (!Inserted) 218*81ad6265SDimitry Andric It->second.merge(KV.second); 219*81ad6265SDimitry Andric } 220*81ad6265SDimitry Andric } 221*81ad6265SDimitry Andric 222*81ad6265SDimitry Andric // Advance the delay info for each regunit, erasing any that are no longer 223*81ad6265SDimitry Andric // useful. 224*81ad6265SDimitry Andric void advance(DelayType Type, unsigned Cycles) { 225*81ad6265SDimitry Andric iterator Next; 226*81ad6265SDimitry Andric for (auto I = begin(), E = end(); I != E; I = Next) { 227*81ad6265SDimitry Andric Next = std::next(I); 228*81ad6265SDimitry Andric if (I->second.advance(Type, Cycles)) 229*81ad6265SDimitry Andric erase(I); 230*81ad6265SDimitry Andric } 231*81ad6265SDimitry Andric } 232*81ad6265SDimitry Andric 233*81ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 234*81ad6265SDimitry Andric void dump(const TargetRegisterInfo *TRI) const { 235*81ad6265SDimitry Andric if (empty()) { 236*81ad6265SDimitry Andric dbgs() << " empty\n"; 237*81ad6265SDimitry Andric return; 238*81ad6265SDimitry Andric } 239*81ad6265SDimitry Andric 240*81ad6265SDimitry Andric // Dump DelayInfo for each RegUnit in numerical order. 241*81ad6265SDimitry Andric SmallVector<const_iterator, 8> Order; 242*81ad6265SDimitry Andric Order.reserve(size()); 243*81ad6265SDimitry Andric for (const_iterator I = begin(), E = end(); I != E; ++I) 244*81ad6265SDimitry Andric Order.push_back(I); 245*81ad6265SDimitry Andric llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { 246*81ad6265SDimitry Andric return A->first < B->first; 247*81ad6265SDimitry Andric }); 248*81ad6265SDimitry Andric for (const_iterator I : Order) { 249*81ad6265SDimitry Andric dbgs() << " " << printRegUnit(I->first, TRI); 250*81ad6265SDimitry Andric I->second.dump(); 251*81ad6265SDimitry Andric dbgs() << "\n"; 252*81ad6265SDimitry Andric } 253*81ad6265SDimitry Andric } 254*81ad6265SDimitry Andric #endif 255*81ad6265SDimitry Andric }; 256*81ad6265SDimitry Andric 257*81ad6265SDimitry Andric // The saved delay state at the end of each basic block. 258*81ad6265SDimitry Andric DenseMap<MachineBasicBlock *, DelayState> BlockState; 259*81ad6265SDimitry Andric 260*81ad6265SDimitry Andric // Emit an s_delay_alu instruction if necessary before MI. 261*81ad6265SDimitry Andric MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, 262*81ad6265SDimitry Andric MachineInstr *LastDelayAlu) { 263*81ad6265SDimitry Andric unsigned Imm = 0; 264*81ad6265SDimitry Andric 265*81ad6265SDimitry Andric // Wait for a TRANS instruction. 266*81ad6265SDimitry Andric if (Delay.TRANSNum < DelayInfo::TRANS_MAX) 267*81ad6265SDimitry Andric Imm |= 4 + Delay.TRANSNum; 268*81ad6265SDimitry Andric 269*81ad6265SDimitry Andric // Wait for a VALU instruction (if it's more recent than any TRANS 270*81ad6265SDimitry Andric // instruction that we're also waiting for). 271*81ad6265SDimitry Andric if (Delay.VALUNum < DelayInfo::VALU_MAX && 272*81ad6265SDimitry Andric Delay.VALUNum <= Delay.TRANSNumVALU) { 273*81ad6265SDimitry Andric if (Imm & 0xf) 274*81ad6265SDimitry Andric Imm |= Delay.VALUNum << 7; 275*81ad6265SDimitry Andric else 276*81ad6265SDimitry Andric Imm |= Delay.VALUNum; 277*81ad6265SDimitry Andric } 278*81ad6265SDimitry Andric 279*81ad6265SDimitry Andric // Wait for an SALU instruction. 280*81ad6265SDimitry Andric if (Delay.SALUCycles) { 281*81ad6265SDimitry Andric if (Imm & 0x780) { 282*81ad6265SDimitry Andric // We have already encoded a VALU and a TRANS delay. There's no room in 283*81ad6265SDimitry Andric // the encoding for an SALU delay as well, so just drop it. 284*81ad6265SDimitry Andric } else if (Imm & 0xf) { 285*81ad6265SDimitry Andric Imm |= (Delay.SALUCycles + 8) << 7; 286*81ad6265SDimitry Andric } else { 287*81ad6265SDimitry Andric Imm |= Delay.SALUCycles + 8; 288*81ad6265SDimitry Andric } 289*81ad6265SDimitry Andric } 290*81ad6265SDimitry Andric 291*81ad6265SDimitry Andric // Don't emit the s_delay_alu instruction if there's nothing to wait for. 292*81ad6265SDimitry Andric if (!Imm) 293*81ad6265SDimitry Andric return LastDelayAlu; 294*81ad6265SDimitry Andric 295*81ad6265SDimitry Andric // If we only need to wait for one instruction, try encoding it in the last 296*81ad6265SDimitry Andric // s_delay_alu that we emitted. 297*81ad6265SDimitry Andric if (!(Imm & 0x780) && LastDelayAlu) { 298*81ad6265SDimitry Andric unsigned Skip = 0; 299*81ad6265SDimitry Andric for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), 300*81ad6265SDimitry Andric E = MachineBasicBlock::instr_iterator(MI); 301*81ad6265SDimitry Andric ++I != E;) { 302*81ad6265SDimitry Andric if (!I->isBundle() && !I->isMetaInstruction()) 303*81ad6265SDimitry Andric ++Skip; 304*81ad6265SDimitry Andric } 305*81ad6265SDimitry Andric if (Skip < 6) { 306*81ad6265SDimitry Andric MachineOperand &Op = LastDelayAlu->getOperand(0); 307*81ad6265SDimitry Andric unsigned LastImm = Op.getImm(); 308*81ad6265SDimitry Andric assert((LastImm & ~0xf) == 0 && 309*81ad6265SDimitry Andric "Remembered an s_delay_alu with no room for another delay!"); 310*81ad6265SDimitry Andric LastImm |= Imm << 7 | Skip << 4; 311*81ad6265SDimitry Andric Op.setImm(LastImm); 312*81ad6265SDimitry Andric return nullptr; 313*81ad6265SDimitry Andric } 314*81ad6265SDimitry Andric } 315*81ad6265SDimitry Andric 316*81ad6265SDimitry Andric auto &MBB = *MI.getParent(); 317*81ad6265SDimitry Andric MachineInstr *DelayAlu = 318*81ad6265SDimitry Andric BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); 319*81ad6265SDimitry Andric // Remember the s_delay_alu for next time if there is still room in it to 320*81ad6265SDimitry Andric // encode another delay. 321*81ad6265SDimitry Andric return (Imm & 0x780) ? nullptr : DelayAlu; 322*81ad6265SDimitry Andric } 323*81ad6265SDimitry Andric 324*81ad6265SDimitry Andric bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 325*81ad6265SDimitry Andric DelayState State; 326*81ad6265SDimitry Andric for (auto *Pred : MBB.predecessors()) 327*81ad6265SDimitry Andric State.merge(BlockState[Pred]); 328*81ad6265SDimitry Andric 329*81ad6265SDimitry Andric LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) 330*81ad6265SDimitry Andric << "\n"; 331*81ad6265SDimitry Andric State.dump(TRI);); 332*81ad6265SDimitry Andric 333*81ad6265SDimitry Andric bool Changed = false; 334*81ad6265SDimitry Andric MachineInstr *LastDelayAlu = nullptr; 335*81ad6265SDimitry Andric 336*81ad6265SDimitry Andric // Iterate over the contents of bundles, but don't emit any instructions 337*81ad6265SDimitry Andric // inside a bundle. 338*81ad6265SDimitry Andric for (auto &MI : MBB.instrs()) { 339*81ad6265SDimitry Andric if (MI.isBundle() || MI.isMetaInstruction()) 340*81ad6265SDimitry Andric continue; 341*81ad6265SDimitry Andric 342*81ad6265SDimitry Andric // Ignore some more instructions that do not generate any code. 343*81ad6265SDimitry Andric switch (MI.getOpcode()) { 344*81ad6265SDimitry Andric case AMDGPU::SI_RETURN_TO_EPILOG: 345*81ad6265SDimitry Andric continue; 346*81ad6265SDimitry Andric } 347*81ad6265SDimitry Andric 348*81ad6265SDimitry Andric DelayType Type = getDelayType(MI.getDesc().TSFlags); 349*81ad6265SDimitry Andric 350*81ad6265SDimitry Andric if (instructionWaitsForVALU(MI)) { 351*81ad6265SDimitry Andric // Forget about all outstanding VALU delays. 352*81ad6265SDimitry Andric State = DelayState(); 353*81ad6265SDimitry Andric } else if (Type != OTHER) { 354*81ad6265SDimitry Andric DelayInfo Delay; 355*81ad6265SDimitry Andric // TODO: Scan implicit uses too? 356*81ad6265SDimitry Andric for (const auto &Op : MI.explicit_uses()) { 357*81ad6265SDimitry Andric if (Op.isReg()) { 358*81ad6265SDimitry Andric // One of the operands of the writelane is also the output operand. 359*81ad6265SDimitry Andric // This creates the insertion of redundant delays. Hence, we have to 360*81ad6265SDimitry Andric // ignore this operand. 361*81ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) 362*81ad6265SDimitry Andric continue; 363*81ad6265SDimitry Andric for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { 364*81ad6265SDimitry Andric auto It = State.find(*UI); 365*81ad6265SDimitry Andric if (It != State.end()) { 366*81ad6265SDimitry Andric Delay.merge(It->second); 367*81ad6265SDimitry Andric State.erase(*UI); 368*81ad6265SDimitry Andric } 369*81ad6265SDimitry Andric } 370*81ad6265SDimitry Andric } 371*81ad6265SDimitry Andric } 372*81ad6265SDimitry Andric if (Emit && !MI.isBundledWithPred()) { 373*81ad6265SDimitry Andric // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or 374*81ad6265SDimitry Andric // just ignore them? 375*81ad6265SDimitry Andric LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); 376*81ad6265SDimitry Andric } 377*81ad6265SDimitry Andric } 378*81ad6265SDimitry Andric 379*81ad6265SDimitry Andric if (Type != OTHER) { 380*81ad6265SDimitry Andric // TODO: Scan implicit defs too? 381*81ad6265SDimitry Andric for (const auto &Op : MI.defs()) { 382*81ad6265SDimitry Andric unsigned Latency = SchedModel.computeOperandLatency( 383*81ad6265SDimitry Andric &MI, MI.getOperandNo(&Op), nullptr, 0); 384*81ad6265SDimitry Andric for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) 385*81ad6265SDimitry Andric State[*UI] = DelayInfo(Type, Latency); 386*81ad6265SDimitry Andric } 387*81ad6265SDimitry Andric } 388*81ad6265SDimitry Andric 389*81ad6265SDimitry Andric // Advance by the number of cycles it takes to issue this instruction. 390*81ad6265SDimitry Andric // TODO: Use a more advanced model that accounts for instructions that 391*81ad6265SDimitry Andric // take multiple cycles to issue on a particular pipeline. 392*81ad6265SDimitry Andric unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); 393*81ad6265SDimitry Andric // TODO: In wave64 mode, double the number of cycles for VALU and VMEM 394*81ad6265SDimitry Andric // instructions on the assumption that they will usually have to be issued 395*81ad6265SDimitry Andric // twice? 396*81ad6265SDimitry Andric State.advance(Type, Cycles); 397*81ad6265SDimitry Andric 398*81ad6265SDimitry Andric LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); 399*81ad6265SDimitry Andric } 400*81ad6265SDimitry Andric 401*81ad6265SDimitry Andric if (Emit) { 402*81ad6265SDimitry Andric assert(State == BlockState[&MBB] && 403*81ad6265SDimitry Andric "Basic block state should not have changed on final pass!"); 404*81ad6265SDimitry Andric } else if (State != BlockState[&MBB]) { 405*81ad6265SDimitry Andric BlockState[&MBB] = std::move(State); 406*81ad6265SDimitry Andric Changed = true; 407*81ad6265SDimitry Andric } 408*81ad6265SDimitry Andric return Changed; 409*81ad6265SDimitry Andric } 410*81ad6265SDimitry Andric 411*81ad6265SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override { 412*81ad6265SDimitry Andric if (skipFunction(MF.getFunction())) 413*81ad6265SDimitry Andric return false; 414*81ad6265SDimitry Andric 415*81ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() 416*81ad6265SDimitry Andric << "\n"); 417*81ad6265SDimitry Andric 418*81ad6265SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 419*81ad6265SDimitry Andric if (!ST.hasDelayAlu()) 420*81ad6265SDimitry Andric return false; 421*81ad6265SDimitry Andric 422*81ad6265SDimitry Andric SII = ST.getInstrInfo(); 423*81ad6265SDimitry Andric TRI = ST.getRegisterInfo(); 424*81ad6265SDimitry Andric 425*81ad6265SDimitry Andric SchedModel.init(&ST); 426*81ad6265SDimitry Andric 427*81ad6265SDimitry Andric // Calculate the delay state for each basic block, iterating until we reach 428*81ad6265SDimitry Andric // a fixed point. 429*81ad6265SDimitry Andric SetVector<MachineBasicBlock *> WorkList; 430*81ad6265SDimitry Andric for (auto &MBB : reverse(MF)) 431*81ad6265SDimitry Andric WorkList.insert(&MBB); 432*81ad6265SDimitry Andric while (!WorkList.empty()) { 433*81ad6265SDimitry Andric auto &MBB = *WorkList.pop_back_val(); 434*81ad6265SDimitry Andric bool Changed = runOnMachineBasicBlock(MBB, false); 435*81ad6265SDimitry Andric if (Changed) 436*81ad6265SDimitry Andric WorkList.insert(MBB.succ_begin(), MBB.succ_end()); 437*81ad6265SDimitry Andric } 438*81ad6265SDimitry Andric 439*81ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); 440*81ad6265SDimitry Andric 441*81ad6265SDimitry Andric // Make one last pass over all basic blocks to emit s_delay_alu 442*81ad6265SDimitry Andric // instructions. 443*81ad6265SDimitry Andric bool Changed = false; 444*81ad6265SDimitry Andric for (auto &MBB : MF) 445*81ad6265SDimitry Andric Changed |= runOnMachineBasicBlock(MBB, true); 446*81ad6265SDimitry Andric return Changed; 447*81ad6265SDimitry Andric } 448*81ad6265SDimitry Andric }; 449*81ad6265SDimitry Andric 450*81ad6265SDimitry Andric } // namespace 451*81ad6265SDimitry Andric 452*81ad6265SDimitry Andric char AMDGPUInsertDelayAlu::ID = 0; 453*81ad6265SDimitry Andric 454*81ad6265SDimitry Andric char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; 455*81ad6265SDimitry Andric 456*81ad6265SDimitry Andric INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", 457*81ad6265SDimitry Andric false, false) 458