181ad6265SDimitry Andric //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// 281ad6265SDimitry Andric // 381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 681ad6265SDimitry Andric // 781ad6265SDimitry Andric //===----------------------------------------------------------------------===// 881ad6265SDimitry Andric // 981ad6265SDimitry Andric /// \file 1081ad6265SDimitry Andric /// Insert s_delay_alu instructions to avoid stalls on GFX11+. 1181ad6265SDimitry Andric // 1281ad6265SDimitry Andric //===----------------------------------------------------------------------===// 1381ad6265SDimitry Andric 1481ad6265SDimitry Andric #include "AMDGPU.h" 1581ad6265SDimitry Andric #include "GCNSubtarget.h" 1681ad6265SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 1781ad6265SDimitry Andric #include "SIInstrInfo.h" 1881ad6265SDimitry Andric #include "llvm/ADT/SetVector.h" 1981ad6265SDimitry Andric 2081ad6265SDimitry Andric using namespace llvm; 2181ad6265SDimitry Andric 2281ad6265SDimitry Andric #define DEBUG_TYPE "amdgpu-insert-delay-alu" 2381ad6265SDimitry Andric 2481ad6265SDimitry Andric namespace { 2581ad6265SDimitry Andric 2681ad6265SDimitry Andric class AMDGPUInsertDelayAlu : public MachineFunctionPass { 2781ad6265SDimitry Andric public: 2881ad6265SDimitry Andric static char ID; 2981ad6265SDimitry Andric 3081ad6265SDimitry Andric const SIInstrInfo *SII; 3181ad6265SDimitry Andric const TargetRegisterInfo *TRI; 3281ad6265SDimitry Andric 3381ad6265SDimitry Andric TargetSchedModel SchedModel; 3481ad6265SDimitry Andric AMDGPUInsertDelayAlu()3581ad6265SDimitry Andric AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} 3681ad6265SDimitry Andric getAnalysisUsage(AnalysisUsage & AU) const3781ad6265SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 3881ad6265SDimitry Andric AU.setPreservesCFG(); 3981ad6265SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 4081ad6265SDimitry Andric } 4181ad6265SDimitry Andric 4281ad6265SDimitry Andric // Return true if MI waits for all outstanding VALU instructions to complete. instructionWaitsForVALU(const MachineInstr & MI)4381ad6265SDimitry Andric static bool instructionWaitsForVALU(const MachineInstr &MI) { 4481ad6265SDimitry Andric // These instruction types wait for VA_VDST==0 before issuing. 4581ad6265SDimitry Andric const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | 4681ad6265SDimitry Andric SIInstrFlags::FLAT | SIInstrFlags::MIMG | 4781ad6265SDimitry Andric SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; 4881ad6265SDimitry Andric if (MI.getDesc().TSFlags & VA_VDST_0) 4981ad6265SDimitry Andric return true; 5081ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || 5181ad6265SDimitry Andric MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) 5281ad6265SDimitry Andric return true; 5381ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 54*06c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0) 5581ad6265SDimitry Andric return true; 5681ad6265SDimitry Andric return false; 5781ad6265SDimitry Andric } 5881ad6265SDimitry Andric 5981ad6265SDimitry Andric // Types of delay that can be encoded in an s_delay_alu instruction. 6081ad6265SDimitry Andric enum DelayType { VALU, TRANS, SALU, OTHER }; 6181ad6265SDimitry Andric 6281ad6265SDimitry Andric // Get the delay type for an instruction with the specified TSFlags. getDelayType(uint64_t TSFlags)6381ad6265SDimitry Andric static DelayType getDelayType(uint64_t TSFlags) { 6481ad6265SDimitry Andric if (TSFlags & SIInstrFlags::TRANS) 6581ad6265SDimitry Andric return TRANS; 6681ad6265SDimitry Andric if (TSFlags & SIInstrFlags::VALU) 6781ad6265SDimitry Andric return VALU; 6881ad6265SDimitry Andric if (TSFlags & SIInstrFlags::SALU) 6981ad6265SDimitry Andric return SALU; 7081ad6265SDimitry Andric return OTHER; 7181ad6265SDimitry Andric } 7281ad6265SDimitry Andric 7381ad6265SDimitry Andric // Information about the last instruction(s) that wrote to a particular 7481ad6265SDimitry Andric // regunit. In straight-line code there will only be one such instruction, but 7581ad6265SDimitry Andric // when control flow converges we merge the delay information from each path 7681ad6265SDimitry Andric // to represent the union of the worst-case delays of each type. 7781ad6265SDimitry Andric struct DelayInfo { 7881ad6265SDimitry Andric // One larger than the maximum number of (non-TRANS) VALU instructions we 7981ad6265SDimitry Andric // can encode in an s_delay_alu instruction. 80*06c3fb27SDimitry Andric static constexpr unsigned VALU_MAX = 5; 8181ad6265SDimitry Andric 8281ad6265SDimitry Andric // One larger than the maximum number of TRANS instructions we can encode in 8381ad6265SDimitry Andric // an s_delay_alu instruction. 84*06c3fb27SDimitry Andric static constexpr unsigned TRANS_MAX = 4; 85*06c3fb27SDimitry Andric 86*06c3fb27SDimitry Andric // One larger than the maximum number of SALU cycles we can encode in an 87*06c3fb27SDimitry Andric // s_delay_alu instruction. 88*06c3fb27SDimitry Andric static constexpr unsigned SALU_CYCLES_MAX = 4; 8981ad6265SDimitry Andric 9081ad6265SDimitry Andric // If it was written by a (non-TRANS) VALU, remember how many clock cycles 9181ad6265SDimitry Andric // are left until it completes, and how many other (non-TRANS) VALU we have 9281ad6265SDimitry Andric // seen since it was issued. 9381ad6265SDimitry Andric uint8_t VALUCycles = 0; 9481ad6265SDimitry Andric uint8_t VALUNum = VALU_MAX; 9581ad6265SDimitry Andric 9681ad6265SDimitry Andric // If it was written by a TRANS, remember how many clock cycles are left 9781ad6265SDimitry Andric // until it completes, and how many other TRANS we have seen since it was 9881ad6265SDimitry Andric // issued. 9981ad6265SDimitry Andric uint8_t TRANSCycles = 0; 10081ad6265SDimitry Andric uint8_t TRANSNum = TRANS_MAX; 10181ad6265SDimitry Andric // Also remember how many other (non-TRANS) VALU we have seen since it was 10281ad6265SDimitry Andric // issued. When an instruction depends on both a prior TRANS and a prior 10381ad6265SDimitry Andric // non-TRANS VALU, this is used to decide whether to encode a wait for just 10481ad6265SDimitry Andric // one or both of them. 10581ad6265SDimitry Andric uint8_t TRANSNumVALU = VALU_MAX; 10681ad6265SDimitry Andric 10781ad6265SDimitry Andric // If it was written by an SALU, remember how many clock cycles are left 10881ad6265SDimitry Andric // until it completes. 10981ad6265SDimitry Andric uint8_t SALUCycles = 0; 11081ad6265SDimitry Andric 11181ad6265SDimitry Andric DelayInfo() = default; 11281ad6265SDimitry Andric DelayInfo__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo11381ad6265SDimitry Andric DelayInfo(DelayType Type, unsigned Cycles) { 11481ad6265SDimitry Andric switch (Type) { 11581ad6265SDimitry Andric default: 11681ad6265SDimitry Andric llvm_unreachable("unexpected type"); 11781ad6265SDimitry Andric case VALU: 11881ad6265SDimitry Andric VALUCycles = Cycles; 11981ad6265SDimitry Andric VALUNum = 0; 12081ad6265SDimitry Andric break; 12181ad6265SDimitry Andric case TRANS: 12281ad6265SDimitry Andric TRANSCycles = Cycles; 12381ad6265SDimitry Andric TRANSNum = 0; 12481ad6265SDimitry Andric TRANSNumVALU = 0; 12581ad6265SDimitry Andric break; 12681ad6265SDimitry Andric case SALU: 127*06c3fb27SDimitry Andric // Guard against pseudo-instructions like SI_CALL which are marked as 128*06c3fb27SDimitry Andric // SALU but with a very high latency. 129*06c3fb27SDimitry Andric SALUCycles = std::min(Cycles, SALU_CYCLES_MAX); 13081ad6265SDimitry Andric break; 13181ad6265SDimitry Andric } 13281ad6265SDimitry Andric } 13381ad6265SDimitry Andric operator ==__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo13481ad6265SDimitry Andric bool operator==(const DelayInfo &RHS) const { 13581ad6265SDimitry Andric return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && 13681ad6265SDimitry Andric TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && 13781ad6265SDimitry Andric TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; 13881ad6265SDimitry Andric } 13981ad6265SDimitry Andric operator !=__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo14081ad6265SDimitry Andric bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } 14181ad6265SDimitry Andric 14281ad6265SDimitry Andric // Merge another DelayInfo into this one, to represent the union of the 14381ad6265SDimitry Andric // worst-case delays of each type. merge__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo14481ad6265SDimitry Andric void merge(const DelayInfo &RHS) { 14581ad6265SDimitry Andric VALUCycles = std::max(VALUCycles, RHS.VALUCycles); 14681ad6265SDimitry Andric VALUNum = std::min(VALUNum, RHS.VALUNum); 14781ad6265SDimitry Andric TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); 14881ad6265SDimitry Andric TRANSNum = std::min(TRANSNum, RHS.TRANSNum); 14981ad6265SDimitry Andric TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); 15081ad6265SDimitry Andric SALUCycles = std::max(SALUCycles, RHS.SALUCycles); 15181ad6265SDimitry Andric } 15281ad6265SDimitry Andric 15381ad6265SDimitry Andric // Update this DelayInfo after issuing an instruction. IsVALU should be 1 15481ad6265SDimitry Andric // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing 15581ad6265SDimitry Andric // a TRANS, else 0. Cycles is the number of cycles it takes to issue the 15681ad6265SDimitry Andric // instruction. Return true if there is no longer any useful delay info. advance__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo15781ad6265SDimitry Andric bool advance(DelayType Type, unsigned Cycles) { 15881ad6265SDimitry Andric bool Erase = true; 15981ad6265SDimitry Andric 16081ad6265SDimitry Andric VALUNum += (Type == VALU); 16181ad6265SDimitry Andric if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { 16281ad6265SDimitry Andric // Forget about the VALU instruction. It was too far back or has 16381ad6265SDimitry Andric // definitely completed by now. 16481ad6265SDimitry Andric VALUNum = VALU_MAX; 16581ad6265SDimitry Andric VALUCycles = 0; 16681ad6265SDimitry Andric } else { 16781ad6265SDimitry Andric VALUCycles -= Cycles; 16881ad6265SDimitry Andric Erase = false; 16981ad6265SDimitry Andric } 17081ad6265SDimitry Andric 17181ad6265SDimitry Andric TRANSNum += (Type == TRANS); 17281ad6265SDimitry Andric TRANSNumVALU += (Type == VALU); 17381ad6265SDimitry Andric if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { 17481ad6265SDimitry Andric // Forget about any TRANS instruction. It was too far back or has 17581ad6265SDimitry Andric // definitely completed by now. 17681ad6265SDimitry Andric TRANSNum = TRANS_MAX; 17781ad6265SDimitry Andric TRANSNumVALU = VALU_MAX; 17881ad6265SDimitry Andric TRANSCycles = 0; 17981ad6265SDimitry Andric } else { 18081ad6265SDimitry Andric TRANSCycles -= Cycles; 18181ad6265SDimitry Andric Erase = false; 18281ad6265SDimitry Andric } 18381ad6265SDimitry Andric 18481ad6265SDimitry Andric if (SALUCycles <= Cycles) { 18581ad6265SDimitry Andric // Forget about any SALU instruction. It has definitely completed by 18681ad6265SDimitry Andric // now. 18781ad6265SDimitry Andric SALUCycles = 0; 18881ad6265SDimitry Andric } else { 18981ad6265SDimitry Andric SALUCycles -= Cycles; 19081ad6265SDimitry Andric Erase = false; 19181ad6265SDimitry Andric } 19281ad6265SDimitry Andric 19381ad6265SDimitry Andric return Erase; 19481ad6265SDimitry Andric } 19581ad6265SDimitry Andric 19681ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dump__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayInfo19781ad6265SDimitry Andric void dump() const { 19881ad6265SDimitry Andric if (VALUCycles) 19981ad6265SDimitry Andric dbgs() << " VALUCycles=" << (int)VALUCycles; 20081ad6265SDimitry Andric if (VALUNum < VALU_MAX) 20181ad6265SDimitry Andric dbgs() << " VALUNum=" << (int)VALUNum; 20281ad6265SDimitry Andric if (TRANSCycles) 20381ad6265SDimitry Andric dbgs() << " TRANSCycles=" << (int)TRANSCycles; 20481ad6265SDimitry Andric if (TRANSNum < TRANS_MAX) 20581ad6265SDimitry Andric dbgs() << " TRANSNum=" << (int)TRANSNum; 20681ad6265SDimitry Andric if (TRANSNumVALU < VALU_MAX) 20781ad6265SDimitry Andric dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; 20881ad6265SDimitry Andric if (SALUCycles) 20981ad6265SDimitry Andric dbgs() << " SALUCycles=" << (int)SALUCycles; 21081ad6265SDimitry Andric } 21181ad6265SDimitry Andric #endif 21281ad6265SDimitry Andric }; 21381ad6265SDimitry Andric 21481ad6265SDimitry Andric // A map from regunits to the delay info for that regunit. 21581ad6265SDimitry Andric struct DelayState : DenseMap<unsigned, DelayInfo> { 21681ad6265SDimitry Andric // Merge another DelayState into this one by merging the delay info for each 21781ad6265SDimitry Andric // regunit. merge__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayState21881ad6265SDimitry Andric void merge(const DelayState &RHS) { 21981ad6265SDimitry Andric for (const auto &KV : RHS) { 22081ad6265SDimitry Andric iterator It; 22181ad6265SDimitry Andric bool Inserted; 22281ad6265SDimitry Andric std::tie(It, Inserted) = insert(KV); 22381ad6265SDimitry Andric if (!Inserted) 22481ad6265SDimitry Andric It->second.merge(KV.second); 22581ad6265SDimitry Andric } 22681ad6265SDimitry Andric } 22781ad6265SDimitry Andric 22881ad6265SDimitry Andric // Advance the delay info for each regunit, erasing any that are no longer 22981ad6265SDimitry Andric // useful. advance__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayState23081ad6265SDimitry Andric void advance(DelayType Type, unsigned Cycles) { 23181ad6265SDimitry Andric iterator Next; 23281ad6265SDimitry Andric for (auto I = begin(), E = end(); I != E; I = Next) { 23381ad6265SDimitry Andric Next = std::next(I); 23481ad6265SDimitry Andric if (I->second.advance(Type, Cycles)) 23581ad6265SDimitry Andric erase(I); 23681ad6265SDimitry Andric } 23781ad6265SDimitry Andric } 23881ad6265SDimitry Andric 23981ad6265SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dump__anon4f1322a90111::AMDGPUInsertDelayAlu::DelayState24081ad6265SDimitry Andric void dump(const TargetRegisterInfo *TRI) const { 24181ad6265SDimitry Andric if (empty()) { 24281ad6265SDimitry Andric dbgs() << " empty\n"; 24381ad6265SDimitry Andric return; 24481ad6265SDimitry Andric } 24581ad6265SDimitry Andric 24681ad6265SDimitry Andric // Dump DelayInfo for each RegUnit in numerical order. 24781ad6265SDimitry Andric SmallVector<const_iterator, 8> Order; 24881ad6265SDimitry Andric Order.reserve(size()); 24981ad6265SDimitry Andric for (const_iterator I = begin(), E = end(); I != E; ++I) 25081ad6265SDimitry Andric Order.push_back(I); 25181ad6265SDimitry Andric llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { 25281ad6265SDimitry Andric return A->first < B->first; 25381ad6265SDimitry Andric }); 25481ad6265SDimitry Andric for (const_iterator I : Order) { 25581ad6265SDimitry Andric dbgs() << " " << printRegUnit(I->first, TRI); 25681ad6265SDimitry Andric I->second.dump(); 25781ad6265SDimitry Andric dbgs() << "\n"; 25881ad6265SDimitry Andric } 25981ad6265SDimitry Andric } 26081ad6265SDimitry Andric #endif 26181ad6265SDimitry Andric }; 26281ad6265SDimitry Andric 26381ad6265SDimitry Andric // The saved delay state at the end of each basic block. 26481ad6265SDimitry Andric DenseMap<MachineBasicBlock *, DelayState> BlockState; 26581ad6265SDimitry Andric 26681ad6265SDimitry Andric // Emit an s_delay_alu instruction if necessary before MI. emitDelayAlu(MachineInstr & MI,DelayInfo Delay,MachineInstr * LastDelayAlu)26781ad6265SDimitry Andric MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, 26881ad6265SDimitry Andric MachineInstr *LastDelayAlu) { 26981ad6265SDimitry Andric unsigned Imm = 0; 27081ad6265SDimitry Andric 27181ad6265SDimitry Andric // Wait for a TRANS instruction. 27281ad6265SDimitry Andric if (Delay.TRANSNum < DelayInfo::TRANS_MAX) 27381ad6265SDimitry Andric Imm |= 4 + Delay.TRANSNum; 27481ad6265SDimitry Andric 27581ad6265SDimitry Andric // Wait for a VALU instruction (if it's more recent than any TRANS 27681ad6265SDimitry Andric // instruction that we're also waiting for). 27781ad6265SDimitry Andric if (Delay.VALUNum < DelayInfo::VALU_MAX && 27881ad6265SDimitry Andric Delay.VALUNum <= Delay.TRANSNumVALU) { 27981ad6265SDimitry Andric if (Imm & 0xf) 28081ad6265SDimitry Andric Imm |= Delay.VALUNum << 7; 28181ad6265SDimitry Andric else 28281ad6265SDimitry Andric Imm |= Delay.VALUNum; 28381ad6265SDimitry Andric } 28481ad6265SDimitry Andric 28581ad6265SDimitry Andric // Wait for an SALU instruction. 28681ad6265SDimitry Andric if (Delay.SALUCycles) { 287*06c3fb27SDimitry Andric assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); 28881ad6265SDimitry Andric if (Imm & 0x780) { 28981ad6265SDimitry Andric // We have already encoded a VALU and a TRANS delay. There's no room in 29081ad6265SDimitry Andric // the encoding for an SALU delay as well, so just drop it. 29181ad6265SDimitry Andric } else if (Imm & 0xf) { 29281ad6265SDimitry Andric Imm |= (Delay.SALUCycles + 8) << 7; 29381ad6265SDimitry Andric } else { 29481ad6265SDimitry Andric Imm |= Delay.SALUCycles + 8; 29581ad6265SDimitry Andric } 29681ad6265SDimitry Andric } 29781ad6265SDimitry Andric 29881ad6265SDimitry Andric // Don't emit the s_delay_alu instruction if there's nothing to wait for. 29981ad6265SDimitry Andric if (!Imm) 30081ad6265SDimitry Andric return LastDelayAlu; 30181ad6265SDimitry Andric 30281ad6265SDimitry Andric // If we only need to wait for one instruction, try encoding it in the last 30381ad6265SDimitry Andric // s_delay_alu that we emitted. 30481ad6265SDimitry Andric if (!(Imm & 0x780) && LastDelayAlu) { 30581ad6265SDimitry Andric unsigned Skip = 0; 30681ad6265SDimitry Andric for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), 30781ad6265SDimitry Andric E = MachineBasicBlock::instr_iterator(MI); 30881ad6265SDimitry Andric ++I != E;) { 30981ad6265SDimitry Andric if (!I->isBundle() && !I->isMetaInstruction()) 31081ad6265SDimitry Andric ++Skip; 31181ad6265SDimitry Andric } 31281ad6265SDimitry Andric if (Skip < 6) { 31381ad6265SDimitry Andric MachineOperand &Op = LastDelayAlu->getOperand(0); 31481ad6265SDimitry Andric unsigned LastImm = Op.getImm(); 31581ad6265SDimitry Andric assert((LastImm & ~0xf) == 0 && 31681ad6265SDimitry Andric "Remembered an s_delay_alu with no room for another delay!"); 31781ad6265SDimitry Andric LastImm |= Imm << 7 | Skip << 4; 31881ad6265SDimitry Andric Op.setImm(LastImm); 31981ad6265SDimitry Andric return nullptr; 32081ad6265SDimitry Andric } 32181ad6265SDimitry Andric } 32281ad6265SDimitry Andric 32381ad6265SDimitry Andric auto &MBB = *MI.getParent(); 32481ad6265SDimitry Andric MachineInstr *DelayAlu = 32581ad6265SDimitry Andric BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); 32681ad6265SDimitry Andric // Remember the s_delay_alu for next time if there is still room in it to 32781ad6265SDimitry Andric // encode another delay. 32881ad6265SDimitry Andric return (Imm & 0x780) ? nullptr : DelayAlu; 32981ad6265SDimitry Andric } 33081ad6265SDimitry Andric runOnMachineBasicBlock(MachineBasicBlock & MBB,bool Emit)33181ad6265SDimitry Andric bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 33281ad6265SDimitry Andric DelayState State; 33381ad6265SDimitry Andric for (auto *Pred : MBB.predecessors()) 33481ad6265SDimitry Andric State.merge(BlockState[Pred]); 33581ad6265SDimitry Andric 33681ad6265SDimitry Andric LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) 33781ad6265SDimitry Andric << "\n"; 33881ad6265SDimitry Andric State.dump(TRI);); 33981ad6265SDimitry Andric 34081ad6265SDimitry Andric bool Changed = false; 34181ad6265SDimitry Andric MachineInstr *LastDelayAlu = nullptr; 34281ad6265SDimitry Andric 34381ad6265SDimitry Andric // Iterate over the contents of bundles, but don't emit any instructions 34481ad6265SDimitry Andric // inside a bundle. 34581ad6265SDimitry Andric for (auto &MI : MBB.instrs()) { 34681ad6265SDimitry Andric if (MI.isBundle() || MI.isMetaInstruction()) 34781ad6265SDimitry Andric continue; 34881ad6265SDimitry Andric 34981ad6265SDimitry Andric // Ignore some more instructions that do not generate any code. 35081ad6265SDimitry Andric switch (MI.getOpcode()) { 35181ad6265SDimitry Andric case AMDGPU::SI_RETURN_TO_EPILOG: 35281ad6265SDimitry Andric continue; 35381ad6265SDimitry Andric } 35481ad6265SDimitry Andric 35581ad6265SDimitry Andric DelayType Type = getDelayType(MI.getDesc().TSFlags); 35681ad6265SDimitry Andric 35781ad6265SDimitry Andric if (instructionWaitsForVALU(MI)) { 35881ad6265SDimitry Andric // Forget about all outstanding VALU delays. 359*06c3fb27SDimitry Andric // TODO: This is overkill since it also forgets about SALU delays. 36081ad6265SDimitry Andric State = DelayState(); 36181ad6265SDimitry Andric } else if (Type != OTHER) { 36281ad6265SDimitry Andric DelayInfo Delay; 36381ad6265SDimitry Andric // TODO: Scan implicit uses too? 36481ad6265SDimitry Andric for (const auto &Op : MI.explicit_uses()) { 36581ad6265SDimitry Andric if (Op.isReg()) { 36681ad6265SDimitry Andric // One of the operands of the writelane is also the output operand. 36781ad6265SDimitry Andric // This creates the insertion of redundant delays. Hence, we have to 36881ad6265SDimitry Andric // ignore this operand. 36981ad6265SDimitry Andric if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) 37081ad6265SDimitry Andric continue; 371*06c3fb27SDimitry Andric for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { 372*06c3fb27SDimitry Andric auto It = State.find(Unit); 37381ad6265SDimitry Andric if (It != State.end()) { 37481ad6265SDimitry Andric Delay.merge(It->second); 375*06c3fb27SDimitry Andric State.erase(Unit); 37681ad6265SDimitry Andric } 37781ad6265SDimitry Andric } 37881ad6265SDimitry Andric } 37981ad6265SDimitry Andric } 38081ad6265SDimitry Andric if (Emit && !MI.isBundledWithPred()) { 38181ad6265SDimitry Andric // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or 38281ad6265SDimitry Andric // just ignore them? 38381ad6265SDimitry Andric LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); 38481ad6265SDimitry Andric } 38581ad6265SDimitry Andric } 38681ad6265SDimitry Andric 38781ad6265SDimitry Andric if (Type != OTHER) { 38881ad6265SDimitry Andric // TODO: Scan implicit defs too? 38981ad6265SDimitry Andric for (const auto &Op : MI.defs()) { 39081ad6265SDimitry Andric unsigned Latency = SchedModel.computeOperandLatency( 391*06c3fb27SDimitry Andric &MI, Op.getOperandNo(), nullptr, 0); 392*06c3fb27SDimitry Andric for (MCRegUnit Unit : TRI->regunits(Op.getReg())) 393*06c3fb27SDimitry Andric State[Unit] = DelayInfo(Type, Latency); 39481ad6265SDimitry Andric } 39581ad6265SDimitry Andric } 39681ad6265SDimitry Andric 39781ad6265SDimitry Andric // Advance by the number of cycles it takes to issue this instruction. 39881ad6265SDimitry Andric // TODO: Use a more advanced model that accounts for instructions that 39981ad6265SDimitry Andric // take multiple cycles to issue on a particular pipeline. 40081ad6265SDimitry Andric unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); 40181ad6265SDimitry Andric // TODO: In wave64 mode, double the number of cycles for VALU and VMEM 40281ad6265SDimitry Andric // instructions on the assumption that they will usually have to be issued 40381ad6265SDimitry Andric // twice? 40481ad6265SDimitry Andric State.advance(Type, Cycles); 40581ad6265SDimitry Andric 40681ad6265SDimitry Andric LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); 40781ad6265SDimitry Andric } 40881ad6265SDimitry Andric 40981ad6265SDimitry Andric if (Emit) { 41081ad6265SDimitry Andric assert(State == BlockState[&MBB] && 41181ad6265SDimitry Andric "Basic block state should not have changed on final pass!"); 41281ad6265SDimitry Andric } else if (State != BlockState[&MBB]) { 41381ad6265SDimitry Andric BlockState[&MBB] = std::move(State); 41481ad6265SDimitry Andric Changed = true; 41581ad6265SDimitry Andric } 41681ad6265SDimitry Andric return Changed; 41781ad6265SDimitry Andric } 41881ad6265SDimitry Andric runOnMachineFunction(MachineFunction & MF)41981ad6265SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override { 42081ad6265SDimitry Andric if (skipFunction(MF.getFunction())) 42181ad6265SDimitry Andric return false; 42281ad6265SDimitry Andric 42381ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() 42481ad6265SDimitry Andric << "\n"); 42581ad6265SDimitry Andric 42681ad6265SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 42781ad6265SDimitry Andric if (!ST.hasDelayAlu()) 42881ad6265SDimitry Andric return false; 42981ad6265SDimitry Andric 43081ad6265SDimitry Andric SII = ST.getInstrInfo(); 43181ad6265SDimitry Andric TRI = ST.getRegisterInfo(); 43281ad6265SDimitry Andric 43381ad6265SDimitry Andric SchedModel.init(&ST); 43481ad6265SDimitry Andric 43581ad6265SDimitry Andric // Calculate the delay state for each basic block, iterating until we reach 43681ad6265SDimitry Andric // a fixed point. 43781ad6265SDimitry Andric SetVector<MachineBasicBlock *> WorkList; 43881ad6265SDimitry Andric for (auto &MBB : reverse(MF)) 43981ad6265SDimitry Andric WorkList.insert(&MBB); 44081ad6265SDimitry Andric while (!WorkList.empty()) { 44181ad6265SDimitry Andric auto &MBB = *WorkList.pop_back_val(); 44281ad6265SDimitry Andric bool Changed = runOnMachineBasicBlock(MBB, false); 44381ad6265SDimitry Andric if (Changed) 44481ad6265SDimitry Andric WorkList.insert(MBB.succ_begin(), MBB.succ_end()); 44581ad6265SDimitry Andric } 44681ad6265SDimitry Andric 44781ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); 44881ad6265SDimitry Andric 44981ad6265SDimitry Andric // Make one last pass over all basic blocks to emit s_delay_alu 45081ad6265SDimitry Andric // instructions. 45181ad6265SDimitry Andric bool Changed = false; 45281ad6265SDimitry Andric for (auto &MBB : MF) 45381ad6265SDimitry Andric Changed |= runOnMachineBasicBlock(MBB, true); 45481ad6265SDimitry Andric return Changed; 45581ad6265SDimitry Andric } 45681ad6265SDimitry Andric }; 45781ad6265SDimitry Andric 45881ad6265SDimitry Andric } // namespace 45981ad6265SDimitry Andric 46081ad6265SDimitry Andric char AMDGPUInsertDelayAlu::ID = 0; 46181ad6265SDimitry Andric 46281ad6265SDimitry Andric char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; 46381ad6265SDimitry Andric 46481ad6265SDimitry Andric INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", 46581ad6265SDimitry Andric false, false) 466