10b57cec5SDimitry Andric //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// Copies from VGPR to SGPR registers are illegal and the register coalescer 110b57cec5SDimitry Andric /// will sometimes generate these illegal copies in situations like this: 120b57cec5SDimitry Andric /// 130b57cec5SDimitry Andric /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 140b57cec5SDimitry Andric /// 150b57cec5SDimitry Andric /// BB0: 160b57cec5SDimitry Andric /// %0 <sgpr> = SCALAR_INST 170b57cec5SDimitry Andric /// %1 <vsrc> = COPY %0 <sgpr> 180b57cec5SDimitry Andric /// ... 190b57cec5SDimitry Andric /// BRANCH %cond BB1, BB2 200b57cec5SDimitry Andric /// BB1: 210b57cec5SDimitry Andric /// %2 <vgpr> = VECTOR_INST 220b57cec5SDimitry Andric /// %3 <vsrc> = COPY %2 <vgpr> 230b57cec5SDimitry Andric /// BB2: 240b57cec5SDimitry Andric /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> 250b57cec5SDimitry Andric /// %5 <vgpr> = VECTOR_INST %4 <vsrc> 260b57cec5SDimitry Andric /// 270b57cec5SDimitry Andric /// 280b57cec5SDimitry Andric /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 290b57cec5SDimitry Andric /// code will look like this: 300b57cec5SDimitry Andric /// 310b57cec5SDimitry Andric /// BB0: 320b57cec5SDimitry Andric /// %0 <sgpr> = SCALAR_INST 330b57cec5SDimitry Andric /// ... 340b57cec5SDimitry Andric /// BRANCH %cond BB1, BB2 350b57cec5SDimitry Andric /// BB1: 360b57cec5SDimitry Andric /// %2 <vgpr> = VECTOR_INST 370b57cec5SDimitry Andric /// %3 <vsrc> = COPY %2 <vgpr> 380b57cec5SDimitry Andric /// BB2: 390b57cec5SDimitry Andric /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> 400b57cec5SDimitry Andric /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 410b57cec5SDimitry Andric /// 420b57cec5SDimitry Andric /// Now that the result of the PHI instruction is an SGPR, the register 430b57cec5SDimitry Andric /// allocator is now forced to constrain the register class of %3 to 440b57cec5SDimitry Andric /// <sgpr> so we end up with final code like this: 450b57cec5SDimitry Andric /// 460b57cec5SDimitry Andric /// BB0: 470b57cec5SDimitry Andric /// %0 <sgpr> = SCALAR_INST 480b57cec5SDimitry Andric /// ... 490b57cec5SDimitry Andric /// BRANCH %cond BB1, BB2 500b57cec5SDimitry Andric /// BB1: 510b57cec5SDimitry Andric /// %2 <vgpr> = VECTOR_INST 520b57cec5SDimitry Andric /// %3 <sgpr> = COPY %2 <vgpr> 530b57cec5SDimitry Andric /// BB2: 540b57cec5SDimitry Andric /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> 550b57cec5SDimitry Andric /// %5 <vgpr> = VECTOR_INST %4 <sgpr> 560b57cec5SDimitry Andric /// 570b57cec5SDimitry Andric /// Now this code contains an illegal copy from a VGPR to an SGPR. 580b57cec5SDimitry Andric /// 590b57cec5SDimitry Andric /// In order to avoid this problem, this pass searches for PHI instructions 600b57cec5SDimitry Andric /// which define a <vsrc> register and constrains its definition class to 610b57cec5SDimitry Andric /// <vgpr> if the user of the PHI's definition register is a vector instruction. 620b57cec5SDimitry Andric /// If the PHI's definition class is constrained to <vgpr> then the coalescer 630b57cec5SDimitry Andric /// will be unable to perform the COPY removal from the above example which 640b57cec5SDimitry Andric /// ultimately led to the creation of an illegal COPY. 650b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 660b57cec5SDimitry Andric 670b57cec5SDimitry Andric #include "AMDGPU.h" 68e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 69480093f4SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 70fcaf7f86SDimitry Andric #include "llvm/ADT/SetOperations.h" 710b57cec5SDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 72480093f4SDimitry Andric #include "llvm/InitializePasses.h" 730b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h" 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric using namespace llvm; 760b57cec5SDimitry Andric 770b57cec5SDimitry Andric #define DEBUG_TYPE "si-fix-sgpr-copies" 780b57cec5SDimitry Andric 790b57cec5SDimitry Andric static cl::opt<bool> EnableM0Merge( 800b57cec5SDimitry Andric "amdgpu-enable-merge-m0", 810b57cec5SDimitry Andric cl::desc("Merge and hoist M0 initializations"), 820b57cec5SDimitry Andric cl::init(true)); 830b57cec5SDimitry Andric 840b57cec5SDimitry Andric namespace { 85bdd1243dSDimitry Andric 86bdd1243dSDimitry Andric class V2SCopyInfo { 87bdd1243dSDimitry Andric public: 88bdd1243dSDimitry Andric // VGPR to SGPR copy being processed 89bdd1243dSDimitry Andric MachineInstr *Copy; 90bdd1243dSDimitry Andric // All SALU instructions reachable from this copy in SSA graph 915f757f3fSDimitry Andric SetVector<MachineInstr *> SChain; 92bdd1243dSDimitry Andric // Number of SGPR to VGPR copies that are used to put the SALU computation 93bdd1243dSDimitry Andric // results back to VALU. 94bdd1243dSDimitry Andric unsigned NumSVCopies; 95bdd1243dSDimitry Andric 96bdd1243dSDimitry Andric unsigned Score; 97bdd1243dSDimitry Andric // Actual count of v_readfirstlane_b32 98bdd1243dSDimitry Andric // which need to be inserted to keep SChain SALU 99bdd1243dSDimitry Andric unsigned NumReadfirstlanes; 100bdd1243dSDimitry Andric // Current score state. To speedup selection V2SCopyInfos for processing 101bdd1243dSDimitry Andric bool NeedToBeConvertedToVALU = false; 102bdd1243dSDimitry Andric // Unique ID. Used as a key for mapping to keep permanent order. 103bdd1243dSDimitry Andric unsigned ID; 104bdd1243dSDimitry Andric 105bdd1243dSDimitry Andric // Count of another VGPR to SGPR copies that contribute to the 106bdd1243dSDimitry Andric // current copy SChain 107bdd1243dSDimitry Andric unsigned SiblingPenalty = 0; 108bdd1243dSDimitry Andric SetVector<unsigned> Siblings; 109bdd1243dSDimitry Andric V2SCopyInfo() : Copy(nullptr), ID(0){}; 110bdd1243dSDimitry Andric V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) 111bdd1243dSDimitry Andric : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; 112bdd1243dSDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 113bdd1243dSDimitry Andric void dump() { 114bdd1243dSDimitry Andric dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() 115bdd1243dSDimitry Andric << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty 116bdd1243dSDimitry Andric << "\nScore: " << Score << "\n"; 117bdd1243dSDimitry Andric } 118bdd1243dSDimitry Andric #endif 119bdd1243dSDimitry Andric }; 120bdd1243dSDimitry Andric 1210b57cec5SDimitry Andric class SIFixSGPRCopies : public MachineFunctionPass { 1220b57cec5SDimitry Andric MachineDominatorTree *MDT; 123bdd1243dSDimitry Andric SmallVector<MachineInstr*, 4> SCCCopies; 124bdd1243dSDimitry Andric SmallVector<MachineInstr*, 4> RegSequences; 125bdd1243dSDimitry Andric SmallVector<MachineInstr*, 4> PHINodes; 126bdd1243dSDimitry Andric SmallVector<MachineInstr*, 4> S2VCopies; 127*0fca6ea1SDimitry Andric unsigned NextVGPRToSGPRCopyID = 0; 1285f757f3fSDimitry Andric MapVector<unsigned, V2SCopyInfo> V2SCopies; 129bdd1243dSDimitry Andric DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric public: 1320b57cec5SDimitry Andric static char ID; 1330b57cec5SDimitry Andric 1348bcb0991SDimitry Andric MachineRegisterInfo *MRI; 1358bcb0991SDimitry Andric const SIRegisterInfo *TRI; 1368bcb0991SDimitry Andric const SIInstrInfo *TII; 1378bcb0991SDimitry Andric 138*0fca6ea1SDimitry Andric SIFixSGPRCopies() : MachineFunctionPass(ID) {} 1390b57cec5SDimitry Andric 1400b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 141bdd1243dSDimitry Andric void fixSCCCopies(MachineFunction &MF); 142bdd1243dSDimitry Andric void prepareRegSequenceAndPHIs(MachineFunction &MF); 143fcaf7f86SDimitry Andric unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } 144bdd1243dSDimitry Andric bool needToBeConvertedToVALU(V2SCopyInfo *I); 145bdd1243dSDimitry Andric void analyzeVGPRToSGPRCopy(MachineInstr *MI); 146fcaf7f86SDimitry Andric void lowerVGPR2SGPRCopies(MachineFunction &MF); 147fcaf7f86SDimitry Andric // Handles copies which source register is: 148fcaf7f86SDimitry Andric // 1. Physical register 149fcaf7f86SDimitry Andric // 2. AGPR 150fcaf7f86SDimitry Andric // 3. Defined by the instruction the merely moves the immediate 151bdd1243dSDimitry Andric bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); 1520b57cec5SDimitry Andric 153bdd1243dSDimitry Andric void processPHINode(MachineInstr &MI); 1548bcb0991SDimitry Andric 1555f757f3fSDimitry Andric // Check if MO is an immediate materialized into a VGPR, and if so replace it 1565f757f3fSDimitry Andric // with an SGPR immediate. The VGPR immediate is also deleted if it does not 1575f757f3fSDimitry Andric // have any other uses. 1585f757f3fSDimitry Andric bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, 1595f757f3fSDimitry Andric MachineBasicBlock *BlockToInsertTo, 1605f757f3fSDimitry Andric MachineBasicBlock::iterator PointToInsertTo); 1615f757f3fSDimitry Andric 1620b57cec5SDimitry Andric StringRef getPassName() const override { return "SI Fix SGPR copies"; } 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 165*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 166*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>(); 1670b57cec5SDimitry Andric AU.setPreservesCFG(); 1680b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 1690b57cec5SDimitry Andric } 1700b57cec5SDimitry Andric }; 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric } // end anonymous namespace 1730b57cec5SDimitry Andric 1740b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 1750b57cec5SDimitry Andric "SI Fix SGPR copies", false, false) 176*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 1770b57cec5SDimitry Andric INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 1780b57cec5SDimitry Andric "SI Fix SGPR copies", false, false) 1790b57cec5SDimitry Andric 1800b57cec5SDimitry Andric char SIFixSGPRCopies::ID = 0; 1810b57cec5SDimitry Andric 1820b57cec5SDimitry Andric char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 1830b57cec5SDimitry Andric 1840b57cec5SDimitry Andric FunctionPass *llvm::createSIFixSGPRCopiesPass() { 1850b57cec5SDimitry Andric return new SIFixSGPRCopies(); 1860b57cec5SDimitry Andric } 1870b57cec5SDimitry Andric 1880b57cec5SDimitry Andric static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 1890b57cec5SDimitry Andric getCopyRegClasses(const MachineInstr &Copy, 1900b57cec5SDimitry Andric const SIRegisterInfo &TRI, 1910b57cec5SDimitry Andric const MachineRegisterInfo &MRI) { 1928bcb0991SDimitry Andric Register DstReg = Copy.getOperand(0).getReg(); 1938bcb0991SDimitry Andric Register SrcReg = Copy.getOperand(1).getReg(); 1940b57cec5SDimitry Andric 195e8d8bef9SDimitry Andric const TargetRegisterClass *SrcRC = SrcReg.isVirtual() 1968bcb0991SDimitry Andric ? MRI.getRegClass(SrcReg) 197bdd1243dSDimitry Andric : TRI.getPhysRegBaseClass(SrcReg); 1980b57cec5SDimitry Andric 1990b57cec5SDimitry Andric // We don't really care about the subregister here. 2000b57cec5SDimitry Andric // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 2010b57cec5SDimitry Andric 202e8d8bef9SDimitry Andric const TargetRegisterClass *DstRC = DstReg.isVirtual() 2038bcb0991SDimitry Andric ? MRI.getRegClass(DstReg) 204bdd1243dSDimitry Andric : TRI.getPhysRegBaseClass(DstReg); 2050b57cec5SDimitry Andric 206bdd1243dSDimitry Andric return std::pair(SrcRC, DstRC); 2070b57cec5SDimitry Andric } 2080b57cec5SDimitry Andric 2090b57cec5SDimitry Andric static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 2100b57cec5SDimitry Andric const TargetRegisterClass *DstRC, 2110b57cec5SDimitry Andric const SIRegisterInfo &TRI) { 2120b57cec5SDimitry Andric return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && 2130b57cec5SDimitry Andric TRI.hasVectorRegisters(SrcRC); 2140b57cec5SDimitry Andric } 2150b57cec5SDimitry Andric 2160b57cec5SDimitry Andric static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 2170b57cec5SDimitry Andric const TargetRegisterClass *DstRC, 2180b57cec5SDimitry Andric const SIRegisterInfo &TRI) { 2190b57cec5SDimitry Andric return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && 2200b57cec5SDimitry Andric TRI.hasVectorRegisters(DstRC); 2210b57cec5SDimitry Andric } 2220b57cec5SDimitry Andric 2230b57cec5SDimitry Andric static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, 2240b57cec5SDimitry Andric const SIRegisterInfo *TRI, 2250b57cec5SDimitry Andric const SIInstrInfo *TII) { 2260b57cec5SDimitry Andric MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2270b57cec5SDimitry Andric auto &Src = MI.getOperand(1); 2288bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2298bcb0991SDimitry Andric Register SrcReg = Src.getReg(); 230e8d8bef9SDimitry Andric if (!SrcReg.isVirtual() || !DstReg.isVirtual()) 2310b57cec5SDimitry Andric return false; 2320b57cec5SDimitry Andric 2330b57cec5SDimitry Andric for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { 2340b57cec5SDimitry Andric const auto *UseMI = MO.getParent(); 2350b57cec5SDimitry Andric if (UseMI == &MI) 2360b57cec5SDimitry Andric continue; 2370b57cec5SDimitry Andric if (MO.isDef() || UseMI->getParent() != MI.getParent() || 238e8d8bef9SDimitry Andric UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) 239e8d8bef9SDimitry Andric return false; 240e8d8bef9SDimitry Andric 24106c3fb27SDimitry Andric unsigned OpIdx = MO.getOperandNo(); 242e8d8bef9SDimitry Andric if (OpIdx >= UseMI->getDesc().getNumOperands() || 243e8d8bef9SDimitry Andric !TII->isOperandLegal(*UseMI, OpIdx, &Src)) 2440b57cec5SDimitry Andric return false; 2450b57cec5SDimitry Andric } 2460b57cec5SDimitry Andric // Change VGPR to SGPR destination. 2470b57cec5SDimitry Andric MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); 2480b57cec5SDimitry Andric return true; 2490b57cec5SDimitry Andric } 2500b57cec5SDimitry Andric 2510b57cec5SDimitry Andric // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 2520b57cec5SDimitry Andric // 2530b57cec5SDimitry Andric // SGPRx = ... 2540b57cec5SDimitry Andric // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 2550b57cec5SDimitry Andric // VGPRz = COPY SGPRy 2560b57cec5SDimitry Andric // 2570b57cec5SDimitry Andric // ==> 2580b57cec5SDimitry Andric // 2590b57cec5SDimitry Andric // VGPRx = COPY SGPRx 2600b57cec5SDimitry Andric // VGPRz = REG_SEQUENCE VGPRx, sub0 2610b57cec5SDimitry Andric // 2620b57cec5SDimitry Andric // This exposes immediate folding opportunities when materializing 64-bit 2630b57cec5SDimitry Andric // immediates. 2640b57cec5SDimitry Andric static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 2650b57cec5SDimitry Andric const SIRegisterInfo *TRI, 2660b57cec5SDimitry Andric const SIInstrInfo *TII, 2670b57cec5SDimitry Andric MachineRegisterInfo &MRI) { 2680b57cec5SDimitry Andric assert(MI.isRegSequence()); 2690b57cec5SDimitry Andric 2708bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2710b57cec5SDimitry Andric if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 2720b57cec5SDimitry Andric return false; 2730b57cec5SDimitry Andric 2740b57cec5SDimitry Andric if (!MRI.hasOneUse(DstReg)) 2750b57cec5SDimitry Andric return false; 2760b57cec5SDimitry Andric 2770b57cec5SDimitry Andric MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 2780b57cec5SDimitry Andric if (!CopyUse.isCopy()) 2790b57cec5SDimitry Andric return false; 2800b57cec5SDimitry Andric 2810b57cec5SDimitry Andric // It is illegal to have vreg inputs to a physreg defining reg_sequence. 282e8d8bef9SDimitry Andric if (CopyUse.getOperand(0).getReg().isPhysical()) 2830b57cec5SDimitry Andric return false; 2840b57cec5SDimitry Andric 2850b57cec5SDimitry Andric const TargetRegisterClass *SrcRC, *DstRC; 2860b57cec5SDimitry Andric std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 2870b57cec5SDimitry Andric 2880b57cec5SDimitry Andric if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 2890b57cec5SDimitry Andric return false; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) 2920b57cec5SDimitry Andric return true; 2930b57cec5SDimitry Andric 2940b57cec5SDimitry Andric // TODO: Could have multiple extracts? 2950b57cec5SDimitry Andric unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 2960b57cec5SDimitry Andric if (SubReg != AMDGPU::NoSubRegister) 2970b57cec5SDimitry Andric return false; 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric MRI.setRegClass(DstReg, DstRC); 3000b57cec5SDimitry Andric 3010b57cec5SDimitry Andric // SGPRx = ... 3020b57cec5SDimitry Andric // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 3030b57cec5SDimitry Andric // VGPRz = COPY SGPRy 3040b57cec5SDimitry Andric 3050b57cec5SDimitry Andric // => 3060b57cec5SDimitry Andric // VGPRx = COPY SGPRx 3070b57cec5SDimitry Andric // VGPRz = REG_SEQUENCE VGPRx, sub0 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 3104824e7fdSDimitry Andric bool IsAGPR = TRI->isAGPRClass(DstRC); 3110b57cec5SDimitry Andric 3120b57cec5SDimitry Andric for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 313bdd1243dSDimitry Andric const TargetRegisterClass *SrcRC = 314bdd1243dSDimitry Andric TRI->getRegClassForOperandReg(MRI, MI.getOperand(I)); 3150b57cec5SDimitry Andric assert(TRI->isSGPRClass(SrcRC) && 3160b57cec5SDimitry Andric "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 3170b57cec5SDimitry Andric const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 3180b57cec5SDimitry Andric 3198bcb0991SDimitry Andric Register TmpReg = MRI.createVirtualRegister(NewSrcRC); 3200b57cec5SDimitry Andric 3210b57cec5SDimitry Andric BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), 3220b57cec5SDimitry Andric TmpReg) 3230b57cec5SDimitry Andric .add(MI.getOperand(I)); 3240b57cec5SDimitry Andric 3250b57cec5SDimitry Andric if (IsAGPR) { 3260b57cec5SDimitry Andric const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); 3278bcb0991SDimitry Andric Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); 3280b57cec5SDimitry Andric unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? 329e8d8bef9SDimitry Andric AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; 3300b57cec5SDimitry Andric BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), 3310b57cec5SDimitry Andric TmpAReg) 3320b57cec5SDimitry Andric .addReg(TmpReg, RegState::Kill); 3330b57cec5SDimitry Andric TmpReg = TmpAReg; 3340b57cec5SDimitry Andric } 3350b57cec5SDimitry Andric 3360b57cec5SDimitry Andric MI.getOperand(I).setReg(TmpReg); 3370b57cec5SDimitry Andric } 3380b57cec5SDimitry Andric 3390b57cec5SDimitry Andric CopyUse.eraseFromParent(); 3400b57cec5SDimitry Andric return true; 3410b57cec5SDimitry Andric } 3420b57cec5SDimitry Andric 3430b57cec5SDimitry Andric static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 3440b57cec5SDimitry Andric const MachineInstr *MoveImm, 3450b57cec5SDimitry Andric const SIInstrInfo *TII, 3460b57cec5SDimitry Andric unsigned &SMovOp, 3470b57cec5SDimitry Andric int64_t &Imm) { 3480b57cec5SDimitry Andric if (Copy->getOpcode() != AMDGPU::COPY) 3490b57cec5SDimitry Andric return false; 3500b57cec5SDimitry Andric 3510b57cec5SDimitry Andric if (!MoveImm->isMoveImmediate()) 3520b57cec5SDimitry Andric return false; 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric const MachineOperand *ImmOp = 3550b57cec5SDimitry Andric TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 3560b57cec5SDimitry Andric if (!ImmOp->isImm()) 3570b57cec5SDimitry Andric return false; 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric // FIXME: Handle copies with sub-regs. 3605f757f3fSDimitry Andric if (Copy->getOperand(1).getSubReg()) 3610b57cec5SDimitry Andric return false; 3620b57cec5SDimitry Andric 3630b57cec5SDimitry Andric switch (MoveImm->getOpcode()) { 3640b57cec5SDimitry Andric default: 3650b57cec5SDimitry Andric return false; 3660b57cec5SDimitry Andric case AMDGPU::V_MOV_B32_e32: 3670b57cec5SDimitry Andric SMovOp = AMDGPU::S_MOV_B32; 3680b57cec5SDimitry Andric break; 3690b57cec5SDimitry Andric case AMDGPU::V_MOV_B64_PSEUDO: 3705f757f3fSDimitry Andric SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; 3710b57cec5SDimitry Andric break; 3720b57cec5SDimitry Andric } 3730b57cec5SDimitry Andric Imm = ImmOp->getImm(); 3740b57cec5SDimitry Andric return true; 3750b57cec5SDimitry Andric } 3760b57cec5SDimitry Andric 3770b57cec5SDimitry Andric template <class UnaryPredicate> 3780b57cec5SDimitry Andric bool searchPredecessors(const MachineBasicBlock *MBB, 3790b57cec5SDimitry Andric const MachineBasicBlock *CutOff, 3800b57cec5SDimitry Andric UnaryPredicate Predicate) { 3810b57cec5SDimitry Andric if (MBB == CutOff) 3820b57cec5SDimitry Andric return false; 3830b57cec5SDimitry Andric 3840b57cec5SDimitry Andric DenseSet<const MachineBasicBlock *> Visited; 385e8d8bef9SDimitry Andric SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); 3860b57cec5SDimitry Andric 3870b57cec5SDimitry Andric while (!Worklist.empty()) { 3880b57cec5SDimitry Andric MachineBasicBlock *MBB = Worklist.pop_back_val(); 3890b57cec5SDimitry Andric 3900b57cec5SDimitry Andric if (!Visited.insert(MBB).second) 3910b57cec5SDimitry Andric continue; 3920b57cec5SDimitry Andric if (MBB == CutOff) 3930b57cec5SDimitry Andric continue; 3940b57cec5SDimitry Andric if (Predicate(MBB)) 3950b57cec5SDimitry Andric return true; 3960b57cec5SDimitry Andric 3970b57cec5SDimitry Andric Worklist.append(MBB->pred_begin(), MBB->pred_end()); 3980b57cec5SDimitry Andric } 3990b57cec5SDimitry Andric 4000b57cec5SDimitry Andric return false; 4010b57cec5SDimitry Andric } 4020b57cec5SDimitry Andric 4030b57cec5SDimitry Andric // Checks if there is potential path From instruction To instruction. 4040b57cec5SDimitry Andric // If CutOff is specified and it sits in between of that path we ignore 4050b57cec5SDimitry Andric // a higher portion of the path and report it is not reachable. 4060b57cec5SDimitry Andric static bool isReachable(const MachineInstr *From, 4070b57cec5SDimitry Andric const MachineInstr *To, 4080b57cec5SDimitry Andric const MachineBasicBlock *CutOff, 4090b57cec5SDimitry Andric MachineDominatorTree &MDT) { 4100b57cec5SDimitry Andric if (MDT.dominates(From, To)) 4110b57cec5SDimitry Andric return true; 4120b57cec5SDimitry Andric 4130b57cec5SDimitry Andric const MachineBasicBlock *MBBFrom = From->getParent(); 4140b57cec5SDimitry Andric const MachineBasicBlock *MBBTo = To->getParent(); 4150b57cec5SDimitry Andric 416e8d8bef9SDimitry Andric // Do predecessor search. 4170b57cec5SDimitry Andric // We should almost never get here since we do not usually produce M0 stores 4180b57cec5SDimitry Andric // other than -1. 4190b57cec5SDimitry Andric return searchPredecessors(MBBTo, CutOff, [MBBFrom] 4200b57cec5SDimitry Andric (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); 4210b57cec5SDimitry Andric } 4220b57cec5SDimitry Andric 4230b57cec5SDimitry Andric // Return the first non-prologue instruction in the block. 4240b57cec5SDimitry Andric static MachineBasicBlock::iterator 4250b57cec5SDimitry Andric getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { 4260b57cec5SDimitry Andric MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); 4270b57cec5SDimitry Andric while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) 4280b57cec5SDimitry Andric ++I; 4290b57cec5SDimitry Andric 4300b57cec5SDimitry Andric return I; 4310b57cec5SDimitry Andric } 4320b57cec5SDimitry Andric 4330b57cec5SDimitry Andric // Hoist and merge identical SGPR initializations into a common predecessor. 4340b57cec5SDimitry Andric // This is intended to combine M0 initializations, but can work with any 4350b57cec5SDimitry Andric // SGPR. A VGPR cannot be processed since we cannot guarantee vector 4360b57cec5SDimitry Andric // executioon. 4370b57cec5SDimitry Andric static bool hoistAndMergeSGPRInits(unsigned Reg, 4380b57cec5SDimitry Andric const MachineRegisterInfo &MRI, 4398bcb0991SDimitry Andric const TargetRegisterInfo *TRI, 4400b57cec5SDimitry Andric MachineDominatorTree &MDT, 4410b57cec5SDimitry Andric const TargetInstrInfo *TII) { 4420b57cec5SDimitry Andric // List of inits by immediate value. 4430b57cec5SDimitry Andric using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; 4440b57cec5SDimitry Andric InitListMap Inits; 4450b57cec5SDimitry Andric // List of clobbering instructions. 4460b57cec5SDimitry Andric SmallVector<MachineInstr*, 8> Clobbers; 4470b57cec5SDimitry Andric // List of instructions marked for deletion. 4480b57cec5SDimitry Andric SmallSet<MachineInstr*, 8> MergedInstrs; 4490b57cec5SDimitry Andric 4500b57cec5SDimitry Andric bool Changed = false; 4510b57cec5SDimitry Andric 4520b57cec5SDimitry Andric for (auto &MI : MRI.def_instructions(Reg)) { 4530b57cec5SDimitry Andric MachineOperand *Imm = nullptr; 4540b57cec5SDimitry Andric for (auto &MO : MI.operands()) { 4550b57cec5SDimitry Andric if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || 4560b57cec5SDimitry Andric (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { 4570b57cec5SDimitry Andric Imm = nullptr; 4580b57cec5SDimitry Andric break; 459*0fca6ea1SDimitry Andric } 460*0fca6ea1SDimitry Andric if (MO.isImm()) 4610b57cec5SDimitry Andric Imm = &MO; 4620b57cec5SDimitry Andric } 4630b57cec5SDimitry Andric if (Imm) 4640b57cec5SDimitry Andric Inits[Imm->getImm()].push_front(&MI); 4650b57cec5SDimitry Andric else 4660b57cec5SDimitry Andric Clobbers.push_back(&MI); 4670b57cec5SDimitry Andric } 4680b57cec5SDimitry Andric 4690b57cec5SDimitry Andric for (auto &Init : Inits) { 4700b57cec5SDimitry Andric auto &Defs = Init.second; 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { 4730b57cec5SDimitry Andric MachineInstr *MI1 = *I1; 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric for (auto I2 = std::next(I1); I2 != E; ) { 4760b57cec5SDimitry Andric MachineInstr *MI2 = *I2; 4770b57cec5SDimitry Andric 4780b57cec5SDimitry Andric // Check any possible interference 4790b57cec5SDimitry Andric auto interferes = [&](MachineBasicBlock::iterator From, 4800b57cec5SDimitry Andric MachineBasicBlock::iterator To) -> bool { 4810b57cec5SDimitry Andric 4820b57cec5SDimitry Andric assert(MDT.dominates(&*To, &*From)); 4830b57cec5SDimitry Andric 4840b57cec5SDimitry Andric auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { 4850b57cec5SDimitry Andric const MachineBasicBlock *MBBFrom = From->getParent(); 4860b57cec5SDimitry Andric const MachineBasicBlock *MBBTo = To->getParent(); 4870b57cec5SDimitry Andric bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); 4880b57cec5SDimitry Andric bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); 4890b57cec5SDimitry Andric if (!MayClobberFrom && !MayClobberTo) 4900b57cec5SDimitry Andric return false; 4910b57cec5SDimitry Andric if ((MayClobberFrom && !MayClobberTo) || 4920b57cec5SDimitry Andric (!MayClobberFrom && MayClobberTo)) 4930b57cec5SDimitry Andric return true; 4940b57cec5SDimitry Andric // Both can clobber, this is not an interference only if both are 4950b57cec5SDimitry Andric // dominated by Clobber and belong to the same block or if Clobber 4960b57cec5SDimitry Andric // properly dominates To, given that To >> From, so it dominates 4970b57cec5SDimitry Andric // both and located in a common dominator. 4980b57cec5SDimitry Andric return !((MBBFrom == MBBTo && 4990b57cec5SDimitry Andric MDT.dominates(Clobber, &*From) && 5000b57cec5SDimitry Andric MDT.dominates(Clobber, &*To)) || 5010b57cec5SDimitry Andric MDT.properlyDominates(Clobber->getParent(), MBBTo)); 5020b57cec5SDimitry Andric }; 5030b57cec5SDimitry Andric 5040b57cec5SDimitry Andric return (llvm::any_of(Clobbers, interferes)) || 5050b57cec5SDimitry Andric (llvm::any_of(Inits, [&](InitListMap::value_type &C) { 5060b57cec5SDimitry Andric return C.first != Init.first && 5070b57cec5SDimitry Andric llvm::any_of(C.second, interferes); 5080b57cec5SDimitry Andric })); 5090b57cec5SDimitry Andric }; 5100b57cec5SDimitry Andric 5110b57cec5SDimitry Andric if (MDT.dominates(MI1, MI2)) { 5120b57cec5SDimitry Andric if (!interferes(MI2, MI1)) { 5130b57cec5SDimitry Andric LLVM_DEBUG(dbgs() 5140b57cec5SDimitry Andric << "Erasing from " 5150b57cec5SDimitry Andric << printMBBReference(*MI2->getParent()) << " " << *MI2); 5160b57cec5SDimitry Andric MergedInstrs.insert(MI2); 5170b57cec5SDimitry Andric Changed = true; 5180b57cec5SDimitry Andric ++I2; 5190b57cec5SDimitry Andric continue; 5200b57cec5SDimitry Andric } 5210b57cec5SDimitry Andric } else if (MDT.dominates(MI2, MI1)) { 5220b57cec5SDimitry Andric if (!interferes(MI1, MI2)) { 5230b57cec5SDimitry Andric LLVM_DEBUG(dbgs() 5240b57cec5SDimitry Andric << "Erasing from " 5250b57cec5SDimitry Andric << printMBBReference(*MI1->getParent()) << " " << *MI1); 5260b57cec5SDimitry Andric MergedInstrs.insert(MI1); 5270b57cec5SDimitry Andric Changed = true; 5280b57cec5SDimitry Andric ++I1; 5290b57cec5SDimitry Andric break; 5300b57cec5SDimitry Andric } 5310b57cec5SDimitry Andric } else { 5320b57cec5SDimitry Andric auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), 5330b57cec5SDimitry Andric MI2->getParent()); 5340b57cec5SDimitry Andric if (!MBB) { 5350b57cec5SDimitry Andric ++I2; 5360b57cec5SDimitry Andric continue; 5370b57cec5SDimitry Andric } 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); 5400b57cec5SDimitry Andric if (!interferes(MI1, I) && !interferes(MI2, I)) { 5410b57cec5SDimitry Andric LLVM_DEBUG(dbgs() 5420b57cec5SDimitry Andric << "Erasing from " 5430b57cec5SDimitry Andric << printMBBReference(*MI1->getParent()) << " " << *MI1 5440b57cec5SDimitry Andric << "and moving from " 5450b57cec5SDimitry Andric << printMBBReference(*MI2->getParent()) << " to " 5460b57cec5SDimitry Andric << printMBBReference(*I->getParent()) << " " << *MI2); 5470b57cec5SDimitry Andric I->getParent()->splice(I, MI2->getParent(), MI2); 5480b57cec5SDimitry Andric MergedInstrs.insert(MI1); 5490b57cec5SDimitry Andric Changed = true; 5500b57cec5SDimitry Andric ++I1; 5510b57cec5SDimitry Andric break; 5520b57cec5SDimitry Andric } 5530b57cec5SDimitry Andric } 5540b57cec5SDimitry Andric ++I2; 5550b57cec5SDimitry Andric } 5560b57cec5SDimitry Andric ++I1; 5570b57cec5SDimitry Andric } 5580b57cec5SDimitry Andric } 5590b57cec5SDimitry Andric 5608bcb0991SDimitry Andric // Remove initializations that were merged into another. 5618bcb0991SDimitry Andric for (auto &Init : Inits) { 5628bcb0991SDimitry Andric auto &Defs = Init.second; 5638bcb0991SDimitry Andric auto I = Defs.begin(); 5648bcb0991SDimitry Andric while (I != Defs.end()) { 5658bcb0991SDimitry Andric if (MergedInstrs.count(*I)) { 5668bcb0991SDimitry Andric (*I)->eraseFromParent(); 5678bcb0991SDimitry Andric I = Defs.erase(I); 5688bcb0991SDimitry Andric } else 5698bcb0991SDimitry Andric ++I; 5708bcb0991SDimitry Andric } 5718bcb0991SDimitry Andric } 5728bcb0991SDimitry Andric 5738bcb0991SDimitry Andric // Try to schedule SGPR initializations as early as possible in the MBB. 5748bcb0991SDimitry Andric for (auto &Init : Inits) { 5758bcb0991SDimitry Andric auto &Defs = Init.second; 576bdd1243dSDimitry Andric for (auto *MI : Defs) { 5778bcb0991SDimitry Andric auto MBB = MI->getParent(); 5788bcb0991SDimitry Andric MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); 5798bcb0991SDimitry Andric MachineBasicBlock::reverse_iterator B(BoundaryMI); 5808bcb0991SDimitry Andric // Check if B should actually be a boundary. If not set the previous 5818bcb0991SDimitry Andric // instruction as the boundary instead. 5828bcb0991SDimitry Andric if (!TII->isBasicBlockPrologue(*B)) 5838bcb0991SDimitry Andric B++; 5848bcb0991SDimitry Andric 5858bcb0991SDimitry Andric auto R = std::next(MI->getReverseIterator()); 5868bcb0991SDimitry Andric const unsigned Threshold = 50; 5878bcb0991SDimitry Andric // Search until B or Threshold for a place to insert the initialization. 5888bcb0991SDimitry Andric for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) 5898bcb0991SDimitry Andric if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || 5908bcb0991SDimitry Andric TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) 5918bcb0991SDimitry Andric break; 5928bcb0991SDimitry Andric 5938bcb0991SDimitry Andric // Move to directly after R. 5948bcb0991SDimitry Andric if (&*--R != MI) 5958bcb0991SDimitry Andric MBB->splice(*R, MBB, MI); 5968bcb0991SDimitry Andric } 5978bcb0991SDimitry Andric } 5980b57cec5SDimitry Andric 5990b57cec5SDimitry Andric if (Changed) 6000b57cec5SDimitry Andric MRI.clearKillFlags(Reg); 6010b57cec5SDimitry Andric 6020b57cec5SDimitry Andric return Changed; 6030b57cec5SDimitry Andric } 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 6065ffd83dbSDimitry Andric // Only need to run this in SelectionDAG path. 6075ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 6085ffd83dbSDimitry Andric MachineFunctionProperties::Property::Selected)) 6095ffd83dbSDimitry Andric return false; 6105ffd83dbSDimitry Andric 6110b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 6128bcb0991SDimitry Andric MRI = &MF.getRegInfo(); 6138bcb0991SDimitry Andric TRI = ST.getRegisterInfo(); 6148bcb0991SDimitry Andric TII = ST.getInstrInfo(); 615*0fca6ea1SDimitry Andric MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 6160b57cec5SDimitry Andric 617*0fca6ea1SDimitry Andric for (MachineBasicBlock &MBB : MF) { 618*0fca6ea1SDimitry Andric for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 619e8d8bef9SDimitry Andric ++I) { 6200b57cec5SDimitry Andric MachineInstr &MI = *I; 6210b57cec5SDimitry Andric 6220b57cec5SDimitry Andric switch (MI.getOpcode()) { 6230b57cec5SDimitry Andric default: 6240b57cec5SDimitry Andric continue; 6250b57cec5SDimitry Andric case AMDGPU::COPY: 6260b57cec5SDimitry Andric case AMDGPU::WQM: 627fe6060f1SDimitry Andric case AMDGPU::STRICT_WQM: 6288bcb0991SDimitry Andric case AMDGPU::SOFT_WQM: 629fe6060f1SDimitry Andric case AMDGPU::STRICT_WWM: { 6300b57cec5SDimitry Andric const TargetRegisterClass *SrcRC, *DstRC; 6318bcb0991SDimitry Andric std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); 6328bcb0991SDimitry Andric 633fcaf7f86SDimitry Andric if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { 634bdd1243dSDimitry Andric // Since VGPR to SGPR copies affect VGPR to SGPR copy 635bdd1243dSDimitry Andric // score and, hence the lowering decision, let's try to get rid of 636bdd1243dSDimitry Andric // them as early as possible 637bdd1243dSDimitry Andric if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) 6380b57cec5SDimitry Andric continue; 6390b57cec5SDimitry Andric 640bdd1243dSDimitry Andric // Collect those not changed to try them after VGPR to SGPR copies 641bdd1243dSDimitry Andric // lowering as there will be more opportunities. 642bdd1243dSDimitry Andric S2VCopies.push_back(&MI); 643e8d8bef9SDimitry Andric } 644bdd1243dSDimitry Andric if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) 645bdd1243dSDimitry Andric continue; 646bdd1243dSDimitry Andric if (lowerSpecialCase(MI, I)) 647bdd1243dSDimitry Andric continue; 648bdd1243dSDimitry Andric 649bdd1243dSDimitry Andric analyzeVGPRToSGPRCopy(&MI); 650bdd1243dSDimitry Andric 6510b57cec5SDimitry Andric break; 652e8d8bef9SDimitry Andric } 653bdd1243dSDimitry Andric case AMDGPU::INSERT_SUBREG: 654bdd1243dSDimitry Andric case AMDGPU::PHI: 655bdd1243dSDimitry Andric case AMDGPU::REG_SEQUENCE: { 656bdd1243dSDimitry Andric if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) { 657bdd1243dSDimitry Andric for (MachineOperand &MO : MI.operands()) { 658bdd1243dSDimitry Andric if (!MO.isReg() || !MO.getReg().isVirtual()) 659bdd1243dSDimitry Andric continue; 660bdd1243dSDimitry Andric const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); 661bdd1243dSDimitry Andric if (TRI->hasVectorRegisters(SrcRC)) { 662bdd1243dSDimitry Andric const TargetRegisterClass *DestRC = 663bdd1243dSDimitry Andric TRI->getEquivalentSGPRClass(SrcRC); 664bdd1243dSDimitry Andric Register NewDst = MRI->createVirtualRegister(DestRC); 665bdd1243dSDimitry Andric MachineBasicBlock *BlockToInsertCopy = 66606c3fb27SDimitry Andric MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB() 667*0fca6ea1SDimitry Andric : &MBB; 668bdd1243dSDimitry Andric MachineBasicBlock::iterator PointToInsertCopy = 669bdd1243dSDimitry Andric MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; 6705f757f3fSDimitry Andric 6715f757f3fSDimitry Andric if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy, 6725f757f3fSDimitry Andric PointToInsertCopy)) { 673bdd1243dSDimitry Andric MachineInstr *NewCopy = 674bdd1243dSDimitry Andric BuildMI(*BlockToInsertCopy, PointToInsertCopy, 675bdd1243dSDimitry Andric PointToInsertCopy->getDebugLoc(), 676bdd1243dSDimitry Andric TII->get(AMDGPU::COPY), NewDst) 677bdd1243dSDimitry Andric .addReg(MO.getReg()); 678bdd1243dSDimitry Andric MO.setReg(NewDst); 679bdd1243dSDimitry Andric analyzeVGPRToSGPRCopy(NewCopy); 680e8d8bef9SDimitry Andric } 6810b57cec5SDimitry Andric } 682bdd1243dSDimitry Andric } 6835f757f3fSDimitry Andric } 684bdd1243dSDimitry Andric 685bdd1243dSDimitry Andric if (MI.isPHI()) 686bdd1243dSDimitry Andric PHINodes.push_back(&MI); 687bdd1243dSDimitry Andric else if (MI.isRegSequence()) 688bdd1243dSDimitry Andric RegSequences.push_back(&MI); 689bdd1243dSDimitry Andric 6900b57cec5SDimitry Andric break; 6910b57cec5SDimitry Andric } 6928bcb0991SDimitry Andric case AMDGPU::V_WRITELANE_B32: { 6938bcb0991SDimitry Andric // Some architectures allow more than one constant bus access without 6948bcb0991SDimitry Andric // SGPR restriction 6958bcb0991SDimitry Andric if (ST.getConstantBusLimit(MI.getOpcode()) != 1) 6968bcb0991SDimitry Andric break; 6978bcb0991SDimitry Andric 6988bcb0991SDimitry Andric // Writelane is special in that it can use SGPR and M0 (which would 6998bcb0991SDimitry Andric // normally count as using the constant bus twice - but in this case it 7008bcb0991SDimitry Andric // is allowed since the lane selector doesn't count as a use of the 7018bcb0991SDimitry Andric // constant bus). However, it is still required to abide by the 1 SGPR 7028bcb0991SDimitry Andric // rule. Apply a fix here as we might have multiple SGPRs after 7038bcb0991SDimitry Andric // legalizing VGPRs to SGPRs 7048bcb0991SDimitry Andric int Src0Idx = 7058bcb0991SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 7068bcb0991SDimitry Andric int Src1Idx = 7078bcb0991SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 7088bcb0991SDimitry Andric MachineOperand &Src0 = MI.getOperand(Src0Idx); 7098bcb0991SDimitry Andric MachineOperand &Src1 = MI.getOperand(Src1Idx); 7108bcb0991SDimitry Andric 7118bcb0991SDimitry Andric // Check to see if the instruction violates the 1 SGPR rule 7128bcb0991SDimitry Andric if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && 7138bcb0991SDimitry Andric Src0.getReg() != AMDGPU::M0) && 7148bcb0991SDimitry Andric (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && 7158bcb0991SDimitry Andric Src1.getReg() != AMDGPU::M0)) { 7168bcb0991SDimitry Andric 7178bcb0991SDimitry Andric // Check for trivially easy constant prop into one of the operands 7188bcb0991SDimitry Andric // If this is the case then perform the operation now to resolve SGPR 7198bcb0991SDimitry Andric // issue. If we don't do that here we will always insert a mov to m0 7208bcb0991SDimitry Andric // that can't be resolved in later operand folding pass 7218bcb0991SDimitry Andric bool Resolved = false; 7228bcb0991SDimitry Andric for (MachineOperand *MO : {&Src0, &Src1}) { 723e8d8bef9SDimitry Andric if (MO->getReg().isVirtual()) { 7248bcb0991SDimitry Andric MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); 7258bcb0991SDimitry Andric if (DefMI && TII->isFoldableCopy(*DefMI)) { 7268bcb0991SDimitry Andric const MachineOperand &Def = DefMI->getOperand(0); 7278bcb0991SDimitry Andric if (Def.isReg() && 7288bcb0991SDimitry Andric MO->getReg() == Def.getReg() && 7298bcb0991SDimitry Andric MO->getSubReg() == Def.getSubReg()) { 7308bcb0991SDimitry Andric const MachineOperand &Copied = DefMI->getOperand(1); 7318bcb0991SDimitry Andric if (Copied.isImm() && 7328bcb0991SDimitry Andric TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { 7338bcb0991SDimitry Andric MO->ChangeToImmediate(Copied.getImm()); 7348bcb0991SDimitry Andric Resolved = true; 7358bcb0991SDimitry Andric break; 7368bcb0991SDimitry Andric } 7378bcb0991SDimitry Andric } 7388bcb0991SDimitry Andric } 7398bcb0991SDimitry Andric } 7408bcb0991SDimitry Andric } 7418bcb0991SDimitry Andric 7428bcb0991SDimitry Andric if (!Resolved) { 7438bcb0991SDimitry Andric // Haven't managed to resolve by replacing an SGPR with an immediate 7448bcb0991SDimitry Andric // Move src1 to be in M0 7458bcb0991SDimitry Andric BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 7468bcb0991SDimitry Andric TII->get(AMDGPU::COPY), AMDGPU::M0) 7478bcb0991SDimitry Andric .add(Src1); 7488bcb0991SDimitry Andric Src1.ChangeToRegister(AMDGPU::M0, false); 7498bcb0991SDimitry Andric } 7508bcb0991SDimitry Andric } 7518bcb0991SDimitry Andric break; 7528bcb0991SDimitry Andric } 7530b57cec5SDimitry Andric } 7540b57cec5SDimitry Andric } 7550b57cec5SDimitry Andric } 7560b57cec5SDimitry Andric 757bdd1243dSDimitry Andric lowerVGPR2SGPRCopies(MF); 758bdd1243dSDimitry Andric // Postprocessing 759bdd1243dSDimitry Andric fixSCCCopies(MF); 760bdd1243dSDimitry Andric for (auto MI : S2VCopies) { 761bdd1243dSDimitry Andric // Check if it is still valid 762bdd1243dSDimitry Andric if (MI->isCopy()) { 763bdd1243dSDimitry Andric const TargetRegisterClass *SrcRC, *DstRC; 764bdd1243dSDimitry Andric std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); 765bdd1243dSDimitry Andric if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 766bdd1243dSDimitry Andric tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII); 767bdd1243dSDimitry Andric } 768bdd1243dSDimitry Andric } 769bdd1243dSDimitry Andric for (auto MI : RegSequences) { 770bdd1243dSDimitry Andric // Check if it is still valid 771bdd1243dSDimitry Andric if (MI->isRegSequence()) 772bdd1243dSDimitry Andric foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI); 773bdd1243dSDimitry Andric } 774bdd1243dSDimitry Andric for (auto MI : PHINodes) { 775bdd1243dSDimitry Andric processPHINode(*MI); 776bdd1243dSDimitry Andric } 7775f757f3fSDimitry Andric if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) 7788bcb0991SDimitry Andric hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); 7790b57cec5SDimitry Andric 780bdd1243dSDimitry Andric SiblingPenalty.clear(); 781bdd1243dSDimitry Andric V2SCopies.clear(); 782bdd1243dSDimitry Andric SCCCopies.clear(); 783bdd1243dSDimitry Andric RegSequences.clear(); 784bdd1243dSDimitry Andric PHINodes.clear(); 785bdd1243dSDimitry Andric S2VCopies.clear(); 786bdd1243dSDimitry Andric 7870b57cec5SDimitry Andric return true; 7880b57cec5SDimitry Andric } 7898bcb0991SDimitry Andric 790bdd1243dSDimitry Andric void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { 7918bcb0991SDimitry Andric bool AllAGPRUses = true; 7928bcb0991SDimitry Andric SetVector<const MachineInstr *> worklist; 7938bcb0991SDimitry Andric SmallSet<const MachineInstr *, 4> Visited; 7945ffd83dbSDimitry Andric SetVector<MachineInstr *> PHIOperands; 7958bcb0991SDimitry Andric worklist.insert(&MI); 7968bcb0991SDimitry Andric Visited.insert(&MI); 797bdd1243dSDimitry Andric // HACK to make MIR tests with no uses happy 798bdd1243dSDimitry Andric bool HasUses = false; 7998bcb0991SDimitry Andric while (!worklist.empty()) { 8008bcb0991SDimitry Andric const MachineInstr *Instr = worklist.pop_back_val(); 801e8d8bef9SDimitry Andric Register Reg = Instr->getOperand(0).getReg(); 8028bcb0991SDimitry Andric for (const auto &Use : MRI->use_operands(Reg)) { 803bdd1243dSDimitry Andric HasUses = true; 8048bcb0991SDimitry Andric const MachineInstr *UseMI = Use.getParent(); 8058bcb0991SDimitry Andric AllAGPRUses &= (UseMI->isCopy() && 8068bcb0991SDimitry Andric TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || 8078bcb0991SDimitry Andric TRI->isAGPR(*MRI, Use.getReg()); 8088bcb0991SDimitry Andric if (UseMI->isCopy() || UseMI->isRegSequence()) { 8098bcb0991SDimitry Andric if (Visited.insert(UseMI).second) 8108bcb0991SDimitry Andric worklist.insert(UseMI); 8118bcb0991SDimitry Andric 8128bcb0991SDimitry Andric continue; 8138bcb0991SDimitry Andric } 8148bcb0991SDimitry Andric } 8158bcb0991SDimitry Andric } 8168bcb0991SDimitry Andric 8178bcb0991SDimitry Andric Register PHIRes = MI.getOperand(0).getReg(); 8188bcb0991SDimitry Andric const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); 819bdd1243dSDimitry Andric if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC0)) { 8208bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); 8218bcb0991SDimitry Andric MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); 8225ffd83dbSDimitry Andric for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 8235ffd83dbSDimitry Andric MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); 8245ffd83dbSDimitry Andric if (DefMI && DefMI->isPHI()) 8255ffd83dbSDimitry Andric PHIOperands.insert(DefMI); 8265ffd83dbSDimitry Andric } 8278bcb0991SDimitry Andric } 8288bcb0991SDimitry Andric 829bdd1243dSDimitry Andric if (TRI->isVectorRegister(*MRI, PHIRes) || 830bdd1243dSDimitry Andric RC0 == &AMDGPU::VReg_1RegClass) { 8318bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); 8328bcb0991SDimitry Andric TII->legalizeOperands(MI, MDT); 8338bcb0991SDimitry Andric } 8348bcb0991SDimitry Andric 8355ffd83dbSDimitry Andric // Propagate register class back to PHI operands which are PHI themselves. 8365ffd83dbSDimitry Andric while (!PHIOperands.empty()) { 8375ffd83dbSDimitry Andric processPHINode(*PHIOperands.pop_back_val()); 8385ffd83dbSDimitry Andric } 8398bcb0991SDimitry Andric } 840fcaf7f86SDimitry Andric 8415f757f3fSDimitry Andric bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( 8425f757f3fSDimitry Andric MachineOperand &MaybeVGPRConstMO, Register DstReg, 8435f757f3fSDimitry Andric MachineBasicBlock *BlockToInsertTo, 8445f757f3fSDimitry Andric MachineBasicBlock::iterator PointToInsertTo) { 8455f757f3fSDimitry Andric 8465f757f3fSDimitry Andric MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg()); 8475f757f3fSDimitry Andric if (!DefMI || !DefMI->isMoveImmediate()) 8485f757f3fSDimitry Andric return false; 8495f757f3fSDimitry Andric 8505f757f3fSDimitry Andric MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0); 8515f757f3fSDimitry Andric if (SrcConst->isReg()) 8525f757f3fSDimitry Andric return false; 8535f757f3fSDimitry Andric 8545f757f3fSDimitry Andric const TargetRegisterClass *SrcRC = 8555f757f3fSDimitry Andric MRI->getRegClass(MaybeVGPRConstMO.getReg()); 8565f757f3fSDimitry Andric unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); 8575f757f3fSDimitry Andric unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 8585f757f3fSDimitry Andric BuildMI(*BlockToInsertTo, PointToInsertTo, PointToInsertTo->getDebugLoc(), 8595f757f3fSDimitry Andric TII->get(MoveOp), DstReg) 8605f757f3fSDimitry Andric .add(*SrcConst); 8615f757f3fSDimitry Andric if (MRI->hasOneUse(MaybeVGPRConstMO.getReg())) 8625f757f3fSDimitry Andric DefMI->eraseFromParent(); 8635f757f3fSDimitry Andric MaybeVGPRConstMO.setReg(DstReg); 8645f757f3fSDimitry Andric return true; 8655f757f3fSDimitry Andric } 8665f757f3fSDimitry Andric 867bdd1243dSDimitry Andric bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, 868bdd1243dSDimitry Andric MachineBasicBlock::iterator &I) { 869bdd1243dSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 870fcaf7f86SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 871bdd1243dSDimitry Andric if (!DstReg.isVirtual()) { 872bdd1243dSDimitry Andric // If the destination register is a physical register there isn't 873bdd1243dSDimitry Andric // really much we can do to fix this. 874bdd1243dSDimitry Andric // Some special instructions use M0 as an input. Some even only use 875bdd1243dSDimitry Andric // the first lane. Insert a readfirstlane and hope for the best. 876bdd1243dSDimitry Andric if (DstReg == AMDGPU::M0 && 877bdd1243dSDimitry Andric TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { 878bdd1243dSDimitry Andric Register TmpReg = 879bdd1243dSDimitry Andric MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 880bdd1243dSDimitry Andric BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 881bdd1243dSDimitry Andric TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) 882bdd1243dSDimitry Andric .add(MI.getOperand(1)); 883bdd1243dSDimitry Andric MI.getOperand(1).setReg(TmpReg); 8845f757f3fSDimitry Andric } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), 8855f757f3fSDimitry Andric MI)) { 886bdd1243dSDimitry Andric I = std::next(I); 887bdd1243dSDimitry Andric MI.eraseFromParent(); 888bdd1243dSDimitry Andric } 889bdd1243dSDimitry Andric return true; 890bdd1243dSDimitry Andric } 891fcaf7f86SDimitry Andric if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { 89206c3fb27SDimitry Andric SIInstrWorklist worklist; 89306c3fb27SDimitry Andric worklist.insert(&MI); 89406c3fb27SDimitry Andric TII->moveToVALU(worklist, MDT); 895fcaf7f86SDimitry Andric return true; 896fcaf7f86SDimitry Andric } 897fcaf7f86SDimitry Andric 898fcaf7f86SDimitry Andric unsigned SMovOp; 899fcaf7f86SDimitry Andric int64_t Imm; 900fcaf7f86SDimitry Andric // If we are just copying an immediate, we can replace the copy with 901fcaf7f86SDimitry Andric // s_mov_b32. 902fcaf7f86SDimitry Andric if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { 903fcaf7f86SDimitry Andric MI.getOperand(1).ChangeToImmediate(Imm); 904bdd1243dSDimitry Andric MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 905fcaf7f86SDimitry Andric MI.setDesc(TII->get(SMovOp)); 906fcaf7f86SDimitry Andric return true; 907fcaf7f86SDimitry Andric } 908fcaf7f86SDimitry Andric return false; 909fcaf7f86SDimitry Andric } 910fcaf7f86SDimitry Andric 911bdd1243dSDimitry Andric void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { 912bdd1243dSDimitry Andric Register DstReg = MI->getOperand(0).getReg(); 913bdd1243dSDimitry Andric const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); 914fcaf7f86SDimitry Andric 915bdd1243dSDimitry Andric V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, 916fcaf7f86SDimitry Andric TRI->getRegSizeInBits(*DstRC)); 917fcaf7f86SDimitry Andric SmallVector<MachineInstr *, 8> AnalysisWorklist; 918fcaf7f86SDimitry Andric // Needed because the SSA is not a tree but a graph and may have 919fcaf7f86SDimitry Andric // forks and joins. We should not then go same way twice. 920fcaf7f86SDimitry Andric DenseSet<MachineInstr *> Visited; 921bdd1243dSDimitry Andric AnalysisWorklist.push_back(Info.Copy); 922fcaf7f86SDimitry Andric while (!AnalysisWorklist.empty()) { 923fcaf7f86SDimitry Andric 924fcaf7f86SDimitry Andric MachineInstr *Inst = AnalysisWorklist.pop_back_val(); 925fcaf7f86SDimitry Andric 926fcaf7f86SDimitry Andric if (!Visited.insert(Inst).second) 927fcaf7f86SDimitry Andric continue; 928fcaf7f86SDimitry Andric 929fcaf7f86SDimitry Andric // Copies and REG_SEQUENCE do not contribute to the final assembly 930fcaf7f86SDimitry Andric // So, skip them but take care of the SGPR to VGPR copies bookkeeping. 931fcaf7f86SDimitry Andric if (Inst->isCopy() || Inst->isRegSequence()) { 932fcaf7f86SDimitry Andric if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { 933fcaf7f86SDimitry Andric if (!Inst->isCopy() || 934fcaf7f86SDimitry Andric !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { 935bdd1243dSDimitry Andric Info.NumSVCopies++; 936fcaf7f86SDimitry Andric continue; 937fcaf7f86SDimitry Andric } 938fcaf7f86SDimitry Andric } 939fcaf7f86SDimitry Andric } 940fcaf7f86SDimitry Andric 941bdd1243dSDimitry Andric SiblingPenalty[Inst].insert(Info.ID); 942fcaf7f86SDimitry Andric 943fcaf7f86SDimitry Andric SmallVector<MachineInstr *, 4> Users; 944fcaf7f86SDimitry Andric if ((TII->isSALU(*Inst) && Inst->isCompare()) || 945fcaf7f86SDimitry Andric (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { 946fcaf7f86SDimitry Andric auto I = Inst->getIterator(); 947fcaf7f86SDimitry Andric auto E = Inst->getParent()->end(); 948*0fca6ea1SDimitry Andric while (++I != E && 949*0fca6ea1SDimitry Andric !I->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) { 950*0fca6ea1SDimitry Andric if (I->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr)) 951fcaf7f86SDimitry Andric Users.push_back(&*I); 952fcaf7f86SDimitry Andric } 953fcaf7f86SDimitry Andric } else if (Inst->getNumExplicitDefs() != 0) { 954fcaf7f86SDimitry Andric Register Reg = Inst->getOperand(0).getReg(); 955bdd1243dSDimitry Andric if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) 956fcaf7f86SDimitry Andric for (auto &U : MRI->use_instructions(Reg)) 957fcaf7f86SDimitry Andric Users.push_back(&U); 958fcaf7f86SDimitry Andric } 959fcaf7f86SDimitry Andric for (auto U : Users) { 960fcaf7f86SDimitry Andric if (TII->isSALU(*U)) 961bdd1243dSDimitry Andric Info.SChain.insert(U); 962fcaf7f86SDimitry Andric AnalysisWorklist.push_back(U); 963fcaf7f86SDimitry Andric } 964fcaf7f86SDimitry Andric } 965bdd1243dSDimitry Andric V2SCopies[Info.ID] = Info; 966fcaf7f86SDimitry Andric } 967fcaf7f86SDimitry Andric 968bdd1243dSDimitry Andric // The main function that computes the VGPR to SGPR copy score 969bdd1243dSDimitry Andric // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU 970bdd1243dSDimitry Andric bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { 971bdd1243dSDimitry Andric if (Info->SChain.empty()) { 972bdd1243dSDimitry Andric Info->Score = 0; 973bdd1243dSDimitry Andric return true; 974bdd1243dSDimitry Andric } 975*0fca6ea1SDimitry Andric Info->Siblings = SiblingPenalty[*llvm::max_element( 976*0fca6ea1SDimitry Andric Info->SChain, [&](MachineInstr *A, MachineInstr *B) -> bool { 977bdd1243dSDimitry Andric return SiblingPenalty[A].size() < SiblingPenalty[B].size(); 978bdd1243dSDimitry Andric })]; 979bdd1243dSDimitry Andric Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; }); 980bdd1243dSDimitry Andric // The loop below computes the number of another VGPR to SGPR V2SCopies 981bdd1243dSDimitry Andric // which contribute to the current copy SALU chain. We assume that all the 982bdd1243dSDimitry Andric // V2SCopies with the same source virtual register will be squashed to one 983bdd1243dSDimitry Andric // by regalloc. Also we take care of the V2SCopies of the differnt subregs 984bdd1243dSDimitry Andric // of the same register. 985bdd1243dSDimitry Andric SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; 986bdd1243dSDimitry Andric for (auto J : Info->Siblings) { 987bdd1243dSDimitry Andric auto InfoIt = V2SCopies.find(J); 988bdd1243dSDimitry Andric if (InfoIt != V2SCopies.end()) { 9895f757f3fSDimitry Andric MachineInstr *SiblingCopy = InfoIt->second.Copy; 990bdd1243dSDimitry Andric if (SiblingCopy->isImplicitDef()) 991bdd1243dSDimitry Andric // the COPY has already been MoveToVALUed 992bdd1243dSDimitry Andric continue; 993bdd1243dSDimitry Andric 994bdd1243dSDimitry Andric SrcRegs.insert(std::pair(SiblingCopy->getOperand(1).getReg(), 995bdd1243dSDimitry Andric SiblingCopy->getOperand(1).getSubReg())); 996bdd1243dSDimitry Andric } 997bdd1243dSDimitry Andric } 998bdd1243dSDimitry Andric Info->SiblingPenalty = SrcRegs.size(); 999bdd1243dSDimitry Andric 1000bdd1243dSDimitry Andric unsigned Penalty = 1001bdd1243dSDimitry Andric Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; 1002bdd1243dSDimitry Andric unsigned Profit = Info->SChain.size(); 1003bdd1243dSDimitry Andric Info->Score = Penalty > Profit ? 0 : Profit - Penalty; 1004bdd1243dSDimitry Andric Info->NeedToBeConvertedToVALU = Info->Score < 3; 1005bdd1243dSDimitry Andric return Info->NeedToBeConvertedToVALU; 1006bdd1243dSDimitry Andric } 1007bdd1243dSDimitry Andric 1008bdd1243dSDimitry Andric void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { 1009bdd1243dSDimitry Andric 1010fcaf7f86SDimitry Andric SmallVector<unsigned, 8> LoweringWorklist; 1011bdd1243dSDimitry Andric for (auto &C : V2SCopies) { 1012fcaf7f86SDimitry Andric if (needToBeConvertedToVALU(&C.second)) 1013fcaf7f86SDimitry Andric LoweringWorklist.push_back(C.second.ID); 1014fcaf7f86SDimitry Andric } 1015fcaf7f86SDimitry Andric 101606c3fb27SDimitry Andric // Store all the V2S copy instructions that need to be moved to VALU 101706c3fb27SDimitry Andric // in the Copies worklist. 101806c3fb27SDimitry Andric SIInstrWorklist Copies; 101906c3fb27SDimitry Andric 1020fcaf7f86SDimitry Andric while (!LoweringWorklist.empty()) { 1021fcaf7f86SDimitry Andric unsigned CurID = LoweringWorklist.pop_back_val(); 1022bdd1243dSDimitry Andric auto CurInfoIt = V2SCopies.find(CurID); 1023bdd1243dSDimitry Andric if (CurInfoIt != V2SCopies.end()) { 10245f757f3fSDimitry Andric V2SCopyInfo C = CurInfoIt->second; 1025fcaf7f86SDimitry Andric LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); 1026fcaf7f86SDimitry Andric for (auto S : C.Siblings) { 1027bdd1243dSDimitry Andric auto SibInfoIt = V2SCopies.find(S); 1028bdd1243dSDimitry Andric if (SibInfoIt != V2SCopies.end()) { 10295f757f3fSDimitry Andric V2SCopyInfo &SI = SibInfoIt->second; 1030fcaf7f86SDimitry Andric LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); 1031fcaf7f86SDimitry Andric if (!SI.NeedToBeConvertedToVALU) { 10325f757f3fSDimitry Andric SI.SChain.set_subtract(C.SChain); 1033fcaf7f86SDimitry Andric if (needToBeConvertedToVALU(&SI)) 1034fcaf7f86SDimitry Andric LoweringWorklist.push_back(SI.ID); 1035fcaf7f86SDimitry Andric } 1036fcaf7f86SDimitry Andric SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); 1037fcaf7f86SDimitry Andric } 1038fcaf7f86SDimitry Andric } 1039fcaf7f86SDimitry Andric LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy 1040fcaf7f86SDimitry Andric << " is being turned to VALU\n"); 10415f757f3fSDimitry Andric // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if 10425f757f3fSDimitry Andric // instead. 1043bdd1243dSDimitry Andric V2SCopies.erase(C.ID); 104406c3fb27SDimitry Andric Copies.insert(C.Copy); 1045fcaf7f86SDimitry Andric } 1046fcaf7f86SDimitry Andric } 1047fcaf7f86SDimitry Andric 104806c3fb27SDimitry Andric TII->moveToVALU(Copies, MDT); 104906c3fb27SDimitry Andric Copies.clear(); 105006c3fb27SDimitry Andric 1051fcaf7f86SDimitry Andric // Now do actual lowering 1052bdd1243dSDimitry Andric for (auto C : V2SCopies) { 1053fcaf7f86SDimitry Andric MachineInstr *MI = C.second.Copy; 1054fcaf7f86SDimitry Andric MachineBasicBlock *MBB = MI->getParent(); 1055fcaf7f86SDimitry Andric // We decide to turn V2S copy to v_readfirstlane_b32 1056fcaf7f86SDimitry Andric // remove it from the V2SCopies and remove it from all its siblings 1057fcaf7f86SDimitry Andric LLVM_DEBUG(dbgs() << "V2S copy " << *MI 1058fcaf7f86SDimitry Andric << " is being turned to v_readfirstlane_b32" 1059fcaf7f86SDimitry Andric << " Score: " << C.second.Score << "\n"); 1060fcaf7f86SDimitry Andric Register DstReg = MI->getOperand(0).getReg(); 1061fcaf7f86SDimitry Andric Register SrcReg = MI->getOperand(1).getReg(); 1062fcaf7f86SDimitry Andric unsigned SubReg = MI->getOperand(1).getSubReg(); 1063bdd1243dSDimitry Andric const TargetRegisterClass *SrcRC = 1064bdd1243dSDimitry Andric TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1)); 1065fcaf7f86SDimitry Andric size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); 1066fcaf7f86SDimitry Andric if (SrcSize == 16) { 1067fcaf7f86SDimitry Andric // HACK to handle possible 16bit VGPR source 1068fcaf7f86SDimitry Andric auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1069fcaf7f86SDimitry Andric TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1070fcaf7f86SDimitry Andric MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); 1071fcaf7f86SDimitry Andric } else if (SrcSize == 32) { 1072fcaf7f86SDimitry Andric auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 1073fcaf7f86SDimitry Andric TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); 1074fcaf7f86SDimitry Andric MIB.addReg(SrcReg, 0, SubReg); 1075fcaf7f86SDimitry Andric } else { 1076fcaf7f86SDimitry Andric auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), 1077fcaf7f86SDimitry Andric TII->get(AMDGPU::REG_SEQUENCE), DstReg); 1078fcaf7f86SDimitry Andric int N = TRI->getRegSizeInBits(*SrcRC) / 32; 1079fcaf7f86SDimitry Andric for (int i = 0; i < N; i++) { 1080fcaf7f86SDimitry Andric Register PartialSrc = TII->buildExtractSubReg( 1081fcaf7f86SDimitry Andric Result, *MRI, MI->getOperand(1), SrcRC, 1082fcaf7f86SDimitry Andric TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); 1083fcaf7f86SDimitry Andric Register PartialDst = 1084fcaf7f86SDimitry Andric MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1085fcaf7f86SDimitry Andric BuildMI(*MBB, *Result, Result->getDebugLoc(), 1086fcaf7f86SDimitry Andric TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) 1087fcaf7f86SDimitry Andric .addReg(PartialSrc); 1088fcaf7f86SDimitry Andric Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); 1089fcaf7f86SDimitry Andric } 1090fcaf7f86SDimitry Andric } 1091fcaf7f86SDimitry Andric MI->eraseFromParent(); 1092fcaf7f86SDimitry Andric } 1093fcaf7f86SDimitry Andric } 1094bdd1243dSDimitry Andric 1095bdd1243dSDimitry Andric void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { 1096bdd1243dSDimitry Andric bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); 1097*0fca6ea1SDimitry Andric for (MachineBasicBlock &MBB : MF) { 1098*0fca6ea1SDimitry Andric for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 1099bdd1243dSDimitry Andric ++I) { 1100bdd1243dSDimitry Andric MachineInstr &MI = *I; 1101bdd1243dSDimitry Andric // May already have been lowered. 1102bdd1243dSDimitry Andric if (!MI.isCopy()) 1103bdd1243dSDimitry Andric continue; 1104bdd1243dSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 1105bdd1243dSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 1106bdd1243dSDimitry Andric if (SrcReg == AMDGPU::SCC) { 1107bdd1243dSDimitry Andric Register SCCCopy = MRI->createVirtualRegister( 1108bdd1243dSDimitry Andric TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); 1109bdd1243dSDimitry Andric I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1110bdd1243dSDimitry Andric MI.getDebugLoc(), 1111bdd1243dSDimitry Andric TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 1112bdd1243dSDimitry Andric : AMDGPU::S_CSELECT_B64), 1113bdd1243dSDimitry Andric SCCCopy) 1114bdd1243dSDimitry Andric .addImm(-1) 1115bdd1243dSDimitry Andric .addImm(0); 1116bdd1243dSDimitry Andric I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), 1117bdd1243dSDimitry Andric TII->get(AMDGPU::COPY), DstReg) 1118bdd1243dSDimitry Andric .addReg(SCCCopy); 1119bdd1243dSDimitry Andric MI.eraseFromParent(); 1120bdd1243dSDimitry Andric continue; 1121bdd1243dSDimitry Andric } 1122bdd1243dSDimitry Andric if (DstReg == AMDGPU::SCC) { 1123bdd1243dSDimitry Andric unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 1124bdd1243dSDimitry Andric Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1125bdd1243dSDimitry Andric Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); 1126bdd1243dSDimitry Andric I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), 1127bdd1243dSDimitry Andric MI.getDebugLoc(), TII->get(Opcode)) 1128bdd1243dSDimitry Andric .addReg(Tmp, getDefRegState(true)) 1129bdd1243dSDimitry Andric .addReg(SrcReg) 1130bdd1243dSDimitry Andric .addReg(Exec); 1131bdd1243dSDimitry Andric MI.eraseFromParent(); 1132bdd1243dSDimitry Andric } 1133bdd1243dSDimitry Andric } 1134bdd1243dSDimitry Andric } 1135bdd1243dSDimitry Andric } 1136