10b57cec5SDimitry Andric //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric 90b57cec5SDimitry Andric #include "AMDGPU.h" 10e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 110b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12753f127fSDimitry Andric #include "SIRegisterInfo.h" 135f757f3fSDimitry Andric #include "llvm/ADT/SmallVector.h" 14*0fca6ea1SDimitry Andric #include "llvm/CodeGen/LiveRegUnits.h" 150b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 16fcaf7f86SDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 17fcaf7f86SDimitry Andric #include "llvm/CodeGen/TargetRegisterInfo.h" 18480093f4SDimitry Andric #include "llvm/InitializePasses.h" 190b57cec5SDimitry Andric 200b57cec5SDimitry Andric using namespace llvm; 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric #define DEBUG_TYPE "si-optimize-exec-masking" 230b57cec5SDimitry Andric 240b57cec5SDimitry Andric namespace { 250b57cec5SDimitry Andric 260b57cec5SDimitry Andric class SIOptimizeExecMasking : public MachineFunctionPass { 27753f127fSDimitry Andric MachineFunction *MF = nullptr; 28753f127fSDimitry Andric const GCNSubtarget *ST = nullptr; 29753f127fSDimitry Andric const SIRegisterInfo *TRI = nullptr; 30753f127fSDimitry Andric const SIInstrInfo *TII = nullptr; 31753f127fSDimitry Andric const MachineRegisterInfo *MRI = nullptr; 32fcaf7f86SDimitry Andric MCRegister Exec; 33fcaf7f86SDimitry Andric 34fcaf7f86SDimitry Andric DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; 35fcaf7f86SDimitry Andric SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; 365f757f3fSDimitry Andric SmallVector<MachineOperand *, 1> KillFlagCandidates; 37753f127fSDimitry Andric 38753f127fSDimitry Andric Register isCopyFromExec(const MachineInstr &MI) const; 39753f127fSDimitry Andric Register isCopyToExec(const MachineInstr &MI) const; 40753f127fSDimitry Andric bool removeTerminatorBit(MachineInstr &MI) const; 41753f127fSDimitry Andric MachineBasicBlock::reverse_iterator 42753f127fSDimitry Andric fixTerminators(MachineBasicBlock &MBB) const; 43753f127fSDimitry Andric MachineBasicBlock::reverse_iterator 44bdd1243dSDimitry Andric findExecCopy(MachineBasicBlock &MBB, 45bdd1243dSDimitry Andric MachineBasicBlock::reverse_iterator I) const; 46753f127fSDimitry Andric bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, 47753f127fSDimitry Andric MCRegister Reg, bool UseLiveOuts = false, 48753f127fSDimitry Andric bool IgnoreStart = false) const; 49753f127fSDimitry Andric bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; 505f757f3fSDimitry Andric MachineInstr *findInstrBackwards( 515f757f3fSDimitry Andric MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 52753f127fSDimitry Andric ArrayRef<MCRegister> NonModifiableRegs, 535f757f3fSDimitry Andric MachineInstr *Terminator = nullptr, 545f757f3fSDimitry Andric SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr, 55753f127fSDimitry Andric unsigned MaxInstructions = 20) const; 56fcaf7f86SDimitry Andric bool optimizeExecSequence(); 57fcaf7f86SDimitry Andric void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); 58fcaf7f86SDimitry Andric bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, 59fcaf7f86SDimitry Andric MachineInstr &VCmp, MCRegister Exec) const; 60fcaf7f86SDimitry Andric 61fcaf7f86SDimitry Andric void tryRecordOrSaveexecXorSequence(MachineInstr &MI); 62fcaf7f86SDimitry Andric bool optimizeOrSaveexecXorSequences(); 63753f127fSDimitry Andric 640b57cec5SDimitry Andric public: 650b57cec5SDimitry Andric static char ID; 660b57cec5SDimitry Andric 670b57cec5SDimitry Andric SIOptimizeExecMasking() : MachineFunctionPass(ID) { 680b57cec5SDimitry Andric initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 690b57cec5SDimitry Andric } 700b57cec5SDimitry Andric 710b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric StringRef getPassName() const override { 740b57cec5SDimitry Andric return "SI optimize exec mask operations"; 750b57cec5SDimitry Andric } 760b57cec5SDimitry Andric 770b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 780b57cec5SDimitry Andric AU.setPreservesCFG(); 790b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 800b57cec5SDimitry Andric } 810b57cec5SDimitry Andric }; 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric } // End anonymous namespace. 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 860b57cec5SDimitry Andric "SI optimize exec mask operations", false, false) 87*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) 880b57cec5SDimitry Andric INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 890b57cec5SDimitry Andric "SI optimize exec mask operations", false, false) 900b57cec5SDimitry Andric 910b57cec5SDimitry Andric char SIOptimizeExecMasking::ID = 0; 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric /// If \p MI is a copy from exec, return the register copied to. 96753f127fSDimitry Andric Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { 970b57cec5SDimitry Andric switch (MI.getOpcode()) { 980b57cec5SDimitry Andric case AMDGPU::COPY: 990b57cec5SDimitry Andric case AMDGPU::S_MOV_B64: 1000b57cec5SDimitry Andric case AMDGPU::S_MOV_B64_term: 1010b57cec5SDimitry Andric case AMDGPU::S_MOV_B32: 1020b57cec5SDimitry Andric case AMDGPU::S_MOV_B32_term: { 1030b57cec5SDimitry Andric const MachineOperand &Src = MI.getOperand(1); 104fcaf7f86SDimitry Andric if (Src.isReg() && Src.getReg() == Exec) 1050b57cec5SDimitry Andric return MI.getOperand(0).getReg(); 1060b57cec5SDimitry Andric } 1070b57cec5SDimitry Andric } 1080b57cec5SDimitry Andric 1090b57cec5SDimitry Andric return AMDGPU::NoRegister; 1100b57cec5SDimitry Andric } 1110b57cec5SDimitry Andric 1120b57cec5SDimitry Andric /// If \p MI is a copy to exec, return the register copied from. 113753f127fSDimitry Andric Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { 1140b57cec5SDimitry Andric switch (MI.getOpcode()) { 1150b57cec5SDimitry Andric case AMDGPU::COPY: 1160b57cec5SDimitry Andric case AMDGPU::S_MOV_B64: 1170b57cec5SDimitry Andric case AMDGPU::S_MOV_B32: { 1180b57cec5SDimitry Andric const MachineOperand &Dst = MI.getOperand(0); 119fcaf7f86SDimitry Andric if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) 1200b57cec5SDimitry Andric return MI.getOperand(1).getReg(); 1210b57cec5SDimitry Andric break; 1220b57cec5SDimitry Andric } 1230b57cec5SDimitry Andric case AMDGPU::S_MOV_B64_term: 1240b57cec5SDimitry Andric case AMDGPU::S_MOV_B32_term: 1250b57cec5SDimitry Andric llvm_unreachable("should have been replaced"); 1260b57cec5SDimitry Andric } 1270b57cec5SDimitry Andric 128480093f4SDimitry Andric return Register(); 1290b57cec5SDimitry Andric } 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric /// If \p MI is a logical operation on an exec value, 1320b57cec5SDimitry Andric /// return the register copied to. 133480093f4SDimitry Andric static Register isLogicalOpOnExec(const MachineInstr &MI) { 1340b57cec5SDimitry Andric switch (MI.getOpcode()) { 1350b57cec5SDimitry Andric case AMDGPU::S_AND_B64: 1360b57cec5SDimitry Andric case AMDGPU::S_OR_B64: 1370b57cec5SDimitry Andric case AMDGPU::S_XOR_B64: 1380b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B64: 1390b57cec5SDimitry Andric case AMDGPU::S_ORN2_B64: 1400b57cec5SDimitry Andric case AMDGPU::S_NAND_B64: 1410b57cec5SDimitry Andric case AMDGPU::S_NOR_B64: 1420b57cec5SDimitry Andric case AMDGPU::S_XNOR_B64: { 1430b57cec5SDimitry Andric const MachineOperand &Src1 = MI.getOperand(1); 1440b57cec5SDimitry Andric if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) 1450b57cec5SDimitry Andric return MI.getOperand(0).getReg(); 1460b57cec5SDimitry Andric const MachineOperand &Src2 = MI.getOperand(2); 1470b57cec5SDimitry Andric if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) 1480b57cec5SDimitry Andric return MI.getOperand(0).getReg(); 1490b57cec5SDimitry Andric break; 1500b57cec5SDimitry Andric } 1510b57cec5SDimitry Andric case AMDGPU::S_AND_B32: 1520b57cec5SDimitry Andric case AMDGPU::S_OR_B32: 1530b57cec5SDimitry Andric case AMDGPU::S_XOR_B32: 1540b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B32: 1550b57cec5SDimitry Andric case AMDGPU::S_ORN2_B32: 1560b57cec5SDimitry Andric case AMDGPU::S_NAND_B32: 1570b57cec5SDimitry Andric case AMDGPU::S_NOR_B32: 1580b57cec5SDimitry Andric case AMDGPU::S_XNOR_B32: { 1590b57cec5SDimitry Andric const MachineOperand &Src1 = MI.getOperand(1); 1600b57cec5SDimitry Andric if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) 1610b57cec5SDimitry Andric return MI.getOperand(0).getReg(); 1620b57cec5SDimitry Andric const MachineOperand &Src2 = MI.getOperand(2); 1630b57cec5SDimitry Andric if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) 1640b57cec5SDimitry Andric return MI.getOperand(0).getReg(); 1650b57cec5SDimitry Andric break; 1660b57cec5SDimitry Andric } 1670b57cec5SDimitry Andric } 1680b57cec5SDimitry Andric 1690b57cec5SDimitry Andric return AMDGPU::NoRegister; 1700b57cec5SDimitry Andric } 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric static unsigned getSaveExecOp(unsigned Opc) { 1730b57cec5SDimitry Andric switch (Opc) { 1740b57cec5SDimitry Andric case AMDGPU::S_AND_B64: 1750b57cec5SDimitry Andric return AMDGPU::S_AND_SAVEEXEC_B64; 1760b57cec5SDimitry Andric case AMDGPU::S_OR_B64: 1770b57cec5SDimitry Andric return AMDGPU::S_OR_SAVEEXEC_B64; 1780b57cec5SDimitry Andric case AMDGPU::S_XOR_B64: 1790b57cec5SDimitry Andric return AMDGPU::S_XOR_SAVEEXEC_B64; 1800b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B64: 1810b57cec5SDimitry Andric return AMDGPU::S_ANDN2_SAVEEXEC_B64; 1820b57cec5SDimitry Andric case AMDGPU::S_ORN2_B64: 1830b57cec5SDimitry Andric return AMDGPU::S_ORN2_SAVEEXEC_B64; 1840b57cec5SDimitry Andric case AMDGPU::S_NAND_B64: 1850b57cec5SDimitry Andric return AMDGPU::S_NAND_SAVEEXEC_B64; 1860b57cec5SDimitry Andric case AMDGPU::S_NOR_B64: 1870b57cec5SDimitry Andric return AMDGPU::S_NOR_SAVEEXEC_B64; 1880b57cec5SDimitry Andric case AMDGPU::S_XNOR_B64: 1890b57cec5SDimitry Andric return AMDGPU::S_XNOR_SAVEEXEC_B64; 1900b57cec5SDimitry Andric case AMDGPU::S_AND_B32: 1910b57cec5SDimitry Andric return AMDGPU::S_AND_SAVEEXEC_B32; 1920b57cec5SDimitry Andric case AMDGPU::S_OR_B32: 1930b57cec5SDimitry Andric return AMDGPU::S_OR_SAVEEXEC_B32; 1940b57cec5SDimitry Andric case AMDGPU::S_XOR_B32: 1950b57cec5SDimitry Andric return AMDGPU::S_XOR_SAVEEXEC_B32; 1960b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B32: 1970b57cec5SDimitry Andric return AMDGPU::S_ANDN2_SAVEEXEC_B32; 1980b57cec5SDimitry Andric case AMDGPU::S_ORN2_B32: 1990b57cec5SDimitry Andric return AMDGPU::S_ORN2_SAVEEXEC_B32; 2000b57cec5SDimitry Andric case AMDGPU::S_NAND_B32: 2010b57cec5SDimitry Andric return AMDGPU::S_NAND_SAVEEXEC_B32; 2020b57cec5SDimitry Andric case AMDGPU::S_NOR_B32: 2030b57cec5SDimitry Andric return AMDGPU::S_NOR_SAVEEXEC_B32; 2040b57cec5SDimitry Andric case AMDGPU::S_XNOR_B32: 2050b57cec5SDimitry Andric return AMDGPU::S_XNOR_SAVEEXEC_B32; 2060b57cec5SDimitry Andric default: 2070b57cec5SDimitry Andric return AMDGPU::INSTRUCTION_LIST_END; 2080b57cec5SDimitry Andric } 2090b57cec5SDimitry Andric } 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric // These are only terminators to get correct spill code placement during 212e8d8bef9SDimitry Andric // register allocation, so turn them back into normal instructions. 213753f127fSDimitry Andric bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { 2140b57cec5SDimitry Andric switch (MI.getOpcode()) { 2150b57cec5SDimitry Andric case AMDGPU::S_MOV_B32_term: { 216e8d8bef9SDimitry Andric bool RegSrc = MI.getOperand(1).isReg(); 217753f127fSDimitry Andric MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); 218e8d8bef9SDimitry Andric return true; 219e8d8bef9SDimitry Andric } 220e8d8bef9SDimitry Andric case AMDGPU::S_MOV_B64_term: { 221e8d8bef9SDimitry Andric bool RegSrc = MI.getOperand(1).isReg(); 222753f127fSDimitry Andric MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); 2230b57cec5SDimitry Andric return true; 2240b57cec5SDimitry Andric } 2250b57cec5SDimitry Andric case AMDGPU::S_XOR_B64_term: { 2260b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 2270b57cec5SDimitry Andric // register allocation. 228753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); 2290b57cec5SDimitry Andric return true; 2300b57cec5SDimitry Andric } 2310b57cec5SDimitry Andric case AMDGPU::S_XOR_B32_term: { 2320b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 2330b57cec5SDimitry Andric // register allocation. 234753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); 2350b57cec5SDimitry Andric return true; 2360b57cec5SDimitry Andric } 237e8d8bef9SDimitry Andric case AMDGPU::S_OR_B64_term: { 238e8d8bef9SDimitry Andric // This is only a terminator to get the correct spill code placement during 239e8d8bef9SDimitry Andric // register allocation. 240753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_OR_B64)); 241e8d8bef9SDimitry Andric return true; 242e8d8bef9SDimitry Andric } 2430b57cec5SDimitry Andric case AMDGPU::S_OR_B32_term: { 2440b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 2450b57cec5SDimitry Andric // register allocation. 246753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_OR_B32)); 2470b57cec5SDimitry Andric return true; 2480b57cec5SDimitry Andric } 2490b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B64_term: { 2500b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 2510b57cec5SDimitry Andric // register allocation. 252753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); 2530b57cec5SDimitry Andric return true; 2540b57cec5SDimitry Andric } 2550b57cec5SDimitry Andric case AMDGPU::S_ANDN2_B32_term: { 2560b57cec5SDimitry Andric // This is only a terminator to get the correct spill code placement during 2570b57cec5SDimitry Andric // register allocation. 258753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); 2590b57cec5SDimitry Andric return true; 2600b57cec5SDimitry Andric } 261fe6060f1SDimitry Andric case AMDGPU::S_AND_B64_term: { 262fe6060f1SDimitry Andric // This is only a terminator to get the correct spill code placement during 263fe6060f1SDimitry Andric // register allocation. 264753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_AND_B64)); 265fe6060f1SDimitry Andric return true; 266fe6060f1SDimitry Andric } 267fe6060f1SDimitry Andric case AMDGPU::S_AND_B32_term: { 268fe6060f1SDimitry Andric // This is only a terminator to get the correct spill code placement during 269fe6060f1SDimitry Andric // register allocation. 270753f127fSDimitry Andric MI.setDesc(TII->get(AMDGPU::S_AND_B32)); 271fe6060f1SDimitry Andric return true; 272fe6060f1SDimitry Andric } 2730b57cec5SDimitry Andric default: 2740b57cec5SDimitry Andric return false; 2750b57cec5SDimitry Andric } 2760b57cec5SDimitry Andric } 2770b57cec5SDimitry Andric 278e8d8bef9SDimitry Andric // Turn all pseudoterminators in the block into their equivalent non-terminator 279e8d8bef9SDimitry Andric // instructions. Returns the reverse iterator to the first non-terminator 280e8d8bef9SDimitry Andric // instruction in the block. 281753f127fSDimitry Andric MachineBasicBlock::reverse_iterator 282753f127fSDimitry Andric SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { 2830b57cec5SDimitry Andric MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 284e8d8bef9SDimitry Andric 285e8d8bef9SDimitry Andric bool Seen = false; 286e8d8bef9SDimitry Andric MachineBasicBlock::reverse_iterator FirstNonTerm = I; 2870b57cec5SDimitry Andric for (; I != E; ++I) { 2880b57cec5SDimitry Andric if (!I->isTerminator()) 289e8d8bef9SDimitry Andric return Seen ? FirstNonTerm : I; 2900b57cec5SDimitry Andric 291753f127fSDimitry Andric if (removeTerminatorBit(*I)) { 292e8d8bef9SDimitry Andric if (!Seen) { 293e8d8bef9SDimitry Andric FirstNonTerm = I; 294e8d8bef9SDimitry Andric Seen = true; 295e8d8bef9SDimitry Andric } 296e8d8bef9SDimitry Andric } 2970b57cec5SDimitry Andric } 2980b57cec5SDimitry Andric 299e8d8bef9SDimitry Andric return FirstNonTerm; 3000b57cec5SDimitry Andric } 3010b57cec5SDimitry Andric 302bdd1243dSDimitry Andric MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy( 303bdd1243dSDimitry Andric MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const { 3040b57cec5SDimitry Andric const unsigned InstLimit = 25; 3050b57cec5SDimitry Andric 3060b57cec5SDimitry Andric auto E = MBB.rend(); 3070b57cec5SDimitry Andric for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 308753f127fSDimitry Andric Register CopyFromExec = isCopyFromExec(*I); 309480093f4SDimitry Andric if (CopyFromExec.isValid()) 3100b57cec5SDimitry Andric return I; 3110b57cec5SDimitry Andric } 3120b57cec5SDimitry Andric 3130b57cec5SDimitry Andric return E; 3140b57cec5SDimitry Andric } 3150b57cec5SDimitry Andric 316*0fca6ea1SDimitry Andric // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly 3170b57cec5SDimitry Andric // report the register as unavailable because a super-register with a lane mask 3180b57cec5SDimitry Andric // is unavailable. 3190b57cec5SDimitry Andric static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 3200b57cec5SDimitry Andric for (MachineBasicBlock *Succ : MBB.successors()) { 3210b57cec5SDimitry Andric if (Succ->isLiveIn(Reg)) 3220b57cec5SDimitry Andric return true; 3230b57cec5SDimitry Andric } 3240b57cec5SDimitry Andric 3250b57cec5SDimitry Andric return false; 3260b57cec5SDimitry Andric } 3270b57cec5SDimitry Andric 32881ad6265SDimitry Andric // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either 32981ad6265SDimitry Andric // the beginning of the BB is reached or Pred evaluates to true - which can be 33081ad6265SDimitry Andric // an arbitrary condition based on the current MachineInstr, for instance an 33181ad6265SDimitry Andric // target instruction. Breaks prematurely by returning nullptr if one of the 33281ad6265SDimitry Andric // registers given in NonModifiableRegs is modified by the current instruction. 333753f127fSDimitry Andric MachineInstr *SIOptimizeExecMasking::findInstrBackwards( 334753f127fSDimitry Andric MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, 3355f757f3fSDimitry Andric ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator, 3365f757f3fSDimitry Andric SmallVectorImpl<MachineOperand *> *KillFlagCandidates, 3375f757f3fSDimitry Andric unsigned MaxInstructions) const { 33881ad6265SDimitry Andric MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), 33981ad6265SDimitry Andric E = Origin.getParent()->rend(); 34081ad6265SDimitry Andric unsigned CurrentIteration = 0; 34181ad6265SDimitry Andric 34281ad6265SDimitry Andric for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { 34381ad6265SDimitry Andric if (A->isDebugInstr()) 34481ad6265SDimitry Andric continue; 34581ad6265SDimitry Andric 34681ad6265SDimitry Andric if (Pred(&*A)) 34781ad6265SDimitry Andric return &*A; 34881ad6265SDimitry Andric 34981ad6265SDimitry Andric for (MCRegister Reg : NonModifiableRegs) { 35081ad6265SDimitry Andric if (A->modifiesRegister(Reg, TRI)) 35181ad6265SDimitry Andric return nullptr; 3525f757f3fSDimitry Andric 3535f757f3fSDimitry Andric // Check for kills that appear after the terminator instruction, that 3545f757f3fSDimitry Andric // would not be detected by clearKillFlags, since they will cause the 3555f757f3fSDimitry Andric // register to be dead at a later place, causing the verifier to fail. 3565f757f3fSDimitry Andric // We use the candidates to clear the kill flags later. 3575f757f3fSDimitry Andric if (Terminator && KillFlagCandidates && A != Terminator && 3585f757f3fSDimitry Andric A->killsRegister(Reg, TRI)) { 3595f757f3fSDimitry Andric for (MachineOperand &MO : A->operands()) { 3605f757f3fSDimitry Andric if (MO.isReg() && MO.isKill()) { 3615f757f3fSDimitry Andric Register Candidate = MO.getReg(); 3625f757f3fSDimitry Andric if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg)) 3635f757f3fSDimitry Andric KillFlagCandidates->push_back(&MO); 3645f757f3fSDimitry Andric } 3655f757f3fSDimitry Andric } 3665f757f3fSDimitry Andric } 36781ad6265SDimitry Andric } 36881ad6265SDimitry Andric 36981ad6265SDimitry Andric ++CurrentIteration; 37081ad6265SDimitry Andric } 37181ad6265SDimitry Andric 37281ad6265SDimitry Andric return nullptr; 37381ad6265SDimitry Andric } 37481ad6265SDimitry Andric 37581ad6265SDimitry Andric // Determine if a register Reg is not re-defined and still in use 37681ad6265SDimitry Andric // in the range (Stop..Start]. 37781ad6265SDimitry Andric // It does so by backwards calculating liveness from the end of the BB until 37881ad6265SDimitry Andric // either Stop or the beginning of the BB is reached. 37981ad6265SDimitry Andric // After liveness is calculated, we can determine if Reg is still in use and not 38081ad6265SDimitry Andric // defined inbetween the instructions. 381753f127fSDimitry Andric bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, 382753f127fSDimitry Andric MachineInstr &Start, 383753f127fSDimitry Andric MCRegister Reg, 384753f127fSDimitry Andric bool UseLiveOuts, 385753f127fSDimitry Andric bool IgnoreStart) const { 386*0fca6ea1SDimitry Andric LiveRegUnits LR(*TRI); 387753f127fSDimitry Andric if (UseLiveOuts) 38881ad6265SDimitry Andric LR.addLiveOuts(*Stop.getParent()); 38981ad6265SDimitry Andric 39081ad6265SDimitry Andric MachineBasicBlock::reverse_iterator A(Start); 39181ad6265SDimitry Andric 392753f127fSDimitry Andric if (IgnoreStart) 39381ad6265SDimitry Andric ++A; 39481ad6265SDimitry Andric 39581ad6265SDimitry Andric for (; A != Stop.getParent()->rend() && A != Stop; ++A) { 39681ad6265SDimitry Andric LR.stepBackward(*A); 39781ad6265SDimitry Andric } 39881ad6265SDimitry Andric 399*0fca6ea1SDimitry Andric return !LR.available(Reg) || MRI->isReserved(Reg); 40081ad6265SDimitry Andric } 40181ad6265SDimitry Andric 40281ad6265SDimitry Andric // Determine if a register Reg is not re-defined and still in use 40381ad6265SDimitry Andric // in the range (Stop..BB.end]. 404753f127fSDimitry Andric bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, 405753f127fSDimitry Andric MCRegister Reg) const { 406753f127fSDimitry Andric return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); 40781ad6265SDimitry Andric } 40881ad6265SDimitry Andric 4090b57cec5SDimitry Andric // Optimize sequences emitted for control flow lowering. They are originally 4100b57cec5SDimitry Andric // emitted as the separate operations because spill code may need to be 4110b57cec5SDimitry Andric // inserted for the saved copy of exec. 4120b57cec5SDimitry Andric // 4130b57cec5SDimitry Andric // x = copy exec 4140b57cec5SDimitry Andric // z = s_<op>_b64 x, y 4150b57cec5SDimitry Andric // exec = copy z 4160b57cec5SDimitry Andric // => 4170b57cec5SDimitry Andric // x = s_<op>_saveexec_b64 y 4180b57cec5SDimitry Andric // 419fcaf7f86SDimitry Andric bool SIOptimizeExecMasking::optimizeExecSequence() { 42081ad6265SDimitry Andric bool Changed = false; 421753f127fSDimitry Andric for (MachineBasicBlock &MBB : *MF) { 422753f127fSDimitry Andric MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); 4230b57cec5SDimitry Andric MachineBasicBlock::reverse_iterator E = MBB.rend(); 4240b57cec5SDimitry Andric if (I == E) 4250b57cec5SDimitry Andric continue; 4260b57cec5SDimitry Andric 427e8d8bef9SDimitry Andric // It's possible to see other terminator copies after the exec copy. This 428e8d8bef9SDimitry Andric // can happen if control flow pseudos had their outputs used by phis. 429e8d8bef9SDimitry Andric Register CopyToExec; 430e8d8bef9SDimitry Andric 431e8d8bef9SDimitry Andric unsigned SearchCount = 0; 432e8d8bef9SDimitry Andric const unsigned SearchLimit = 5; 433e8d8bef9SDimitry Andric while (I != E && SearchCount++ < SearchLimit) { 434753f127fSDimitry Andric CopyToExec = isCopyToExec(*I); 435e8d8bef9SDimitry Andric if (CopyToExec) 436e8d8bef9SDimitry Andric break; 437e8d8bef9SDimitry Andric ++I; 438e8d8bef9SDimitry Andric } 439e8d8bef9SDimitry Andric 440e8d8bef9SDimitry Andric if (!CopyToExec) 4410b57cec5SDimitry Andric continue; 4420b57cec5SDimitry Andric 4430b57cec5SDimitry Andric // Scan backwards to find the def. 444753f127fSDimitry Andric auto *CopyToExecInst = &*I; 445bdd1243dSDimitry Andric auto CopyFromExecInst = findExecCopy(MBB, I); 4460b57cec5SDimitry Andric if (CopyFromExecInst == E) { 4470b57cec5SDimitry Andric auto PrepareExecInst = std::next(I); 4480b57cec5SDimitry Andric if (PrepareExecInst == E) 4490b57cec5SDimitry Andric continue; 4500b57cec5SDimitry Andric // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec 4510b57cec5SDimitry Andric if (CopyToExecInst->getOperand(1).isKill() && 4520b57cec5SDimitry Andric isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { 4530b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); 4540b57cec5SDimitry Andric 4550b57cec5SDimitry Andric PrepareExecInst->getOperand(0).setReg(Exec); 4560b57cec5SDimitry Andric 4570b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); 4580b57cec5SDimitry Andric 4590b57cec5SDimitry Andric CopyToExecInst->eraseFromParent(); 46081ad6265SDimitry Andric Changed = true; 4610b57cec5SDimitry Andric } 4620b57cec5SDimitry Andric 4630b57cec5SDimitry Andric continue; 4640b57cec5SDimitry Andric } 4650b57cec5SDimitry Andric 4660b57cec5SDimitry Andric if (isLiveOut(MBB, CopyToExec)) { 4670b57cec5SDimitry Andric // The copied register is live out and has a second use in another block. 4680b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); 4690b57cec5SDimitry Andric continue; 4700b57cec5SDimitry Andric } 4710b57cec5SDimitry Andric 4728bcb0991SDimitry Andric Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 4730b57cec5SDimitry Andric MachineInstr *SaveExecInst = nullptr; 4740b57cec5SDimitry Andric SmallVector<MachineInstr *, 4> OtherUseInsts; 4750b57cec5SDimitry Andric 476753f127fSDimitry Andric for (MachineBasicBlock::iterator 477753f127fSDimitry Andric J = std::next(CopyFromExecInst->getIterator()), 478753f127fSDimitry Andric JE = I->getIterator(); 4790b57cec5SDimitry Andric J != JE; ++J) { 4800b57cec5SDimitry Andric if (SaveExecInst && J->readsRegister(Exec, TRI)) { 4810b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 4820b57cec5SDimitry Andric // Make sure this is inserted after any VALU ops that may have been 4830b57cec5SDimitry Andric // scheduled in between. 4840b57cec5SDimitry Andric SaveExecInst = nullptr; 4850b57cec5SDimitry Andric break; 4860b57cec5SDimitry Andric } 4870b57cec5SDimitry Andric 4880b57cec5SDimitry Andric bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); 4890b57cec5SDimitry Andric 4900b57cec5SDimitry Andric if (J->modifiesRegister(CopyToExec, TRI)) { 4910b57cec5SDimitry Andric if (SaveExecInst) { 4920b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Multiple instructions modify " 4930b57cec5SDimitry Andric << printReg(CopyToExec, TRI) << '\n'); 4940b57cec5SDimitry Andric SaveExecInst = nullptr; 4950b57cec5SDimitry Andric break; 4960b57cec5SDimitry Andric } 4970b57cec5SDimitry Andric 4980b57cec5SDimitry Andric unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 4990b57cec5SDimitry Andric if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 5000b57cec5SDimitry Andric break; 5010b57cec5SDimitry Andric 5020b57cec5SDimitry Andric if (ReadsCopyFromExec) { 5030b57cec5SDimitry Andric SaveExecInst = &*J; 5040b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 5050b57cec5SDimitry Andric continue; 506*0fca6ea1SDimitry Andric } 507*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J 508*0fca6ea1SDimitry Andric << '\n'); 5090b57cec5SDimitry Andric break; 5100b57cec5SDimitry Andric } 511*0fca6ea1SDimitry Andric if (ReadsCopyFromExec && !SaveExecInst) { 5120b57cec5SDimitry Andric // Make sure no other instruction is trying to use this copy, before it 5130b57cec5SDimitry Andric // will be rewritten by the saveexec, i.e. hasOneUse. There may have 5140b57cec5SDimitry Andric // been another use, such as an inserted spill. For example: 5150b57cec5SDimitry Andric // 5160b57cec5SDimitry Andric // %sgpr0_sgpr1 = COPY %exec 5170b57cec5SDimitry Andric // spill %sgpr0_sgpr1 5180b57cec5SDimitry Andric // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 5190b57cec5SDimitry Andric // 5200b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J 5210b57cec5SDimitry Andric << '\n'); 5220b57cec5SDimitry Andric break; 5230b57cec5SDimitry Andric } 5240b57cec5SDimitry Andric 5250b57cec5SDimitry Andric if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 5260b57cec5SDimitry Andric assert(SaveExecInst != &*J); 5270b57cec5SDimitry Andric OtherUseInsts.push_back(&*J); 5280b57cec5SDimitry Andric } 5290b57cec5SDimitry Andric } 5300b57cec5SDimitry Andric 5310b57cec5SDimitry Andric if (!SaveExecInst) 5320b57cec5SDimitry Andric continue; 5330b57cec5SDimitry Andric 5340b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 5350b57cec5SDimitry Andric 5360b57cec5SDimitry Andric MachineOperand &Src0 = SaveExecInst->getOperand(1); 5370b57cec5SDimitry Andric MachineOperand &Src1 = SaveExecInst->getOperand(2); 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric MachineOperand *OtherOp = nullptr; 5400b57cec5SDimitry Andric 5410b57cec5SDimitry Andric if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 5420b57cec5SDimitry Andric OtherOp = &Src1; 5430b57cec5SDimitry Andric } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 5440b57cec5SDimitry Andric if (!SaveExecInst->isCommutable()) 5450b57cec5SDimitry Andric break; 5460b57cec5SDimitry Andric 5470b57cec5SDimitry Andric OtherOp = &Src0; 5480b57cec5SDimitry Andric } else 5490b57cec5SDimitry Andric llvm_unreachable("unexpected"); 5500b57cec5SDimitry Andric 5510b57cec5SDimitry Andric CopyFromExecInst->eraseFromParent(); 5520b57cec5SDimitry Andric 5530b57cec5SDimitry Andric auto InsPt = SaveExecInst->getIterator(); 5540b57cec5SDimitry Andric const DebugLoc &DL = SaveExecInst->getDebugLoc(); 5550b57cec5SDimitry Andric 5560b57cec5SDimitry Andric BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 5570b57cec5SDimitry Andric CopyFromExec) 5580b57cec5SDimitry Andric .addReg(OtherOp->getReg()); 5590b57cec5SDimitry Andric SaveExecInst->eraseFromParent(); 5600b57cec5SDimitry Andric 5610b57cec5SDimitry Andric CopyToExecInst->eraseFromParent(); 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric for (MachineInstr *OtherInst : OtherUseInsts) { 564753f127fSDimitry Andric OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, 565753f127fSDimitry Andric *TRI); 5660b57cec5SDimitry Andric } 56781ad6265SDimitry Andric 56881ad6265SDimitry Andric Changed = true; 5690b57cec5SDimitry Andric } 5700b57cec5SDimitry Andric 571753f127fSDimitry Andric return Changed; 572753f127fSDimitry Andric } 573753f127fSDimitry Andric 574753f127fSDimitry Andric // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the 575753f127fSDimitry Andric // operands extracted from a v_cmp ..., s_and_saveexec pattern. 576fcaf7f86SDimitry Andric bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( 577753f127fSDimitry Andric MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { 578753f127fSDimitry Andric const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); 579753f127fSDimitry Andric 580753f127fSDimitry Andric if (NewOpcode == -1) 581753f127fSDimitry Andric return false; 582753f127fSDimitry Andric 583753f127fSDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); 584753f127fSDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); 585753f127fSDimitry Andric 586753f127fSDimitry Andric Register MoveDest = SaveExecInstr.getOperand(0).getReg(); 587753f127fSDimitry Andric 588753f127fSDimitry Andric MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); 589753f127fSDimitry Andric if (!SaveExecInstr.uses().empty()) { 590753f127fSDimitry Andric bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; 591753f127fSDimitry Andric unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 592753f127fSDimitry Andric BuildMI(*SaveExecInstr.getParent(), InsertPosIt, 593753f127fSDimitry Andric SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) 594753f127fSDimitry Andric .addReg(Exec); 595753f127fSDimitry Andric } 596753f127fSDimitry Andric 597753f127fSDimitry Andric // Omit dst as V_CMPX is implicitly writing to EXEC. 598753f127fSDimitry Andric // Add dummy src and clamp modifiers, if needed. 599753f127fSDimitry Andric auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), 600753f127fSDimitry Andric VCmp.getDebugLoc(), TII->get(NewOpcode)); 601753f127fSDimitry Andric 602753f127fSDimitry Andric auto TryAddImmediateValueFromNamedOperand = 603753f127fSDimitry Andric [&](unsigned OperandName) -> void { 604753f127fSDimitry Andric if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) 605753f127fSDimitry Andric Builder.addImm(Mod->getImm()); 606753f127fSDimitry Andric }; 607753f127fSDimitry Andric 608753f127fSDimitry Andric TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); 609753f127fSDimitry Andric Builder.add(*Src0); 610753f127fSDimitry Andric 611753f127fSDimitry Andric TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); 612753f127fSDimitry Andric Builder.add(*Src1); 613753f127fSDimitry Andric 614753f127fSDimitry Andric TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); 615753f127fSDimitry Andric 616753f127fSDimitry Andric // The kill flags may no longer be correct. 617753f127fSDimitry Andric if (Src0->isReg()) 618753f127fSDimitry Andric MRI->clearKillFlags(Src0->getReg()); 619753f127fSDimitry Andric if (Src1->isReg()) 620753f127fSDimitry Andric MRI->clearKillFlags(Src1->getReg()); 621753f127fSDimitry Andric 6225f757f3fSDimitry Andric for (MachineOperand *MO : KillFlagCandidates) 6235f757f3fSDimitry Andric MO->setIsKill(false); 6245f757f3fSDimitry Andric 625fcaf7f86SDimitry Andric SaveExecInstr.eraseFromParent(); 626fcaf7f86SDimitry Andric VCmp.eraseFromParent(); 627fcaf7f86SDimitry Andric 628753f127fSDimitry Andric return true; 629753f127fSDimitry Andric } 630753f127fSDimitry Andric 631fcaf7f86SDimitry Andric // Record (on GFX10.3 and later) occurences of 63281ad6265SDimitry Andric // v_cmp_* SGPR, IMM, VGPR 63381ad6265SDimitry Andric // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR 634fcaf7f86SDimitry Andric // to be replaced with 63581ad6265SDimitry Andric // s_mov_b32 EXEC_SGPR_DEST, exec_lo 63681ad6265SDimitry Andric // v_cmpx_* IMM, VGPR 63781ad6265SDimitry Andric // to reduce pipeline stalls. 638fcaf7f86SDimitry Andric void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( 639fcaf7f86SDimitry Andric MachineInstr &MI) { 640753f127fSDimitry Andric if (!ST->hasGFX10_3Insts()) 641fcaf7f86SDimitry Andric return; 6420b57cec5SDimitry Andric 643753f127fSDimitry Andric const unsigned AndSaveExecOpcode = 644753f127fSDimitry Andric ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 645753f127fSDimitry Andric 64681ad6265SDimitry Andric if (MI.getOpcode() != AndSaveExecOpcode) 647fcaf7f86SDimitry Andric return; 64881ad6265SDimitry Andric 649fcaf7f86SDimitry Andric Register SaveExecDest = MI.getOperand(0).getReg(); 650fcaf7f86SDimitry Andric if (!TRI->isSGPRReg(*MRI, SaveExecDest)) 651fcaf7f86SDimitry Andric return; 652fcaf7f86SDimitry Andric 653fcaf7f86SDimitry Andric MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 654fcaf7f86SDimitry Andric if (!SaveExecSrc0->isReg()) 655fcaf7f86SDimitry Andric return; 656fcaf7f86SDimitry Andric 657fcaf7f86SDimitry Andric // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec 658bdd1243dSDimitry Andric // sequence by looking at an instance of an s_and_saveexec instruction. 659bdd1243dSDimitry Andric // Returns a pointer to the v_cmp instruction if it is safe to replace the 660bdd1243dSDimitry Andric // sequence (see the conditions in the function body). This is after register 661fcaf7f86SDimitry Andric // allocation, so some checks on operand dependencies need to be considered. 662fcaf7f86SDimitry Andric MachineInstr *VCmp = nullptr; 663fcaf7f86SDimitry Andric 664fcaf7f86SDimitry Andric // Try to find the last v_cmp instruction that defs the saveexec input 665fcaf7f86SDimitry Andric // operand without any write to Exec or the saveexec input operand inbetween. 666fcaf7f86SDimitry Andric VCmp = findInstrBackwards( 667fcaf7f86SDimitry Andric MI, 668fcaf7f86SDimitry Andric [&](MachineInstr *Check) { 669fcaf7f86SDimitry Andric return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && 670fcaf7f86SDimitry Andric Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); 671fcaf7f86SDimitry Andric }, 672fcaf7f86SDimitry Andric {Exec, SaveExecSrc0->getReg()}); 673fcaf7f86SDimitry Andric 674fcaf7f86SDimitry Andric if (!VCmp) 675fcaf7f86SDimitry Andric return; 676fcaf7f86SDimitry Andric 677fcaf7f86SDimitry Andric MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); 678fcaf7f86SDimitry Andric assert(VCmpDest && "Should have an sdst operand!"); 679fcaf7f86SDimitry Andric 680fcaf7f86SDimitry Andric // Check if any of the v_cmp source operands is written by the saveexec. 681fcaf7f86SDimitry Andric MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); 682fcaf7f86SDimitry Andric if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && 683fcaf7f86SDimitry Andric MI.modifiesRegister(Src0->getReg(), TRI)) 684fcaf7f86SDimitry Andric return; 685fcaf7f86SDimitry Andric 686fcaf7f86SDimitry Andric MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); 687fcaf7f86SDimitry Andric if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && 688fcaf7f86SDimitry Andric MI.modifiesRegister(Src1->getReg(), TRI)) 689fcaf7f86SDimitry Andric return; 690fcaf7f86SDimitry Andric 691fcaf7f86SDimitry Andric // Don't do the transformation if the destination operand is included in 692bdd1243dSDimitry Andric // it's MBB Live-outs, meaning it's used in any of its successors, leading 693fcaf7f86SDimitry Andric // to incorrect code if the v_cmp and therefore the def of 694fcaf7f86SDimitry Andric // the dest operand is removed. 695fcaf7f86SDimitry Andric if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) 696fcaf7f86SDimitry Andric return; 697fcaf7f86SDimitry Andric 698fcaf7f86SDimitry Andric // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the 699fcaf7f86SDimitry Andric // s_and_saveexec, skip the optimization. 700fcaf7f86SDimitry Andric if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || 701fcaf7f86SDimitry Andric isRegisterInUseAfter(MI, VCmpDest->getReg())) 702fcaf7f86SDimitry Andric return; 703fcaf7f86SDimitry Andric 704fcaf7f86SDimitry Andric // Try to determine if there is a write to any of the VCmp 705fcaf7f86SDimitry Andric // operands between the saveexec and the vcmp. 706fcaf7f86SDimitry Andric // If yes, additional VGPR spilling might need to be inserted. In this case, 707fcaf7f86SDimitry Andric // it's not worth replacing the instruction sequence. 708fcaf7f86SDimitry Andric SmallVector<MCRegister, 2> NonDefRegs; 709fcaf7f86SDimitry Andric if (Src0->isReg()) 710fcaf7f86SDimitry Andric NonDefRegs.push_back(Src0->getReg()); 711fcaf7f86SDimitry Andric 712fcaf7f86SDimitry Andric if (Src1->isReg()) 713fcaf7f86SDimitry Andric NonDefRegs.push_back(Src1->getReg()); 714fcaf7f86SDimitry Andric 715fcaf7f86SDimitry Andric if (!findInstrBackwards( 7165f757f3fSDimitry Andric MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs, 7175f757f3fSDimitry Andric VCmp, &KillFlagCandidates)) 718fcaf7f86SDimitry Andric return; 719fcaf7f86SDimitry Andric 720fcaf7f86SDimitry Andric if (VCmp) 72181ad6265SDimitry Andric SaveExecVCmpMapping[&MI] = VCmp; 72281ad6265SDimitry Andric } 723fcaf7f86SDimitry Andric 724fcaf7f86SDimitry Andric // Record occurences of 725fcaf7f86SDimitry Andric // s_or_saveexec s_o, s_i 726fcaf7f86SDimitry Andric // s_xor exec, exec, s_o 727fcaf7f86SDimitry Andric // to be replaced with 728fcaf7f86SDimitry Andric // s_andn2_saveexec s_o, s_i. 729fcaf7f86SDimitry Andric void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { 730fcaf7f86SDimitry Andric const unsigned XorOpcode = 731fcaf7f86SDimitry Andric ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; 732fcaf7f86SDimitry Andric 733fcaf7f86SDimitry Andric if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { 734fcaf7f86SDimitry Andric const MachineOperand &XorDst = MI.getOperand(0); 735fcaf7f86SDimitry Andric const MachineOperand &XorSrc0 = MI.getOperand(1); 736fcaf7f86SDimitry Andric const MachineOperand &XorSrc1 = MI.getOperand(2); 737fcaf7f86SDimitry Andric 738fcaf7f86SDimitry Andric if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && 739fcaf7f86SDimitry Andric XorSrc1.isReg() && 740fcaf7f86SDimitry Andric (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { 741fcaf7f86SDimitry Andric const unsigned OrSaveexecOpcode = ST->isWave32() 742fcaf7f86SDimitry Andric ? AMDGPU::S_OR_SAVEEXEC_B32 743fcaf7f86SDimitry Andric : AMDGPU::S_OR_SAVEEXEC_B64; 744fcaf7f86SDimitry Andric 745fcaf7f86SDimitry Andric // Peek at the previous instruction and check if this is a relevant 746fcaf7f86SDimitry Andric // s_or_saveexec instruction. 747fcaf7f86SDimitry Andric MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); 748fcaf7f86SDimitry Andric if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) 749fcaf7f86SDimitry Andric return; 750fcaf7f86SDimitry Andric 751fcaf7f86SDimitry Andric const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); 752fcaf7f86SDimitry Andric const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); 753fcaf7f86SDimitry Andric if (OrDst.isReg() && OrSrc0.isReg()) { 754fcaf7f86SDimitry Andric if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || 755fcaf7f86SDimitry Andric (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { 756fcaf7f86SDimitry Andric OrXors.emplace_back(&PossibleOrSaveexec, &MI); 757fcaf7f86SDimitry Andric } 758fcaf7f86SDimitry Andric } 759fcaf7f86SDimitry Andric } 760fcaf7f86SDimitry Andric } 76181ad6265SDimitry Andric } 76281ad6265SDimitry Andric 763fcaf7f86SDimitry Andric bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { 764fcaf7f86SDimitry Andric if (OrXors.empty()) { 765fcaf7f86SDimitry Andric return false; 766fcaf7f86SDimitry Andric } 76781ad6265SDimitry Andric 768fcaf7f86SDimitry Andric bool Changed = false; 769fcaf7f86SDimitry Andric const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 770fcaf7f86SDimitry Andric : AMDGPU::S_ANDN2_SAVEEXEC_B64; 771fcaf7f86SDimitry Andric 772fcaf7f86SDimitry Andric for (const auto &Pair : OrXors) { 773fcaf7f86SDimitry Andric MachineInstr *Or = nullptr; 774fcaf7f86SDimitry Andric MachineInstr *Xor = nullptr; 775fcaf7f86SDimitry Andric std::tie(Or, Xor) = Pair; 776fcaf7f86SDimitry Andric BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), 777fcaf7f86SDimitry Andric TII->get(Andn2Opcode), Or->getOperand(0).getReg()) 778fcaf7f86SDimitry Andric .addReg(Or->getOperand(1).getReg()); 779fcaf7f86SDimitry Andric 780fcaf7f86SDimitry Andric Or->eraseFromParent(); 781fcaf7f86SDimitry Andric Xor->eraseFromParent(); 78281ad6265SDimitry Andric 78381ad6265SDimitry Andric Changed = true; 78481ad6265SDimitry Andric } 785753f127fSDimitry Andric 786753f127fSDimitry Andric return Changed; 78781ad6265SDimitry Andric } 78881ad6265SDimitry Andric 789753f127fSDimitry Andric bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 790753f127fSDimitry Andric if (skipFunction(MF.getFunction())) 791753f127fSDimitry Andric return false; 792753f127fSDimitry Andric 793753f127fSDimitry Andric this->MF = &MF; 794753f127fSDimitry Andric ST = &MF.getSubtarget<GCNSubtarget>(); 795753f127fSDimitry Andric TRI = ST->getRegisterInfo(); 796753f127fSDimitry Andric TII = ST->getInstrInfo(); 797753f127fSDimitry Andric MRI = &MF.getRegInfo(); 798fcaf7f86SDimitry Andric Exec = TRI->getExec(); 799753f127fSDimitry Andric 800753f127fSDimitry Andric bool Changed = optimizeExecSequence(); 801fcaf7f86SDimitry Andric 802fcaf7f86SDimitry Andric OrXors.clear(); 803fcaf7f86SDimitry Andric SaveExecVCmpMapping.clear(); 8045f757f3fSDimitry Andric KillFlagCandidates.clear(); 805fcaf7f86SDimitry Andric static unsigned SearchWindow = 10; 806fcaf7f86SDimitry Andric for (MachineBasicBlock &MBB : MF) { 807fcaf7f86SDimitry Andric unsigned SearchCount = 0; 808fcaf7f86SDimitry Andric 809fcaf7f86SDimitry Andric for (auto &MI : llvm::reverse(MBB)) { 810fcaf7f86SDimitry Andric if (MI.isDebugInstr()) 811fcaf7f86SDimitry Andric continue; 812fcaf7f86SDimitry Andric 813fcaf7f86SDimitry Andric if (SearchCount >= SearchWindow) { 814fcaf7f86SDimitry Andric break; 815fcaf7f86SDimitry Andric } 816fcaf7f86SDimitry Andric 817fcaf7f86SDimitry Andric tryRecordOrSaveexecXorSequence(MI); 818fcaf7f86SDimitry Andric tryRecordVCmpxAndSaveexecSequence(MI); 819fcaf7f86SDimitry Andric 820fcaf7f86SDimitry Andric if (MI.modifiesRegister(Exec, TRI)) { 821fcaf7f86SDimitry Andric break; 822fcaf7f86SDimitry Andric } 823fcaf7f86SDimitry Andric 824fcaf7f86SDimitry Andric ++SearchCount; 825fcaf7f86SDimitry Andric } 826fcaf7f86SDimitry Andric } 827fcaf7f86SDimitry Andric 828fcaf7f86SDimitry Andric Changed |= optimizeOrSaveexecXorSequences(); 829fcaf7f86SDimitry Andric for (const auto &Entry : SaveExecVCmpMapping) { 830fcaf7f86SDimitry Andric MachineInstr *SaveExecInstr = Entry.getFirst(); 831fcaf7f86SDimitry Andric MachineInstr *VCmpInstr = Entry.getSecond(); 832fcaf7f86SDimitry Andric 833fcaf7f86SDimitry Andric Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); 834fcaf7f86SDimitry Andric } 835753f127fSDimitry Andric 83681ad6265SDimitry Andric return Changed; 8370b57cec5SDimitry Andric } 838