xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "AMDGPU.h"
10e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
110b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12753f127fSDimitry Andric #include "SIRegisterInfo.h"
135f757f3fSDimitry Andric #include "llvm/ADT/SmallVector.h"
14*0fca6ea1SDimitry Andric #include "llvm/CodeGen/LiveRegUnits.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
16fcaf7f86SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
17fcaf7f86SDimitry Andric #include "llvm/CodeGen/TargetRegisterInfo.h"
18480093f4SDimitry Andric #include "llvm/InitializePasses.h"
190b57cec5SDimitry Andric 
200b57cec5SDimitry Andric using namespace llvm;
210b57cec5SDimitry Andric 
220b57cec5SDimitry Andric #define DEBUG_TYPE "si-optimize-exec-masking"
230b57cec5SDimitry Andric 
240b57cec5SDimitry Andric namespace {
250b57cec5SDimitry Andric 
260b57cec5SDimitry Andric class SIOptimizeExecMasking : public MachineFunctionPass {
27753f127fSDimitry Andric   MachineFunction *MF = nullptr;
28753f127fSDimitry Andric   const GCNSubtarget *ST = nullptr;
29753f127fSDimitry Andric   const SIRegisterInfo *TRI = nullptr;
30753f127fSDimitry Andric   const SIInstrInfo *TII = nullptr;
31753f127fSDimitry Andric   const MachineRegisterInfo *MRI = nullptr;
32fcaf7f86SDimitry Andric   MCRegister Exec;
33fcaf7f86SDimitry Andric 
34fcaf7f86SDimitry Andric   DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
35fcaf7f86SDimitry Andric   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
365f757f3fSDimitry Andric   SmallVector<MachineOperand *, 1> KillFlagCandidates;
37753f127fSDimitry Andric 
38753f127fSDimitry Andric   Register isCopyFromExec(const MachineInstr &MI) const;
39753f127fSDimitry Andric   Register isCopyToExec(const MachineInstr &MI) const;
40753f127fSDimitry Andric   bool removeTerminatorBit(MachineInstr &MI) const;
41753f127fSDimitry Andric   MachineBasicBlock::reverse_iterator
42753f127fSDimitry Andric   fixTerminators(MachineBasicBlock &MBB) const;
43753f127fSDimitry Andric   MachineBasicBlock::reverse_iterator
44bdd1243dSDimitry Andric   findExecCopy(MachineBasicBlock &MBB,
45bdd1243dSDimitry Andric                MachineBasicBlock::reverse_iterator I) const;
46753f127fSDimitry Andric   bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
47753f127fSDimitry Andric                               MCRegister Reg, bool UseLiveOuts = false,
48753f127fSDimitry Andric                               bool IgnoreStart = false) const;
49753f127fSDimitry Andric   bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
505f757f3fSDimitry Andric   MachineInstr *findInstrBackwards(
515f757f3fSDimitry Andric       MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
52753f127fSDimitry Andric       ArrayRef<MCRegister> NonModifiableRegs,
535f757f3fSDimitry Andric       MachineInstr *Terminator = nullptr,
545f757f3fSDimitry Andric       SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
55753f127fSDimitry Andric       unsigned MaxInstructions = 20) const;
56fcaf7f86SDimitry Andric   bool optimizeExecSequence();
57fcaf7f86SDimitry Andric   void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
58fcaf7f86SDimitry Andric   bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
59fcaf7f86SDimitry Andric                                     MachineInstr &VCmp, MCRegister Exec) const;
60fcaf7f86SDimitry Andric 
61fcaf7f86SDimitry Andric   void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
62fcaf7f86SDimitry Andric   bool optimizeOrSaveexecXorSequences();
63753f127fSDimitry Andric 
640b57cec5SDimitry Andric public:
650b57cec5SDimitry Andric   static char ID;
660b57cec5SDimitry Andric 
670b57cec5SDimitry Andric   SIOptimizeExecMasking() : MachineFunctionPass(ID) {
680b57cec5SDimitry Andric     initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
690b57cec5SDimitry Andric   }
700b57cec5SDimitry Andric 
710b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
720b57cec5SDimitry Andric 
730b57cec5SDimitry Andric   StringRef getPassName() const override {
740b57cec5SDimitry Andric     return "SI optimize exec mask operations";
750b57cec5SDimitry Andric   }
760b57cec5SDimitry Andric 
770b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
780b57cec5SDimitry Andric     AU.setPreservesCFG();
790b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
800b57cec5SDimitry Andric   }
810b57cec5SDimitry Andric };
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric } // End anonymous namespace.
840b57cec5SDimitry Andric 
850b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
860b57cec5SDimitry Andric                       "SI optimize exec mask operations", false, false)
87*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
880b57cec5SDimitry Andric INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
890b57cec5SDimitry Andric                     "SI optimize exec mask operations", false, false)
900b57cec5SDimitry Andric 
910b57cec5SDimitry Andric char SIOptimizeExecMasking::ID = 0;
920b57cec5SDimitry Andric 
930b57cec5SDimitry Andric char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
940b57cec5SDimitry Andric 
950b57cec5SDimitry Andric /// If \p MI is a copy from exec, return the register copied to.
96753f127fSDimitry Andric Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
970b57cec5SDimitry Andric   switch (MI.getOpcode()) {
980b57cec5SDimitry Andric   case AMDGPU::COPY:
990b57cec5SDimitry Andric   case AMDGPU::S_MOV_B64:
1000b57cec5SDimitry Andric   case AMDGPU::S_MOV_B64_term:
1010b57cec5SDimitry Andric   case AMDGPU::S_MOV_B32:
1020b57cec5SDimitry Andric   case AMDGPU::S_MOV_B32_term: {
1030b57cec5SDimitry Andric     const MachineOperand &Src = MI.getOperand(1);
104fcaf7f86SDimitry Andric     if (Src.isReg() && Src.getReg() == Exec)
1050b57cec5SDimitry Andric       return MI.getOperand(0).getReg();
1060b57cec5SDimitry Andric   }
1070b57cec5SDimitry Andric   }
1080b57cec5SDimitry Andric 
1090b57cec5SDimitry Andric   return AMDGPU::NoRegister;
1100b57cec5SDimitry Andric }
1110b57cec5SDimitry Andric 
1120b57cec5SDimitry Andric /// If \p MI is a copy to exec, return the register copied from.
113753f127fSDimitry Andric Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
1140b57cec5SDimitry Andric   switch (MI.getOpcode()) {
1150b57cec5SDimitry Andric   case AMDGPU::COPY:
1160b57cec5SDimitry Andric   case AMDGPU::S_MOV_B64:
1170b57cec5SDimitry Andric   case AMDGPU::S_MOV_B32: {
1180b57cec5SDimitry Andric     const MachineOperand &Dst = MI.getOperand(0);
119fcaf7f86SDimitry Andric     if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
1200b57cec5SDimitry Andric       return MI.getOperand(1).getReg();
1210b57cec5SDimitry Andric     break;
1220b57cec5SDimitry Andric   }
1230b57cec5SDimitry Andric   case AMDGPU::S_MOV_B64_term:
1240b57cec5SDimitry Andric   case AMDGPU::S_MOV_B32_term:
1250b57cec5SDimitry Andric     llvm_unreachable("should have been replaced");
1260b57cec5SDimitry Andric   }
1270b57cec5SDimitry Andric 
128480093f4SDimitry Andric   return Register();
1290b57cec5SDimitry Andric }
1300b57cec5SDimitry Andric 
1310b57cec5SDimitry Andric /// If \p MI is a logical operation on an exec value,
1320b57cec5SDimitry Andric /// return the register copied to.
133480093f4SDimitry Andric static Register isLogicalOpOnExec(const MachineInstr &MI) {
1340b57cec5SDimitry Andric   switch (MI.getOpcode()) {
1350b57cec5SDimitry Andric   case AMDGPU::S_AND_B64:
1360b57cec5SDimitry Andric   case AMDGPU::S_OR_B64:
1370b57cec5SDimitry Andric   case AMDGPU::S_XOR_B64:
1380b57cec5SDimitry Andric   case AMDGPU::S_ANDN2_B64:
1390b57cec5SDimitry Andric   case AMDGPU::S_ORN2_B64:
1400b57cec5SDimitry Andric   case AMDGPU::S_NAND_B64:
1410b57cec5SDimitry Andric   case AMDGPU::S_NOR_B64:
1420b57cec5SDimitry Andric   case AMDGPU::S_XNOR_B64: {
1430b57cec5SDimitry Andric     const MachineOperand &Src1 = MI.getOperand(1);
1440b57cec5SDimitry Andric     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
1450b57cec5SDimitry Andric       return MI.getOperand(0).getReg();
1460b57cec5SDimitry Andric     const MachineOperand &Src2 = MI.getOperand(2);
1470b57cec5SDimitry Andric     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
1480b57cec5SDimitry Andric       return MI.getOperand(0).getReg();
1490b57cec5SDimitry Andric     break;
1500b57cec5SDimitry Andric   }
1510b57cec5SDimitry Andric   case AMDGPU::S_AND_B32:
1520b57cec5SDimitry Andric   case AMDGPU::S_OR_B32:
1530b57cec5SDimitry Andric   case AMDGPU::S_XOR_B32:
1540b57cec5SDimitry Andric   case AMDGPU::S_ANDN2_B32:
1550b57cec5SDimitry Andric   case AMDGPU::S_ORN2_B32:
1560b57cec5SDimitry Andric   case AMDGPU::S_NAND_B32:
1570b57cec5SDimitry Andric   case AMDGPU::S_NOR_B32:
1580b57cec5SDimitry Andric   case AMDGPU::S_XNOR_B32: {
1590b57cec5SDimitry Andric     const MachineOperand &Src1 = MI.getOperand(1);
1600b57cec5SDimitry Andric     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
1610b57cec5SDimitry Andric       return MI.getOperand(0).getReg();
1620b57cec5SDimitry Andric     const MachineOperand &Src2 = MI.getOperand(2);
1630b57cec5SDimitry Andric     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
1640b57cec5SDimitry Andric       return MI.getOperand(0).getReg();
1650b57cec5SDimitry Andric     break;
1660b57cec5SDimitry Andric   }
1670b57cec5SDimitry Andric   }
1680b57cec5SDimitry Andric 
1690b57cec5SDimitry Andric   return AMDGPU::NoRegister;
1700b57cec5SDimitry Andric }
1710b57cec5SDimitry Andric 
1720b57cec5SDimitry Andric static unsigned getSaveExecOp(unsigned Opc) {
1730b57cec5SDimitry Andric   switch (Opc) {
1740b57cec5SDimitry Andric   case AMDGPU::S_AND_B64:
1750b57cec5SDimitry Andric     return AMDGPU::S_AND_SAVEEXEC_B64;
1760b57cec5SDimitry Andric   case AMDGPU::S_OR_B64:
1770b57cec5SDimitry Andric     return AMDGPU::S_OR_SAVEEXEC_B64;
1780b57cec5SDimitry Andric   case AMDGPU::S_XOR_B64:
1790b57cec5SDimitry Andric     return AMDGPU::S_XOR_SAVEEXEC_B64;
1800b57cec5SDimitry Andric   case AMDGPU::S_ANDN2_B64:
1810b57cec5SDimitry Andric     return AMDGPU::S_ANDN2_SAVEEXEC_B64;
1820b57cec5SDimitry Andric   case AMDGPU::S_ORN2_B64:
1830b57cec5SDimitry Andric     return AMDGPU::S_ORN2_SAVEEXEC_B64;
1840b57cec5SDimitry Andric   case AMDGPU::S_NAND_B64:
1850b57cec5SDimitry Andric     return AMDGPU::S_NAND_SAVEEXEC_B64;
1860b57cec5SDimitry Andric   case AMDGPU::S_NOR_B64:
1870b57cec5SDimitry Andric     return AMDGPU::S_NOR_SAVEEXEC_B64;
1880b57cec5SDimitry Andric   case AMDGPU::S_XNOR_B64:
1890b57cec5SDimitry Andric     return AMDGPU::S_XNOR_SAVEEXEC_B64;
1900b57cec5SDimitry Andric   case AMDGPU::S_AND_B32:
1910b57cec5SDimitry Andric     return AMDGPU::S_AND_SAVEEXEC_B32;
1920b57cec5SDimitry Andric   case AMDGPU::S_OR_B32:
1930b57cec5SDimitry Andric     return AMDGPU::S_OR_SAVEEXEC_B32;
1940b57cec5SDimitry Andric   case AMDGPU::S_XOR_B32:
1950b57cec5SDimitry Andric     return AMDGPU::S_XOR_SAVEEXEC_B32;
1960b57cec5SDimitry Andric   case AMDGPU::S_ANDN2_B32:
1970b57cec5SDimitry Andric     return AMDGPU::S_ANDN2_SAVEEXEC_B32;
1980b57cec5SDimitry Andric   case AMDGPU::S_ORN2_B32:
1990b57cec5SDimitry Andric     return AMDGPU::S_ORN2_SAVEEXEC_B32;
2000b57cec5SDimitry Andric   case AMDGPU::S_NAND_B32:
2010b57cec5SDimitry Andric     return AMDGPU::S_NAND_SAVEEXEC_B32;
2020b57cec5SDimitry Andric   case AMDGPU::S_NOR_B32:
2030b57cec5SDimitry Andric     return AMDGPU::S_NOR_SAVEEXEC_B32;
2040b57cec5SDimitry Andric   case AMDGPU::S_XNOR_B32:
2050b57cec5SDimitry Andric     return AMDGPU::S_XNOR_SAVEEXEC_B32;
2060b57cec5SDimitry Andric   default:
2070b57cec5SDimitry Andric     return AMDGPU::INSTRUCTION_LIST_END;
2080b57cec5SDimitry Andric   }
2090b57cec5SDimitry Andric }
2100b57cec5SDimitry Andric 
2110b57cec5SDimitry Andric // These are only terminators to get correct spill code placement during
212e8d8bef9SDimitry Andric // register allocation, so turn them back into normal instructions.
213753f127fSDimitry Andric bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const {
2140b57cec5SDimitry Andric   switch (MI.getOpcode()) {
2150b57cec5SDimitry Andric   case AMDGPU::S_MOV_B32_term: {
216e8d8bef9SDimitry Andric     bool RegSrc = MI.getOperand(1).isReg();
217753f127fSDimitry Andric     MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
218e8d8bef9SDimitry Andric     return true;
219e8d8bef9SDimitry Andric   }
220e8d8bef9SDimitry Andric   case AMDGPU::S_MOV_B64_term: {
221e8d8bef9SDimitry Andric     bool RegSrc = MI.getOperand(1).isReg();
222753f127fSDimitry Andric     MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
2230b57cec5SDimitry Andric     return true;
2240b57cec5SDimitry Andric   }
2250b57cec5SDimitry Andric   case AMDGPU::S_XOR_B64_term: {
2260b57cec5SDimitry Andric     // This is only a terminator to get the correct spill code placement during
2270b57cec5SDimitry Andric     // register allocation.
228753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_XOR_B64));
2290b57cec5SDimitry Andric     return true;
2300b57cec5SDimitry Andric   }
2310b57cec5SDimitry Andric   case AMDGPU::S_XOR_B32_term: {
2320b57cec5SDimitry Andric     // This is only a terminator to get the correct spill code placement during
2330b57cec5SDimitry Andric     // register allocation.
234753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_XOR_B32));
2350b57cec5SDimitry Andric     return true;
2360b57cec5SDimitry Andric   }
237e8d8bef9SDimitry Andric   case AMDGPU::S_OR_B64_term: {
238e8d8bef9SDimitry Andric     // This is only a terminator to get the correct spill code placement during
239e8d8bef9SDimitry Andric     // register allocation.
240753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_OR_B64));
241e8d8bef9SDimitry Andric     return true;
242e8d8bef9SDimitry Andric   }
2430b57cec5SDimitry Andric   case AMDGPU::S_OR_B32_term: {
2440b57cec5SDimitry Andric     // This is only a terminator to get the correct spill code placement during
2450b57cec5SDimitry Andric     // register allocation.
246753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_OR_B32));
2470b57cec5SDimitry Andric     return true;
2480b57cec5SDimitry Andric   }
2490b57cec5SDimitry Andric   case AMDGPU::S_ANDN2_B64_term: {
2500b57cec5SDimitry Andric     // This is only a terminator to get the correct spill code placement during
2510b57cec5SDimitry Andric     // register allocation.
252753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64));
2530b57cec5SDimitry Andric     return true;
2540b57cec5SDimitry Andric   }
2550b57cec5SDimitry Andric   case AMDGPU::S_ANDN2_B32_term: {
2560b57cec5SDimitry Andric     // This is only a terminator to get the correct spill code placement during
2570b57cec5SDimitry Andric     // register allocation.
258753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32));
2590b57cec5SDimitry Andric     return true;
2600b57cec5SDimitry Andric   }
261fe6060f1SDimitry Andric   case AMDGPU::S_AND_B64_term: {
262fe6060f1SDimitry Andric     // This is only a terminator to get the correct spill code placement during
263fe6060f1SDimitry Andric     // register allocation.
264753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_AND_B64));
265fe6060f1SDimitry Andric     return true;
266fe6060f1SDimitry Andric   }
267fe6060f1SDimitry Andric   case AMDGPU::S_AND_B32_term: {
268fe6060f1SDimitry Andric     // This is only a terminator to get the correct spill code placement during
269fe6060f1SDimitry Andric     // register allocation.
270753f127fSDimitry Andric     MI.setDesc(TII->get(AMDGPU::S_AND_B32));
271fe6060f1SDimitry Andric     return true;
272fe6060f1SDimitry Andric   }
2730b57cec5SDimitry Andric   default:
2740b57cec5SDimitry Andric     return false;
2750b57cec5SDimitry Andric   }
2760b57cec5SDimitry Andric }
2770b57cec5SDimitry Andric 
278e8d8bef9SDimitry Andric // Turn all pseudoterminators in the block into their equivalent non-terminator
279e8d8bef9SDimitry Andric // instructions. Returns the reverse iterator to the first non-terminator
280e8d8bef9SDimitry Andric // instruction in the block.
281753f127fSDimitry Andric MachineBasicBlock::reverse_iterator
282753f127fSDimitry Andric SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const {
2830b57cec5SDimitry Andric   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
284e8d8bef9SDimitry Andric 
285e8d8bef9SDimitry Andric   bool Seen = false;
286e8d8bef9SDimitry Andric   MachineBasicBlock::reverse_iterator FirstNonTerm = I;
2870b57cec5SDimitry Andric   for (; I != E; ++I) {
2880b57cec5SDimitry Andric     if (!I->isTerminator())
289e8d8bef9SDimitry Andric       return Seen ? FirstNonTerm : I;
2900b57cec5SDimitry Andric 
291753f127fSDimitry Andric     if (removeTerminatorBit(*I)) {
292e8d8bef9SDimitry Andric       if (!Seen) {
293e8d8bef9SDimitry Andric         FirstNonTerm = I;
294e8d8bef9SDimitry Andric         Seen = true;
295e8d8bef9SDimitry Andric       }
296e8d8bef9SDimitry Andric     }
2970b57cec5SDimitry Andric   }
2980b57cec5SDimitry Andric 
299e8d8bef9SDimitry Andric   return FirstNonTerm;
3000b57cec5SDimitry Andric }
3010b57cec5SDimitry Andric 
302bdd1243dSDimitry Andric MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy(
303bdd1243dSDimitry Andric     MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const {
3040b57cec5SDimitry Andric   const unsigned InstLimit = 25;
3050b57cec5SDimitry Andric 
3060b57cec5SDimitry Andric   auto E = MBB.rend();
3070b57cec5SDimitry Andric   for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
308753f127fSDimitry Andric     Register CopyFromExec = isCopyFromExec(*I);
309480093f4SDimitry Andric     if (CopyFromExec.isValid())
3100b57cec5SDimitry Andric       return I;
3110b57cec5SDimitry Andric   }
3120b57cec5SDimitry Andric 
3130b57cec5SDimitry Andric   return E;
3140b57cec5SDimitry Andric }
3150b57cec5SDimitry Andric 
316*0fca6ea1SDimitry Andric // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly
3170b57cec5SDimitry Andric // report the register as unavailable because a super-register with a lane mask
3180b57cec5SDimitry Andric // is unavailable.
3190b57cec5SDimitry Andric static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
3200b57cec5SDimitry Andric   for (MachineBasicBlock *Succ : MBB.successors()) {
3210b57cec5SDimitry Andric     if (Succ->isLiveIn(Reg))
3220b57cec5SDimitry Andric       return true;
3230b57cec5SDimitry Andric   }
3240b57cec5SDimitry Andric 
3250b57cec5SDimitry Andric   return false;
3260b57cec5SDimitry Andric }
3270b57cec5SDimitry Andric 
32881ad6265SDimitry Andric // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
32981ad6265SDimitry Andric // the beginning of the BB is reached or Pred evaluates to true - which can be
33081ad6265SDimitry Andric // an arbitrary condition based on the current MachineInstr, for instance an
33181ad6265SDimitry Andric // target instruction. Breaks prematurely by returning nullptr if one of the
33281ad6265SDimitry Andric // registers given in NonModifiableRegs is modified by the current instruction.
333753f127fSDimitry Andric MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
334753f127fSDimitry Andric     MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
3355f757f3fSDimitry Andric     ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
3365f757f3fSDimitry Andric     SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
3375f757f3fSDimitry Andric     unsigned MaxInstructions) const {
33881ad6265SDimitry Andric   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
33981ad6265SDimitry Andric                                       E = Origin.getParent()->rend();
34081ad6265SDimitry Andric   unsigned CurrentIteration = 0;
34181ad6265SDimitry Andric 
34281ad6265SDimitry Andric   for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
34381ad6265SDimitry Andric     if (A->isDebugInstr())
34481ad6265SDimitry Andric       continue;
34581ad6265SDimitry Andric 
34681ad6265SDimitry Andric     if (Pred(&*A))
34781ad6265SDimitry Andric       return &*A;
34881ad6265SDimitry Andric 
34981ad6265SDimitry Andric     for (MCRegister Reg : NonModifiableRegs) {
35081ad6265SDimitry Andric       if (A->modifiesRegister(Reg, TRI))
35181ad6265SDimitry Andric         return nullptr;
3525f757f3fSDimitry Andric 
3535f757f3fSDimitry Andric       // Check for kills that appear after the terminator instruction, that
3545f757f3fSDimitry Andric       // would not be detected by clearKillFlags, since they will cause the
3555f757f3fSDimitry Andric       // register to be dead at a later place, causing the verifier to fail.
3565f757f3fSDimitry Andric       // We use the candidates to clear the kill flags later.
3575f757f3fSDimitry Andric       if (Terminator && KillFlagCandidates && A != Terminator &&
3585f757f3fSDimitry Andric           A->killsRegister(Reg, TRI)) {
3595f757f3fSDimitry Andric         for (MachineOperand &MO : A->operands()) {
3605f757f3fSDimitry Andric           if (MO.isReg() && MO.isKill()) {
3615f757f3fSDimitry Andric             Register Candidate = MO.getReg();
3625f757f3fSDimitry Andric             if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg))
3635f757f3fSDimitry Andric               KillFlagCandidates->push_back(&MO);
3645f757f3fSDimitry Andric           }
3655f757f3fSDimitry Andric         }
3665f757f3fSDimitry Andric       }
36781ad6265SDimitry Andric     }
36881ad6265SDimitry Andric 
36981ad6265SDimitry Andric     ++CurrentIteration;
37081ad6265SDimitry Andric   }
37181ad6265SDimitry Andric 
37281ad6265SDimitry Andric   return nullptr;
37381ad6265SDimitry Andric }
37481ad6265SDimitry Andric 
37581ad6265SDimitry Andric // Determine if a register Reg is not re-defined and still in use
37681ad6265SDimitry Andric // in the range (Stop..Start].
37781ad6265SDimitry Andric // It does so by backwards calculating liveness from the end of the BB until
37881ad6265SDimitry Andric // either Stop or the beginning of the BB is reached.
37981ad6265SDimitry Andric // After liveness is calculated, we can determine if Reg is still in use and not
38081ad6265SDimitry Andric // defined inbetween the instructions.
381753f127fSDimitry Andric bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop,
382753f127fSDimitry Andric                                                    MachineInstr &Start,
383753f127fSDimitry Andric                                                    MCRegister Reg,
384753f127fSDimitry Andric                                                    bool UseLiveOuts,
385753f127fSDimitry Andric                                                    bool IgnoreStart) const {
386*0fca6ea1SDimitry Andric   LiveRegUnits LR(*TRI);
387753f127fSDimitry Andric   if (UseLiveOuts)
38881ad6265SDimitry Andric     LR.addLiveOuts(*Stop.getParent());
38981ad6265SDimitry Andric 
39081ad6265SDimitry Andric   MachineBasicBlock::reverse_iterator A(Start);
39181ad6265SDimitry Andric 
392753f127fSDimitry Andric   if (IgnoreStart)
39381ad6265SDimitry Andric     ++A;
39481ad6265SDimitry Andric 
39581ad6265SDimitry Andric   for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
39681ad6265SDimitry Andric     LR.stepBackward(*A);
39781ad6265SDimitry Andric   }
39881ad6265SDimitry Andric 
399*0fca6ea1SDimitry Andric   return !LR.available(Reg) || MRI->isReserved(Reg);
40081ad6265SDimitry Andric }
40181ad6265SDimitry Andric 
40281ad6265SDimitry Andric // Determine if a register Reg is not re-defined and still in use
40381ad6265SDimitry Andric // in the range (Stop..BB.end].
404753f127fSDimitry Andric bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
405753f127fSDimitry Andric                                                  MCRegister Reg) const {
406753f127fSDimitry Andric   return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true);
40781ad6265SDimitry Andric }
40881ad6265SDimitry Andric 
4090b57cec5SDimitry Andric // Optimize sequences emitted for control flow lowering. They are originally
4100b57cec5SDimitry Andric // emitted as the separate operations because spill code may need to be
4110b57cec5SDimitry Andric // inserted for the saved copy of exec.
4120b57cec5SDimitry Andric //
4130b57cec5SDimitry Andric //     x = copy exec
4140b57cec5SDimitry Andric //     z = s_<op>_b64 x, y
4150b57cec5SDimitry Andric //     exec = copy z
4160b57cec5SDimitry Andric // =>
4170b57cec5SDimitry Andric //     x = s_<op>_saveexec_b64 y
4180b57cec5SDimitry Andric //
419fcaf7f86SDimitry Andric bool SIOptimizeExecMasking::optimizeExecSequence() {
42081ad6265SDimitry Andric   bool Changed = false;
421753f127fSDimitry Andric   for (MachineBasicBlock &MBB : *MF) {
422753f127fSDimitry Andric     MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
4230b57cec5SDimitry Andric     MachineBasicBlock::reverse_iterator E = MBB.rend();
4240b57cec5SDimitry Andric     if (I == E)
4250b57cec5SDimitry Andric       continue;
4260b57cec5SDimitry Andric 
427e8d8bef9SDimitry Andric     // It's possible to see other terminator copies after the exec copy. This
428e8d8bef9SDimitry Andric     // can happen if control flow pseudos had their outputs used by phis.
429e8d8bef9SDimitry Andric     Register CopyToExec;
430e8d8bef9SDimitry Andric 
431e8d8bef9SDimitry Andric     unsigned SearchCount = 0;
432e8d8bef9SDimitry Andric     const unsigned SearchLimit = 5;
433e8d8bef9SDimitry Andric     while (I != E && SearchCount++ < SearchLimit) {
434753f127fSDimitry Andric       CopyToExec = isCopyToExec(*I);
435e8d8bef9SDimitry Andric       if (CopyToExec)
436e8d8bef9SDimitry Andric         break;
437e8d8bef9SDimitry Andric       ++I;
438e8d8bef9SDimitry Andric     }
439e8d8bef9SDimitry Andric 
440e8d8bef9SDimitry Andric     if (!CopyToExec)
4410b57cec5SDimitry Andric       continue;
4420b57cec5SDimitry Andric 
4430b57cec5SDimitry Andric     // Scan backwards to find the def.
444753f127fSDimitry Andric     auto *CopyToExecInst = &*I;
445bdd1243dSDimitry Andric     auto CopyFromExecInst = findExecCopy(MBB, I);
4460b57cec5SDimitry Andric     if (CopyFromExecInst == E) {
4470b57cec5SDimitry Andric       auto PrepareExecInst = std::next(I);
4480b57cec5SDimitry Andric       if (PrepareExecInst == E)
4490b57cec5SDimitry Andric         continue;
4500b57cec5SDimitry Andric       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
4510b57cec5SDimitry Andric       if (CopyToExecInst->getOperand(1).isKill() &&
4520b57cec5SDimitry Andric           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
4530b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
4540b57cec5SDimitry Andric 
4550b57cec5SDimitry Andric         PrepareExecInst->getOperand(0).setReg(Exec);
4560b57cec5SDimitry Andric 
4570b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
4580b57cec5SDimitry Andric 
4590b57cec5SDimitry Andric         CopyToExecInst->eraseFromParent();
46081ad6265SDimitry Andric         Changed = true;
4610b57cec5SDimitry Andric       }
4620b57cec5SDimitry Andric 
4630b57cec5SDimitry Andric       continue;
4640b57cec5SDimitry Andric     }
4650b57cec5SDimitry Andric 
4660b57cec5SDimitry Andric     if (isLiveOut(MBB, CopyToExec)) {
4670b57cec5SDimitry Andric       // The copied register is live out and has a second use in another block.
4680b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
4690b57cec5SDimitry Andric       continue;
4700b57cec5SDimitry Andric     }
4710b57cec5SDimitry Andric 
4728bcb0991SDimitry Andric     Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
4730b57cec5SDimitry Andric     MachineInstr *SaveExecInst = nullptr;
4740b57cec5SDimitry Andric     SmallVector<MachineInstr *, 4> OtherUseInsts;
4750b57cec5SDimitry Andric 
476753f127fSDimitry Andric     for (MachineBasicBlock::iterator
477753f127fSDimitry Andric              J = std::next(CopyFromExecInst->getIterator()),
478753f127fSDimitry Andric              JE = I->getIterator();
4790b57cec5SDimitry Andric          J != JE; ++J) {
4800b57cec5SDimitry Andric       if (SaveExecInst && J->readsRegister(Exec, TRI)) {
4810b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
4820b57cec5SDimitry Andric         // Make sure this is inserted after any VALU ops that may have been
4830b57cec5SDimitry Andric         // scheduled in between.
4840b57cec5SDimitry Andric         SaveExecInst = nullptr;
4850b57cec5SDimitry Andric         break;
4860b57cec5SDimitry Andric       }
4870b57cec5SDimitry Andric 
4880b57cec5SDimitry Andric       bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
4890b57cec5SDimitry Andric 
4900b57cec5SDimitry Andric       if (J->modifiesRegister(CopyToExec, TRI)) {
4910b57cec5SDimitry Andric         if (SaveExecInst) {
4920b57cec5SDimitry Andric           LLVM_DEBUG(dbgs() << "Multiple instructions modify "
4930b57cec5SDimitry Andric                             << printReg(CopyToExec, TRI) << '\n');
4940b57cec5SDimitry Andric           SaveExecInst = nullptr;
4950b57cec5SDimitry Andric           break;
4960b57cec5SDimitry Andric         }
4970b57cec5SDimitry Andric 
4980b57cec5SDimitry Andric         unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
4990b57cec5SDimitry Andric         if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
5000b57cec5SDimitry Andric           break;
5010b57cec5SDimitry Andric 
5020b57cec5SDimitry Andric         if (ReadsCopyFromExec) {
5030b57cec5SDimitry Andric           SaveExecInst = &*J;
5040b57cec5SDimitry Andric           LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
5050b57cec5SDimitry Andric           continue;
506*0fca6ea1SDimitry Andric         }
507*0fca6ea1SDimitry Andric         LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J
508*0fca6ea1SDimitry Andric                           << '\n');
5090b57cec5SDimitry Andric         break;
5100b57cec5SDimitry Andric       }
511*0fca6ea1SDimitry Andric       if (ReadsCopyFromExec && !SaveExecInst) {
5120b57cec5SDimitry Andric         // Make sure no other instruction is trying to use this copy, before it
5130b57cec5SDimitry Andric         // will be rewritten by the saveexec, i.e. hasOneUse. There may have
5140b57cec5SDimitry Andric         // been another use, such as an inserted spill. For example:
5150b57cec5SDimitry Andric         //
5160b57cec5SDimitry Andric         // %sgpr0_sgpr1 = COPY %exec
5170b57cec5SDimitry Andric         // spill %sgpr0_sgpr1
5180b57cec5SDimitry Andric         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
5190b57cec5SDimitry Andric         //
5200b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
5210b57cec5SDimitry Andric                           << '\n');
5220b57cec5SDimitry Andric         break;
5230b57cec5SDimitry Andric       }
5240b57cec5SDimitry Andric 
5250b57cec5SDimitry Andric       if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
5260b57cec5SDimitry Andric         assert(SaveExecInst != &*J);
5270b57cec5SDimitry Andric         OtherUseInsts.push_back(&*J);
5280b57cec5SDimitry Andric       }
5290b57cec5SDimitry Andric     }
5300b57cec5SDimitry Andric 
5310b57cec5SDimitry Andric     if (!SaveExecInst)
5320b57cec5SDimitry Andric       continue;
5330b57cec5SDimitry Andric 
5340b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
5350b57cec5SDimitry Andric 
5360b57cec5SDimitry Andric     MachineOperand &Src0 = SaveExecInst->getOperand(1);
5370b57cec5SDimitry Andric     MachineOperand &Src1 = SaveExecInst->getOperand(2);
5380b57cec5SDimitry Andric 
5390b57cec5SDimitry Andric     MachineOperand *OtherOp = nullptr;
5400b57cec5SDimitry Andric 
5410b57cec5SDimitry Andric     if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
5420b57cec5SDimitry Andric       OtherOp = &Src1;
5430b57cec5SDimitry Andric     } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
5440b57cec5SDimitry Andric       if (!SaveExecInst->isCommutable())
5450b57cec5SDimitry Andric         break;
5460b57cec5SDimitry Andric 
5470b57cec5SDimitry Andric       OtherOp = &Src0;
5480b57cec5SDimitry Andric     } else
5490b57cec5SDimitry Andric       llvm_unreachable("unexpected");
5500b57cec5SDimitry Andric 
5510b57cec5SDimitry Andric     CopyFromExecInst->eraseFromParent();
5520b57cec5SDimitry Andric 
5530b57cec5SDimitry Andric     auto InsPt = SaveExecInst->getIterator();
5540b57cec5SDimitry Andric     const DebugLoc &DL = SaveExecInst->getDebugLoc();
5550b57cec5SDimitry Andric 
5560b57cec5SDimitry Andric     BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
5570b57cec5SDimitry Andric             CopyFromExec)
5580b57cec5SDimitry Andric         .addReg(OtherOp->getReg());
5590b57cec5SDimitry Andric     SaveExecInst->eraseFromParent();
5600b57cec5SDimitry Andric 
5610b57cec5SDimitry Andric     CopyToExecInst->eraseFromParent();
5620b57cec5SDimitry Andric 
5630b57cec5SDimitry Andric     for (MachineInstr *OtherInst : OtherUseInsts) {
564753f127fSDimitry Andric       OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
565753f127fSDimitry Andric                                     *TRI);
5660b57cec5SDimitry Andric     }
56781ad6265SDimitry Andric 
56881ad6265SDimitry Andric     Changed = true;
5690b57cec5SDimitry Andric   }
5700b57cec5SDimitry Andric 
571753f127fSDimitry Andric   return Changed;
572753f127fSDimitry Andric }
573753f127fSDimitry Andric 
574753f127fSDimitry Andric // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
575753f127fSDimitry Andric // operands extracted from a v_cmp ..., s_and_saveexec pattern.
576fcaf7f86SDimitry Andric bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
577753f127fSDimitry Andric     MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
578753f127fSDimitry Andric   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
579753f127fSDimitry Andric 
580753f127fSDimitry Andric   if (NewOpcode == -1)
581753f127fSDimitry Andric     return false;
582753f127fSDimitry Andric 
583753f127fSDimitry Andric   MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
584753f127fSDimitry Andric   MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
585753f127fSDimitry Andric 
586753f127fSDimitry Andric   Register MoveDest = SaveExecInstr.getOperand(0).getReg();
587753f127fSDimitry Andric 
588753f127fSDimitry Andric   MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
589753f127fSDimitry Andric   if (!SaveExecInstr.uses().empty()) {
590753f127fSDimitry Andric     bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32;
591753f127fSDimitry Andric     unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
592753f127fSDimitry Andric     BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
593753f127fSDimitry Andric             SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
594753f127fSDimitry Andric         .addReg(Exec);
595753f127fSDimitry Andric   }
596753f127fSDimitry Andric 
597753f127fSDimitry Andric   // Omit dst as V_CMPX is implicitly writing to EXEC.
598753f127fSDimitry Andric   // Add dummy src and clamp modifiers, if needed.
599753f127fSDimitry Andric   auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
600753f127fSDimitry Andric                          VCmp.getDebugLoc(), TII->get(NewOpcode));
601753f127fSDimitry Andric 
602753f127fSDimitry Andric   auto TryAddImmediateValueFromNamedOperand =
603753f127fSDimitry Andric       [&](unsigned OperandName) -> void {
604753f127fSDimitry Andric     if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
605753f127fSDimitry Andric       Builder.addImm(Mod->getImm());
606753f127fSDimitry Andric   };
607753f127fSDimitry Andric 
608753f127fSDimitry Andric   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
609753f127fSDimitry Andric   Builder.add(*Src0);
610753f127fSDimitry Andric 
611753f127fSDimitry Andric   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
612753f127fSDimitry Andric   Builder.add(*Src1);
613753f127fSDimitry Andric 
614753f127fSDimitry Andric   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
615753f127fSDimitry Andric 
616753f127fSDimitry Andric   // The kill flags may no longer be correct.
617753f127fSDimitry Andric   if (Src0->isReg())
618753f127fSDimitry Andric     MRI->clearKillFlags(Src0->getReg());
619753f127fSDimitry Andric   if (Src1->isReg())
620753f127fSDimitry Andric     MRI->clearKillFlags(Src1->getReg());
621753f127fSDimitry Andric 
6225f757f3fSDimitry Andric   for (MachineOperand *MO : KillFlagCandidates)
6235f757f3fSDimitry Andric     MO->setIsKill(false);
6245f757f3fSDimitry Andric 
625fcaf7f86SDimitry Andric   SaveExecInstr.eraseFromParent();
626fcaf7f86SDimitry Andric   VCmp.eraseFromParent();
627fcaf7f86SDimitry Andric 
628753f127fSDimitry Andric   return true;
629753f127fSDimitry Andric }
630753f127fSDimitry Andric 
631fcaf7f86SDimitry Andric // Record (on GFX10.3 and later) occurences of
63281ad6265SDimitry Andric // v_cmp_* SGPR, IMM, VGPR
63381ad6265SDimitry Andric // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
634fcaf7f86SDimitry Andric // to be replaced with
63581ad6265SDimitry Andric // s_mov_b32 EXEC_SGPR_DEST, exec_lo
63681ad6265SDimitry Andric // v_cmpx_* IMM, VGPR
63781ad6265SDimitry Andric // to reduce pipeline stalls.
638fcaf7f86SDimitry Andric void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
639fcaf7f86SDimitry Andric     MachineInstr &MI) {
640753f127fSDimitry Andric   if (!ST->hasGFX10_3Insts())
641fcaf7f86SDimitry Andric     return;
6420b57cec5SDimitry Andric 
643753f127fSDimitry Andric   const unsigned AndSaveExecOpcode =
644753f127fSDimitry Andric       ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
645753f127fSDimitry Andric 
64681ad6265SDimitry Andric   if (MI.getOpcode() != AndSaveExecOpcode)
647fcaf7f86SDimitry Andric     return;
64881ad6265SDimitry Andric 
649fcaf7f86SDimitry Andric   Register SaveExecDest = MI.getOperand(0).getReg();
650fcaf7f86SDimitry Andric   if (!TRI->isSGPRReg(*MRI, SaveExecDest))
651fcaf7f86SDimitry Andric     return;
652fcaf7f86SDimitry Andric 
653fcaf7f86SDimitry Andric   MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
654fcaf7f86SDimitry Andric   if (!SaveExecSrc0->isReg())
655fcaf7f86SDimitry Andric     return;
656fcaf7f86SDimitry Andric 
657fcaf7f86SDimitry Andric   // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec
658bdd1243dSDimitry Andric   // sequence by looking at an instance of an s_and_saveexec instruction.
659bdd1243dSDimitry Andric   // Returns a pointer to the v_cmp instruction if it is safe to replace the
660bdd1243dSDimitry Andric   // sequence (see the conditions in the function body). This is after register
661fcaf7f86SDimitry Andric   // allocation, so some checks on operand dependencies need to be considered.
662fcaf7f86SDimitry Andric   MachineInstr *VCmp = nullptr;
663fcaf7f86SDimitry Andric 
664fcaf7f86SDimitry Andric   // Try to find the last v_cmp instruction that defs the saveexec input
665fcaf7f86SDimitry Andric   // operand without any write to Exec or the saveexec input operand inbetween.
666fcaf7f86SDimitry Andric   VCmp = findInstrBackwards(
667fcaf7f86SDimitry Andric       MI,
668fcaf7f86SDimitry Andric       [&](MachineInstr *Check) {
669fcaf7f86SDimitry Andric         return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
670fcaf7f86SDimitry Andric                Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
671fcaf7f86SDimitry Andric       },
672fcaf7f86SDimitry Andric       {Exec, SaveExecSrc0->getReg()});
673fcaf7f86SDimitry Andric 
674fcaf7f86SDimitry Andric   if (!VCmp)
675fcaf7f86SDimitry Andric     return;
676fcaf7f86SDimitry Andric 
677fcaf7f86SDimitry Andric   MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
678fcaf7f86SDimitry Andric   assert(VCmpDest && "Should have an sdst operand!");
679fcaf7f86SDimitry Andric 
680fcaf7f86SDimitry Andric   // Check if any of the v_cmp source operands is written by the saveexec.
681fcaf7f86SDimitry Andric   MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
682fcaf7f86SDimitry Andric   if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
683fcaf7f86SDimitry Andric       MI.modifiesRegister(Src0->getReg(), TRI))
684fcaf7f86SDimitry Andric     return;
685fcaf7f86SDimitry Andric 
686fcaf7f86SDimitry Andric   MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
687fcaf7f86SDimitry Andric   if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
688fcaf7f86SDimitry Andric       MI.modifiesRegister(Src1->getReg(), TRI))
689fcaf7f86SDimitry Andric     return;
690fcaf7f86SDimitry Andric 
691fcaf7f86SDimitry Andric   // Don't do the transformation if the destination operand is included in
692bdd1243dSDimitry Andric   // it's MBB Live-outs, meaning it's used in any of its successors, leading
693fcaf7f86SDimitry Andric   // to incorrect code if the v_cmp and therefore the def of
694fcaf7f86SDimitry Andric   // the dest operand is removed.
695fcaf7f86SDimitry Andric   if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
696fcaf7f86SDimitry Andric     return;
697fcaf7f86SDimitry Andric 
698fcaf7f86SDimitry Andric   // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
699fcaf7f86SDimitry Andric   // s_and_saveexec, skip the optimization.
700fcaf7f86SDimitry Andric   if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) ||
701fcaf7f86SDimitry Andric       isRegisterInUseAfter(MI, VCmpDest->getReg()))
702fcaf7f86SDimitry Andric     return;
703fcaf7f86SDimitry Andric 
704fcaf7f86SDimitry Andric   // Try to determine if there is a write to any of the VCmp
705fcaf7f86SDimitry Andric   // operands between the saveexec and the vcmp.
706fcaf7f86SDimitry Andric   // If yes, additional VGPR spilling might need to be inserted. In this case,
707fcaf7f86SDimitry Andric   // it's not worth replacing the instruction sequence.
708fcaf7f86SDimitry Andric   SmallVector<MCRegister, 2> NonDefRegs;
709fcaf7f86SDimitry Andric   if (Src0->isReg())
710fcaf7f86SDimitry Andric     NonDefRegs.push_back(Src0->getReg());
711fcaf7f86SDimitry Andric 
712fcaf7f86SDimitry Andric   if (Src1->isReg())
713fcaf7f86SDimitry Andric     NonDefRegs.push_back(Src1->getReg());
714fcaf7f86SDimitry Andric 
715fcaf7f86SDimitry Andric   if (!findInstrBackwards(
7165f757f3fSDimitry Andric           MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
7175f757f3fSDimitry Andric           VCmp, &KillFlagCandidates))
718fcaf7f86SDimitry Andric     return;
719fcaf7f86SDimitry Andric 
720fcaf7f86SDimitry Andric   if (VCmp)
72181ad6265SDimitry Andric     SaveExecVCmpMapping[&MI] = VCmp;
72281ad6265SDimitry Andric }
723fcaf7f86SDimitry Andric 
724fcaf7f86SDimitry Andric // Record occurences of
725fcaf7f86SDimitry Andric // s_or_saveexec s_o, s_i
726fcaf7f86SDimitry Andric // s_xor exec, exec, s_o
727fcaf7f86SDimitry Andric // to be replaced with
728fcaf7f86SDimitry Andric // s_andn2_saveexec s_o, s_i.
729fcaf7f86SDimitry Andric void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
730fcaf7f86SDimitry Andric   const unsigned XorOpcode =
731fcaf7f86SDimitry Andric       ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
732fcaf7f86SDimitry Andric 
733fcaf7f86SDimitry Andric   if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
734fcaf7f86SDimitry Andric     const MachineOperand &XorDst = MI.getOperand(0);
735fcaf7f86SDimitry Andric     const MachineOperand &XorSrc0 = MI.getOperand(1);
736fcaf7f86SDimitry Andric     const MachineOperand &XorSrc1 = MI.getOperand(2);
737fcaf7f86SDimitry Andric 
738fcaf7f86SDimitry Andric     if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
739fcaf7f86SDimitry Andric         XorSrc1.isReg() &&
740fcaf7f86SDimitry Andric         (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
741fcaf7f86SDimitry Andric       const unsigned OrSaveexecOpcode = ST->isWave32()
742fcaf7f86SDimitry Andric                                             ? AMDGPU::S_OR_SAVEEXEC_B32
743fcaf7f86SDimitry Andric                                             : AMDGPU::S_OR_SAVEEXEC_B64;
744fcaf7f86SDimitry Andric 
745fcaf7f86SDimitry Andric       // Peek at the previous instruction and check if this is a relevant
746fcaf7f86SDimitry Andric       // s_or_saveexec instruction.
747fcaf7f86SDimitry Andric       MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
748fcaf7f86SDimitry Andric       if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
749fcaf7f86SDimitry Andric         return;
750fcaf7f86SDimitry Andric 
751fcaf7f86SDimitry Andric       const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
752fcaf7f86SDimitry Andric       const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
753fcaf7f86SDimitry Andric       if (OrDst.isReg() && OrSrc0.isReg()) {
754fcaf7f86SDimitry Andric         if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
755fcaf7f86SDimitry Andric             (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
756fcaf7f86SDimitry Andric           OrXors.emplace_back(&PossibleOrSaveexec, &MI);
757fcaf7f86SDimitry Andric         }
758fcaf7f86SDimitry Andric       }
759fcaf7f86SDimitry Andric     }
760fcaf7f86SDimitry Andric   }
76181ad6265SDimitry Andric }
76281ad6265SDimitry Andric 
763fcaf7f86SDimitry Andric bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
764fcaf7f86SDimitry Andric   if (OrXors.empty()) {
765fcaf7f86SDimitry Andric     return false;
766fcaf7f86SDimitry Andric   }
76781ad6265SDimitry Andric 
768fcaf7f86SDimitry Andric   bool Changed = false;
769fcaf7f86SDimitry Andric   const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
770fcaf7f86SDimitry Andric                                               : AMDGPU::S_ANDN2_SAVEEXEC_B64;
771fcaf7f86SDimitry Andric 
772fcaf7f86SDimitry Andric   for (const auto &Pair : OrXors) {
773fcaf7f86SDimitry Andric     MachineInstr *Or = nullptr;
774fcaf7f86SDimitry Andric     MachineInstr *Xor = nullptr;
775fcaf7f86SDimitry Andric     std::tie(Or, Xor) = Pair;
776fcaf7f86SDimitry Andric     BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
777fcaf7f86SDimitry Andric             TII->get(Andn2Opcode), Or->getOperand(0).getReg())
778fcaf7f86SDimitry Andric         .addReg(Or->getOperand(1).getReg());
779fcaf7f86SDimitry Andric 
780fcaf7f86SDimitry Andric     Or->eraseFromParent();
781fcaf7f86SDimitry Andric     Xor->eraseFromParent();
78281ad6265SDimitry Andric 
78381ad6265SDimitry Andric     Changed = true;
78481ad6265SDimitry Andric   }
785753f127fSDimitry Andric 
786753f127fSDimitry Andric   return Changed;
78781ad6265SDimitry Andric }
78881ad6265SDimitry Andric 
789753f127fSDimitry Andric bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
790753f127fSDimitry Andric   if (skipFunction(MF.getFunction()))
791753f127fSDimitry Andric     return false;
792753f127fSDimitry Andric 
793753f127fSDimitry Andric   this->MF = &MF;
794753f127fSDimitry Andric   ST = &MF.getSubtarget<GCNSubtarget>();
795753f127fSDimitry Andric   TRI = ST->getRegisterInfo();
796753f127fSDimitry Andric   TII = ST->getInstrInfo();
797753f127fSDimitry Andric   MRI = &MF.getRegInfo();
798fcaf7f86SDimitry Andric   Exec = TRI->getExec();
799753f127fSDimitry Andric 
800753f127fSDimitry Andric   bool Changed = optimizeExecSequence();
801fcaf7f86SDimitry Andric 
802fcaf7f86SDimitry Andric   OrXors.clear();
803fcaf7f86SDimitry Andric   SaveExecVCmpMapping.clear();
8045f757f3fSDimitry Andric   KillFlagCandidates.clear();
805fcaf7f86SDimitry Andric   static unsigned SearchWindow = 10;
806fcaf7f86SDimitry Andric   for (MachineBasicBlock &MBB : MF) {
807fcaf7f86SDimitry Andric     unsigned SearchCount = 0;
808fcaf7f86SDimitry Andric 
809fcaf7f86SDimitry Andric     for (auto &MI : llvm::reverse(MBB)) {
810fcaf7f86SDimitry Andric       if (MI.isDebugInstr())
811fcaf7f86SDimitry Andric         continue;
812fcaf7f86SDimitry Andric 
813fcaf7f86SDimitry Andric       if (SearchCount >= SearchWindow) {
814fcaf7f86SDimitry Andric         break;
815fcaf7f86SDimitry Andric       }
816fcaf7f86SDimitry Andric 
817fcaf7f86SDimitry Andric       tryRecordOrSaveexecXorSequence(MI);
818fcaf7f86SDimitry Andric       tryRecordVCmpxAndSaveexecSequence(MI);
819fcaf7f86SDimitry Andric 
820fcaf7f86SDimitry Andric       if (MI.modifiesRegister(Exec, TRI)) {
821fcaf7f86SDimitry Andric         break;
822fcaf7f86SDimitry Andric       }
823fcaf7f86SDimitry Andric 
824fcaf7f86SDimitry Andric       ++SearchCount;
825fcaf7f86SDimitry Andric     }
826fcaf7f86SDimitry Andric   }
827fcaf7f86SDimitry Andric 
828fcaf7f86SDimitry Andric   Changed |= optimizeOrSaveexecXorSequences();
829fcaf7f86SDimitry Andric   for (const auto &Entry : SaveExecVCmpMapping) {
830fcaf7f86SDimitry Andric     MachineInstr *SaveExecInstr = Entry.getFirst();
831fcaf7f86SDimitry Andric     MachineInstr *VCmpInstr = Entry.getSecond();
832fcaf7f86SDimitry Andric 
833fcaf7f86SDimitry Andric     Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
834fcaf7f86SDimitry Andric   }
835753f127fSDimitry Andric 
83681ad6265SDimitry Andric   return Changed;
8370b57cec5SDimitry Andric }
838