xref: /llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (revision 96c4f978d0fd1339262a350e118375ee4bf5fc57)
1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "SIOptimizeExecMasking.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIRegisterInfo.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/CodeGen/LiveRegUnits.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineOperand.h"
18 #include "llvm/CodeGen/TargetRegisterInfo.h"
19 #include "llvm/InitializePasses.h"
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "si-optimize-exec-masking"
24 
25 namespace {
26 
27 class SIOptimizeExecMasking {
28   MachineFunction *MF = nullptr;
29   const GCNSubtarget *ST = nullptr;
30   const SIRegisterInfo *TRI = nullptr;
31   const SIInstrInfo *TII = nullptr;
32   const MachineRegisterInfo *MRI = nullptr;
33   MCRegister Exec;
34 
35   DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
36   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
37   SmallVector<MachineOperand *, 1> KillFlagCandidates;
38 
39   Register isCopyFromExec(const MachineInstr &MI) const;
40   Register isCopyToExec(const MachineInstr &MI) const;
41   bool removeTerminatorBit(MachineInstr &MI) const;
42   MachineBasicBlock::reverse_iterator
43   fixTerminators(MachineBasicBlock &MBB) const;
44   MachineBasicBlock::reverse_iterator
45   findExecCopy(MachineBasicBlock &MBB,
46                MachineBasicBlock::reverse_iterator I) const;
47   bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
48                               MCRegister Reg, bool UseLiveOuts = false,
49                               bool IgnoreStart = false) const;
50   bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
51   MachineInstr *findInstrBackwards(
52       MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
53       ArrayRef<MCRegister> NonModifiableRegs,
54       MachineInstr *Terminator = nullptr,
55       SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
56       unsigned MaxInstructions = 20) const;
57   bool optimizeExecSequence();
58   void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
59   bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
60                                     MachineInstr &VCmp, MCRegister Exec) const;
61 
62   void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
63   bool optimizeOrSaveexecXorSequences();
64 
65 public:
66   bool run(MachineFunction &MF);
67 };
68 
69 class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
70 public:
71   static char ID;
72 
73   SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {
74     initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry());
75   }
76 
77   bool runOnMachineFunction(MachineFunction &MF) override;
78 
79   StringRef getPassName() const override {
80     return "SI optimize exec mask operations";
81   }
82 
83   void getAnalysisUsage(AnalysisUsage &AU) const override {
84     AU.setPreservesCFG();
85     MachineFunctionPass::getAnalysisUsage(AU);
86   }
87 };
88 
89 } // End anonymous namespace.
90 
91 PreservedAnalyses
92 SIOptimizeExecMaskingPass::run(MachineFunction &MF,
93                                MachineFunctionAnalysisManager &) {
94   SIOptimizeExecMasking Impl;
95 
96   if (!Impl.run(MF))
97     return PreservedAnalyses::all();
98 
99   auto PA = getMachineFunctionPassPreservedAnalyses();
100   PA.preserveSet<CFGAnalyses>();
101   return PA;
102 }
103 
104 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingLegacy, DEBUG_TYPE,
105                       "SI optimize exec mask operations", false, false)
106 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
107 INITIALIZE_PASS_END(SIOptimizeExecMaskingLegacy, DEBUG_TYPE,
108                     "SI optimize exec mask operations", false, false)
109 
110 char SIOptimizeExecMaskingLegacy::ID = 0;
111 
112 char &llvm::SIOptimizeExecMaskingLegacyID = SIOptimizeExecMaskingLegacy::ID;
113 
114 /// If \p MI is a copy from exec, return the register copied to.
115 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
116   switch (MI.getOpcode()) {
117   case AMDGPU::COPY:
118   case AMDGPU::S_MOV_B64:
119   case AMDGPU::S_MOV_B64_term:
120   case AMDGPU::S_MOV_B32:
121   case AMDGPU::S_MOV_B32_term: {
122     const MachineOperand &Src = MI.getOperand(1);
123     if (Src.isReg() && Src.getReg() == Exec)
124       return MI.getOperand(0).getReg();
125   }
126   }
127 
128   return AMDGPU::NoRegister;
129 }
130 
131 /// If \p MI is a copy to exec, return the register copied from.
132 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
133   switch (MI.getOpcode()) {
134   case AMDGPU::COPY:
135   case AMDGPU::S_MOV_B64:
136   case AMDGPU::S_MOV_B32: {
137     const MachineOperand &Dst = MI.getOperand(0);
138     if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
139       return MI.getOperand(1).getReg();
140     break;
141   }
142   case AMDGPU::S_MOV_B64_term:
143   case AMDGPU::S_MOV_B32_term:
144     llvm_unreachable("should have been replaced");
145   }
146 
147   return Register();
148 }
149 
150 /// If \p MI is a logical operation on an exec value,
151 /// return the register copied to.
152 static Register isLogicalOpOnExec(const MachineInstr &MI) {
153   switch (MI.getOpcode()) {
154   case AMDGPU::S_AND_B64:
155   case AMDGPU::S_OR_B64:
156   case AMDGPU::S_XOR_B64:
157   case AMDGPU::S_ANDN2_B64:
158   case AMDGPU::S_ORN2_B64:
159   case AMDGPU::S_NAND_B64:
160   case AMDGPU::S_NOR_B64:
161   case AMDGPU::S_XNOR_B64: {
162     const MachineOperand &Src1 = MI.getOperand(1);
163     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
164       return MI.getOperand(0).getReg();
165     const MachineOperand &Src2 = MI.getOperand(2);
166     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
167       return MI.getOperand(0).getReg();
168     break;
169   }
170   case AMDGPU::S_AND_B32:
171   case AMDGPU::S_OR_B32:
172   case AMDGPU::S_XOR_B32:
173   case AMDGPU::S_ANDN2_B32:
174   case AMDGPU::S_ORN2_B32:
175   case AMDGPU::S_NAND_B32:
176   case AMDGPU::S_NOR_B32:
177   case AMDGPU::S_XNOR_B32: {
178     const MachineOperand &Src1 = MI.getOperand(1);
179     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
180       return MI.getOperand(0).getReg();
181     const MachineOperand &Src2 = MI.getOperand(2);
182     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
183       return MI.getOperand(0).getReg();
184     break;
185   }
186   }
187 
188   return AMDGPU::NoRegister;
189 }
190 
191 static unsigned getSaveExecOp(unsigned Opc) {
192   switch (Opc) {
193   case AMDGPU::S_AND_B64:
194     return AMDGPU::S_AND_SAVEEXEC_B64;
195   case AMDGPU::S_OR_B64:
196     return AMDGPU::S_OR_SAVEEXEC_B64;
197   case AMDGPU::S_XOR_B64:
198     return AMDGPU::S_XOR_SAVEEXEC_B64;
199   case AMDGPU::S_ANDN2_B64:
200     return AMDGPU::S_ANDN2_SAVEEXEC_B64;
201   case AMDGPU::S_ORN2_B64:
202     return AMDGPU::S_ORN2_SAVEEXEC_B64;
203   case AMDGPU::S_NAND_B64:
204     return AMDGPU::S_NAND_SAVEEXEC_B64;
205   case AMDGPU::S_NOR_B64:
206     return AMDGPU::S_NOR_SAVEEXEC_B64;
207   case AMDGPU::S_XNOR_B64:
208     return AMDGPU::S_XNOR_SAVEEXEC_B64;
209   case AMDGPU::S_AND_B32:
210     return AMDGPU::S_AND_SAVEEXEC_B32;
211   case AMDGPU::S_OR_B32:
212     return AMDGPU::S_OR_SAVEEXEC_B32;
213   case AMDGPU::S_XOR_B32:
214     return AMDGPU::S_XOR_SAVEEXEC_B32;
215   case AMDGPU::S_ANDN2_B32:
216     return AMDGPU::S_ANDN2_SAVEEXEC_B32;
217   case AMDGPU::S_ORN2_B32:
218     return AMDGPU::S_ORN2_SAVEEXEC_B32;
219   case AMDGPU::S_NAND_B32:
220     return AMDGPU::S_NAND_SAVEEXEC_B32;
221   case AMDGPU::S_NOR_B32:
222     return AMDGPU::S_NOR_SAVEEXEC_B32;
223   case AMDGPU::S_XNOR_B32:
224     return AMDGPU::S_XNOR_SAVEEXEC_B32;
225   default:
226     return AMDGPU::INSTRUCTION_LIST_END;
227   }
228 }
229 
230 // These are only terminators to get correct spill code placement during
231 // register allocation, so turn them back into normal instructions.
232 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const {
233   switch (MI.getOpcode()) {
234   case AMDGPU::S_MOV_B32_term: {
235     bool RegSrc = MI.getOperand(1).isReg();
236     MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
237     return true;
238   }
239   case AMDGPU::S_MOV_B64_term: {
240     bool RegSrc = MI.getOperand(1).isReg();
241     MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
242     return true;
243   }
244   case AMDGPU::S_XOR_B64_term: {
245     // This is only a terminator to get the correct spill code placement during
246     // register allocation.
247     MI.setDesc(TII->get(AMDGPU::S_XOR_B64));
248     return true;
249   }
250   case AMDGPU::S_XOR_B32_term: {
251     // This is only a terminator to get the correct spill code placement during
252     // register allocation.
253     MI.setDesc(TII->get(AMDGPU::S_XOR_B32));
254     return true;
255   }
256   case AMDGPU::S_OR_B64_term: {
257     // This is only a terminator to get the correct spill code placement during
258     // register allocation.
259     MI.setDesc(TII->get(AMDGPU::S_OR_B64));
260     return true;
261   }
262   case AMDGPU::S_OR_B32_term: {
263     // This is only a terminator to get the correct spill code placement during
264     // register allocation.
265     MI.setDesc(TII->get(AMDGPU::S_OR_B32));
266     return true;
267   }
268   case AMDGPU::S_ANDN2_B64_term: {
269     // This is only a terminator to get the correct spill code placement during
270     // register allocation.
271     MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64));
272     return true;
273   }
274   case AMDGPU::S_ANDN2_B32_term: {
275     // This is only a terminator to get the correct spill code placement during
276     // register allocation.
277     MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32));
278     return true;
279   }
280   case AMDGPU::S_AND_B64_term: {
281     // This is only a terminator to get the correct spill code placement during
282     // register allocation.
283     MI.setDesc(TII->get(AMDGPU::S_AND_B64));
284     return true;
285   }
286   case AMDGPU::S_AND_B32_term: {
287     // This is only a terminator to get the correct spill code placement during
288     // register allocation.
289     MI.setDesc(TII->get(AMDGPU::S_AND_B32));
290     return true;
291   }
292   default:
293     return false;
294   }
295 }
296 
297 // Turn all pseudoterminators in the block into their equivalent non-terminator
298 // instructions. Returns the reverse iterator to the first non-terminator
299 // instruction in the block.
300 MachineBasicBlock::reverse_iterator
301 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const {
302   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
303 
304   bool Seen = false;
305   MachineBasicBlock::reverse_iterator FirstNonTerm = I;
306   for (; I != E; ++I) {
307     if (!I->isTerminator())
308       return Seen ? FirstNonTerm : I;
309 
310     if (removeTerminatorBit(*I)) {
311       if (!Seen) {
312         FirstNonTerm = I;
313         Seen = true;
314       }
315     }
316   }
317 
318   return FirstNonTerm;
319 }
320 
321 MachineBasicBlock::reverse_iterator SIOptimizeExecMasking::findExecCopy(
322     MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const {
323   const unsigned InstLimit = 25;
324 
325   auto E = MBB.rend();
326   for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
327     Register CopyFromExec = isCopyFromExec(*I);
328     if (CopyFromExec.isValid())
329       return I;
330   }
331 
332   return E;
333 }
334 
335 // XXX - Seems LiveRegUnits doesn't work correctly since it will incorrectly
336 // report the register as unavailable because a super-register with a lane mask
337 // is unavailable.
338 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
339   for (MachineBasicBlock *Succ : MBB.successors()) {
340     if (Succ->isLiveIn(Reg))
341       return true;
342   }
343 
344   return false;
345 }
346 
347 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
348 // the beginning of the BB is reached or Pred evaluates to true - which can be
349 // an arbitrary condition based on the current MachineInstr, for instance an
350 // target instruction. Breaks prematurely by returning nullptr if one of the
351 // registers given in NonModifiableRegs is modified by the current instruction.
352 MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
353     MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
354     ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
355     SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
356     unsigned MaxInstructions) const {
357   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
358                                       E = Origin.getParent()->rend();
359   unsigned CurrentIteration = 0;
360 
361   for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
362     if (A->isDebugInstr())
363       continue;
364 
365     if (Pred(&*A))
366       return &*A;
367 
368     for (MCRegister Reg : NonModifiableRegs) {
369       if (A->modifiesRegister(Reg, TRI))
370         return nullptr;
371 
372       // Check for kills that appear after the terminator instruction, that
373       // would not be detected by clearKillFlags, since they will cause the
374       // register to be dead at a later place, causing the verifier to fail.
375       // We use the candidates to clear the kill flags later.
376       if (Terminator && KillFlagCandidates && A != Terminator &&
377           A->killsRegister(Reg, TRI)) {
378         for (MachineOperand &MO : A->operands()) {
379           if (MO.isReg() && MO.isKill()) {
380             Register Candidate = MO.getReg();
381             if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg))
382               KillFlagCandidates->push_back(&MO);
383           }
384         }
385       }
386     }
387 
388     ++CurrentIteration;
389   }
390 
391   return nullptr;
392 }
393 
394 // Determine if a register Reg is not re-defined and still in use
395 // in the range (Stop..Start].
396 // It does so by backwards calculating liveness from the end of the BB until
397 // either Stop or the beginning of the BB is reached.
398 // After liveness is calculated, we can determine if Reg is still in use and not
399 // defined inbetween the instructions.
400 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop,
401                                                    MachineInstr &Start,
402                                                    MCRegister Reg,
403                                                    bool UseLiveOuts,
404                                                    bool IgnoreStart) const {
405   LiveRegUnits LR(*TRI);
406   if (UseLiveOuts)
407     LR.addLiveOuts(*Stop.getParent());
408 
409   MachineBasicBlock::reverse_iterator A(Start);
410 
411   if (IgnoreStart)
412     ++A;
413 
414   for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
415     LR.stepBackward(*A);
416   }
417 
418   return !LR.available(Reg) || MRI->isReserved(Reg);
419 }
420 
421 // Determine if a register Reg is not re-defined and still in use
422 // in the range (Stop..BB.end].
423 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
424                                                  MCRegister Reg) const {
425   return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true);
426 }
427 
428 // Optimize sequences emitted for control flow lowering. They are originally
429 // emitted as the separate operations because spill code may need to be
430 // inserted for the saved copy of exec.
431 //
432 //     x = copy exec
433 //     z = s_<op>_b64 x, y
434 //     exec = copy z
435 // =>
436 //     x = s_<op>_saveexec_b64 y
437 //
438 bool SIOptimizeExecMasking::optimizeExecSequence() {
439   bool Changed = false;
440   for (MachineBasicBlock &MBB : *MF) {
441     MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
442     MachineBasicBlock::reverse_iterator E = MBB.rend();
443     if (I == E)
444       continue;
445 
446     // It's possible to see other terminator copies after the exec copy. This
447     // can happen if control flow pseudos had their outputs used by phis.
448     Register CopyToExec;
449 
450     unsigned SearchCount = 0;
451     const unsigned SearchLimit = 5;
452     while (I != E && SearchCount++ < SearchLimit) {
453       CopyToExec = isCopyToExec(*I);
454       if (CopyToExec)
455         break;
456       ++I;
457     }
458 
459     if (!CopyToExec)
460       continue;
461 
462     // Scan backwards to find the def.
463     auto *CopyToExecInst = &*I;
464     auto CopyFromExecInst = findExecCopy(MBB, I);
465     if (CopyFromExecInst == E) {
466       auto PrepareExecInst = std::next(I);
467       if (PrepareExecInst == E)
468         continue;
469       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
470       if (CopyToExecInst->getOperand(1).isKill() &&
471           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
472         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
473 
474         PrepareExecInst->getOperand(0).setReg(Exec);
475 
476         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
477 
478         CopyToExecInst->eraseFromParent();
479         Changed = true;
480       }
481 
482       continue;
483     }
484 
485     if (isLiveOut(MBB, CopyToExec)) {
486       // The copied register is live out and has a second use in another block.
487       LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
488       continue;
489     }
490 
491     Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
492     MachineInstr *SaveExecInst = nullptr;
493     SmallVector<MachineInstr *, 4> OtherUseInsts;
494 
495     for (MachineBasicBlock::iterator
496              J = std::next(CopyFromExecInst->getIterator()),
497              JE = I->getIterator();
498          J != JE; ++J) {
499       if (SaveExecInst && J->readsRegister(Exec, TRI)) {
500         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
501         // Make sure this is inserted after any VALU ops that may have been
502         // scheduled in between.
503         SaveExecInst = nullptr;
504         break;
505       }
506 
507       bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
508 
509       if (J->modifiesRegister(CopyToExec, TRI)) {
510         if (SaveExecInst) {
511           LLVM_DEBUG(dbgs() << "Multiple instructions modify "
512                             << printReg(CopyToExec, TRI) << '\n');
513           SaveExecInst = nullptr;
514           break;
515         }
516 
517         unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
518         if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
519           break;
520 
521         if (ReadsCopyFromExec) {
522           SaveExecInst = &*J;
523           LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
524           continue;
525         }
526         LLVM_DEBUG(dbgs() << "Instruction does not read exec copy: " << *J
527                           << '\n');
528         break;
529       }
530       if (ReadsCopyFromExec && !SaveExecInst) {
531         // Make sure no other instruction is trying to use this copy, before it
532         // will be rewritten by the saveexec, i.e. hasOneUse. There may have
533         // been another use, such as an inserted spill. For example:
534         //
535         // %sgpr0_sgpr1 = COPY %exec
536         // spill %sgpr0_sgpr1
537         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
538         //
539         LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
540                           << '\n');
541         break;
542       }
543 
544       if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
545         assert(SaveExecInst != &*J);
546         OtherUseInsts.push_back(&*J);
547       }
548     }
549 
550     if (!SaveExecInst)
551       continue;
552 
553     LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
554 
555     MachineOperand &Src0 = SaveExecInst->getOperand(1);
556     MachineOperand &Src1 = SaveExecInst->getOperand(2);
557 
558     MachineOperand *OtherOp = nullptr;
559 
560     if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
561       OtherOp = &Src1;
562     } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
563       if (!SaveExecInst->isCommutable())
564         break;
565 
566       OtherOp = &Src0;
567     } else
568       llvm_unreachable("unexpected");
569 
570     CopyFromExecInst->eraseFromParent();
571 
572     auto InsPt = SaveExecInst->getIterator();
573     const DebugLoc &DL = SaveExecInst->getDebugLoc();
574 
575     BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
576             CopyFromExec)
577         .addReg(OtherOp->getReg());
578     SaveExecInst->eraseFromParent();
579 
580     CopyToExecInst->eraseFromParent();
581 
582     for (MachineInstr *OtherInst : OtherUseInsts) {
583       OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
584                                     *TRI);
585     }
586 
587     Changed = true;
588   }
589 
590   return Changed;
591 }
592 
593 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
594 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
595 bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
596     MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
597   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
598 
599   if (NewOpcode == -1)
600     return false;
601 
602   MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
603   MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
604 
605   Register MoveDest = SaveExecInstr.getOperand(0).getReg();
606 
607   MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
608   if (!SaveExecInstr.uses().empty()) {
609     bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32;
610     unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
611     BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
612             SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
613         .addReg(Exec);
614   }
615 
616   // Omit dst as V_CMPX is implicitly writing to EXEC.
617   // Add dummy src and clamp modifiers, if needed.
618   auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
619                          VCmp.getDebugLoc(), TII->get(NewOpcode));
620 
621   auto TryAddImmediateValueFromNamedOperand =
622       [&](unsigned OperandName) -> void {
623     if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
624       Builder.addImm(Mod->getImm());
625   };
626 
627   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
628   Builder.add(*Src0);
629 
630   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
631   Builder.add(*Src1);
632 
633   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
634 
635   // The kill flags may no longer be correct.
636   if (Src0->isReg())
637     MRI->clearKillFlags(Src0->getReg());
638   if (Src1->isReg())
639     MRI->clearKillFlags(Src1->getReg());
640 
641   for (MachineOperand *MO : KillFlagCandidates)
642     MO->setIsKill(false);
643 
644   SaveExecInstr.eraseFromParent();
645   VCmp.eraseFromParent();
646 
647   return true;
648 }
649 
650 // Record (on GFX10.3 and later) occurences of
651 // v_cmp_* SGPR, IMM, VGPR
652 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
653 // to be replaced with
654 // s_mov_b32 EXEC_SGPR_DEST, exec_lo
655 // v_cmpx_* IMM, VGPR
656 // to reduce pipeline stalls.
657 void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
658     MachineInstr &MI) {
659   if (!ST->hasGFX10_3Insts())
660     return;
661 
662   const unsigned AndSaveExecOpcode =
663       ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
664 
665   if (MI.getOpcode() != AndSaveExecOpcode)
666     return;
667 
668   Register SaveExecDest = MI.getOperand(0).getReg();
669   if (!TRI->isSGPRReg(*MRI, SaveExecDest))
670     return;
671 
672   MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
673   if (!SaveExecSrc0->isReg())
674     return;
675 
676   // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec
677   // sequence by looking at an instance of an s_and_saveexec instruction.
678   // Returns a pointer to the v_cmp instruction if it is safe to replace the
679   // sequence (see the conditions in the function body). This is after register
680   // allocation, so some checks on operand dependencies need to be considered.
681   MachineInstr *VCmp = nullptr;
682 
683   // Try to find the last v_cmp instruction that defs the saveexec input
684   // operand without any write to Exec or the saveexec input operand inbetween.
685   VCmp = findInstrBackwards(
686       MI,
687       [&](MachineInstr *Check) {
688         return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
689                Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
690       },
691       {Exec, SaveExecSrc0->getReg()});
692 
693   if (!VCmp)
694     return;
695 
696   MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
697   assert(VCmpDest && "Should have an sdst operand!");
698 
699   // Check if any of the v_cmp source operands is written by the saveexec.
700   MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
701   if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
702       MI.modifiesRegister(Src0->getReg(), TRI))
703     return;
704 
705   MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
706   if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
707       MI.modifiesRegister(Src1->getReg(), TRI))
708     return;
709 
710   // Don't do the transformation if the destination operand is included in
711   // it's MBB Live-outs, meaning it's used in any of its successors, leading
712   // to incorrect code if the v_cmp and therefore the def of
713   // the dest operand is removed.
714   if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
715     return;
716 
717   // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
718   // s_and_saveexec, skip the optimization.
719   if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) ||
720       isRegisterInUseAfter(MI, VCmpDest->getReg()))
721     return;
722 
723   // Try to determine if there is a write to any of the VCmp
724   // operands between the saveexec and the vcmp.
725   // If yes, additional VGPR spilling might need to be inserted. In this case,
726   // it's not worth replacing the instruction sequence.
727   SmallVector<MCRegister, 2> NonDefRegs;
728   if (Src0->isReg())
729     NonDefRegs.push_back(Src0->getReg());
730 
731   if (Src1->isReg())
732     NonDefRegs.push_back(Src1->getReg());
733 
734   if (!findInstrBackwards(
735           MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
736           VCmp, &KillFlagCandidates))
737     return;
738 
739   if (VCmp)
740     SaveExecVCmpMapping[&MI] = VCmp;
741 }
742 
743 // Record occurences of
744 // s_or_saveexec s_o, s_i
745 // s_xor exec, exec, s_o
746 // to be replaced with
747 // s_andn2_saveexec s_o, s_i.
748 void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
749   const unsigned XorOpcode =
750       ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
751 
752   if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
753     const MachineOperand &XorDst = MI.getOperand(0);
754     const MachineOperand &XorSrc0 = MI.getOperand(1);
755     const MachineOperand &XorSrc1 = MI.getOperand(2);
756 
757     if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
758         XorSrc1.isReg() &&
759         (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
760       const unsigned OrSaveexecOpcode = ST->isWave32()
761                                             ? AMDGPU::S_OR_SAVEEXEC_B32
762                                             : AMDGPU::S_OR_SAVEEXEC_B64;
763 
764       // Peek at the previous instruction and check if this is a relevant
765       // s_or_saveexec instruction.
766       MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
767       if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
768         return;
769 
770       const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
771       const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
772       if (OrDst.isReg() && OrSrc0.isReg()) {
773         if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
774             (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
775           OrXors.emplace_back(&PossibleOrSaveexec, &MI);
776         }
777       }
778     }
779   }
780 }
781 
782 bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
783   if (OrXors.empty()) {
784     return false;
785   }
786 
787   bool Changed = false;
788   const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
789                                               : AMDGPU::S_ANDN2_SAVEEXEC_B64;
790 
791   for (const auto &Pair : OrXors) {
792     MachineInstr *Or = nullptr;
793     MachineInstr *Xor = nullptr;
794     std::tie(Or, Xor) = Pair;
795     BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
796             TII->get(Andn2Opcode), Or->getOperand(0).getReg())
797         .addReg(Or->getOperand(1).getReg());
798 
799     Or->eraseFromParent();
800     Xor->eraseFromParent();
801 
802     Changed = true;
803   }
804 
805   return Changed;
806 }
807 
808 bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) {
809   if (skipFunction(MF.getFunction()))
810     return false;
811 
812   return SIOptimizeExecMasking().run(MF);
813 }
814 
815 bool SIOptimizeExecMasking::run(MachineFunction &MF) {
816   this->MF = &MF;
817   ST = &MF.getSubtarget<GCNSubtarget>();
818   TRI = ST->getRegisterInfo();
819   TII = ST->getInstrInfo();
820   MRI = &MF.getRegInfo();
821   Exec = TRI->getExec();
822 
823   bool Changed = optimizeExecSequence();
824 
825   OrXors.clear();
826   SaveExecVCmpMapping.clear();
827   KillFlagCandidates.clear();
828   static unsigned SearchWindow = 10;
829   for (MachineBasicBlock &MBB : MF) {
830     unsigned SearchCount = 0;
831 
832     for (auto &MI : llvm::reverse(MBB)) {
833       if (MI.isDebugInstr())
834         continue;
835 
836       if (SearchCount >= SearchWindow) {
837         break;
838       }
839 
840       tryRecordOrSaveexecXorSequence(MI);
841       tryRecordVCmpxAndSaveexecSequence(MI);
842 
843       if (MI.modifiesRegister(Exec, TRI)) {
844         break;
845       }
846 
847       ++SearchCount;
848     }
849   }
850 
851   Changed |= optimizeOrSaveexecXorSequences();
852   for (const auto &Entry : SaveExecVCmpMapping) {
853     MachineInstr *SaveExecInstr = Entry.getFirst();
854     MachineInstr *VCmpInstr = Entry.getSecond();
855 
856     Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
857   }
858 
859   return Changed;
860 }
861