xref: /llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (revision 011c64191ef9ccc6538d52f4b57f98f37d4ea36e)
1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPU.h"
10 #include "GCNSubtarget.h"
11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12 #include "llvm/CodeGen/MachineFunctionPass.h"
13 #include "llvm/InitializePasses.h"
14 
15 using namespace llvm;
16 
17 #define DEBUG_TYPE "si-optimize-exec-masking"
18 
19 namespace {
20 
21 class SIOptimizeExecMasking : public MachineFunctionPass {
22 public:
23   static char ID;
24 
25 public:
26   SIOptimizeExecMasking() : MachineFunctionPass(ID) {
27     initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
28   }
29 
30   bool runOnMachineFunction(MachineFunction &MF) override;
31 
32   StringRef getPassName() const override {
33     return "SI optimize exec mask operations";
34   }
35 
36   void getAnalysisUsage(AnalysisUsage &AU) const override {
37     AU.setPreservesCFG();
38     MachineFunctionPass::getAnalysisUsage(AU);
39   }
40 };
41 
42 } // End anonymous namespace.
43 
44 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
45                       "SI optimize exec mask operations", false, false)
46 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
47 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
48                     "SI optimize exec mask operations", false, false)
49 
50 char SIOptimizeExecMasking::ID = 0;
51 
52 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
53 
54 /// If \p MI is a copy from exec, return the register copied to.
55 static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
56   switch (MI.getOpcode()) {
57   case AMDGPU::COPY:
58   case AMDGPU::S_MOV_B64:
59   case AMDGPU::S_MOV_B64_term:
60   case AMDGPU::S_MOV_B32:
61   case AMDGPU::S_MOV_B32_term: {
62     const MachineOperand &Src = MI.getOperand(1);
63     if (Src.isReg() &&
64         Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
65       return MI.getOperand(0).getReg();
66   }
67   }
68 
69   return AMDGPU::NoRegister;
70 }
71 
72 /// If \p MI is a copy to exec, return the register copied from.
73 static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
74   switch (MI.getOpcode()) {
75   case AMDGPU::COPY:
76   case AMDGPU::S_MOV_B64:
77   case AMDGPU::S_MOV_B32: {
78     const MachineOperand &Dst = MI.getOperand(0);
79     if (Dst.isReg() &&
80         Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
81         MI.getOperand(1).isReg())
82       return MI.getOperand(1).getReg();
83     break;
84   }
85   case AMDGPU::S_MOV_B64_term:
86   case AMDGPU::S_MOV_B32_term:
87     llvm_unreachable("should have been replaced");
88   }
89 
90   return Register();
91 }
92 
93 /// If \p MI is a logical operation on an exec value,
94 /// return the register copied to.
95 static Register isLogicalOpOnExec(const MachineInstr &MI) {
96   switch (MI.getOpcode()) {
97   case AMDGPU::S_AND_B64:
98   case AMDGPU::S_OR_B64:
99   case AMDGPU::S_XOR_B64:
100   case AMDGPU::S_ANDN2_B64:
101   case AMDGPU::S_ORN2_B64:
102   case AMDGPU::S_NAND_B64:
103   case AMDGPU::S_NOR_B64:
104   case AMDGPU::S_XNOR_B64: {
105     const MachineOperand &Src1 = MI.getOperand(1);
106     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
107       return MI.getOperand(0).getReg();
108     const MachineOperand &Src2 = MI.getOperand(2);
109     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
110       return MI.getOperand(0).getReg();
111     break;
112   }
113   case AMDGPU::S_AND_B32:
114   case AMDGPU::S_OR_B32:
115   case AMDGPU::S_XOR_B32:
116   case AMDGPU::S_ANDN2_B32:
117   case AMDGPU::S_ORN2_B32:
118   case AMDGPU::S_NAND_B32:
119   case AMDGPU::S_NOR_B32:
120   case AMDGPU::S_XNOR_B32: {
121     const MachineOperand &Src1 = MI.getOperand(1);
122     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
123       return MI.getOperand(0).getReg();
124     const MachineOperand &Src2 = MI.getOperand(2);
125     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
126       return MI.getOperand(0).getReg();
127     break;
128   }
129   }
130 
131   return AMDGPU::NoRegister;
132 }
133 
134 static unsigned getSaveExecOp(unsigned Opc) {
135   switch (Opc) {
136   case AMDGPU::S_AND_B64:
137     return AMDGPU::S_AND_SAVEEXEC_B64;
138   case AMDGPU::S_OR_B64:
139     return AMDGPU::S_OR_SAVEEXEC_B64;
140   case AMDGPU::S_XOR_B64:
141     return AMDGPU::S_XOR_SAVEEXEC_B64;
142   case AMDGPU::S_ANDN2_B64:
143     return AMDGPU::S_ANDN2_SAVEEXEC_B64;
144   case AMDGPU::S_ORN2_B64:
145     return AMDGPU::S_ORN2_SAVEEXEC_B64;
146   case AMDGPU::S_NAND_B64:
147     return AMDGPU::S_NAND_SAVEEXEC_B64;
148   case AMDGPU::S_NOR_B64:
149     return AMDGPU::S_NOR_SAVEEXEC_B64;
150   case AMDGPU::S_XNOR_B64:
151     return AMDGPU::S_XNOR_SAVEEXEC_B64;
152   case AMDGPU::S_AND_B32:
153     return AMDGPU::S_AND_SAVEEXEC_B32;
154   case AMDGPU::S_OR_B32:
155     return AMDGPU::S_OR_SAVEEXEC_B32;
156   case AMDGPU::S_XOR_B32:
157     return AMDGPU::S_XOR_SAVEEXEC_B32;
158   case AMDGPU::S_ANDN2_B32:
159     return AMDGPU::S_ANDN2_SAVEEXEC_B32;
160   case AMDGPU::S_ORN2_B32:
161     return AMDGPU::S_ORN2_SAVEEXEC_B32;
162   case AMDGPU::S_NAND_B32:
163     return AMDGPU::S_NAND_SAVEEXEC_B32;
164   case AMDGPU::S_NOR_B32:
165     return AMDGPU::S_NOR_SAVEEXEC_B32;
166   case AMDGPU::S_XNOR_B32:
167     return AMDGPU::S_XNOR_SAVEEXEC_B32;
168   default:
169     return AMDGPU::INSTRUCTION_LIST_END;
170   }
171 }
172 
173 // These are only terminators to get correct spill code placement during
174 // register allocation, so turn them back into normal instructions.
175 static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
176   switch (MI.getOpcode()) {
177   case AMDGPU::S_MOV_B32_term: {
178     bool RegSrc = MI.getOperand(1).isReg();
179     MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
180     return true;
181   }
182   case AMDGPU::S_MOV_B64_term: {
183     bool RegSrc = MI.getOperand(1).isReg();
184     MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
185     return true;
186   }
187   case AMDGPU::S_XOR_B64_term: {
188     // This is only a terminator to get the correct spill code placement during
189     // register allocation.
190     MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
191     return true;
192   }
193   case AMDGPU::S_XOR_B32_term: {
194     // This is only a terminator to get the correct spill code placement during
195     // register allocation.
196     MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
197     return true;
198   }
199   case AMDGPU::S_OR_B64_term: {
200     // This is only a terminator to get the correct spill code placement during
201     // register allocation.
202     MI.setDesc(TII.get(AMDGPU::S_OR_B64));
203     return true;
204   }
205   case AMDGPU::S_OR_B32_term: {
206     // This is only a terminator to get the correct spill code placement during
207     // register allocation.
208     MI.setDesc(TII.get(AMDGPU::S_OR_B32));
209     return true;
210   }
211   case AMDGPU::S_ANDN2_B64_term: {
212     // This is only a terminator to get the correct spill code placement during
213     // register allocation.
214     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
215     return true;
216   }
217   case AMDGPU::S_ANDN2_B32_term: {
218     // This is only a terminator to get the correct spill code placement during
219     // register allocation.
220     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
221     return true;
222   }
223   case AMDGPU::S_AND_B64_term: {
224     // This is only a terminator to get the correct spill code placement during
225     // register allocation.
226     MI.setDesc(TII.get(AMDGPU::S_AND_B64));
227     return true;
228   }
229   case AMDGPU::S_AND_B32_term: {
230     // This is only a terminator to get the correct spill code placement during
231     // register allocation.
232     MI.setDesc(TII.get(AMDGPU::S_AND_B32));
233     return true;
234   }
235   default:
236     return false;
237   }
238 }
239 
240 // Turn all pseudoterminators in the block into their equivalent non-terminator
241 // instructions. Returns the reverse iterator to the first non-terminator
242 // instruction in the block.
243 static MachineBasicBlock::reverse_iterator fixTerminators(
244   const SIInstrInfo &TII,
245   MachineBasicBlock &MBB) {
246   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
247 
248   bool Seen = false;
249   MachineBasicBlock::reverse_iterator FirstNonTerm = I;
250   for (; I != E; ++I) {
251     if (!I->isTerminator())
252       return Seen ? FirstNonTerm : I;
253 
254     if (removeTerminatorBit(TII, *I)) {
255       if (!Seen) {
256         FirstNonTerm = I;
257         Seen = true;
258       }
259     }
260   }
261 
262   return FirstNonTerm;
263 }
264 
265 static MachineBasicBlock::reverse_iterator findExecCopy(
266   const SIInstrInfo &TII,
267   const GCNSubtarget &ST,
268   MachineBasicBlock &MBB,
269   MachineBasicBlock::reverse_iterator I,
270   unsigned CopyToExec) {
271   const unsigned InstLimit = 25;
272 
273   auto E = MBB.rend();
274   for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
275     Register CopyFromExec = isCopyFromExec(*I, ST);
276     if (CopyFromExec.isValid())
277       return I;
278   }
279 
280   return E;
281 }
282 
283 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
284 // report the register as unavailable because a super-register with a lane mask
285 // is unavailable.
286 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
287   for (MachineBasicBlock *Succ : MBB.successors()) {
288     if (Succ->isLiveIn(Reg))
289       return true;
290   }
291 
292   return false;
293 }
294 
295 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
296 // the beginning of the BB is reached or Pred evaluates to true - which can be
297 // an arbitrary condition based on the current MachineInstr, for instance an
298 // target instruction. Breaks prematurely by returning nullptr if  one of the
299 // registers given in NonModifiableRegs is modified by the current instruction.
300 static MachineInstr *
301 findInstrBackwards(MachineInstr &Origin,
302                    std::function<bool(MachineInstr *)> Pred,
303                    ArrayRef<MCRegister> NonModifiableRegs,
304                    const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
305   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
306                                       E = Origin.getParent()->rend();
307   unsigned CurrentIteration = 0;
308 
309   for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
310     if (Pred(&*A))
311       return &*A;
312 
313     for (MCRegister Reg : NonModifiableRegs) {
314       if (A->modifiesRegister(Reg, TRI))
315         return nullptr;
316     }
317 
318     ++CurrentIteration;
319   }
320 
321   return nullptr;
322 }
323 
324 // Determine if a register Reg is not re-defined and still in use
325 // in the range (Stop..BB.end].
326 // It does so by backwards calculating liveness from the end of the BB until
327 // either Stop or the beginning of the BB is reached.
328 // After liveness is calculated, we can determine if Reg is still in use and not
329 // defined inbetween the instructions.
330 static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
331                                  const SIRegisterInfo *TRI,
332                                  MachineRegisterInfo &MRI) {
333   LivePhysRegs LR(*TRI);
334   LR.addLiveOuts(*Stop.getParent());
335 
336   for (auto A = Stop.getParent()->rbegin();
337        A != Stop.getParent()->rend() && A != Stop; ++A) {
338     LR.stepBackward(*A);
339   }
340 
341   return !LR.available(MRI, Reg);
342 }
343 
344 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
345 // by looking at an instance of a s_and_saveexec instruction. Returns a pointer
346 // to the v_cmp instruction if it is safe to replace the sequence (see the
347 // conditions in the function body). This is after register allocation, so some
348 // checks on operand dependencies need to be considered.
349 static MachineInstr *findPossibleVCMPVCMPXOptimization(
350     MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
351     const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
352 
353   MachineInstr *VCmp = nullptr;
354 
355   Register SaveExecDest = SaveExec.getOperand(0).getReg();
356   if (!TRI->isSGPRReg(MRI, SaveExecDest))
357     return nullptr;
358 
359   MachineOperand *SaveExecSrc0 =
360       TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
361   if (!SaveExecSrc0->isReg())
362     return nullptr;
363 
364   // Try to find the last v_cmp instruction that defs the saveexec input
365   // operand without any write to Exec inbetween.
366   VCmp = findInstrBackwards(
367       SaveExec,
368       [&](MachineInstr *Check) {
369         return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
370                Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
371       },
372       {Exec, SaveExecSrc0->getReg()}, TRI);
373 
374   if (!VCmp)
375     return nullptr;
376 
377   MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
378   assert(VCmpDest && "Should have an sdst operand!");
379 
380   // Check if any of the v_cmp source operands is written by the saveexec.
381   MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
382   if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
383       SaveExec.modifiesRegister(Src0->getReg(), TRI))
384     return nullptr;
385 
386   MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
387   if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
388       SaveExec.modifiesRegister(Src1->getReg(), TRI))
389     return nullptr;
390 
391   // Don't do the transformation if the destination operand is included in
392   // it's MBB Live-outs, meaning it's used in any of it's successors, leading
393   // to incorrect code if the v_cmp and therefore the def of
394   // the dest operand is removed.
395   if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
396     return nullptr;
397 
398   // If the v_cmp target is in use after the s_and_saveexec, skip the
399   // optimization.
400   if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI,
401                           MRI))
402     return nullptr;
403 
404   // Try to determine if there is a write to any of the VCmp
405   // operands between the saveexec and the vcmp.
406   // If yes, additional VGPR spilling might need to be inserted. In this case,
407   // it's not worth replacing the instruction sequence.
408   SmallVector<MCRegister, 2> NonDefRegs;
409   if (Src0->isReg())
410     NonDefRegs.push_back(Src0->getReg());
411 
412   if (Src1->isReg())
413     NonDefRegs.push_back(Src1->getReg());
414 
415   if (!findInstrBackwards(
416           SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
417           NonDefRegs, TRI))
418     return nullptr;
419 
420   return VCmp;
421 }
422 
423 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
424 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
425 static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
426                                          MachineInstr &VCmp, MCRegister Exec,
427                                          const SIInstrInfo *TII,
428                                          const SIRegisterInfo *TRI,
429                                          MachineRegisterInfo &MRI) {
430   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
431 
432   if (NewOpcode == -1)
433     return false;
434 
435   MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
436   MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
437 
438   Register MoveDest = SaveExecInstr.getOperand(0).getReg();
439 
440   MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
441   if (!SaveExecInstr.uses().empty()) {
442     bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
443     unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
444     BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
445             SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
446         .addReg(Exec);
447   }
448 
449   // Omit dst as V_CMPX is implicitly writing to EXEC.
450   // Add dummy src and clamp modifiers, if needed.
451   auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
452                          VCmp.getDebugLoc(), TII->get(NewOpcode));
453 
454   if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
455       -1)
456     Builder.addImm(0);
457 
458   Builder.add(*Src0);
459 
460   if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
461       -1)
462     Builder.addImm(0);
463 
464   Builder.add(*Src1);
465 
466   if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
467     Builder.addImm(0);
468 
469   return true;
470 }
471 
472 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
473   if (skipFunction(MF.getFunction()))
474     return false;
475 
476   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
477   const SIRegisterInfo *TRI = ST.getRegisterInfo();
478   const SIInstrInfo *TII = ST.getInstrInfo();
479   MachineRegisterInfo *MRI = &MF.getRegInfo();
480   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
481 
482   // Optimize sequences emitted for control flow lowering. They are originally
483   // emitted as the separate operations because spill code may need to be
484   // inserted for the saved copy of exec.
485   //
486   //     x = copy exec
487   //     z = s_<op>_b64 x, y
488   //     exec = copy z
489   // =>
490   //     x = s_<op>_saveexec_b64 y
491   //
492 
493   bool Changed = false;
494   for (MachineBasicBlock &MBB : MF) {
495     MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
496     MachineBasicBlock::reverse_iterator E = MBB.rend();
497     if (I == E)
498       continue;
499 
500     // It's possible to see other terminator copies after the exec copy. This
501     // can happen if control flow pseudos had their outputs used by phis.
502     Register CopyToExec;
503 
504     unsigned SearchCount = 0;
505     const unsigned SearchLimit = 5;
506     while (I != E && SearchCount++ < SearchLimit) {
507       CopyToExec = isCopyToExec(*I, ST);
508       if (CopyToExec)
509         break;
510       ++I;
511     }
512 
513     if (!CopyToExec)
514       continue;
515 
516     // Scan backwards to find the def.
517     auto CopyToExecInst = &*I;
518     auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
519     if (CopyFromExecInst == E) {
520       auto PrepareExecInst = std::next(I);
521       if (PrepareExecInst == E)
522         continue;
523       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
524       if (CopyToExecInst->getOperand(1).isKill() &&
525           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
526         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
527 
528         PrepareExecInst->getOperand(0).setReg(Exec);
529 
530         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
531 
532         CopyToExecInst->eraseFromParent();
533         Changed = true;
534       }
535 
536       continue;
537     }
538 
539     if (isLiveOut(MBB, CopyToExec)) {
540       // The copied register is live out and has a second use in another block.
541       LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
542       continue;
543     }
544 
545     Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
546     MachineInstr *SaveExecInst = nullptr;
547     SmallVector<MachineInstr *, 4> OtherUseInsts;
548 
549     for (MachineBasicBlock::iterator J
550            = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
551          J != JE; ++J) {
552       if (SaveExecInst && J->readsRegister(Exec, TRI)) {
553         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
554         // Make sure this is inserted after any VALU ops that may have been
555         // scheduled in between.
556         SaveExecInst = nullptr;
557         break;
558       }
559 
560       bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
561 
562       if (J->modifiesRegister(CopyToExec, TRI)) {
563         if (SaveExecInst) {
564           LLVM_DEBUG(dbgs() << "Multiple instructions modify "
565                             << printReg(CopyToExec, TRI) << '\n');
566           SaveExecInst = nullptr;
567           break;
568         }
569 
570         unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
571         if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
572           break;
573 
574         if (ReadsCopyFromExec) {
575           SaveExecInst = &*J;
576           LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
577           continue;
578         } else {
579           LLVM_DEBUG(dbgs()
580                      << "Instruction does not read exec copy: " << *J << '\n');
581           break;
582         }
583       } else if (ReadsCopyFromExec && !SaveExecInst) {
584         // Make sure no other instruction is trying to use this copy, before it
585         // will be rewritten by the saveexec, i.e. hasOneUse. There may have
586         // been another use, such as an inserted spill. For example:
587         //
588         // %sgpr0_sgpr1 = COPY %exec
589         // spill %sgpr0_sgpr1
590         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
591         //
592         LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
593                           << '\n');
594         break;
595       }
596 
597       if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
598         assert(SaveExecInst != &*J);
599         OtherUseInsts.push_back(&*J);
600       }
601     }
602 
603     if (!SaveExecInst)
604       continue;
605 
606     LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
607 
608     MachineOperand &Src0 = SaveExecInst->getOperand(1);
609     MachineOperand &Src1 = SaveExecInst->getOperand(2);
610 
611     MachineOperand *OtherOp = nullptr;
612 
613     if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
614       OtherOp = &Src1;
615     } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
616       if (!SaveExecInst->isCommutable())
617         break;
618 
619       OtherOp = &Src0;
620     } else
621       llvm_unreachable("unexpected");
622 
623     CopyFromExecInst->eraseFromParent();
624 
625     auto InsPt = SaveExecInst->getIterator();
626     const DebugLoc &DL = SaveExecInst->getDebugLoc();
627 
628     BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
629             CopyFromExec)
630       .addReg(OtherOp->getReg());
631     SaveExecInst->eraseFromParent();
632 
633     CopyToExecInst->eraseFromParent();
634 
635     for (MachineInstr *OtherInst : OtherUseInsts) {
636       OtherInst->substituteRegister(CopyToExec, Exec,
637                                     AMDGPU::NoSubRegister, *TRI);
638     }
639 
640     Changed = true;
641   }
642 
643   // After all s_op_saveexec instructions are inserted,
644   // replace (on GFX10.3 and later)
645   // v_cmp_* SGPR, IMM, VGPR
646   // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
647   // with
648   // s_mov_b32 EXEC_SGPR_DEST, exec_lo
649   // v_cmpx_* IMM, VGPR
650   // to reduce pipeline stalls.
651   if (ST.hasGFX10_3Insts()) {
652     DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
653     const unsigned AndSaveExecOpcode =
654         ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
655 
656     for (MachineBasicBlock &MBB : MF) {
657       for (MachineInstr &MI : MBB) {
658         // Record relevant v_cmp / s_and_saveexec instruction pairs for
659         // replacement.
660         if (MI.getOpcode() != AndSaveExecOpcode)
661           continue;
662 
663         if (MachineInstr *VCmp =
664                 findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
665           SaveExecVCmpMapping[&MI] = VCmp;
666       }
667     }
668 
669     for (const auto &Entry : SaveExecVCmpMapping) {
670       MachineInstr *SaveExecInstr = Entry.getFirst();
671       MachineInstr *VCmpInstr = Entry.getSecond();
672 
673       if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
674                                        TRI, *MRI)) {
675         SaveExecInstr->eraseFromParent();
676         VCmpInstr->eraseFromParent();
677 
678         Changed = true;
679       }
680     }
681   }
682 
683   return Changed;
684 }
685