xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision c16f776028ddd39a4d5ea39d5c3831b1bcbb8c02)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass lowers the pseudo control flow instructions to real
11 /// machine instructions.
12 ///
13 /// All control flow is handled using predicated instructions and
14 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
15 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
16 /// by writting to the 64-bit EXEC register (each bit corresponds to a
17 /// single vector ALU).  Typically, for predicates, a vector ALU will write
18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
20 /// EXEC to update the predicates.
21 ///
22 /// For example:
23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24 /// %sgpr0 = SI_IF %vcc
25 ///   %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26 /// %sgpr0 = SI_ELSE %sgpr0
27 ///   %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
28 /// SI_END_CF %sgpr0
29 ///
30 /// becomes:
31 ///
32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc  // Save and update the exec mask
33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec  // Clear live bits from saved exec mask
34 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
35 ///                                   // optimization which allows us to
36 ///                                   // branch if all the bits of
37 ///                                   // EXEC are zero.
38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
39 ///
40 /// label0:
41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then block
42 /// %exec = S_XOR_B64 %sgpr0, %exec    // Update the exec mask
43 /// S_BRANCH_EXECZ label1              // Use our branch optimization
44 ///                                    // instruction again.
45 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
46 /// label1:
47 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
48 //===----------------------------------------------------------------------===//
49 
50 #include "AMDGPU.h"
51 #include "GCNSubtarget.h"
52 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
53 #include "llvm/ADT/SmallSet.h"
54 #include "llvm/CodeGen/LiveIntervals.h"
55 #include "llvm/CodeGen/MachineFunctionPass.h"
56 
57 using namespace llvm;
58 
59 #define DEBUG_TYPE "si-lower-control-flow"
60 
61 static cl::opt<bool>
62 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
63     cl::init(true), cl::ReallyHidden);
64 
65 namespace {
66 
67 class SILowerControlFlow : public MachineFunctionPass {
68 private:
69   const SIRegisterInfo *TRI = nullptr;
70   const SIInstrInfo *TII = nullptr;
71   LiveIntervals *LIS = nullptr;
72   MachineRegisterInfo *MRI = nullptr;
73   SetVector<MachineInstr*> LoweredEndCf;
74   DenseSet<Register> LoweredIf;
75 
76   const TargetRegisterClass *BoolRC = nullptr;
77   unsigned AndOpc;
78   unsigned OrOpc;
79   unsigned XorOpc;
80   unsigned MovTermOpc;
81   unsigned Andn2TermOpc;
82   unsigned XorTermrOpc;
83   unsigned OrTermrOpc;
84   unsigned OrSaveExecOpc;
85   unsigned Exec;
86 
87   void emitIf(MachineInstr &MI);
88   void emitElse(MachineInstr &MI);
89   void emitIfBreak(MachineInstr &MI);
90   void emitLoop(MachineInstr &MI);
91 
92   MachineBasicBlock *emitEndCf(MachineInstr &MI);
93 
94   void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
95 
96   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
97                         SmallVectorImpl<MachineOperand> &Src) const;
98 
99   void combineMasks(MachineInstr &MI);
100 
101   bool removeMBBifRedundant(MachineBasicBlock &MBB);
102 
103   MachineBasicBlock *process(MachineInstr &MI);
104 
105   // Skip to the next instruction, ignoring debug instructions, and trivial
106   // block boundaries (blocks that have one (typically fallthrough) successor,
107   // and the successor has one predecessor.
108   MachineBasicBlock::iterator
109   skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
110                                  MachineBasicBlock::iterator It) const;
111 
112   /// Find the insertion point for a new conditional branch.
113   MachineBasicBlock::iterator
114   skipToUncondBrOrEnd(MachineBasicBlock &MBB,
115                       MachineBasicBlock::iterator I) const {
116     assert(I->isTerminator());
117 
118     // FIXME: What if we had multiple pre-existing conditional branches?
119     MachineBasicBlock::iterator End = MBB.end();
120     while (I != End && !I->isUnconditionalBranch())
121       ++I;
122     return I;
123   }
124 
125   // Remove redundant SI_END_CF instructions.
126   void optimizeEndCf();
127 
128 public:
129   static char ID;
130 
131   SILowerControlFlow() : MachineFunctionPass(ID) {}
132 
133   bool runOnMachineFunction(MachineFunction &MF) override;
134 
135   StringRef getPassName() const override {
136     return "SI Lower control flow pseudo instructions";
137   }
138 
139   void getAnalysisUsage(AnalysisUsage &AU) const override {
140     // Should preserve the same set that TwoAddressInstructions does.
141     AU.addPreserved<SlotIndexes>();
142     AU.addPreserved<LiveIntervals>();
143     AU.addPreservedID(LiveVariablesID);
144     MachineFunctionPass::getAnalysisUsage(AU);
145   }
146 };
147 
148 } // end anonymous namespace
149 
150 char SILowerControlFlow::ID = 0;
151 
152 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
153                "SI lower control flow", false, false)
154 
155 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
156   MachineOperand &ImpDefSCC = MI.getOperand(3);
157   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
158 
159   ImpDefSCC.setIsDead(IsDead);
160 }
161 
162 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
163 
164 static bool hasKill(const MachineBasicBlock *Begin,
165                     const MachineBasicBlock *End, const SIInstrInfo *TII) {
166   DenseSet<const MachineBasicBlock*> Visited;
167   SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
168 
169   while (!Worklist.empty()) {
170     MachineBasicBlock *MBB = Worklist.pop_back_val();
171 
172     if (MBB == End || !Visited.insert(MBB).second)
173       continue;
174     for (auto &Term : MBB->terminators())
175       if (TII->isKillTerminator(Term.getOpcode()))
176         return true;
177 
178     Worklist.append(MBB->succ_begin(), MBB->succ_end());
179   }
180 
181   return false;
182 }
183 
184 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
185   Register SaveExecReg = MI.getOperand(0).getReg();
186   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
187 
188   if (U == MRI->use_instr_nodbg_end() ||
189       std::next(U) != MRI->use_instr_nodbg_end() ||
190       U->getOpcode() != AMDGPU::SI_END_CF)
191     return false;
192 
193   return true;
194 }
195 
196 void SILowerControlFlow::emitIf(MachineInstr &MI) {
197   MachineBasicBlock &MBB = *MI.getParent();
198   const DebugLoc &DL = MI.getDebugLoc();
199   MachineBasicBlock::iterator I(&MI);
200   Register SaveExecReg = MI.getOperand(0).getReg();
201   MachineOperand& Cond = MI.getOperand(1);
202   assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
203 
204   MachineOperand &ImpDefSCC = MI.getOperand(4);
205   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
206 
207   // If there is only one use of save exec register and that use is SI_END_CF,
208   // we can optimize SI_IF by returning the full saved exec mask instead of
209   // just cleared bits.
210   bool SimpleIf = isSimpleIf(MI, MRI);
211 
212   if (SimpleIf) {
213     // Check for SI_KILL_*_TERMINATOR on path from if to endif.
214     // if there is any such terminator simplifications are not safe.
215     auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
216     SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
217   }
218 
219   // Add an implicit def of exec to discourage scheduling VALU after this which
220   // will interfere with trying to form s_and_saveexec_b64 later.
221   Register CopyReg = SimpleIf ? SaveExecReg
222                        : MRI->createVirtualRegister(BoolRC);
223   MachineInstr *CopyExec =
224     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
225     .addReg(Exec)
226     .addReg(Exec, RegState::ImplicitDefine);
227   LoweredIf.insert(CopyReg);
228 
229   Register Tmp = MRI->createVirtualRegister(BoolRC);
230 
231   MachineInstr *And =
232     BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
233     .addReg(CopyReg)
234     .add(Cond);
235 
236   setImpSCCDefDead(*And, true);
237 
238   MachineInstr *Xor = nullptr;
239   if (!SimpleIf) {
240     Xor =
241       BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
242       .addReg(Tmp)
243       .addReg(CopyReg);
244     setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
245   }
246 
247   // Use a copy that is a terminator to get correct spill code placement it with
248   // fast regalloc.
249   MachineInstr *SetExec =
250     BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
251     .addReg(Tmp, RegState::Kill);
252 
253   // Skip ahead to the unconditional branch in case there are other terminators
254   // present.
255   I = skipToUncondBrOrEnd(MBB, I);
256 
257   // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
258   // during SIRemoveShortExecBranches.
259   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
260                             .add(MI.getOperand(2));
261 
262   if (!LIS) {
263     MI.eraseFromParent();
264     return;
265   }
266 
267   LIS->InsertMachineInstrInMaps(*CopyExec);
268 
269   // Replace with and so we don't need to fix the live interval for condition
270   // register.
271   LIS->ReplaceMachineInstrInMaps(MI, *And);
272 
273   if (!SimpleIf)
274     LIS->InsertMachineInstrInMaps(*Xor);
275   LIS->InsertMachineInstrInMaps(*SetExec);
276   LIS->InsertMachineInstrInMaps(*NewBr);
277 
278   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
279   MI.eraseFromParent();
280 
281   // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
282   // hard to add another def here but I'm not sure how to correctly update the
283   // valno.
284   LIS->removeInterval(SaveExecReg);
285   LIS->createAndComputeVirtRegInterval(SaveExecReg);
286   LIS->createAndComputeVirtRegInterval(Tmp);
287   if (!SimpleIf)
288     LIS->createAndComputeVirtRegInterval(CopyReg);
289 }
290 
291 void SILowerControlFlow::emitElse(MachineInstr &MI) {
292   MachineBasicBlock &MBB = *MI.getParent();
293   const DebugLoc &DL = MI.getDebugLoc();
294 
295   Register DstReg = MI.getOperand(0).getReg();
296 
297   MachineBasicBlock::iterator Start = MBB.begin();
298 
299   // This must be inserted before phis and any spill code inserted before the
300   // else.
301   Register SaveReg = MRI->createVirtualRegister(BoolRC);
302   MachineInstr *OrSaveExec =
303     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
304     .add(MI.getOperand(1)); // Saved EXEC
305 
306   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
307 
308   MachineBasicBlock::iterator ElsePt(MI);
309 
310   // This accounts for any modification of the EXEC mask within the block and
311   // can be optimized out pre-RA when not required.
312   MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
313                           .addReg(Exec)
314                           .addReg(SaveReg);
315 
316   if (LIS)
317     LIS->InsertMachineInstrInMaps(*And);
318 
319   MachineInstr *Xor =
320     BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
321     .addReg(Exec)
322     .addReg(DstReg);
323 
324   // Skip ahead to the unconditional branch in case there are other terminators
325   // present.
326   ElsePt = skipToUncondBrOrEnd(MBB, ElsePt);
327 
328   MachineInstr *Branch =
329       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
330           .addMBB(DestBB);
331 
332   if (!LIS) {
333     MI.eraseFromParent();
334     return;
335   }
336 
337   LIS->RemoveMachineInstrFromMaps(MI);
338   MI.eraseFromParent();
339 
340   LIS->InsertMachineInstrInMaps(*OrSaveExec);
341 
342   LIS->InsertMachineInstrInMaps(*Xor);
343   LIS->InsertMachineInstrInMaps(*Branch);
344 
345   LIS->removeInterval(DstReg);
346   LIS->createAndComputeVirtRegInterval(DstReg);
347   LIS->createAndComputeVirtRegInterval(SaveReg);
348 
349   // Let this be recomputed.
350   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
351 }
352 
353 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
354   MachineBasicBlock &MBB = *MI.getParent();
355   const DebugLoc &DL = MI.getDebugLoc();
356   auto Dst = MI.getOperand(0).getReg();
357 
358   // Skip ANDing with exec if the break condition is already masked by exec
359   // because it is a V_CMP in the same basic block. (We know the break
360   // condition operand was an i1 in IR, so if it is a VALU instruction it must
361   // be one with a carry-out.)
362   bool SkipAnding = false;
363   if (MI.getOperand(1).isReg()) {
364     if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
365       SkipAnding = Def->getParent() == MI.getParent()
366           && SIInstrInfo::isVALU(*Def);
367     }
368   }
369 
370   // AND the break condition operand with exec, then OR that into the "loop
371   // exit" mask.
372   MachineInstr *And = nullptr, *Or = nullptr;
373   if (!SkipAnding) {
374     Register AndReg = MRI->createVirtualRegister(BoolRC);
375     And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
376              .addReg(Exec)
377              .add(MI.getOperand(1));
378     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
379              .addReg(AndReg)
380              .add(MI.getOperand(2));
381     if (LIS)
382       LIS->createAndComputeVirtRegInterval(AndReg);
383   } else
384     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
385              .add(MI.getOperand(1))
386              .add(MI.getOperand(2));
387 
388   if (LIS) {
389     if (And)
390       LIS->InsertMachineInstrInMaps(*And);
391     LIS->ReplaceMachineInstrInMaps(MI, *Or);
392   }
393 
394   MI.eraseFromParent();
395 }
396 
397 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
398   MachineBasicBlock &MBB = *MI.getParent();
399   const DebugLoc &DL = MI.getDebugLoc();
400 
401   MachineInstr *AndN2 =
402       BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
403           .addReg(Exec)
404           .add(MI.getOperand(0));
405 
406   auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
407   MachineInstr *Branch =
408       BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
409           .add(MI.getOperand(1));
410 
411   if (LIS) {
412     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
413     LIS->InsertMachineInstrInMaps(*Branch);
414   }
415 
416   MI.eraseFromParent();
417 }
418 
419 MachineBasicBlock::iterator
420 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
421   MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
422 
423   SmallSet<const MachineBasicBlock *, 4> Visited;
424   MachineBasicBlock *B = &MBB;
425   do {
426     if (!Visited.insert(B).second)
427       return MBB.end();
428 
429     auto E = B->end();
430     for ( ; It != E; ++It) {
431       if (TII->mayReadEXEC(*MRI, *It))
432         break;
433     }
434 
435     if (It != E)
436       return It;
437 
438     if (B->succ_size() != 1)
439       return MBB.end();
440 
441     // If there is one trivial successor, advance to the next block.
442     MachineBasicBlock *Succ = *B->succ_begin();
443 
444     It = Succ->begin();
445     B = Succ;
446   } while (true);
447 }
448 
449 MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
450   MachineBasicBlock &MBB = *MI.getParent();
451   const DebugLoc &DL = MI.getDebugLoc();
452 
453   MachineBasicBlock::iterator InsPt = MBB.begin();
454 
455   // If we have instructions that aren't prolog instructions, split the block
456   // and emit a terminator instruction. This ensures correct spill placement.
457   // FIXME: We should unconditionally split the block here.
458   bool NeedBlockSplit = false;
459   Register DataReg = MI.getOperand(0).getReg();
460   for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
461        I != E; ++I) {
462     if (I->modifiesRegister(DataReg, TRI)) {
463       NeedBlockSplit = true;
464       break;
465     }
466   }
467 
468   unsigned Opcode = OrOpc;
469   MachineBasicBlock *SplitBB = &MBB;
470   if (NeedBlockSplit) {
471     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
472     Opcode = OrTermrOpc;
473     InsPt = MI;
474   }
475 
476   MachineInstr *NewMI =
477     BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
478     .addReg(Exec)
479     .add(MI.getOperand(0));
480 
481   LoweredEndCf.insert(NewMI);
482 
483   if (LIS)
484     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
485 
486   MI.eraseFromParent();
487 
488   if (LIS)
489     LIS->handleMove(*NewMI);
490   return SplitBB;
491 }
492 
493 // Returns replace operands for a logical operation, either single result
494 // for exec or two operands if source was another equivalent operation.
495 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
496        SmallVectorImpl<MachineOperand> &Src) const {
497   MachineOperand &Op = MI.getOperand(OpNo);
498   if (!Op.isReg() || !Op.getReg().isVirtual()) {
499     Src.push_back(Op);
500     return;
501   }
502 
503   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
504   if (!Def || Def->getParent() != MI.getParent() ||
505       !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
506     return;
507 
508   // Make sure we do not modify exec between def and use.
509   // A copy with implcitly defined exec inserted earlier is an exclusion, it
510   // does not really modify exec.
511   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
512     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
513         !(I->isCopy() && I->getOperand(0).getReg() != Exec))
514       return;
515 
516   for (const auto &SrcOp : Def->explicit_operands())
517     if (SrcOp.isReg() && SrcOp.isUse() &&
518         (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
519       Src.push_back(SrcOp);
520 }
521 
522 // Search and combine pairs of equivalent instructions, like
523 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
524 // S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
525 // One of the operands is exec mask.
526 void SILowerControlFlow::combineMasks(MachineInstr &MI) {
527   assert(MI.getNumExplicitOperands() == 3);
528   SmallVector<MachineOperand, 4> Ops;
529   unsigned OpToReplace = 1;
530   findMaskOperands(MI, 1, Ops);
531   if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
532   findMaskOperands(MI, 2, Ops);
533   if (Ops.size() != 3) return;
534 
535   unsigned UniqueOpndIdx;
536   if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
537   else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
538   else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
539   else return;
540 
541   Register Reg = MI.getOperand(OpToReplace).getReg();
542   MI.RemoveOperand(OpToReplace);
543   MI.addOperand(Ops[UniqueOpndIdx]);
544   if (MRI->use_empty(Reg))
545     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
546 }
547 
548 void SILowerControlFlow::optimizeEndCf() {
549   // If the only instruction immediately following this END_CF is an another
550   // END_CF in the only successor we can avoid emitting exec mask restore here.
551   if (!RemoveRedundantEndcf)
552     return;
553 
554   for (MachineInstr *MI : LoweredEndCf) {
555     MachineBasicBlock &MBB = *MI->getParent();
556     auto Next =
557       skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
558     if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
559       continue;
560     // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
561     // If that belongs to SI_ELSE then saved mask has an inverted value.
562     Register SavedExec
563       = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
564     assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
565 
566     const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
567     if (Def && LoweredIf.count(SavedExec)) {
568       LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
569       if (LIS)
570         LIS->RemoveMachineInstrFromMaps(*MI);
571       MI->eraseFromParent();
572       removeMBBifRedundant(MBB);
573     }
574   }
575 }
576 
577 MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
578   MachineBasicBlock &MBB = *MI.getParent();
579   MachineBasicBlock::iterator I(MI);
580   MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
581 
582   MachineBasicBlock *SplitBB = &MBB;
583 
584   switch (MI.getOpcode()) {
585   case AMDGPU::SI_IF:
586     emitIf(MI);
587     break;
588 
589   case AMDGPU::SI_ELSE:
590     emitElse(MI);
591     break;
592 
593   case AMDGPU::SI_IF_BREAK:
594     emitIfBreak(MI);
595     break;
596 
597   case AMDGPU::SI_LOOP:
598     emitLoop(MI);
599     break;
600 
601   case AMDGPU::SI_END_CF:
602     SplitBB = emitEndCf(MI);
603     break;
604 
605   default:
606     assert(false && "Attempt to process unsupported instruction");
607     break;
608   }
609 
610   MachineBasicBlock::iterator Next;
611   for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
612     Next = std::next(I);
613     MachineInstr &MaskMI = *I;
614     switch (MaskMI.getOpcode()) {
615     case AMDGPU::S_AND_B64:
616     case AMDGPU::S_OR_B64:
617     case AMDGPU::S_AND_B32:
618     case AMDGPU::S_OR_B32:
619       // Cleanup bit manipulations on exec mask
620       combineMasks(MaskMI);
621       break;
622     default:
623       I = MBB.end();
624       break;
625     }
626   }
627 
628   return SplitBB;
629 }
630 
631 void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
632                                        MachineInstr &MI) {
633   MachineFunction &MF = *MBB->getParent();
634   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
635   bool IsWave32 = ST.isWave32();
636 
637   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
638     // This should be before all vector instructions.
639     BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
640             TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
641         .addImm(MI.getOperand(0).getImm());
642     if (LIS)
643       LIS->RemoveMachineInstrFromMaps(MI);
644     MI.eraseFromParent();
645     return;
646   }
647 
648   // Extract the thread count from an SGPR input and set EXEC accordingly.
649   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
650   //
651   // S_BFE_U32 count, input, {shift, 7}
652   // S_BFM_B64 exec, count, 0
653   // S_CMP_EQ_U32 count, 64
654   // S_CMOV_B64 exec, -1
655   Register InputReg = MI.getOperand(0).getReg();
656   MachineInstr *FirstMI = &*MBB->begin();
657   if (InputReg.isVirtual()) {
658     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
659     assert(DefInstr && DefInstr->isCopy());
660     if (DefInstr->getParent() == MBB) {
661       if (DefInstr != FirstMI) {
662         // If the `InputReg` is defined in current block, we also need to
663         // move that instruction to the beginning of the block.
664         DefInstr->removeFromParent();
665         MBB->insert(FirstMI, DefInstr);
666         if (LIS)
667           LIS->handleMove(*DefInstr);
668       } else {
669         // If first instruction is definition then move pointer after it.
670         FirstMI = &*std::next(FirstMI->getIterator());
671       }
672     }
673   }
674 
675   // Insert instruction sequence at block beginning (before vector operations).
676   const DebugLoc DL = MI.getDebugLoc();
677   const unsigned WavefrontSize = ST.getWavefrontSize();
678   const unsigned Mask = (WavefrontSize << 1) - 1;
679   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
680   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
681                    .addReg(InputReg)
682                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
683   auto BfmMI =
684       BuildMI(*MBB, FirstMI, DL,
685               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
686           .addReg(CountReg)
687           .addImm(0);
688   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
689                    .addReg(CountReg, RegState::Kill)
690                    .addImm(WavefrontSize);
691   auto CmovMI =
692       BuildMI(*MBB, FirstMI, DL,
693               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
694               Exec)
695           .addImm(-1);
696 
697   if (!LIS) {
698     MI.eraseFromParent();
699     return;
700   }
701 
702   LIS->RemoveMachineInstrFromMaps(MI);
703   MI.eraseFromParent();
704 
705   LIS->InsertMachineInstrInMaps(*BfeMI);
706   LIS->InsertMachineInstrInMaps(*BfmMI);
707   LIS->InsertMachineInstrInMaps(*CmpMI);
708   LIS->InsertMachineInstrInMaps(*CmovMI);
709 
710   LIS->removeInterval(InputReg);
711   LIS->createAndComputeVirtRegInterval(InputReg);
712   LIS->createAndComputeVirtRegInterval(CountReg);
713 }
714 
715 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
716   auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
717     auto *S = B->getNextNode();
718     if (!S)
719       return nullptr;
720     if (B->isSuccessor(S)) {
721       // The only fallthrough candidate
722       MachineBasicBlock::iterator I(B->getFirstInstrTerminator());
723       MachineBasicBlock::iterator E = B->end();
724       for (; I != E; I++) {
725         if (I->isBranch() && TII->getBranchDestBlock(*I) == S)
726           // We have unoptimized branch to layout successor
727           return nullptr;
728       }
729     }
730     return S;
731   };
732 
733   for (auto &I : MBB.instrs()) {
734     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
735       return false;
736   }
737 
738   assert(MBB.succ_size() == 1 && "MBB has more than one successor");
739 
740   MachineBasicBlock *Succ = *MBB.succ_begin();
741   MachineBasicBlock *FallThrough = nullptr;
742 
743   while (!MBB.predecessors().empty()) {
744     MachineBasicBlock *P = *MBB.pred_begin();
745     if (GetFallThroughSucc(P) == &MBB)
746       FallThrough = P;
747     P->ReplaceUsesOfBlockWith(&MBB, Succ);
748   }
749   MBB.removeSuccessor(Succ);
750   if (LIS) {
751     for (auto &I : MBB.instrs())
752       LIS->RemoveMachineInstrFromMaps(I);
753   }
754   MBB.clear();
755   MBB.eraseFromParent();
756   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
757     if (!GetFallThroughSucc(Succ)) {
758       MachineFunction *MF = FallThrough->getParent();
759       MachineFunction::iterator FallThroughPos(FallThrough);
760       MF->splice(std::next(FallThroughPos), Succ);
761     } else
762       BuildMI(*FallThrough, FallThrough->end(),
763               FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
764           .addMBB(Succ);
765   }
766 
767   return true;
768 }
769 
770 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
771   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
772   TII = ST.getInstrInfo();
773   TRI = &TII->getRegisterInfo();
774 
775   // This doesn't actually need LiveIntervals, but we can preserve them.
776   LIS = getAnalysisIfAvailable<LiveIntervals>();
777   MRI = &MF.getRegInfo();
778   BoolRC = TRI->getBoolRC();
779 
780   if (ST.isWave32()) {
781     AndOpc = AMDGPU::S_AND_B32;
782     OrOpc = AMDGPU::S_OR_B32;
783     XorOpc = AMDGPU::S_XOR_B32;
784     MovTermOpc = AMDGPU::S_MOV_B32_term;
785     Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
786     XorTermrOpc = AMDGPU::S_XOR_B32_term;
787     OrTermrOpc = AMDGPU::S_OR_B32_term;
788     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
789     Exec = AMDGPU::EXEC_LO;
790   } else {
791     AndOpc = AMDGPU::S_AND_B64;
792     OrOpc = AMDGPU::S_OR_B64;
793     XorOpc = AMDGPU::S_XOR_B64;
794     MovTermOpc = AMDGPU::S_MOV_B64_term;
795     Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
796     XorTermrOpc = AMDGPU::S_XOR_B64_term;
797     OrTermrOpc = AMDGPU::S_OR_B64_term;
798     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
799     Exec = AMDGPU::EXEC;
800   }
801 
802   MachineFunction::iterator NextBB;
803   for (MachineFunction::iterator BI = MF.begin();
804        BI != MF.end(); BI = NextBB) {
805     NextBB = std::next(BI);
806     MachineBasicBlock *MBB = &*BI;
807 
808     MachineBasicBlock::iterator I, E, Next;
809     E = MBB->end();
810     for (I = MBB->begin(); I != E; I = Next) {
811       Next = std::next(I);
812       MachineInstr &MI = *I;
813       MachineBasicBlock *SplitMBB = MBB;
814 
815       switch (MI.getOpcode()) {
816       case AMDGPU::SI_IF:
817         SplitMBB = process(MI);
818         break;
819 
820       case AMDGPU::SI_ELSE:
821       case AMDGPU::SI_IF_BREAK:
822       case AMDGPU::SI_LOOP:
823       case AMDGPU::SI_END_CF:
824         // Only build worklist if SI_IF instructions must be processed first.
825         SplitMBB = process(MI);
826         break;
827 
828       // FIXME: find a better place for this
829       case AMDGPU::SI_INIT_EXEC:
830       case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
831         lowerInitExec(MBB, MI);
832         if (LIS)
833           LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
834         break;
835 
836       default:
837         break;
838       }
839 
840       if (SplitMBB != MBB) {
841         MBB = Next->getParent();
842         E = MBB->end();
843       }
844     }
845   }
846 
847   optimizeEndCf();
848 
849   LoweredEndCf.clear();
850   LoweredIf.clear();
851 
852   return true;
853 }
854