xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision 43e92fe306ac1fa4fb36062a458a18a9aed23855)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   static const unsigned SkipThreshold = 12;
72 
73   const SIRegisterInfo *TRI;
74   const SIInstrInfo *TII;
75 
76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77 
78   void Skip(MachineInstr &From, MachineOperand &To);
79   void SkipIfDead(MachineInstr &MI);
80 
81   void If(MachineInstr &MI);
82   void Else(MachineInstr &MI, bool ExecModified);
83   void Break(MachineInstr &MI);
84   void IfBreak(MachineInstr &MI);
85   void ElseBreak(MachineInstr &MI);
86   void Loop(MachineInstr &MI);
87   void EndCf(MachineInstr &MI);
88 
89   void Kill(MachineInstr &MI);
90   void Branch(MachineInstr &MI);
91 
92   void splitBlockLiveIns(const MachineBasicBlock &MBB,
93                          const MachineInstr &MI,
94                          MachineBasicBlock &LoopBB,
95                          MachineBasicBlock &RemainderBB,
96                          unsigned SaveReg,
97                          unsigned IdxReg);
98 
99   void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
100                               MachineInstr *MovRel, unsigned IdxReg, int Offset);
101 
102   bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
103   void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
104   bool indirectSrc(MachineInstr &MI);
105   bool indirectDst(MachineInstr &MI);
106 
107 public:
108   static char ID;
109 
110   SILowerControlFlow() :
111     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
112 
113   bool runOnMachineFunction(MachineFunction &MF) override;
114 
115   const char *getPassName() const override {
116     return "SI Lower control flow pseudo instructions";
117   }
118 };
119 
120 } // End anonymous namespace
121 
122 char SILowerControlFlow::ID = 0;
123 
124 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
125                 "SI lower control flow", false, false)
126 
127 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
128 
129 
130 FunctionPass *llvm::createSILowerControlFlowPass() {
131   return new SILowerControlFlow();
132 }
133 
134 static bool opcodeEmitsNoInsts(unsigned Opc) {
135   switch (Opc) {
136   case TargetOpcode::IMPLICIT_DEF:
137   case TargetOpcode::KILL:
138   case TargetOpcode::BUNDLE:
139   case TargetOpcode::CFI_INSTRUCTION:
140   case TargetOpcode::EH_LABEL:
141   case TargetOpcode::GC_LABEL:
142   case TargetOpcode::DBG_VALUE:
143     return true;
144   default:
145     return false;
146   }
147 }
148 
149 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
150                                     MachineBasicBlock *To) {
151 
152   unsigned NumInstr = 0;
153   MachineFunction *MF = From->getParent();
154 
155   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
156        MBBI != End && MBBI != ToI; ++MBBI) {
157     MachineBasicBlock &MBB = *MBBI;
158 
159     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
160          NumInstr < SkipThreshold && I != E; ++I) {
161       if (opcodeEmitsNoInsts(I->getOpcode()))
162         continue;
163 
164       // When a uniform loop is inside non-uniform control flow, the branch
165       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
166       // when EXEC = 0. We should skip the loop lest it becomes infinite.
167       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
168           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
169         return true;
170 
171       if (++NumInstr >= SkipThreshold)
172         return true;
173     }
174   }
175 
176   return false;
177 }
178 
179 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
180 
181   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
182     return;
183 
184   DebugLoc DL = From.getDebugLoc();
185   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
186     .addOperand(To);
187 }
188 
189 void SILowerControlFlow::SkipIfDead(MachineInstr &MI) {
190 
191   MachineBasicBlock &MBB = *MI.getParent();
192   DebugLoc DL = MI.getDebugLoc();
193 
194   if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
195       !shouldSkip(&MBB, &MBB.getParent()->back()))
196     return;
197 
198   MachineBasicBlock::iterator Insert = &MI;
199   ++Insert;
200 
201   // If the exec mask is non-zero, skip the next two instructions
202   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
203     .addImm(3);
204 
205   // Exec mask is zero: Export to NULL target...
206   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
207           .addImm(0)
208           .addImm(0x09) // V_008DFC_SQ_EXP_NULL
209           .addImm(0)
210           .addImm(1)
211           .addImm(1)
212           .addReg(AMDGPU::VGPR0)
213           .addReg(AMDGPU::VGPR0)
214           .addReg(AMDGPU::VGPR0)
215           .addReg(AMDGPU::VGPR0);
216 
217   // ... and terminate wavefront
218   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
219 }
220 
221 void SILowerControlFlow::If(MachineInstr &MI) {
222   MachineBasicBlock &MBB = *MI.getParent();
223   DebugLoc DL = MI.getDebugLoc();
224   unsigned Reg = MI.getOperand(0).getReg();
225   unsigned Vcc = MI.getOperand(1).getReg();
226 
227   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
228           .addReg(Vcc);
229 
230   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
231           .addReg(AMDGPU::EXEC)
232           .addReg(Reg);
233 
234   Skip(MI, MI.getOperand(2));
235 
236   // Insert a pseudo terminator to help keep the verifier happy.
237   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Reg)
238     .addOperand(MI.getOperand(2));
239 
240   MI.eraseFromParent();
241 }
242 
243 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
244   MachineBasicBlock &MBB = *MI.getParent();
245   DebugLoc DL = MI.getDebugLoc();
246   unsigned Dst = MI.getOperand(0).getReg();
247   unsigned Src = MI.getOperand(1).getReg();
248 
249   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
250           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
251           .addReg(Src); // Saved EXEC
252 
253   if (ExecModified) {
254     // Adjust the saved exec to account for the modifications during the flow
255     // block that contains the ELSE. This can happen when WQM mode is switched
256     // off.
257     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
258             .addReg(AMDGPU::EXEC)
259             .addReg(Dst);
260   }
261 
262   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
263           .addReg(AMDGPU::EXEC)
264           .addReg(Dst);
265 
266   Skip(MI, MI.getOperand(2));
267 
268   // Insert a pseudo terminator to help keep the verifier happy.
269   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Dst)
270     .addOperand(MI.getOperand(2));
271 
272   MI.eraseFromParent();
273 }
274 
275 void SILowerControlFlow::Break(MachineInstr &MI) {
276   MachineBasicBlock &MBB = *MI.getParent();
277   DebugLoc DL = MI.getDebugLoc();
278 
279   unsigned Dst = MI.getOperand(0).getReg();
280   unsigned Src = MI.getOperand(1).getReg();
281 
282   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
283           .addReg(AMDGPU::EXEC)
284           .addReg(Src);
285 
286   MI.eraseFromParent();
287 }
288 
289 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
290   MachineBasicBlock &MBB = *MI.getParent();
291   DebugLoc DL = MI.getDebugLoc();
292 
293   unsigned Dst = MI.getOperand(0).getReg();
294   unsigned Vcc = MI.getOperand(1).getReg();
295   unsigned Src = MI.getOperand(2).getReg();
296 
297   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
298           .addReg(Vcc)
299           .addReg(Src);
300 
301   MI.eraseFromParent();
302 }
303 
304 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
305   MachineBasicBlock &MBB = *MI.getParent();
306   DebugLoc DL = MI.getDebugLoc();
307 
308   unsigned Dst = MI.getOperand(0).getReg();
309   unsigned Saved = MI.getOperand(1).getReg();
310   unsigned Src = MI.getOperand(2).getReg();
311 
312   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
313           .addReg(Saved)
314           .addReg(Src);
315 
316   MI.eraseFromParent();
317 }
318 
319 void SILowerControlFlow::Loop(MachineInstr &MI) {
320   MachineBasicBlock &MBB = *MI.getParent();
321   DebugLoc DL = MI.getDebugLoc();
322   unsigned Src = MI.getOperand(0).getReg();
323 
324   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
325           .addReg(AMDGPU::EXEC)
326           .addReg(Src);
327 
328   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
329     .addOperand(MI.getOperand(1));
330 
331   MI.eraseFromParent();
332 }
333 
334 void SILowerControlFlow::EndCf(MachineInstr &MI) {
335   MachineBasicBlock &MBB = *MI.getParent();
336   DebugLoc DL = MI.getDebugLoc();
337   unsigned Reg = MI.getOperand(0).getReg();
338 
339   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
340           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
341           .addReg(AMDGPU::EXEC)
342           .addReg(Reg);
343 
344   MI.eraseFromParent();
345 }
346 
347 void SILowerControlFlow::Branch(MachineInstr &MI) {
348   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
349   if (MBB == MI.getParent()->getNextNode())
350     MI.eraseFromParent();
351 
352   // If these aren't equal, this is probably an infinite loop.
353 }
354 
355 void SILowerControlFlow::Kill(MachineInstr &MI) {
356   MachineBasicBlock &MBB = *MI.getParent();
357   DebugLoc DL = MI.getDebugLoc();
358   const MachineOperand &Op = MI.getOperand(0);
359 
360 #ifndef NDEBUG
361   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
362   // Kill is only allowed in pixel / geometry shaders.
363   assert(CallConv == CallingConv::AMDGPU_PS ||
364          CallConv == CallingConv::AMDGPU_GS);
365 #endif
366 
367   // Clear this thread from the exec mask if the operand is negative
368   if ((Op.isImm())) {
369     // Constant operand: Set exec mask to 0 or do nothing
370     if (Op.getImm() & 0x80000000) {
371       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
372               .addImm(0);
373     }
374   } else {
375     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
376            .addImm(0)
377            .addOperand(Op);
378   }
379 
380   MI.eraseFromParent();
381 }
382 
383 // All currently live registers must remain so in the remainder block.
384 void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB,
385                                            const MachineInstr &MI,
386                                            MachineBasicBlock &LoopBB,
387                                            MachineBasicBlock &RemainderBB,
388                                            unsigned SaveReg,
389                                            unsigned IdxReg) {
390   LivePhysRegs RemainderLiveRegs(TRI);
391 
392   RemainderLiveRegs.addLiveOuts(MBB);
393   for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI);
394        I != E; ++I) {
395     RemainderLiveRegs.stepBackward(*I);
396   }
397 
398   // Add reg defined in loop body.
399   RemainderLiveRegs.addReg(SaveReg);
400 
401   if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
402     RemainderLiveRegs.addReg(Val->getReg());
403     LoopBB.addLiveIn(Val->getReg());
404   }
405 
406   for (unsigned Reg : RemainderLiveRegs)
407     RemainderBB.addLiveIn(Reg);
408 
409   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
410   LoopBB.addLiveIn(SrcReg);
411   LoopBB.addLiveIn(IdxReg);
412   LoopBB.sortUniqueLiveIns();
413 }
414 
415 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
416                                                 DebugLoc DL,
417                                                 MachineInstr *MovRel,
418                                                 unsigned IdxReg,
419                                                 int Offset) {
420   MachineBasicBlock::iterator I = LoopBB.begin();
421 
422   // Read the next variant into VCC (lower 32 bits) <- also loop target
423   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
424     .addReg(IdxReg);
425 
426   // Move index from VCC into M0
427   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
428     .addReg(AMDGPU::VCC_LO);
429 
430   // Compare the just read M0 value to all possible Idx values
431   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
432     .addReg(AMDGPU::M0)
433     .addReg(IdxReg);
434 
435   // Update EXEC, save the original EXEC value to VCC
436   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
437     .addReg(AMDGPU::VCC);
438 
439   if (Offset) {
440     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
441       .addReg(AMDGPU::M0)
442       .addImm(Offset);
443   }
444 
445   // Do the actual move
446   LoopBB.insert(I, MovRel);
447 
448   // Update EXEC, switch all done bits to 0 and all todo bits to 1
449   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
450     .addReg(AMDGPU::EXEC)
451     .addReg(AMDGPU::VCC);
452 
453   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
454   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
455     .addMBB(&LoopBB);
456 }
457 
458 // Returns true if a new block was inserted.
459 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
460   MachineBasicBlock &MBB = *MI.getParent();
461   DebugLoc DL = MI.getDebugLoc();
462   MachineBasicBlock::iterator I(&MI);
463 
464   unsigned Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx)->getReg();
465 
466   if (AMDGPU::SReg_32RegClass.contains(Idx)) {
467     if (Offset) {
468       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
469         .addReg(Idx)
470         .addImm(Offset);
471     } else {
472       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
473         .addReg(Idx);
474     }
475 
476     MBB.insert(I, MovRel);
477     MI.eraseFromParent();
478     return false;
479   }
480 
481   MachineFunction &MF = *MBB.getParent();
482   MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
483   SaveOp->setIsDead(false);
484   unsigned Save = SaveOp->getReg();
485 
486   // Reading from a VGPR requires looping over all workitems in the wavefront.
487   assert(AMDGPU::SReg_64RegClass.contains(Save) &&
488          AMDGPU::VGPR_32RegClass.contains(Idx));
489 
490   // Save the EXEC mask
491   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
492     .addReg(AMDGPU::EXEC);
493 
494   // To insert the loop we need to split the block. Move everything after this
495   // point to a new block, and insert a new empty block between the two.
496   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
497   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
498   MachineFunction::iterator MBBI(MBB);
499   ++MBBI;
500 
501   MF.insert(MBBI, LoopBB);
502   MF.insert(MBBI, RemainderBB);
503 
504   LoopBB->addSuccessor(LoopBB);
505   LoopBB->addSuccessor(RemainderBB);
506 
507   if (TRI->trackLivenessAfterRegAlloc(MF))
508     splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, Idx);
509 
510   // Move the rest of the block into a new block.
511   RemainderBB->transferSuccessors(&MBB);
512   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
513 
514   emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Idx, Offset);
515 
516   MachineBasicBlock::iterator First = RemainderBB->begin();
517   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
518     .addReg(Save);
519 
520   MI.eraseFromParent();
521   return true;
522 }
523 
524 /// \param @VecReg The register which holds element zero of the vector
525 ///                 being addressed into.
526 /// \param[out] @Reg The base register to use in the indirect addressing instruction.
527 /// \param[in,out] @Offset As an input, this is the constant offset part of the
528 //                         indirect Index. e.g. v0 = v[VecReg + Offset]
529 //                         As an output, this is a constant value that needs
530 //                         to be added to the value stored in M0.
531 void SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
532                                                      unsigned &Reg,
533                                                      int &Offset) {
534   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
535   if (!SubReg)
536     SubReg = VecReg;
537 
538   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
539   int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
540 
541   if (RegIdx < 0) {
542     Offset = RegIdx;
543     RegIdx = 0;
544   } else {
545     Offset = 0;
546   }
547 
548   Reg = RC->getRegister(RegIdx);
549 }
550 
551 // Return true if a new block was inserted.
552 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
553   MachineBasicBlock &MBB = *MI.getParent();
554   DebugLoc DL = MI.getDebugLoc();
555 
556   unsigned Dst = MI.getOperand(0).getReg();
557   unsigned Vec = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
558   int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
559   unsigned Reg;
560 
561   computeIndirectRegAndOffset(Vec, Reg, Off);
562 
563   MachineInstr *MovRel =
564     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
565     .addReg(Reg)
566     .addReg(Vec, RegState::Implicit);
567 
568   return loadM0(MI, MovRel, Off);
569 }
570 
571 // Return true if a new block was inserted.
572 bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
573   MachineBasicBlock &MBB = *MI.getParent();
574   DebugLoc DL = MI.getDebugLoc();
575 
576   unsigned Dst = MI.getOperand(0).getReg();
577   int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
578   unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)->getReg();
579   unsigned Reg;
580 
581   computeIndirectRegAndOffset(Dst, Reg, Off);
582 
583   MachineInstr *MovRel =
584     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
585     .addReg(Reg, RegState::Define)
586     .addReg(Val)
587     .addReg(Dst, RegState::Implicit);
588 
589   return loadM0(MI, MovRel, Off);
590 }
591 
592 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
593   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
594   TII = ST.getInstrInfo();
595   TRI = &TII->getRegisterInfo();
596 
597   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
598 
599   bool HaveKill = false;
600   bool NeedFlat = false;
601   unsigned Depth = 0;
602 
603   MachineFunction::iterator NextBB;
604 
605   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
606        BI != BE; BI = NextBB) {
607     NextBB = std::next(BI);
608     MachineBasicBlock &MBB = *BI;
609 
610     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
611     MachineBasicBlock::iterator I, Next;
612     bool ExecModified = false;
613 
614     for (I = MBB.begin(); I != MBB.end(); I = Next) {
615       Next = std::next(I);
616 
617       MachineInstr &MI = *I;
618 
619       // Flat uses m0 in case it needs to access LDS.
620       if (TII->isFLAT(MI))
621         NeedFlat = true;
622 
623       for (const auto &Def : I->defs()) {
624         if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
625           ExecModified = true;
626           break;
627         }
628       }
629 
630       switch (MI.getOpcode()) {
631         default: break;
632         case AMDGPU::SI_IF:
633           ++Depth;
634           If(MI);
635           break;
636 
637         case AMDGPU::SI_ELSE:
638           Else(MI, ExecModified);
639           break;
640 
641         case AMDGPU::SI_BREAK:
642           Break(MI);
643           break;
644 
645         case AMDGPU::SI_IF_BREAK:
646           IfBreak(MI);
647           break;
648 
649         case AMDGPU::SI_ELSE_BREAK:
650           ElseBreak(MI);
651           break;
652 
653         case AMDGPU::SI_LOOP:
654           ++Depth;
655           Loop(MI);
656           break;
657 
658         case AMDGPU::SI_END_CF:
659           if (--Depth == 0 && HaveKill) {
660             SkipIfDead(MI);
661             HaveKill = false;
662           }
663           EndCf(MI);
664           break;
665 
666         case AMDGPU::SI_KILL:
667           if (Depth == 0)
668             SkipIfDead(MI);
669           else
670             HaveKill = true;
671           Kill(MI);
672           break;
673 
674         case AMDGPU::S_BRANCH:
675           Branch(MI);
676           break;
677 
678         case AMDGPU::SI_INDIRECT_SRC_V1:
679         case AMDGPU::SI_INDIRECT_SRC_V2:
680         case AMDGPU::SI_INDIRECT_SRC_V4:
681         case AMDGPU::SI_INDIRECT_SRC_V8:
682         case AMDGPU::SI_INDIRECT_SRC_V16:
683           if (indirectSrc(MI)) {
684             // The block was split at this point. We can safely skip the middle
685             // inserted block to the following which contains the rest of this
686             // block's instructions.
687             NextBB = std::next(BI);
688             BE = MF.end();
689             Next = MBB.end();
690           }
691 
692           break;
693 
694         case AMDGPU::SI_INDIRECT_DST_V1:
695         case AMDGPU::SI_INDIRECT_DST_V2:
696         case AMDGPU::SI_INDIRECT_DST_V4:
697         case AMDGPU::SI_INDIRECT_DST_V8:
698         case AMDGPU::SI_INDIRECT_DST_V16:
699           if (indirectDst(MI)) {
700             // The block was split at this point. We can safely skip the middle
701             // inserted block to the following which contains the rest of this
702             // block's instructions.
703             NextBB = std::next(BI);
704             BE = MF.end();
705             Next = MBB.end();
706           }
707 
708           break;
709 
710         case AMDGPU::S_ENDPGM: {
711           if (MF.getInfo<SIMachineFunctionInfo>()->returnsVoid())
712             break;
713 
714           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
715           // because external bytecode will be appended at the end.
716           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
717             // S_ENDPGM is not the last instruction. Add an empty block at
718             // the end and jump there.
719             if (!EmptyMBBAtEnd) {
720               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
721               MF.insert(MF.end(), EmptyMBBAtEnd);
722             }
723 
724             MBB.addSuccessor(EmptyMBBAtEnd);
725             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
726                     .addMBB(EmptyMBBAtEnd);
727           }
728 
729           I->eraseFromParent();
730           break;
731         }
732       }
733     }
734   }
735 
736   if (NeedFlat && MFI->IsKernel) {
737     // TODO: What to use with function calls?
738     // We will need to Initialize the flat scratch register pair.
739     if (NeedFlat)
740       MFI->setHasFlatInstructions(true);
741   }
742 
743   return true;
744 }
745