xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision 657f871a4e7e58781fa36fe9371483283c11b100)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   static const unsigned SkipThreshold = 12;
72 
73   const SIRegisterInfo *TRI;
74   const SIInstrInfo *TII;
75 
76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77 
78   void Skip(MachineInstr &From, MachineOperand &To);
79   bool skipIfDead(MachineInstr &MI);
80 
81   void If(MachineInstr &MI);
82   void Else(MachineInstr &MI, bool ExecModified);
83   void Break(MachineInstr &MI);
84   void IfBreak(MachineInstr &MI);
85   void ElseBreak(MachineInstr &MI);
86   void Loop(MachineInstr &MI);
87   void EndCf(MachineInstr &MI);
88 
89   void Kill(MachineInstr &MI);
90   void Branch(MachineInstr &MI);
91 
92   std::pair<MachineBasicBlock *, MachineBasicBlock *>
93   splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
94 
95   void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
96                                const MachineRegisterInfo &MRI,
97                                const MachineInstr &MI,
98                                MachineBasicBlock &LoopBB,
99                                MachineBasicBlock &RemainderBB,
100                                unsigned SaveReg,
101                                const MachineOperand &IdxReg);
102 
103   void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
104                               MachineInstr *MovRel,
105                               const MachineOperand &IdxReg,
106                               int Offset);
107 
108   bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
109   std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
110                                                        int Offset) const;
111   bool indirectSrc(MachineInstr &MI);
112   bool indirectDst(MachineInstr &MI);
113 
114 public:
115   static char ID;
116 
117   SILowerControlFlow() :
118     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
119 
120   bool runOnMachineFunction(MachineFunction &MF) override;
121 
122   const char *getPassName() const override {
123     return "SI Lower control flow pseudo instructions";
124   }
125 };
126 
127 } // End anonymous namespace
128 
129 char SILowerControlFlow::ID = 0;
130 
131 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
132                 "SI lower control flow", false, false)
133 
134 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
135 
136 
137 FunctionPass *llvm::createSILowerControlFlowPass() {
138   return new SILowerControlFlow();
139 }
140 
141 static bool opcodeEmitsNoInsts(unsigned Opc) {
142   switch (Opc) {
143   case TargetOpcode::IMPLICIT_DEF:
144   case TargetOpcode::KILL:
145   case TargetOpcode::BUNDLE:
146   case TargetOpcode::CFI_INSTRUCTION:
147   case TargetOpcode::EH_LABEL:
148   case TargetOpcode::GC_LABEL:
149   case TargetOpcode::DBG_VALUE:
150     return true;
151   default:
152     return false;
153   }
154 }
155 
156 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
157                                     MachineBasicBlock *To) {
158 
159   unsigned NumInstr = 0;
160   MachineFunction *MF = From->getParent();
161 
162   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
163        MBBI != End && MBBI != ToI; ++MBBI) {
164     MachineBasicBlock &MBB = *MBBI;
165 
166     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
167          NumInstr < SkipThreshold && I != E; ++I) {
168       if (opcodeEmitsNoInsts(I->getOpcode()))
169         continue;
170 
171       // When a uniform loop is inside non-uniform control flow, the branch
172       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
173       // when EXEC = 0. We should skip the loop lest it becomes infinite.
174       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
175           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
176         return true;
177 
178       if (I->isInlineAsm()) {
179         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
180         const char *AsmStr = I->getOperand(0).getSymbolName();
181 
182         // inlineasm length estimate is number of bytes assuming the longest
183         // instruction.
184         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
185         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
186       } else {
187         ++NumInstr;
188       }
189 
190       if (NumInstr >= SkipThreshold)
191         return true;
192     }
193   }
194 
195   return false;
196 }
197 
198 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
199 
200   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
201     return;
202 
203   DebugLoc DL = From.getDebugLoc();
204   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
205     .addOperand(To);
206 }
207 
208 bool SILowerControlFlow::skipIfDead(MachineInstr &MI) {
209   MachineBasicBlock &MBB = *MI.getParent();
210 
211   if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
212       !shouldSkip(&MBB, &MBB.getParent()->back()))
213     return false;
214 
215   LivePhysRegs RemainderLiveRegs(TRI);
216   RemainderLiveRegs.addLiveOuts(MBB);
217 
218   MachineBasicBlock *SkipBB;
219   MachineBasicBlock *RemainderBB;
220   std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator());
221 
222   const DebugLoc &DL = MI.getDebugLoc();
223 
224   // If the exec mask is non-zero, skip the next two instructions
225   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
226     .addMBB(RemainderBB);
227 
228   MBB.addSuccessor(RemainderBB);
229 
230   MachineBasicBlock::iterator Insert = SkipBB->begin();
231 
232   // Exec mask is zero: Export to NULL target...
233   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
234     .addImm(0)
235     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
236     .addImm(0)
237     .addImm(1)
238     .addImm(1)
239     .addReg(AMDGPU::VGPR0, RegState::Undef)
240     .addReg(AMDGPU::VGPR0, RegState::Undef)
241     .addReg(AMDGPU::VGPR0, RegState::Undef)
242     .addReg(AMDGPU::VGPR0, RegState::Undef);
243 
244   // ... and terminate wavefront.
245   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
246 
247   for (const MachineInstr &Inst : reverse(*RemainderBB))
248     RemainderLiveRegs.stepBackward(Inst);
249 
250   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
251   for (unsigned Reg : RemainderLiveRegs) {
252     if (MRI.isAllocatable(Reg))
253       RemainderBB->addLiveIn(Reg);
254   }
255 
256   return true;
257 }
258 
259 void SILowerControlFlow::If(MachineInstr &MI) {
260   MachineBasicBlock &MBB = *MI.getParent();
261   DebugLoc DL = MI.getDebugLoc();
262   unsigned Reg = MI.getOperand(0).getReg();
263   unsigned Vcc = MI.getOperand(1).getReg();
264 
265   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
266           .addReg(Vcc);
267 
268   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
269           .addReg(AMDGPU::EXEC)
270           .addReg(Reg);
271 
272   Skip(MI, MI.getOperand(2));
273 
274   // Insert a pseudo terminator to help keep the verifier happy.
275   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
276     .addOperand(MI.getOperand(2))
277     .addReg(Reg);
278 
279   MI.eraseFromParent();
280 }
281 
282 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
283   MachineBasicBlock &MBB = *MI.getParent();
284   DebugLoc DL = MI.getDebugLoc();
285   unsigned Dst = MI.getOperand(0).getReg();
286   unsigned Src = MI.getOperand(1).getReg();
287 
288   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
289           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
290           .addReg(Src); // Saved EXEC
291 
292   if (ExecModified) {
293     // Adjust the saved exec to account for the modifications during the flow
294     // block that contains the ELSE. This can happen when WQM mode is switched
295     // off.
296     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
297             .addReg(AMDGPU::EXEC)
298             .addReg(Dst);
299   }
300 
301   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
302           .addReg(AMDGPU::EXEC)
303           .addReg(Dst);
304 
305   Skip(MI, MI.getOperand(2));
306 
307   // Insert a pseudo terminator to help keep the verifier happy.
308   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
309     .addOperand(MI.getOperand(2))
310     .addReg(Dst);
311 
312   MI.eraseFromParent();
313 }
314 
315 void SILowerControlFlow::Break(MachineInstr &MI) {
316   MachineBasicBlock &MBB = *MI.getParent();
317   DebugLoc DL = MI.getDebugLoc();
318 
319   unsigned Dst = MI.getOperand(0).getReg();
320   unsigned Src = MI.getOperand(1).getReg();
321 
322   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
323           .addReg(AMDGPU::EXEC)
324           .addReg(Src);
325 
326   MI.eraseFromParent();
327 }
328 
329 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
330   MachineBasicBlock &MBB = *MI.getParent();
331   DebugLoc DL = MI.getDebugLoc();
332 
333   unsigned Dst = MI.getOperand(0).getReg();
334   unsigned Vcc = MI.getOperand(1).getReg();
335   unsigned Src = MI.getOperand(2).getReg();
336 
337   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
338           .addReg(Vcc)
339           .addReg(Src);
340 
341   MI.eraseFromParent();
342 }
343 
344 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
345   MachineBasicBlock &MBB = *MI.getParent();
346   DebugLoc DL = MI.getDebugLoc();
347 
348   unsigned Dst = MI.getOperand(0).getReg();
349   unsigned Saved = MI.getOperand(1).getReg();
350   unsigned Src = MI.getOperand(2).getReg();
351 
352   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
353           .addReg(Saved)
354           .addReg(Src);
355 
356   MI.eraseFromParent();
357 }
358 
359 void SILowerControlFlow::Loop(MachineInstr &MI) {
360   MachineBasicBlock &MBB = *MI.getParent();
361   DebugLoc DL = MI.getDebugLoc();
362   unsigned Src = MI.getOperand(0).getReg();
363 
364   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
365           .addReg(AMDGPU::EXEC)
366           .addReg(Src);
367 
368   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
369     .addOperand(MI.getOperand(1));
370 
371   MI.eraseFromParent();
372 }
373 
374 void SILowerControlFlow::EndCf(MachineInstr &MI) {
375   MachineBasicBlock &MBB = *MI.getParent();
376   DebugLoc DL = MI.getDebugLoc();
377   unsigned Reg = MI.getOperand(0).getReg();
378 
379   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
380           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
381           .addReg(AMDGPU::EXEC)
382           .addReg(Reg);
383 
384   MI.eraseFromParent();
385 }
386 
387 void SILowerControlFlow::Branch(MachineInstr &MI) {
388   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
389   if (MBB == MI.getParent()->getNextNode())
390     MI.eraseFromParent();
391 
392   // If these aren't equal, this is probably an infinite loop.
393 }
394 
395 void SILowerControlFlow::Kill(MachineInstr &MI) {
396   MachineBasicBlock &MBB = *MI.getParent();
397   DebugLoc DL = MI.getDebugLoc();
398   const MachineOperand &Op = MI.getOperand(0);
399 
400 #ifndef NDEBUG
401   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
402   // Kill is only allowed in pixel / geometry shaders.
403   assert(CallConv == CallingConv::AMDGPU_PS ||
404          CallConv == CallingConv::AMDGPU_GS);
405 #endif
406 
407   // Clear this thread from the exec mask if the operand is negative
408   if ((Op.isImm())) {
409     // Constant operand: Set exec mask to 0 or do nothing
410     if (Op.getImm() & 0x80000000) {
411       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
412               .addImm(0);
413     }
414   } else {
415     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
416            .addImm(0)
417            .addOperand(Op);
418   }
419 
420   MI.eraseFromParent();
421 }
422 
423 // All currently live registers must remain so in the remainder block.
424 void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
425                                                  const MachineRegisterInfo &MRI,
426                                                  const MachineInstr &MI,
427                                                  MachineBasicBlock &LoopBB,
428                                                  MachineBasicBlock &RemainderBB,
429                                                  unsigned SaveReg,
430                                                  const MachineOperand &IdxReg) {
431   // Add reg defined in loop body.
432   RemainderLiveRegs.addReg(SaveReg);
433 
434   if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
435     if (!Val->isUndef()) {
436       RemainderLiveRegs.addReg(Val->getReg());
437       LoopBB.addLiveIn(Val->getReg());
438     }
439   }
440 
441   for (unsigned Reg : RemainderLiveRegs) {
442     if (MRI.isAllocatable(Reg))
443       RemainderBB.addLiveIn(Reg);
444   }
445 
446   const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
447   if (!Src->isUndef())
448     LoopBB.addLiveIn(Src->getReg());
449 
450   if (!IdxReg.isUndef())
451     LoopBB.addLiveIn(IdxReg.getReg());
452   LoopBB.sortUniqueLiveIns();
453 }
454 
455 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
456                                                 DebugLoc DL,
457                                                 MachineInstr *MovRel,
458                                                 const MachineOperand &IdxReg,
459                                                 int Offset) {
460   MachineBasicBlock::iterator I = LoopBB.begin();
461 
462   // Read the next variant into VCC (lower 32 bits) <- also loop target
463   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
464     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
465 
466   // Move index from VCC into M0
467   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
468     .addReg(AMDGPU::VCC_LO);
469 
470   // Compare the just read M0 value to all possible Idx values
471   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
472     .addReg(AMDGPU::M0)
473     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
474 
475   // Update EXEC, save the original EXEC value to VCC
476   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
477     .addReg(AMDGPU::VCC);
478 
479   if (Offset != 0) {
480     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
481       .addReg(AMDGPU::M0)
482       .addImm(Offset);
483   }
484 
485   // Do the actual move
486   LoopBB.insert(I, MovRel);
487 
488   // Update EXEC, switch all done bits to 0 and all todo bits to 1
489   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
490     .addReg(AMDGPU::EXEC)
491     .addReg(AMDGPU::VCC);
492 
493   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
494   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
495     .addMBB(&LoopBB);
496 }
497 
498 std::pair<MachineBasicBlock *, MachineBasicBlock *>
499 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
500                                MachineBasicBlock::iterator I) {
501   MachineFunction *MF = MBB.getParent();
502 
503   // To insert the loop we need to split the block. Move everything after this
504   // point to a new block, and insert a new empty block between the two.
505   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
506   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
507   MachineFunction::iterator MBBI(MBB);
508   ++MBBI;
509 
510   MF->insert(MBBI, LoopBB);
511   MF->insert(MBBI, RemainderBB);
512 
513   // Move the rest of the block into a new block.
514   RemainderBB->transferSuccessors(&MBB);
515   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
516 
517   MBB.addSuccessor(LoopBB);
518 
519   return std::make_pair(LoopBB, RemainderBB);
520 }
521 
522 // Returns true if a new block was inserted.
523 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
524   MachineBasicBlock &MBB = *MI.getParent();
525   DebugLoc DL = MI.getDebugLoc();
526   MachineBasicBlock::iterator I(&MI);
527 
528   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
529 
530   if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
531     if (Offset != 0) {
532       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
533         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
534         .addImm(Offset);
535     } else {
536       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
537         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
538     }
539 
540     MBB.insert(I, MovRel);
541     MI.eraseFromParent();
542     return false;
543   }
544 
545   MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
546   SaveOp->setIsDead(false);
547   unsigned Save = SaveOp->getReg();
548 
549   // Reading from a VGPR requires looping over all workitems in the wavefront.
550   assert(AMDGPU::SReg_64RegClass.contains(Save) &&
551          AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
552 
553   // Save the EXEC mask
554   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
555     .addReg(AMDGPU::EXEC);
556 
557   LivePhysRegs RemainderLiveRegs(TRI);
558 
559   RemainderLiveRegs.addLiveOuts(MBB);
560 
561   MachineBasicBlock *LoopBB;
562   MachineBasicBlock *RemainderBB;
563 
564   std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
565 
566   for (const MachineInstr &Inst : reverse(*RemainderBB))
567     RemainderLiveRegs.stepBackward(Inst);
568 
569   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
570   LoopBB->addSuccessor(RemainderBB);
571   LoopBB->addSuccessor(LoopBB);
572 
573   splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
574                           *RemainderBB, Save, *Idx);
575 
576   emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
577 
578   MachineBasicBlock::iterator First = RemainderBB->begin();
579   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
580     .addReg(Save);
581 
582   MI.eraseFromParent();
583   return true;
584 }
585 
586 /// \param @VecReg The register which holds element zero of the vector being
587 ///                 addressed into.
588 //
589 /// \param[in] @Idx The index operand from the movrel instruction. This must be
590 // a register, but may be NoRegister.
591 ///
592 /// \param[in] @Offset As an input, this is the constant offset part of the
593 // indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
594 // value that needs to be added to the value stored in M0.
595 std::pair<unsigned, int>
596 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
597   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
598   if (!SubReg)
599     SubReg = VecReg;
600 
601   const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
602   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
603   int NumElts = SuperRC->getSize() / RC->getSize();
604 
605   int BaseRegIdx = TRI->getHWRegIndex(SubReg);
606 
607   // Skip out of bounds offsets, or else we would end up using an undefined
608   // register.
609   if (Offset >= NumElts)
610     return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
611 
612   int RegIdx = BaseRegIdx + Offset;
613   if (RegIdx < 0) {
614     Offset = RegIdx;
615     RegIdx = 0;
616   } else {
617     Offset = 0;
618   }
619 
620   unsigned Reg = RC->getRegister(RegIdx);
621   return std::make_pair(Reg, Offset);
622 }
623 
624 // Return true if a new block was inserted.
625 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
626   MachineBasicBlock &MBB = *MI.getParent();
627   const DebugLoc &DL = MI.getDebugLoc();
628 
629   unsigned Dst = MI.getOperand(0).getReg();
630   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
631   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
632   unsigned Reg;
633 
634   std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
635 
636   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
637   if (Idx->getReg() == AMDGPU::NoRegister) {
638     // Only had a constant offset, copy the register directly.
639     BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
640       .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
641     MI.eraseFromParent();
642     return false;
643   }
644 
645   MachineInstr *MovRel =
646     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
647     .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
648     .addReg(SrcVec->getReg(), RegState::Implicit);
649 
650   return loadM0(MI, MovRel, Offset);
651 }
652 
653 // Return true if a new block was inserted.
654 bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
655   MachineBasicBlock &MBB = *MI.getParent();
656   const DebugLoc &DL = MI.getDebugLoc();
657 
658   unsigned Dst = MI.getOperand(0).getReg();
659   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
660   unsigned Reg;
661 
662   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
663   std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
664 
665   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
666   if (Idx->getReg() == AMDGPU::NoRegister) {
667     // Only had a constant offset, copy the register directly.
668     BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
669       .addOperand(*Val);
670     MI.eraseFromParent();
671     return false;
672   }
673 
674   MachineInstr *MovRel =
675     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
676     .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
677     .addReg(Dst, RegState::Implicit);
678 
679   return loadM0(MI, MovRel, Offset);
680 }
681 
682 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
683   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
684   TII = ST.getInstrInfo();
685   TRI = &TII->getRegisterInfo();
686 
687   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
688 
689   bool HaveKill = false;
690   bool NeedFlat = false;
691   unsigned Depth = 0;
692 
693   MachineFunction::iterator NextBB;
694 
695   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
696        BI != BE; BI = NextBB) {
697     NextBB = std::next(BI);
698     MachineBasicBlock &MBB = *BI;
699 
700     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
701     MachineBasicBlock::iterator I, Next;
702     bool ExecModified = false;
703 
704     for (I = MBB.begin(); I != MBB.end(); I = Next) {
705       Next = std::next(I);
706 
707       MachineInstr &MI = *I;
708 
709       // Flat uses m0 in case it needs to access LDS.
710       if (TII->isFLAT(MI))
711         NeedFlat = true;
712 
713       if (I->modifiesRegister(AMDGPU::EXEC, TRI))
714         ExecModified = true;
715 
716       switch (MI.getOpcode()) {
717         default: break;
718         case AMDGPU::SI_IF:
719           ++Depth;
720           If(MI);
721           break;
722 
723         case AMDGPU::SI_ELSE:
724           Else(MI, ExecModified);
725           break;
726 
727         case AMDGPU::SI_BREAK:
728           Break(MI);
729           break;
730 
731         case AMDGPU::SI_IF_BREAK:
732           IfBreak(MI);
733           break;
734 
735         case AMDGPU::SI_ELSE_BREAK:
736           ElseBreak(MI);
737           break;
738 
739         case AMDGPU::SI_LOOP:
740           ++Depth;
741           Loop(MI);
742           break;
743 
744         case AMDGPU::SI_END_CF:
745           if (--Depth == 0 && HaveKill) {
746             HaveKill = false;
747 
748             if (skipIfDead(MI)) {
749               NextBB = std::next(BI);
750               BE = MF.end();
751               Next = MBB.end();
752             }
753           }
754           EndCf(MI);
755           break;
756 
757         case AMDGPU::SI_KILL:
758           if (Depth == 0) {
759             if (skipIfDead(MI)) {
760               NextBB = std::next(BI);
761               BE = MF.end();
762               Next = MBB.end();
763             }
764           } else
765             HaveKill = true;
766           Kill(MI);
767           break;
768 
769         case AMDGPU::S_BRANCH:
770           Branch(MI);
771           break;
772 
773         case AMDGPU::SI_INDIRECT_SRC_V1:
774         case AMDGPU::SI_INDIRECT_SRC_V2:
775         case AMDGPU::SI_INDIRECT_SRC_V4:
776         case AMDGPU::SI_INDIRECT_SRC_V8:
777         case AMDGPU::SI_INDIRECT_SRC_V16:
778           if (indirectSrc(MI)) {
779             // The block was split at this point. We can safely skip the middle
780             // inserted block to the following which contains the rest of this
781             // block's instructions.
782             NextBB = std::next(BI);
783             BE = MF.end();
784             Next = MBB.end();
785           }
786 
787           break;
788 
789         case AMDGPU::SI_INDIRECT_DST_V1:
790         case AMDGPU::SI_INDIRECT_DST_V2:
791         case AMDGPU::SI_INDIRECT_DST_V4:
792         case AMDGPU::SI_INDIRECT_DST_V8:
793         case AMDGPU::SI_INDIRECT_DST_V16:
794           if (indirectDst(MI)) {
795             // The block was split at this point. We can safely skip the middle
796             // inserted block to the following which contains the rest of this
797             // block's instructions.
798             NextBB = std::next(BI);
799             BE = MF.end();
800             Next = MBB.end();
801           }
802 
803           break;
804 
805         case AMDGPU::SI_RETURN: {
806           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
807 
808           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
809           // because external bytecode will be appended at the end.
810           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
811             // SI_RETURN is not the last instruction. Add an empty block at
812             // the end and jump there.
813             if (!EmptyMBBAtEnd) {
814               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
815               MF.insert(MF.end(), EmptyMBBAtEnd);
816             }
817 
818             MBB.addSuccessor(EmptyMBBAtEnd);
819             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
820                     .addMBB(EmptyMBBAtEnd);
821             I->eraseFromParent();
822           }
823           break;
824         }
825       }
826     }
827   }
828 
829   if (NeedFlat && MFI->IsKernel) {
830     // TODO: What to use with function calls?
831     // We will need to Initialize the flat scratch register pair.
832     if (NeedFlat)
833       MFI->setHasFlatInstructions(true);
834   }
835 
836   return true;
837 }
838