xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision a74374a86b7948b5c772d5aa761cb24c8568b024)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   static const unsigned SkipThreshold = 12;
72 
73   const SIRegisterInfo *TRI;
74   const SIInstrInfo *TII;
75 
76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77 
78   void Skip(MachineInstr &From, MachineOperand &To);
79   void SkipIfDead(MachineInstr &MI);
80 
81   void If(MachineInstr &MI);
82   void Else(MachineInstr &MI, bool ExecModified);
83   void Break(MachineInstr &MI);
84   void IfBreak(MachineInstr &MI);
85   void ElseBreak(MachineInstr &MI);
86   void Loop(MachineInstr &MI);
87   void EndCf(MachineInstr &MI);
88 
89   void Kill(MachineInstr &MI);
90   void Branch(MachineInstr &MI);
91 
92   void splitBlockLiveIns(const MachineBasicBlock &MBB,
93                          const MachineInstr &MI,
94                          MachineBasicBlock &LoopBB,
95                          MachineBasicBlock &RemainderBB,
96                          unsigned SaveReg,
97                          const MachineOperand &IdxReg);
98 
99   void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
100                               MachineInstr *MovRel,
101                               const MachineOperand &IdxReg,
102                               int Offset);
103 
104   bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
105   std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
106                                                        int Offset) const;
107   bool indirectSrc(MachineInstr &MI);
108   bool indirectDst(MachineInstr &MI);
109 
110 public:
111   static char ID;
112 
113   SILowerControlFlow() :
114     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
115 
116   bool runOnMachineFunction(MachineFunction &MF) override;
117 
118   const char *getPassName() const override {
119     return "SI Lower control flow pseudo instructions";
120   }
121 };
122 
123 } // End anonymous namespace
124 
125 char SILowerControlFlow::ID = 0;
126 
127 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
128                 "SI lower control flow", false, false)
129 
130 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
131 
132 
133 FunctionPass *llvm::createSILowerControlFlowPass() {
134   return new SILowerControlFlow();
135 }
136 
137 static bool opcodeEmitsNoInsts(unsigned Opc) {
138   switch (Opc) {
139   case TargetOpcode::IMPLICIT_DEF:
140   case TargetOpcode::KILL:
141   case TargetOpcode::BUNDLE:
142   case TargetOpcode::CFI_INSTRUCTION:
143   case TargetOpcode::EH_LABEL:
144   case TargetOpcode::GC_LABEL:
145   case TargetOpcode::DBG_VALUE:
146     return true;
147   default:
148     return false;
149   }
150 }
151 
152 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
153                                     MachineBasicBlock *To) {
154 
155   unsigned NumInstr = 0;
156   MachineFunction *MF = From->getParent();
157 
158   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
159        MBBI != End && MBBI != ToI; ++MBBI) {
160     MachineBasicBlock &MBB = *MBBI;
161 
162     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
163          NumInstr < SkipThreshold && I != E; ++I) {
164       if (opcodeEmitsNoInsts(I->getOpcode()))
165         continue;
166 
167       // When a uniform loop is inside non-uniform control flow, the branch
168       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
169       // when EXEC = 0. We should skip the loop lest it becomes infinite.
170       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
171           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
172         return true;
173 
174       if (++NumInstr >= SkipThreshold)
175         return true;
176     }
177   }
178 
179   return false;
180 }
181 
182 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
183 
184   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
185     return;
186 
187   DebugLoc DL = From.getDebugLoc();
188   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
189     .addOperand(To);
190 }
191 
192 void SILowerControlFlow::SkipIfDead(MachineInstr &MI) {
193 
194   MachineBasicBlock &MBB = *MI.getParent();
195   DebugLoc DL = MI.getDebugLoc();
196 
197   if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
198       !shouldSkip(&MBB, &MBB.getParent()->back()))
199     return;
200 
201   MachineBasicBlock::iterator Insert = &MI;
202   ++Insert;
203 
204   // If the exec mask is non-zero, skip the next two instructions
205   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
206     .addImm(3);
207 
208   // Exec mask is zero: Export to NULL target...
209   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
210           .addImm(0)
211           .addImm(0x09) // V_008DFC_SQ_EXP_NULL
212           .addImm(0)
213           .addImm(1)
214           .addImm(1)
215           .addReg(AMDGPU::VGPR0)
216           .addReg(AMDGPU::VGPR0)
217           .addReg(AMDGPU::VGPR0)
218           .addReg(AMDGPU::VGPR0);
219 
220   // ... and terminate wavefront
221   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
222 }
223 
224 void SILowerControlFlow::If(MachineInstr &MI) {
225   MachineBasicBlock &MBB = *MI.getParent();
226   DebugLoc DL = MI.getDebugLoc();
227   unsigned Reg = MI.getOperand(0).getReg();
228   unsigned Vcc = MI.getOperand(1).getReg();
229 
230   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
231           .addReg(Vcc);
232 
233   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
234           .addReg(AMDGPU::EXEC)
235           .addReg(Reg);
236 
237   Skip(MI, MI.getOperand(2));
238 
239   // Insert a pseudo terminator to help keep the verifier happy.
240   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
241     .addOperand(MI.getOperand(2))
242     .addReg(Reg);
243 
244   MI.eraseFromParent();
245 }
246 
247 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
248   MachineBasicBlock &MBB = *MI.getParent();
249   DebugLoc DL = MI.getDebugLoc();
250   unsigned Dst = MI.getOperand(0).getReg();
251   unsigned Src = MI.getOperand(1).getReg();
252 
253   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
254           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
255           .addReg(Src); // Saved EXEC
256 
257   if (ExecModified) {
258     // Adjust the saved exec to account for the modifications during the flow
259     // block that contains the ELSE. This can happen when WQM mode is switched
260     // off.
261     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
262             .addReg(AMDGPU::EXEC)
263             .addReg(Dst);
264   }
265 
266   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
267           .addReg(AMDGPU::EXEC)
268           .addReg(Dst);
269 
270   Skip(MI, MI.getOperand(2));
271 
272   // Insert a pseudo terminator to help keep the verifier happy.
273   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
274     .addOperand(MI.getOperand(2))
275     .addReg(Dst);
276 
277   MI.eraseFromParent();
278 }
279 
280 void SILowerControlFlow::Break(MachineInstr &MI) {
281   MachineBasicBlock &MBB = *MI.getParent();
282   DebugLoc DL = MI.getDebugLoc();
283 
284   unsigned Dst = MI.getOperand(0).getReg();
285   unsigned Src = MI.getOperand(1).getReg();
286 
287   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
288           .addReg(AMDGPU::EXEC)
289           .addReg(Src);
290 
291   MI.eraseFromParent();
292 }
293 
294 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
295   MachineBasicBlock &MBB = *MI.getParent();
296   DebugLoc DL = MI.getDebugLoc();
297 
298   unsigned Dst = MI.getOperand(0).getReg();
299   unsigned Vcc = MI.getOperand(1).getReg();
300   unsigned Src = MI.getOperand(2).getReg();
301 
302   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
303           .addReg(Vcc)
304           .addReg(Src);
305 
306   MI.eraseFromParent();
307 }
308 
309 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
310   MachineBasicBlock &MBB = *MI.getParent();
311   DebugLoc DL = MI.getDebugLoc();
312 
313   unsigned Dst = MI.getOperand(0).getReg();
314   unsigned Saved = MI.getOperand(1).getReg();
315   unsigned Src = MI.getOperand(2).getReg();
316 
317   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
318           .addReg(Saved)
319           .addReg(Src);
320 
321   MI.eraseFromParent();
322 }
323 
324 void SILowerControlFlow::Loop(MachineInstr &MI) {
325   MachineBasicBlock &MBB = *MI.getParent();
326   DebugLoc DL = MI.getDebugLoc();
327   unsigned Src = MI.getOperand(0).getReg();
328 
329   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
330           .addReg(AMDGPU::EXEC)
331           .addReg(Src);
332 
333   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
334     .addOperand(MI.getOperand(1));
335 
336   MI.eraseFromParent();
337 }
338 
339 void SILowerControlFlow::EndCf(MachineInstr &MI) {
340   MachineBasicBlock &MBB = *MI.getParent();
341   DebugLoc DL = MI.getDebugLoc();
342   unsigned Reg = MI.getOperand(0).getReg();
343 
344   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
345           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
346           .addReg(AMDGPU::EXEC)
347           .addReg(Reg);
348 
349   MI.eraseFromParent();
350 }
351 
352 void SILowerControlFlow::Branch(MachineInstr &MI) {
353   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
354   if (MBB == MI.getParent()->getNextNode())
355     MI.eraseFromParent();
356 
357   // If these aren't equal, this is probably an infinite loop.
358 }
359 
360 void SILowerControlFlow::Kill(MachineInstr &MI) {
361   MachineBasicBlock &MBB = *MI.getParent();
362   DebugLoc DL = MI.getDebugLoc();
363   const MachineOperand &Op = MI.getOperand(0);
364 
365 #ifndef NDEBUG
366   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
367   // Kill is only allowed in pixel / geometry shaders.
368   assert(CallConv == CallingConv::AMDGPU_PS ||
369          CallConv == CallingConv::AMDGPU_GS);
370 #endif
371 
372   // Clear this thread from the exec mask if the operand is negative
373   if ((Op.isImm())) {
374     // Constant operand: Set exec mask to 0 or do nothing
375     if (Op.getImm() & 0x80000000) {
376       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
377               .addImm(0);
378     }
379   } else {
380     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
381            .addImm(0)
382            .addOperand(Op);
383   }
384 
385   MI.eraseFromParent();
386 }
387 
388 // All currently live registers must remain so in the remainder block.
389 void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB,
390                                            const MachineInstr &MI,
391                                            MachineBasicBlock &LoopBB,
392                                            MachineBasicBlock &RemainderBB,
393                                            unsigned SaveReg,
394                                            const MachineOperand &IdxReg) {
395   LivePhysRegs RemainderLiveRegs(TRI);
396 
397   RemainderLiveRegs.addLiveOuts(MBB);
398   for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI);
399        I != E; ++I) {
400     RemainderLiveRegs.stepBackward(*I);
401   }
402 
403   // Add reg defined in loop body.
404   RemainderLiveRegs.addReg(SaveReg);
405 
406   if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
407     if (!Val->isUndef()) {
408       RemainderLiveRegs.addReg(Val->getReg());
409       LoopBB.addLiveIn(Val->getReg());
410     }
411   }
412 
413   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
414   for (unsigned Reg : RemainderLiveRegs) {
415     if (MRI.isAllocatable(Reg))
416       RemainderBB.addLiveIn(Reg);
417   }
418 
419 
420   const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
421   if (!Src->isUndef())
422     LoopBB.addLiveIn(Src->getReg());
423 
424   if (!IdxReg.isUndef())
425     LoopBB.addLiveIn(IdxReg.getReg());
426   LoopBB.sortUniqueLiveIns();
427 }
428 
429 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
430                                                 DebugLoc DL,
431                                                 MachineInstr *MovRel,
432                                                 const MachineOperand &IdxReg,
433                                                 int Offset) {
434   MachineBasicBlock::iterator I = LoopBB.begin();
435 
436   // Read the next variant into VCC (lower 32 bits) <- also loop target
437   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
438     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
439 
440   // Move index from VCC into M0
441   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
442     .addReg(AMDGPU::VCC_LO);
443 
444   // Compare the just read M0 value to all possible Idx values
445   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
446     .addReg(AMDGPU::M0)
447     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
448 
449   // Update EXEC, save the original EXEC value to VCC
450   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
451     .addReg(AMDGPU::VCC);
452 
453   if (Offset) {
454     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
455       .addReg(AMDGPU::M0)
456       .addImm(Offset);
457   }
458 
459   // Do the actual move
460   LoopBB.insert(I, MovRel);
461 
462   // Update EXEC, switch all done bits to 0 and all todo bits to 1
463   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
464     .addReg(AMDGPU::EXEC)
465     .addReg(AMDGPU::VCC);
466 
467   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
468   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
469     .addMBB(&LoopBB);
470 }
471 
472 // Returns true if a new block was inserted.
473 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
474   MachineBasicBlock &MBB = *MI.getParent();
475   DebugLoc DL = MI.getDebugLoc();
476   MachineBasicBlock::iterator I(&MI);
477 
478   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
479 
480   if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
481     if (Offset) {
482       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
483         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
484         .addImm(Offset);
485     } else {
486       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
487         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
488     }
489 
490     MBB.insert(I, MovRel);
491     MI.eraseFromParent();
492     return false;
493   }
494 
495   MachineFunction &MF = *MBB.getParent();
496   MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
497   SaveOp->setIsDead(false);
498   unsigned Save = SaveOp->getReg();
499 
500   // Reading from a VGPR requires looping over all workitems in the wavefront.
501   assert(AMDGPU::SReg_64RegClass.contains(Save) &&
502          AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
503 
504   // Save the EXEC mask
505   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
506     .addReg(AMDGPU::EXEC);
507 
508   // To insert the loop we need to split the block. Move everything after this
509   // point to a new block, and insert a new empty block between the two.
510   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
511   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
512   MachineFunction::iterator MBBI(MBB);
513   ++MBBI;
514 
515   MF.insert(MBBI, LoopBB);
516   MF.insert(MBBI, RemainderBB);
517 
518   LoopBB->addSuccessor(LoopBB);
519   LoopBB->addSuccessor(RemainderBB);
520 
521   splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx);
522 
523   // Move the rest of the block into a new block.
524   RemainderBB->transferSuccessors(&MBB);
525   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
526   MBB.addSuccessor(LoopBB);
527 
528   emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
529 
530   MachineBasicBlock::iterator First = RemainderBB->begin();
531   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
532     .addReg(Save);
533 
534   MI.eraseFromParent();
535   return true;
536 }
537 
538 /// \param @VecReg The register which holds element zero of the vector
539 ///                 being addressed into.
540 /// \param[out] @Reg The base register to use in the indirect addressing instruction.
541 /// \param[in,out] @Offset As an input, this is the constant offset part of the
542 //                         indirect Index. e.g. v0 = v[VecReg + Offset]
543 //                         As an output, this is a constant value that needs
544 //                         to be added to the value stored in M0.
545 std::pair<unsigned, int>
546 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
547                                                 int Offset) const {
548   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
549   if (!SubReg)
550     SubReg = VecReg;
551 
552   const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
553   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
554   int NumElts = SuperRC->getSize() / RC->getSize();
555 
556   int BaseRegIdx = TRI->getHWRegIndex(SubReg);
557 
558   // Skip out of bounds offsets, or else we would end up using an undefined
559   // register.
560   if (Offset >= NumElts)
561     return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
562 
563   int RegIdx = BaseRegIdx + Offset;
564   if (RegIdx < 0) {
565     Offset = RegIdx;
566     RegIdx = 0;
567   } else {
568     Offset = 0;
569   }
570 
571   unsigned Reg = RC->getRegister(RegIdx);
572   return std::make_pair(Reg, Offset);
573 }
574 
575 // Return true if a new block was inserted.
576 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
577   MachineBasicBlock &MBB = *MI.getParent();
578   DebugLoc DL = MI.getDebugLoc();
579 
580   unsigned Dst = MI.getOperand(0).getReg();
581   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
582   int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
583   unsigned Reg;
584 
585   std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off);
586 
587   MachineInstr *MovRel =
588     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
589     .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
590     .addReg(SrcVec->getReg(), RegState::Implicit);
591 
592   return loadM0(MI, MovRel, Off);
593 }
594 
595 // Return true if a new block was inserted.
596 bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
597   MachineBasicBlock &MBB = *MI.getParent();
598   DebugLoc DL = MI.getDebugLoc();
599 
600   unsigned Dst = MI.getOperand(0).getReg();
601   int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
602   MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
603   unsigned Reg;
604 
605   std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off);
606 
607   MachineInstr *MovRel =
608     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
609     .addReg(Reg, RegState::Define)
610     .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
611     .addReg(Dst, RegState::Implicit);
612 
613   return loadM0(MI, MovRel, Off);
614 }
615 
616 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
617   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
618   TII = ST.getInstrInfo();
619   TRI = &TII->getRegisterInfo();
620 
621   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
622 
623   bool HaveKill = false;
624   bool NeedFlat = false;
625   unsigned Depth = 0;
626 
627   MachineFunction::iterator NextBB;
628 
629   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
630        BI != BE; BI = NextBB) {
631     NextBB = std::next(BI);
632     MachineBasicBlock &MBB = *BI;
633 
634     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
635     MachineBasicBlock::iterator I, Next;
636     bool ExecModified = false;
637 
638     for (I = MBB.begin(); I != MBB.end(); I = Next) {
639       Next = std::next(I);
640 
641       MachineInstr &MI = *I;
642 
643       // Flat uses m0 in case it needs to access LDS.
644       if (TII->isFLAT(MI))
645         NeedFlat = true;
646 
647       if (I->definesRegister(AMDGPU::EXEC, TRI))
648         ExecModified = true;
649 
650       switch (MI.getOpcode()) {
651         default: break;
652         case AMDGPU::SI_IF:
653           ++Depth;
654           If(MI);
655           break;
656 
657         case AMDGPU::SI_ELSE:
658           Else(MI, ExecModified);
659           break;
660 
661         case AMDGPU::SI_BREAK:
662           Break(MI);
663           break;
664 
665         case AMDGPU::SI_IF_BREAK:
666           IfBreak(MI);
667           break;
668 
669         case AMDGPU::SI_ELSE_BREAK:
670           ElseBreak(MI);
671           break;
672 
673         case AMDGPU::SI_LOOP:
674           ++Depth;
675           Loop(MI);
676           break;
677 
678         case AMDGPU::SI_END_CF:
679           if (--Depth == 0 && HaveKill) {
680             SkipIfDead(MI);
681             HaveKill = false;
682           }
683           EndCf(MI);
684           break;
685 
686         case AMDGPU::SI_KILL:
687           if (Depth == 0)
688             SkipIfDead(MI);
689           else
690             HaveKill = true;
691           Kill(MI);
692           break;
693 
694         case AMDGPU::S_BRANCH:
695           Branch(MI);
696           break;
697 
698         case AMDGPU::SI_INDIRECT_SRC_V1:
699         case AMDGPU::SI_INDIRECT_SRC_V2:
700         case AMDGPU::SI_INDIRECT_SRC_V4:
701         case AMDGPU::SI_INDIRECT_SRC_V8:
702         case AMDGPU::SI_INDIRECT_SRC_V16:
703           if (indirectSrc(MI)) {
704             // The block was split at this point. We can safely skip the middle
705             // inserted block to the following which contains the rest of this
706             // block's instructions.
707             NextBB = std::next(BI);
708             BE = MF.end();
709             Next = MBB.end();
710           }
711 
712           break;
713 
714         case AMDGPU::SI_INDIRECT_DST_V1:
715         case AMDGPU::SI_INDIRECT_DST_V2:
716         case AMDGPU::SI_INDIRECT_DST_V4:
717         case AMDGPU::SI_INDIRECT_DST_V8:
718         case AMDGPU::SI_INDIRECT_DST_V16:
719           if (indirectDst(MI)) {
720             // The block was split at this point. We can safely skip the middle
721             // inserted block to the following which contains the rest of this
722             // block's instructions.
723             NextBB = std::next(BI);
724             BE = MF.end();
725             Next = MBB.end();
726           }
727 
728           break;
729 
730         case AMDGPU::SI_RETURN: {
731           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
732 
733           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
734           // because external bytecode will be appended at the end.
735           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
736             // SI_RETURN is not the last instruction. Add an empty block at
737             // the end and jump there.
738             if (!EmptyMBBAtEnd) {
739               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
740               MF.insert(MF.end(), EmptyMBBAtEnd);
741             }
742 
743             MBB.addSuccessor(EmptyMBBAtEnd);
744             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
745                     .addMBB(EmptyMBBAtEnd);
746             I->eraseFromParent();
747           }
748           break;
749         }
750       }
751     }
752   }
753 
754   if (NeedFlat && MFI->IsKernel) {
755     // TODO: What to use with function calls?
756     // We will need to Initialize the flat scratch register pair.
757     if (NeedFlat)
758       MFI->setHasFlatInstructions(true);
759   }
760 
761   return true;
762 }
763