xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision d4a84b1ed2e2205067afc600a87dace3e2ee5bbb)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   static const unsigned SkipThreshold = 12;
72 
73   const SIRegisterInfo *TRI;
74   const SIInstrInfo *TII;
75 
76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77 
78   void Skip(MachineInstr &From, MachineOperand &To);
79   void SkipIfDead(MachineInstr &MI);
80 
81   void If(MachineInstr &MI);
82   void Else(MachineInstr &MI, bool ExecModified);
83   void Break(MachineInstr &MI);
84   void IfBreak(MachineInstr &MI);
85   void ElseBreak(MachineInstr &MI);
86   void Loop(MachineInstr &MI);
87   void EndCf(MachineInstr &MI);
88 
89   void Kill(MachineInstr &MI);
90   void Branch(MachineInstr &MI);
91 
92   void splitBlockLiveIns(const MachineBasicBlock &MBB,
93                          const MachineInstr &MI,
94                          MachineBasicBlock &LoopBB,
95                          MachineBasicBlock &RemainderBB,
96                          unsigned SaveReg,
97                          const MachineOperand &IdxReg);
98 
99   void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
100                               MachineInstr *MovRel,
101                               const MachineOperand &IdxReg,
102                               int Offset);
103 
104   bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
105   std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
106                                                        int Offset) const;
107   bool indirectSrc(MachineInstr &MI);
108   bool indirectDst(MachineInstr &MI);
109 
110 public:
111   static char ID;
112 
113   SILowerControlFlow() :
114     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
115 
116   bool runOnMachineFunction(MachineFunction &MF) override;
117 
118   const char *getPassName() const override {
119     return "SI Lower control flow pseudo instructions";
120   }
121 };
122 
123 } // End anonymous namespace
124 
125 char SILowerControlFlow::ID = 0;
126 
127 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
128                 "SI lower control flow", false, false)
129 
130 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
131 
132 
133 FunctionPass *llvm::createSILowerControlFlowPass() {
134   return new SILowerControlFlow();
135 }
136 
137 static bool opcodeEmitsNoInsts(unsigned Opc) {
138   switch (Opc) {
139   case TargetOpcode::IMPLICIT_DEF:
140   case TargetOpcode::KILL:
141   case TargetOpcode::BUNDLE:
142   case TargetOpcode::CFI_INSTRUCTION:
143   case TargetOpcode::EH_LABEL:
144   case TargetOpcode::GC_LABEL:
145   case TargetOpcode::DBG_VALUE:
146     return true;
147   default:
148     return false;
149   }
150 }
151 
152 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
153                                     MachineBasicBlock *To) {
154 
155   unsigned NumInstr = 0;
156   MachineFunction *MF = From->getParent();
157 
158   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
159        MBBI != End && MBBI != ToI; ++MBBI) {
160     MachineBasicBlock &MBB = *MBBI;
161 
162     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
163          NumInstr < SkipThreshold && I != E; ++I) {
164       if (opcodeEmitsNoInsts(I->getOpcode()))
165         continue;
166 
167       // When a uniform loop is inside non-uniform control flow, the branch
168       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
169       // when EXEC = 0. We should skip the loop lest it becomes infinite.
170       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
171           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
172         return true;
173 
174       if (++NumInstr >= SkipThreshold)
175         return true;
176     }
177   }
178 
179   return false;
180 }
181 
182 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
183 
184   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
185     return;
186 
187   DebugLoc DL = From.getDebugLoc();
188   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
189     .addOperand(To);
190 }
191 
192 void SILowerControlFlow::SkipIfDead(MachineInstr &MI) {
193 
194   MachineBasicBlock &MBB = *MI.getParent();
195   DebugLoc DL = MI.getDebugLoc();
196 
197   if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
198       !shouldSkip(&MBB, &MBB.getParent()->back()))
199     return;
200 
201   MachineBasicBlock::iterator Insert = &MI;
202   ++Insert;
203 
204   // If the exec mask is non-zero, skip the next two instructions
205   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
206     .addImm(3);
207 
208   // Exec mask is zero: Export to NULL target...
209   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
210           .addImm(0)
211           .addImm(0x09) // V_008DFC_SQ_EXP_NULL
212           .addImm(0)
213           .addImm(1)
214           .addImm(1)
215           .addReg(AMDGPU::VGPR0)
216           .addReg(AMDGPU::VGPR0)
217           .addReg(AMDGPU::VGPR0)
218           .addReg(AMDGPU::VGPR0);
219 
220   // ... and terminate wavefront
221   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
222 }
223 
224 void SILowerControlFlow::If(MachineInstr &MI) {
225   MachineBasicBlock &MBB = *MI.getParent();
226   DebugLoc DL = MI.getDebugLoc();
227   unsigned Reg = MI.getOperand(0).getReg();
228   unsigned Vcc = MI.getOperand(1).getReg();
229 
230   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
231           .addReg(Vcc);
232 
233   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
234           .addReg(AMDGPU::EXEC)
235           .addReg(Reg);
236 
237   Skip(MI, MI.getOperand(2));
238 
239   // Insert a pseudo terminator to help keep the verifier happy.
240   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Reg)
241     .addOperand(MI.getOperand(2));
242 
243   MI.eraseFromParent();
244 }
245 
246 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
247   MachineBasicBlock &MBB = *MI.getParent();
248   DebugLoc DL = MI.getDebugLoc();
249   unsigned Dst = MI.getOperand(0).getReg();
250   unsigned Src = MI.getOperand(1).getReg();
251 
252   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
253           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
254           .addReg(Src); // Saved EXEC
255 
256   if (ExecModified) {
257     // Adjust the saved exec to account for the modifications during the flow
258     // block that contains the ELSE. This can happen when WQM mode is switched
259     // off.
260     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
261             .addReg(AMDGPU::EXEC)
262             .addReg(Dst);
263   }
264 
265   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
266           .addReg(AMDGPU::EXEC)
267           .addReg(Dst);
268 
269   Skip(MI, MI.getOperand(2));
270 
271   // Insert a pseudo terminator to help keep the verifier happy.
272   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Dst)
273     .addOperand(MI.getOperand(2));
274 
275   MI.eraseFromParent();
276 }
277 
278 void SILowerControlFlow::Break(MachineInstr &MI) {
279   MachineBasicBlock &MBB = *MI.getParent();
280   DebugLoc DL = MI.getDebugLoc();
281 
282   unsigned Dst = MI.getOperand(0).getReg();
283   unsigned Src = MI.getOperand(1).getReg();
284 
285   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
286           .addReg(AMDGPU::EXEC)
287           .addReg(Src);
288 
289   MI.eraseFromParent();
290 }
291 
292 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
293   MachineBasicBlock &MBB = *MI.getParent();
294   DebugLoc DL = MI.getDebugLoc();
295 
296   unsigned Dst = MI.getOperand(0).getReg();
297   unsigned Vcc = MI.getOperand(1).getReg();
298   unsigned Src = MI.getOperand(2).getReg();
299 
300   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
301           .addReg(Vcc)
302           .addReg(Src);
303 
304   MI.eraseFromParent();
305 }
306 
307 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
308   MachineBasicBlock &MBB = *MI.getParent();
309   DebugLoc DL = MI.getDebugLoc();
310 
311   unsigned Dst = MI.getOperand(0).getReg();
312   unsigned Saved = MI.getOperand(1).getReg();
313   unsigned Src = MI.getOperand(2).getReg();
314 
315   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
316           .addReg(Saved)
317           .addReg(Src);
318 
319   MI.eraseFromParent();
320 }
321 
322 void SILowerControlFlow::Loop(MachineInstr &MI) {
323   MachineBasicBlock &MBB = *MI.getParent();
324   DebugLoc DL = MI.getDebugLoc();
325   unsigned Src = MI.getOperand(0).getReg();
326 
327   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
328           .addReg(AMDGPU::EXEC)
329           .addReg(Src);
330 
331   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
332     .addOperand(MI.getOperand(1));
333 
334   MI.eraseFromParent();
335 }
336 
337 void SILowerControlFlow::EndCf(MachineInstr &MI) {
338   MachineBasicBlock &MBB = *MI.getParent();
339   DebugLoc DL = MI.getDebugLoc();
340   unsigned Reg = MI.getOperand(0).getReg();
341 
342   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
343           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
344           .addReg(AMDGPU::EXEC)
345           .addReg(Reg);
346 
347   MI.eraseFromParent();
348 }
349 
350 void SILowerControlFlow::Branch(MachineInstr &MI) {
351   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
352   if (MBB == MI.getParent()->getNextNode())
353     MI.eraseFromParent();
354 
355   // If these aren't equal, this is probably an infinite loop.
356 }
357 
358 void SILowerControlFlow::Kill(MachineInstr &MI) {
359   MachineBasicBlock &MBB = *MI.getParent();
360   DebugLoc DL = MI.getDebugLoc();
361   const MachineOperand &Op = MI.getOperand(0);
362 
363 #ifndef NDEBUG
364   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
365   // Kill is only allowed in pixel / geometry shaders.
366   assert(CallConv == CallingConv::AMDGPU_PS ||
367          CallConv == CallingConv::AMDGPU_GS);
368 #endif
369 
370   // Clear this thread from the exec mask if the operand is negative
371   if ((Op.isImm())) {
372     // Constant operand: Set exec mask to 0 or do nothing
373     if (Op.getImm() & 0x80000000) {
374       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
375               .addImm(0);
376     }
377   } else {
378     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
379            .addImm(0)
380            .addOperand(Op);
381   }
382 
383   MI.eraseFromParent();
384 }
385 
386 // All currently live registers must remain so in the remainder block.
387 void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB,
388                                            const MachineInstr &MI,
389                                            MachineBasicBlock &LoopBB,
390                                            MachineBasicBlock &RemainderBB,
391                                            unsigned SaveReg,
392                                            const MachineOperand &IdxReg) {
393   LivePhysRegs RemainderLiveRegs(TRI);
394 
395   RemainderLiveRegs.addLiveOuts(MBB);
396   for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI);
397        I != E; ++I) {
398     RemainderLiveRegs.stepBackward(*I);
399   }
400 
401   // Add reg defined in loop body.
402   RemainderLiveRegs.addReg(SaveReg);
403 
404   if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
405     if (!Val->isUndef()) {
406       RemainderLiveRegs.addReg(Val->getReg());
407       LoopBB.addLiveIn(Val->getReg());
408     }
409   }
410 
411   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
412   for (unsigned Reg : RemainderLiveRegs) {
413     if (MRI.isAllocatable(Reg))
414       RemainderBB.addLiveIn(Reg);
415   }
416 
417 
418   const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
419   if (!Src->isUndef())
420     LoopBB.addLiveIn(Src->getReg());
421 
422   if (!IdxReg.isUndef())
423     LoopBB.addLiveIn(IdxReg.getReg());
424   LoopBB.sortUniqueLiveIns();
425 }
426 
427 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
428                                                 DebugLoc DL,
429                                                 MachineInstr *MovRel,
430                                                 const MachineOperand &IdxReg,
431                                                 int Offset) {
432   MachineBasicBlock::iterator I = LoopBB.begin();
433 
434   // Read the next variant into VCC (lower 32 bits) <- also loop target
435   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
436     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
437 
438   // Move index from VCC into M0
439   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
440     .addReg(AMDGPU::VCC_LO);
441 
442   // Compare the just read M0 value to all possible Idx values
443   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
444     .addReg(AMDGPU::M0)
445     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
446 
447   // Update EXEC, save the original EXEC value to VCC
448   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
449     .addReg(AMDGPU::VCC);
450 
451   if (Offset) {
452     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
453       .addReg(AMDGPU::M0)
454       .addImm(Offset);
455   }
456 
457   // Do the actual move
458   LoopBB.insert(I, MovRel);
459 
460   // Update EXEC, switch all done bits to 0 and all todo bits to 1
461   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
462     .addReg(AMDGPU::EXEC)
463     .addReg(AMDGPU::VCC);
464 
465   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
466   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
467     .addMBB(&LoopBB);
468 }
469 
470 // Returns true if a new block was inserted.
471 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
472   MachineBasicBlock &MBB = *MI.getParent();
473   DebugLoc DL = MI.getDebugLoc();
474   MachineBasicBlock::iterator I(&MI);
475 
476   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
477 
478   if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
479     if (Offset) {
480       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
481         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
482         .addImm(Offset);
483     } else {
484       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
485         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
486     }
487 
488     MBB.insert(I, MovRel);
489     MI.eraseFromParent();
490     return false;
491   }
492 
493   MachineFunction &MF = *MBB.getParent();
494   MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
495   SaveOp->setIsDead(false);
496   unsigned Save = SaveOp->getReg();
497 
498   // Reading from a VGPR requires looping over all workitems in the wavefront.
499   assert(AMDGPU::SReg_64RegClass.contains(Save) &&
500          AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
501 
502   // Save the EXEC mask
503   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
504     .addReg(AMDGPU::EXEC);
505 
506   // To insert the loop we need to split the block. Move everything after this
507   // point to a new block, and insert a new empty block between the two.
508   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
509   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
510   MachineFunction::iterator MBBI(MBB);
511   ++MBBI;
512 
513   MF.insert(MBBI, LoopBB);
514   MF.insert(MBBI, RemainderBB);
515 
516   LoopBB->addSuccessor(LoopBB);
517   LoopBB->addSuccessor(RemainderBB);
518 
519   splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx);
520 
521   // Move the rest of the block into a new block.
522   RemainderBB->transferSuccessors(&MBB);
523   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
524   MBB.addSuccessor(LoopBB);
525 
526   emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
527 
528   MachineBasicBlock::iterator First = RemainderBB->begin();
529   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
530     .addReg(Save);
531 
532   MI.eraseFromParent();
533   return true;
534 }
535 
536 /// \param @VecReg The register which holds element zero of the vector
537 ///                 being addressed into.
538 /// \param[out] @Reg The base register to use in the indirect addressing instruction.
539 /// \param[in,out] @Offset As an input, this is the constant offset part of the
540 //                         indirect Index. e.g. v0 = v[VecReg + Offset]
541 //                         As an output, this is a constant value that needs
542 //                         to be added to the value stored in M0.
543 std::pair<unsigned, int>
544 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
545                                                 int Offset) const {
546   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
547   if (!SubReg)
548     SubReg = VecReg;
549 
550   const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
551   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
552   int NumElts = SuperRC->getSize() / RC->getSize();
553 
554   int BaseRegIdx = TRI->getHWRegIndex(SubReg);
555 
556   // Skip out of bounds offsets, or else we would end up using an undefined
557   // register.
558   if (Offset >= NumElts)
559     return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
560 
561   int RegIdx = BaseRegIdx + Offset;
562   if (RegIdx < 0) {
563     Offset = RegIdx;
564     RegIdx = 0;
565   } else {
566     Offset = 0;
567   }
568 
569   unsigned Reg = RC->getRegister(RegIdx);
570   return std::make_pair(Reg, Offset);
571 }
572 
573 // Return true if a new block was inserted.
574 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
575   MachineBasicBlock &MBB = *MI.getParent();
576   DebugLoc DL = MI.getDebugLoc();
577 
578   unsigned Dst = MI.getOperand(0).getReg();
579   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
580   int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
581   unsigned Reg;
582 
583   std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off);
584 
585   MachineInstr *MovRel =
586     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
587     .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
588     .addReg(SrcVec->getReg(), RegState::Implicit);
589 
590   return loadM0(MI, MovRel, Off);
591 }
592 
593 // Return true if a new block was inserted.
594 bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
595   MachineBasicBlock &MBB = *MI.getParent();
596   DebugLoc DL = MI.getDebugLoc();
597 
598   unsigned Dst = MI.getOperand(0).getReg();
599   int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
600   MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
601   unsigned Reg;
602 
603   std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off);
604 
605   MachineInstr *MovRel =
606     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
607     .addReg(Reg, RegState::Define)
608     .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
609     .addReg(Dst, RegState::Implicit);
610 
611   return loadM0(MI, MovRel, Off);
612 }
613 
614 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
615   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
616   TII = ST.getInstrInfo();
617   TRI = &TII->getRegisterInfo();
618 
619   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
620 
621   bool HaveKill = false;
622   bool NeedFlat = false;
623   unsigned Depth = 0;
624 
625   MachineFunction::iterator NextBB;
626 
627   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
628        BI != BE; BI = NextBB) {
629     NextBB = std::next(BI);
630     MachineBasicBlock &MBB = *BI;
631 
632     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
633     MachineBasicBlock::iterator I, Next;
634     bool ExecModified = false;
635 
636     for (I = MBB.begin(); I != MBB.end(); I = Next) {
637       Next = std::next(I);
638 
639       MachineInstr &MI = *I;
640 
641       // Flat uses m0 in case it needs to access LDS.
642       if (TII->isFLAT(MI))
643         NeedFlat = true;
644 
645       if (I->definesRegister(AMDGPU::EXEC, TRI))
646         ExecModified = true;
647 
648       switch (MI.getOpcode()) {
649         default: break;
650         case AMDGPU::SI_IF:
651           ++Depth;
652           If(MI);
653           break;
654 
655         case AMDGPU::SI_ELSE:
656           Else(MI, ExecModified);
657           break;
658 
659         case AMDGPU::SI_BREAK:
660           Break(MI);
661           break;
662 
663         case AMDGPU::SI_IF_BREAK:
664           IfBreak(MI);
665           break;
666 
667         case AMDGPU::SI_ELSE_BREAK:
668           ElseBreak(MI);
669           break;
670 
671         case AMDGPU::SI_LOOP:
672           ++Depth;
673           Loop(MI);
674           break;
675 
676         case AMDGPU::SI_END_CF:
677           if (--Depth == 0 && HaveKill) {
678             SkipIfDead(MI);
679             HaveKill = false;
680           }
681           EndCf(MI);
682           break;
683 
684         case AMDGPU::SI_KILL:
685           if (Depth == 0)
686             SkipIfDead(MI);
687           else
688             HaveKill = true;
689           Kill(MI);
690           break;
691 
692         case AMDGPU::S_BRANCH:
693           Branch(MI);
694           break;
695 
696         case AMDGPU::SI_INDIRECT_SRC_V1:
697         case AMDGPU::SI_INDIRECT_SRC_V2:
698         case AMDGPU::SI_INDIRECT_SRC_V4:
699         case AMDGPU::SI_INDIRECT_SRC_V8:
700         case AMDGPU::SI_INDIRECT_SRC_V16:
701           if (indirectSrc(MI)) {
702             // The block was split at this point. We can safely skip the middle
703             // inserted block to the following which contains the rest of this
704             // block's instructions.
705             NextBB = std::next(BI);
706             BE = MF.end();
707             Next = MBB.end();
708           }
709 
710           break;
711 
712         case AMDGPU::SI_INDIRECT_DST_V1:
713         case AMDGPU::SI_INDIRECT_DST_V2:
714         case AMDGPU::SI_INDIRECT_DST_V4:
715         case AMDGPU::SI_INDIRECT_DST_V8:
716         case AMDGPU::SI_INDIRECT_DST_V16:
717           if (indirectDst(MI)) {
718             // The block was split at this point. We can safely skip the middle
719             // inserted block to the following which contains the rest of this
720             // block's instructions.
721             NextBB = std::next(BI);
722             BE = MF.end();
723             Next = MBB.end();
724           }
725 
726           break;
727 
728         case AMDGPU::SI_RETURN: {
729           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
730 
731           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
732           // because external bytecode will be appended at the end.
733           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
734             // SI_RETURN is not the last instruction. Add an empty block at
735             // the end and jump there.
736             if (!EmptyMBBAtEnd) {
737               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
738               MF.insert(MF.end(), EmptyMBBAtEnd);
739             }
740 
741             MBB.addSuccessor(EmptyMBBAtEnd);
742             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
743                     .addMBB(EmptyMBBAtEnd);
744             I->eraseFromParent();
745           }
746           break;
747         }
748       }
749     }
750   }
751 
752   if (NeedFlat && MFI->IsKernel) {
753     // TODO: What to use with function calls?
754     // We will need to Initialize the flat scratch register pair.
755     if (NeedFlat)
756       MFI->setHasFlatInstructions(true);
757   }
758 
759   return true;
760 }
761