xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision 213e87f2ee2f5f5fa5704346d2f73df1d61a2f02)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/MachineFrameInfo.h"
56 #include "llvm/CodeGen/MachineFunction.h"
57 #include "llvm/CodeGen/MachineFunctionPass.h"
58 #include "llvm/CodeGen/MachineInstrBuilder.h"
59 #include "llvm/CodeGen/MachineRegisterInfo.h"
60 #include "llvm/IR/Constants.h"
61 
62 using namespace llvm;
63 
64 #define DEBUG_TYPE "si-lower-control-flow"
65 
66 namespace {
67 
68 class SILowerControlFlow : public MachineFunctionPass {
69 private:
70   static const unsigned SkipThreshold = 12;
71 
72   const SIRegisterInfo *TRI;
73   const SIInstrInfo *TII;
74 
75   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
76 
77   void Skip(MachineInstr &From, MachineOperand &To);
78   void SkipIfDead(MachineInstr &MI);
79 
80   void If(MachineInstr &MI);
81   void Else(MachineInstr &MI, bool ExecModified);
82   void Break(MachineInstr &MI);
83   void IfBreak(MachineInstr &MI);
84   void ElseBreak(MachineInstr &MI);
85   void Loop(MachineInstr &MI);
86   void EndCf(MachineInstr &MI);
87 
88   void Kill(MachineInstr &MI);
89   void Branch(MachineInstr &MI);
90 
91   void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
92   void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
93   void IndirectSrc(MachineInstr &MI);
94   void IndirectDst(MachineInstr &MI);
95 
96 public:
97   static char ID;
98 
99   SILowerControlFlow() :
100     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
101 
102   bool runOnMachineFunction(MachineFunction &MF) override;
103 
104   const char *getPassName() const override {
105     return "SI Lower control flow pseudo instructions";
106   }
107 
108   void getAnalysisUsage(AnalysisUsage &AU) const override {
109     AU.setPreservesCFG();
110     MachineFunctionPass::getAnalysisUsage(AU);
111   }
112 };
113 
114 } // End anonymous namespace
115 
116 char SILowerControlFlow::ID = 0;
117 
118 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
119                 "SI lower control flow", false, false)
120 
121 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
122 
123 
124 FunctionPass *llvm::createSILowerControlFlowPass() {
125   return new SILowerControlFlow();
126 }
127 
128 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
129                                     MachineBasicBlock *To) {
130 
131   unsigned NumInstr = 0;
132 
133   for (MachineFunction::iterator MBBI = MachineFunction::iterator(From),
134                                  ToI = MachineFunction::iterator(To); MBBI != ToI; ++MBBI) {
135 
136     MachineBasicBlock &MBB = *MBBI;
137 
138     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
139          NumInstr < SkipThreshold && I != E; ++I) {
140 
141       if (I->isBundle() || !I->isBundled()) {
142         // When a uniform loop is inside non-uniform control flow, the branch
143         // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
144         // when EXEC = 0. We should skip the loop lest it becomes infinite.
145         if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ)
146           return true;
147 
148         if (++NumInstr >= SkipThreshold)
149           return true;
150       }
151     }
152   }
153 
154   return false;
155 }
156 
157 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
158 
159   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
160     return;
161 
162   DebugLoc DL = From.getDebugLoc();
163   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
164     .addOperand(To);
165 }
166 
167 void SILowerControlFlow::SkipIfDead(MachineInstr &MI) {
168 
169   MachineBasicBlock &MBB = *MI.getParent();
170   DebugLoc DL = MI.getDebugLoc();
171 
172   if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
173       ShaderType::PIXEL ||
174       !shouldSkip(&MBB, &MBB.getParent()->back()))
175     return;
176 
177   MachineBasicBlock::iterator Insert = &MI;
178   ++Insert;
179 
180   // If the exec mask is non-zero, skip the next two instructions
181   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
182     .addImm(3);
183 
184   // Exec mask is zero: Export to NULL target...
185   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
186           .addImm(0)
187           .addImm(0x09) // V_008DFC_SQ_EXP_NULL
188           .addImm(0)
189           .addImm(1)
190           .addImm(1)
191           .addReg(AMDGPU::VGPR0)
192           .addReg(AMDGPU::VGPR0)
193           .addReg(AMDGPU::VGPR0)
194           .addReg(AMDGPU::VGPR0);
195 
196   // ... and terminate wavefront
197   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
198 }
199 
200 void SILowerControlFlow::If(MachineInstr &MI) {
201   MachineBasicBlock &MBB = *MI.getParent();
202   DebugLoc DL = MI.getDebugLoc();
203   unsigned Reg = MI.getOperand(0).getReg();
204   unsigned Vcc = MI.getOperand(1).getReg();
205 
206   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
207           .addReg(Vcc);
208 
209   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
210           .addReg(AMDGPU::EXEC)
211           .addReg(Reg);
212 
213   Skip(MI, MI.getOperand(2));
214 
215   MI.eraseFromParent();
216 }
217 
218 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
219   MachineBasicBlock &MBB = *MI.getParent();
220   DebugLoc DL = MI.getDebugLoc();
221   unsigned Dst = MI.getOperand(0).getReg();
222   unsigned Src = MI.getOperand(1).getReg();
223 
224   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
225           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
226           .addReg(Src); // Saved EXEC
227 
228   if (ExecModified) {
229     // Adjust the saved exec to account for the modifications during the flow
230     // block that contains the ELSE. This can happen when WQM mode is switched
231     // off.
232     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
233             .addReg(AMDGPU::EXEC)
234             .addReg(Dst);
235   }
236 
237   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
238           .addReg(AMDGPU::EXEC)
239           .addReg(Dst);
240 
241   Skip(MI, MI.getOperand(2));
242 
243   MI.eraseFromParent();
244 }
245 
246 void SILowerControlFlow::Break(MachineInstr &MI) {
247   MachineBasicBlock &MBB = *MI.getParent();
248   DebugLoc DL = MI.getDebugLoc();
249 
250   unsigned Dst = MI.getOperand(0).getReg();
251   unsigned Src = MI.getOperand(1).getReg();
252 
253   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
254           .addReg(AMDGPU::EXEC)
255           .addReg(Src);
256 
257   MI.eraseFromParent();
258 }
259 
260 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
261   MachineBasicBlock &MBB = *MI.getParent();
262   DebugLoc DL = MI.getDebugLoc();
263 
264   unsigned Dst = MI.getOperand(0).getReg();
265   unsigned Vcc = MI.getOperand(1).getReg();
266   unsigned Src = MI.getOperand(2).getReg();
267 
268   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
269           .addReg(Vcc)
270           .addReg(Src);
271 
272   MI.eraseFromParent();
273 }
274 
275 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
276   MachineBasicBlock &MBB = *MI.getParent();
277   DebugLoc DL = MI.getDebugLoc();
278 
279   unsigned Dst = MI.getOperand(0).getReg();
280   unsigned Saved = MI.getOperand(1).getReg();
281   unsigned Src = MI.getOperand(2).getReg();
282 
283   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
284           .addReg(Saved)
285           .addReg(Src);
286 
287   MI.eraseFromParent();
288 }
289 
290 void SILowerControlFlow::Loop(MachineInstr &MI) {
291   MachineBasicBlock &MBB = *MI.getParent();
292   DebugLoc DL = MI.getDebugLoc();
293   unsigned Src = MI.getOperand(0).getReg();
294 
295   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
296           .addReg(AMDGPU::EXEC)
297           .addReg(Src);
298 
299   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
300     .addOperand(MI.getOperand(1));
301 
302   MI.eraseFromParent();
303 }
304 
305 void SILowerControlFlow::EndCf(MachineInstr &MI) {
306   MachineBasicBlock &MBB = *MI.getParent();
307   DebugLoc DL = MI.getDebugLoc();
308   unsigned Reg = MI.getOperand(0).getReg();
309 
310   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
311           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
312           .addReg(AMDGPU::EXEC)
313           .addReg(Reg);
314 
315   MI.eraseFromParent();
316 }
317 
318 void SILowerControlFlow::Branch(MachineInstr &MI) {
319   if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
320     MI.eraseFromParent();
321 
322   // If these aren't equal, this is probably an infinite loop.
323 }
324 
325 void SILowerControlFlow::Kill(MachineInstr &MI) {
326   MachineBasicBlock &MBB = *MI.getParent();
327   DebugLoc DL = MI.getDebugLoc();
328   const MachineOperand &Op = MI.getOperand(0);
329 
330 #ifndef NDEBUG
331   const SIMachineFunctionInfo *MFI
332     = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
333   // Kill is only allowed in pixel / geometry shaders.
334   assert(MFI->getShaderType() == ShaderType::PIXEL ||
335          MFI->getShaderType() == ShaderType::GEOMETRY);
336 #endif
337 
338   // Clear this thread from the exec mask if the operand is negative
339   if ((Op.isImm())) {
340     // Constant operand: Set exec mask to 0 or do nothing
341     if (Op.getImm() & 0x80000000) {
342       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
343               .addImm(0);
344     }
345   } else {
346     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
347            .addImm(0)
348            .addOperand(Op);
349   }
350 
351   MI.eraseFromParent();
352 }
353 
354 void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
355 
356   MachineBasicBlock &MBB = *MI.getParent();
357   DebugLoc DL = MI.getDebugLoc();
358   MachineBasicBlock::iterator I = MI;
359 
360   unsigned Save = MI.getOperand(1).getReg();
361   unsigned Idx = MI.getOperand(3).getReg();
362 
363   if (AMDGPU::SReg_32RegClass.contains(Idx)) {
364     if (Offset) {
365       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
366               .addReg(Idx)
367               .addImm(Offset);
368     } else {
369       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
370               .addReg(Idx);
371     }
372     MBB.insert(I, MovRel);
373   } else {
374 
375     assert(AMDGPU::SReg_64RegClass.contains(Save));
376     assert(AMDGPU::VGPR_32RegClass.contains(Idx));
377 
378     // Save the EXEC mask
379     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
380             .addReg(AMDGPU::EXEC);
381 
382     // Read the next variant into VCC (lower 32 bits) <- also loop target
383     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
384             AMDGPU::VCC_LO)
385             .addReg(Idx);
386 
387     // Move index from VCC into M0
388     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
389             .addReg(AMDGPU::VCC_LO);
390 
391     // Compare the just read M0 value to all possible Idx values
392     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
393       .addReg(AMDGPU::M0)
394       .addReg(Idx);
395 
396     // Update EXEC, save the original EXEC value to VCC
397     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
398             .addReg(AMDGPU::VCC);
399 
400     if (Offset) {
401       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
402               .addReg(AMDGPU::M0)
403               .addImm(Offset);
404     }
405     // Do the actual move
406     MBB.insert(I, MovRel);
407 
408     // Update EXEC, switch all done bits to 0 and all todo bits to 1
409     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
410             .addReg(AMDGPU::EXEC)
411             .addReg(AMDGPU::VCC);
412 
413     // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
414     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
415       .addImm(-7);
416 
417     // Restore EXEC
418     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
419             .addReg(Save);
420 
421   }
422   MI.eraseFromParent();
423 }
424 
425 /// \param @VecReg The register which holds element zero of the vector
426 ///                 being addressed into.
427 /// \param[out] @Reg The base register to use in the indirect addressing instruction.
428 /// \param[in,out] @Offset As an input, this is the constant offset part of the
429 //                         indirect Index. e.g. v0 = v[VecReg + Offset]
430 //                         As an output, this is a constant value that needs
431 //                         to be added to the value stored in M0.
432 void SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
433                                                      unsigned &Reg,
434                                                      int &Offset) {
435   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
436   if (!SubReg)
437     SubReg = VecReg;
438 
439   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
440   int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
441 
442   if (RegIdx < 0) {
443     Offset = RegIdx;
444     RegIdx = 0;
445   } else {
446     Offset = 0;
447   }
448 
449   Reg = RC->getRegister(RegIdx);
450 }
451 
452 void SILowerControlFlow::IndirectSrc(MachineInstr &MI) {
453 
454   MachineBasicBlock &MBB = *MI.getParent();
455   DebugLoc DL = MI.getDebugLoc();
456 
457   unsigned Dst = MI.getOperand(0).getReg();
458   unsigned Vec = MI.getOperand(2).getReg();
459   int Off = MI.getOperand(4).getImm();
460   unsigned Reg;
461 
462   computeIndirectRegAndOffset(Vec, Reg, Off);
463 
464   MachineInstr *MovRel =
465     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
466             .addReg(Reg)
467             .addReg(Vec, RegState::Implicit);
468 
469   LoadM0(MI, MovRel, Off);
470 }
471 
472 void SILowerControlFlow::IndirectDst(MachineInstr &MI) {
473 
474   MachineBasicBlock &MBB = *MI.getParent();
475   DebugLoc DL = MI.getDebugLoc();
476 
477   unsigned Dst = MI.getOperand(0).getReg();
478   int Off = MI.getOperand(4).getImm();
479   unsigned Val = MI.getOperand(5).getReg();
480   unsigned Reg;
481 
482   computeIndirectRegAndOffset(Dst, Reg, Off);
483 
484   MachineInstr *MovRel =
485     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
486             .addReg(Reg, RegState::Define)
487             .addReg(Val)
488             .addReg(Dst, RegState::Implicit);
489 
490   LoadM0(MI, MovRel, Off);
491 }
492 
493 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
494   TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
495   TRI =
496       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
497   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
498 
499   bool HaveKill = false;
500   bool NeedFlat = false;
501   unsigned Depth = 0;
502 
503   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
504        BI != BE; ++BI) {
505 
506     MachineBasicBlock *EmptyMBBAtEnd = NULL;
507     MachineBasicBlock &MBB = *BI;
508     MachineBasicBlock::iterator I, Next;
509     bool ExecModified = false;
510 
511     for (I = MBB.begin(); I != MBB.end(); I = Next) {
512       Next = std::next(I);
513 
514       MachineInstr &MI = *I;
515 
516       // Flat uses m0 in case it needs to access LDS.
517       if (TII->isFLAT(MI))
518         NeedFlat = true;
519 
520       for (const auto &Def : I->defs()) {
521         if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
522           ExecModified = true;
523           break;
524         }
525       }
526 
527       switch (MI.getOpcode()) {
528         default: break;
529         case AMDGPU::SI_IF:
530           ++Depth;
531           If(MI);
532           break;
533 
534         case AMDGPU::SI_ELSE:
535           Else(MI, ExecModified);
536           break;
537 
538         case AMDGPU::SI_BREAK:
539           Break(MI);
540           break;
541 
542         case AMDGPU::SI_IF_BREAK:
543           IfBreak(MI);
544           break;
545 
546         case AMDGPU::SI_ELSE_BREAK:
547           ElseBreak(MI);
548           break;
549 
550         case AMDGPU::SI_LOOP:
551           ++Depth;
552           Loop(MI);
553           break;
554 
555         case AMDGPU::SI_END_CF:
556           if (--Depth == 0 && HaveKill) {
557             SkipIfDead(MI);
558             HaveKill = false;
559           }
560           EndCf(MI);
561           break;
562 
563         case AMDGPU::SI_KILL:
564           if (Depth == 0)
565             SkipIfDead(MI);
566           else
567             HaveKill = true;
568           Kill(MI);
569           break;
570 
571         case AMDGPU::S_BRANCH:
572           Branch(MI);
573           break;
574 
575         case AMDGPU::SI_INDIRECT_SRC_V1:
576         case AMDGPU::SI_INDIRECT_SRC_V2:
577         case AMDGPU::SI_INDIRECT_SRC_V4:
578         case AMDGPU::SI_INDIRECT_SRC_V8:
579         case AMDGPU::SI_INDIRECT_SRC_V16:
580           IndirectSrc(MI);
581           break;
582 
583         case AMDGPU::SI_INDIRECT_DST_V1:
584         case AMDGPU::SI_INDIRECT_DST_V2:
585         case AMDGPU::SI_INDIRECT_DST_V4:
586         case AMDGPU::SI_INDIRECT_DST_V8:
587         case AMDGPU::SI_INDIRECT_DST_V16:
588           IndirectDst(MI);
589           break;
590 
591         case AMDGPU::S_ENDPGM: {
592           if (MF.getInfo<SIMachineFunctionInfo>()->returnsVoid())
593             break;
594 
595           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
596           // because external bytecode will be appended at the end.
597           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
598             // S_ENDPGM is not the last instruction. Add an empty block at
599             // the end and jump there.
600             if (!EmptyMBBAtEnd) {
601               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
602               MF.insert(MF.end(), EmptyMBBAtEnd);
603             }
604 
605             MBB.addSuccessor(EmptyMBBAtEnd);
606             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
607                     .addMBB(EmptyMBBAtEnd);
608           }
609 
610           I->eraseFromParent();
611           break;
612         }
613       }
614     }
615   }
616 
617   if (NeedFlat && MFI->IsKernel) {
618     // TODO: What to use with function calls?
619     // We will need to Initialize the flat scratch register pair.
620     if (NeedFlat)
621       MFI->setHasFlatInstructions(true);
622   }
623 
624   return true;
625 }
626