xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision cb540bc03c29ad9e9c1982267135d2cee3033058)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   static const unsigned SkipThreshold = 12;
72 
73   const SIRegisterInfo *TRI;
74   const SIInstrInfo *TII;
75 
76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77 
78   void Skip(MachineInstr &From, MachineOperand &To);
79   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
80 
81   void If(MachineInstr &MI);
82   void Else(MachineInstr &MI, bool ExecModified);
83   void Break(MachineInstr &MI);
84   void IfBreak(MachineInstr &MI);
85   void ElseBreak(MachineInstr &MI);
86   void Loop(MachineInstr &MI);
87   void EndCf(MachineInstr &MI);
88 
89   void Kill(MachineInstr &MI);
90   void Branch(MachineInstr &MI);
91 
92   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
93                                      MachineBasicBlock::iterator I) const;
94 
95   std::pair<MachineBasicBlock *, MachineBasicBlock *>
96   splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
97 
98 public:
99   static char ID;
100 
101   SILowerControlFlow() :
102     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
103 
104   bool runOnMachineFunction(MachineFunction &MF) override;
105 
106   const char *getPassName() const override {
107     return "SI Lower control flow pseudo instructions";
108   }
109 };
110 
111 } // End anonymous namespace
112 
113 char SILowerControlFlow::ID = 0;
114 
115 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
116                 "SI lower control flow", false, false)
117 
118 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
119 
120 
121 FunctionPass *llvm::createSILowerControlFlowPass() {
122   return new SILowerControlFlow();
123 }
124 
125 static bool opcodeEmitsNoInsts(unsigned Opc) {
126   switch (Opc) {
127   case TargetOpcode::IMPLICIT_DEF:
128   case TargetOpcode::KILL:
129   case TargetOpcode::BUNDLE:
130   case TargetOpcode::CFI_INSTRUCTION:
131   case TargetOpcode::EH_LABEL:
132   case TargetOpcode::GC_LABEL:
133   case TargetOpcode::DBG_VALUE:
134     return true;
135   default:
136     return false;
137   }
138 }
139 
140 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
141                                     MachineBasicBlock *To) {
142   if (From->succ_empty())
143     return false;
144 
145   unsigned NumInstr = 0;
146   MachineFunction *MF = From->getParent();
147 
148   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
149        MBBI != End && MBBI != ToI; ++MBBI) {
150     MachineBasicBlock &MBB = *MBBI;
151 
152     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
153          NumInstr < SkipThreshold && I != E; ++I) {
154       if (opcodeEmitsNoInsts(I->getOpcode()))
155         continue;
156 
157       // When a uniform loop is inside non-uniform control flow, the branch
158       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
159       // when EXEC = 0. We should skip the loop lest it becomes infinite.
160       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
161           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
162         return true;
163 
164       if (I->isInlineAsm()) {
165         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
166         const char *AsmStr = I->getOperand(0).getSymbolName();
167 
168         // inlineasm length estimate is number of bytes assuming the longest
169         // instruction.
170         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
171         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
172       } else {
173         ++NumInstr;
174       }
175 
176       if (NumInstr >= SkipThreshold)
177         return true;
178     }
179   }
180 
181   return false;
182 }
183 
184 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
185 
186   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
187     return;
188 
189   DebugLoc DL = From.getDebugLoc();
190   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
191     .addOperand(To);
192 }
193 
194 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
195   MachineBasicBlock &MBB = *MI.getParent();
196   MachineFunction *MF = MBB.getParent();
197 
198   if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
199       !shouldSkip(&MBB, &MBB.getParent()->back()))
200     return false;
201 
202   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
203   MBB.addSuccessor(SkipBB);
204 
205   const DebugLoc &DL = MI.getDebugLoc();
206 
207   // If the exec mask is non-zero, skip the next two instructions
208   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
209     .addMBB(&NextBB);
210 
211   MachineBasicBlock::iterator Insert = SkipBB->begin();
212 
213   // Exec mask is zero: Export to NULL target...
214   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
215     .addImm(0)
216     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
217     .addImm(0)
218     .addImm(1)
219     .addImm(1)
220     .addReg(AMDGPU::VGPR0, RegState::Undef)
221     .addReg(AMDGPU::VGPR0, RegState::Undef)
222     .addReg(AMDGPU::VGPR0, RegState::Undef)
223     .addReg(AMDGPU::VGPR0, RegState::Undef);
224 
225   // ... and terminate wavefront.
226   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
227 
228   return true;
229 }
230 
231 void SILowerControlFlow::If(MachineInstr &MI) {
232   MachineBasicBlock &MBB = *MI.getParent();
233   DebugLoc DL = MI.getDebugLoc();
234   unsigned Reg = MI.getOperand(0).getReg();
235   unsigned Vcc = MI.getOperand(1).getReg();
236 
237   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
238           .addReg(Vcc);
239 
240   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
241           .addReg(AMDGPU::EXEC)
242           .addReg(Reg);
243 
244   Skip(MI, MI.getOperand(2));
245 
246   // Insert a pseudo terminator to help keep the verifier happy.
247   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
248     .addOperand(MI.getOperand(2))
249     .addReg(Reg);
250 
251   MI.eraseFromParent();
252 }
253 
254 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
255   MachineBasicBlock &MBB = *MI.getParent();
256   DebugLoc DL = MI.getDebugLoc();
257   unsigned Dst = MI.getOperand(0).getReg();
258   unsigned Src = MI.getOperand(1).getReg();
259 
260   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
261           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
262           .addReg(Src); // Saved EXEC
263 
264   if (ExecModified) {
265     // Adjust the saved exec to account for the modifications during the flow
266     // block that contains the ELSE. This can happen when WQM mode is switched
267     // off.
268     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
269             .addReg(AMDGPU::EXEC)
270             .addReg(Dst);
271   }
272 
273   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
274           .addReg(AMDGPU::EXEC)
275           .addReg(Dst);
276 
277   Skip(MI, MI.getOperand(2));
278 
279   // Insert a pseudo terminator to help keep the verifier happy.
280   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
281     .addOperand(MI.getOperand(2))
282     .addReg(Dst);
283 
284   MI.eraseFromParent();
285 }
286 
287 void SILowerControlFlow::Break(MachineInstr &MI) {
288   MachineBasicBlock &MBB = *MI.getParent();
289   DebugLoc DL = MI.getDebugLoc();
290 
291   unsigned Dst = MI.getOperand(0).getReg();
292   unsigned Src = MI.getOperand(1).getReg();
293 
294   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
295           .addReg(AMDGPU::EXEC)
296           .addReg(Src);
297 
298   MI.eraseFromParent();
299 }
300 
301 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
302   MachineBasicBlock &MBB = *MI.getParent();
303   DebugLoc DL = MI.getDebugLoc();
304 
305   unsigned Dst = MI.getOperand(0).getReg();
306   unsigned Vcc = MI.getOperand(1).getReg();
307   unsigned Src = MI.getOperand(2).getReg();
308 
309   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
310           .addReg(Vcc)
311           .addReg(Src);
312 
313   MI.eraseFromParent();
314 }
315 
316 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
317   MachineBasicBlock &MBB = *MI.getParent();
318   DebugLoc DL = MI.getDebugLoc();
319 
320   unsigned Dst = MI.getOperand(0).getReg();
321   unsigned Saved = MI.getOperand(1).getReg();
322   unsigned Src = MI.getOperand(2).getReg();
323 
324   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
325           .addReg(Saved)
326           .addReg(Src);
327 
328   MI.eraseFromParent();
329 }
330 
331 void SILowerControlFlow::Loop(MachineInstr &MI) {
332   MachineBasicBlock &MBB = *MI.getParent();
333   DebugLoc DL = MI.getDebugLoc();
334   unsigned Src = MI.getOperand(0).getReg();
335 
336   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
337           .addReg(AMDGPU::EXEC)
338           .addReg(Src);
339 
340   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
341     .addOperand(MI.getOperand(1));
342 
343   MI.eraseFromParent();
344 }
345 
346 void SILowerControlFlow::EndCf(MachineInstr &MI) {
347   MachineBasicBlock &MBB = *MI.getParent();
348   DebugLoc DL = MI.getDebugLoc();
349   unsigned Reg = MI.getOperand(0).getReg();
350 
351   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
352           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
353           .addReg(AMDGPU::EXEC)
354           .addReg(Reg);
355 
356   MI.eraseFromParent();
357 }
358 
359 void SILowerControlFlow::Branch(MachineInstr &MI) {
360   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
361   if (MBB == MI.getParent()->getNextNode())
362     MI.eraseFromParent();
363 
364   // If these aren't equal, this is probably an infinite loop.
365 }
366 
367 void SILowerControlFlow::Kill(MachineInstr &MI) {
368   MachineBasicBlock &MBB = *MI.getParent();
369   DebugLoc DL = MI.getDebugLoc();
370   const MachineOperand &Op = MI.getOperand(0);
371 
372 #ifndef NDEBUG
373   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
374   // Kill is only allowed in pixel / geometry shaders.
375   assert(CallConv == CallingConv::AMDGPU_PS ||
376          CallConv == CallingConv::AMDGPU_GS);
377 #endif
378 
379   // Clear this thread from the exec mask if the operand is negative
380   if ((Op.isImm())) {
381     // Constant operand: Set exec mask to 0 or do nothing
382     if (Op.getImm() & 0x80000000) {
383       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
384               .addImm(0);
385     }
386   } else {
387     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
388            .addImm(0)
389            .addOperand(Op);
390   }
391 
392   MI.eraseFromParent();
393 }
394 
395 MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
396   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
397   MachineFunction *MF = MBB.getParent();
398 
399   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
400   MachineFunction::iterator MBBI(MBB);
401   ++MBBI;
402 
403   MF->insert(MBBI, SkipBB);
404 
405   return SkipBB;
406 }
407 
408 std::pair<MachineBasicBlock *, MachineBasicBlock *>
409 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
410                                MachineBasicBlock::iterator I) {
411   MachineFunction *MF = MBB.getParent();
412 
413   // To insert the loop we need to split the block. Move everything after this
414   // point to a new block, and insert a new empty block between the two.
415   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
416   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
417   MachineFunction::iterator MBBI(MBB);
418   ++MBBI;
419 
420   MF->insert(MBBI, LoopBB);
421   MF->insert(MBBI, RemainderBB);
422 
423   // Move the rest of the block into a new block.
424   RemainderBB->transferSuccessors(&MBB);
425   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
426 
427   MBB.addSuccessor(LoopBB);
428 
429   return std::make_pair(LoopBB, RemainderBB);
430 }
431 
432 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
433   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
434   TII = ST.getInstrInfo();
435   TRI = &TII->getRegisterInfo();
436 
437   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
438 
439   bool HaveKill = false;
440   bool NeedFlat = false;
441   unsigned Depth = 0;
442 
443   MachineFunction::iterator NextBB;
444 
445   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
446        BI != BE; BI = NextBB) {
447     NextBB = std::next(BI);
448     MachineBasicBlock &MBB = *BI;
449 
450     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
451     MachineBasicBlock::iterator I, Next;
452     bool ExecModified = false;
453 
454     for (I = MBB.begin(); I != MBB.end(); I = Next) {
455       Next = std::next(I);
456 
457       MachineInstr &MI = *I;
458 
459       // Flat uses m0 in case it needs to access LDS.
460       if (TII->isFLAT(MI))
461         NeedFlat = true;
462 
463       if (I->modifiesRegister(AMDGPU::EXEC, TRI))
464         ExecModified = true;
465 
466       switch (MI.getOpcode()) {
467         default: break;
468         case AMDGPU::SI_IF:
469           ++Depth;
470           If(MI);
471           break;
472 
473         case AMDGPU::SI_ELSE:
474           Else(MI, ExecModified);
475           break;
476 
477         case AMDGPU::SI_BREAK:
478           Break(MI);
479           break;
480 
481         case AMDGPU::SI_IF_BREAK:
482           IfBreak(MI);
483           break;
484 
485         case AMDGPU::SI_ELSE_BREAK:
486           ElseBreak(MI);
487           break;
488 
489         case AMDGPU::SI_LOOP:
490           ++Depth;
491           Loop(MI);
492           break;
493 
494         case AMDGPU::SI_END_CF:
495           if (--Depth == 0 && HaveKill) {
496             HaveKill = false;
497             // TODO: Insert skip if exec is 0?
498           }
499 
500           EndCf(MI);
501           break;
502 
503         case AMDGPU::SI_KILL_TERMINATOR:
504           if (Depth == 0) {
505             if (skipIfDead(MI, *NextBB)) {
506               NextBB = std::next(BI);
507               BE = MF.end();
508             }
509           } else
510             HaveKill = true;
511           Kill(MI);
512           break;
513 
514         case AMDGPU::S_BRANCH:
515           Branch(MI);
516           break;
517 
518         case AMDGPU::SI_RETURN: {
519           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
520 
521           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
522           // because external bytecode will be appended at the end.
523           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
524             // SI_RETURN is not the last instruction. Add an empty block at
525             // the end and jump there.
526             if (!EmptyMBBAtEnd) {
527               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
528               MF.insert(MF.end(), EmptyMBBAtEnd);
529             }
530 
531             MBB.addSuccessor(EmptyMBBAtEnd);
532             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
533                     .addMBB(EmptyMBBAtEnd);
534             I->eraseFromParent();
535           }
536           break;
537         }
538       }
539     }
540   }
541 
542   if (NeedFlat && MFI->IsKernel) {
543     // TODO: What to use with function calls?
544     // We will need to Initialize the flat scratch register pair.
545     if (NeedFlat)
546       MFI->setHasFlatInstructions(true);
547   }
548 
549   return true;
550 }
551