xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision 52ef4019fd7f1300812dd73c2c1c755319891097)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 static cl::opt<unsigned> SkipThresholdFlag(
70   "amdgpu-skip-threshold",
71   cl::desc("Number of instructions before jumping over divergent control flow"),
72   cl::init(12), cl::Hidden);
73 
74 class SILowerControlFlow : public MachineFunctionPass {
75 private:
76   const SIRegisterInfo *TRI;
77   const SIInstrInfo *TII;
78   unsigned SkipThreshold;
79 
80   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
81 
82   void Skip(MachineInstr &From, MachineOperand &To);
83   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
84 
85   void If(MachineInstr &MI);
86   void Else(MachineInstr &MI, bool ExecModified);
87   void Break(MachineInstr &MI);
88   void IfBreak(MachineInstr &MI);
89   void ElseBreak(MachineInstr &MI);
90   void Loop(MachineInstr &MI);
91   void EndCf(MachineInstr &MI);
92 
93   void Kill(MachineInstr &MI);
94   void Branch(MachineInstr &MI);
95 
96   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
97                                      MachineBasicBlock::iterator I) const;
98 public:
99   static char ID;
100 
101   SILowerControlFlow() :
102     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
103 
104   bool runOnMachineFunction(MachineFunction &MF) override;
105 
106   const char *getPassName() const override {
107     return "SI Lower control flow pseudo instructions";
108   }
109 };
110 
111 } // End anonymous namespace
112 
113 char SILowerControlFlow::ID = 0;
114 
115 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
116                 "SI lower control flow", false, false)
117 
118 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
119 
120 
121 FunctionPass *llvm::createSILowerControlFlowPass() {
122   return new SILowerControlFlow();
123 }
124 
125 static bool opcodeEmitsNoInsts(unsigned Opc) {
126   switch (Opc) {
127   case TargetOpcode::IMPLICIT_DEF:
128   case TargetOpcode::KILL:
129   case TargetOpcode::BUNDLE:
130   case TargetOpcode::CFI_INSTRUCTION:
131   case TargetOpcode::EH_LABEL:
132   case TargetOpcode::GC_LABEL:
133   case TargetOpcode::DBG_VALUE:
134     return true;
135   default:
136     return false;
137   }
138 }
139 
140 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
141                                     MachineBasicBlock *To) {
142   if (From->succ_empty())
143     return false;
144 
145   unsigned NumInstr = 0;
146   MachineFunction *MF = From->getParent();
147 
148   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
149        MBBI != End && MBBI != ToI; ++MBBI) {
150     MachineBasicBlock &MBB = *MBBI;
151 
152     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
153          NumInstr < SkipThreshold && I != E; ++I) {
154       if (opcodeEmitsNoInsts(I->getOpcode()))
155         continue;
156 
157       // When a uniform loop is inside non-uniform control flow, the branch
158       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
159       // when EXEC = 0. We should skip the loop lest it becomes infinite.
160       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
161           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
162         return true;
163 
164       if (I->isInlineAsm()) {
165         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
166         const char *AsmStr = I->getOperand(0).getSymbolName();
167 
168         // inlineasm length estimate is number of bytes assuming the longest
169         // instruction.
170         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
171         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
172       } else {
173         ++NumInstr;
174       }
175 
176       if (NumInstr >= SkipThreshold)
177         return true;
178     }
179   }
180 
181   return false;
182 }
183 
184 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
185 
186   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
187     return;
188 
189   DebugLoc DL = From.getDebugLoc();
190   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
191     .addOperand(To);
192 }
193 
194 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
195   MachineBasicBlock &MBB = *MI.getParent();
196   MachineFunction *MF = MBB.getParent();
197 
198   if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
199       !shouldSkip(&MBB, &MBB.getParent()->back()))
200     return false;
201 
202   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
203   MBB.addSuccessor(SkipBB);
204 
205   const DebugLoc &DL = MI.getDebugLoc();
206 
207   // If the exec mask is non-zero, skip the next two instructions
208   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
209     .addMBB(&NextBB);
210 
211   MachineBasicBlock::iterator Insert = SkipBB->begin();
212 
213   // Exec mask is zero: Export to NULL target...
214   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
215     .addImm(0)
216     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
217     .addImm(0)
218     .addImm(1)
219     .addImm(1)
220     .addReg(AMDGPU::VGPR0, RegState::Undef)
221     .addReg(AMDGPU::VGPR0, RegState::Undef)
222     .addReg(AMDGPU::VGPR0, RegState::Undef)
223     .addReg(AMDGPU::VGPR0, RegState::Undef);
224 
225   // ... and terminate wavefront.
226   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
227 
228   return true;
229 }
230 
231 void SILowerControlFlow::If(MachineInstr &MI) {
232   MachineBasicBlock &MBB = *MI.getParent();
233   DebugLoc DL = MI.getDebugLoc();
234   unsigned Reg = MI.getOperand(0).getReg();
235   unsigned Vcc = MI.getOperand(1).getReg();
236 
237   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
238           .addReg(Vcc);
239 
240   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
241           .addReg(AMDGPU::EXEC)
242           .addReg(Reg);
243 
244   Skip(MI, MI.getOperand(2));
245 
246   // Insert a pseudo terminator to help keep the verifier happy.
247   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
248     .addOperand(MI.getOperand(2))
249     .addReg(Reg);
250 
251   MI.eraseFromParent();
252 }
253 
254 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
255   MachineBasicBlock &MBB = *MI.getParent();
256   DebugLoc DL = MI.getDebugLoc();
257   unsigned Dst = MI.getOperand(0).getReg();
258   unsigned Src = MI.getOperand(1).getReg();
259 
260   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
261           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
262           .addReg(Src); // Saved EXEC
263 
264   if (ExecModified) {
265     // Adjust the saved exec to account for the modifications during the flow
266     // block that contains the ELSE. This can happen when WQM mode is switched
267     // off.
268     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
269             .addReg(AMDGPU::EXEC)
270             .addReg(Dst);
271   }
272 
273   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
274           .addReg(AMDGPU::EXEC)
275           .addReg(Dst);
276 
277   Skip(MI, MI.getOperand(2));
278 
279   // Insert a pseudo terminator to help keep the verifier happy.
280   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
281     .addOperand(MI.getOperand(2))
282     .addReg(Dst);
283 
284   MI.eraseFromParent();
285 }
286 
287 void SILowerControlFlow::Break(MachineInstr &MI) {
288   MachineBasicBlock &MBB = *MI.getParent();
289   DebugLoc DL = MI.getDebugLoc();
290 
291   unsigned Dst = MI.getOperand(0).getReg();
292   unsigned Src = MI.getOperand(1).getReg();
293 
294   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
295           .addReg(AMDGPU::EXEC)
296           .addReg(Src);
297 
298   MI.eraseFromParent();
299 }
300 
301 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
302   MachineBasicBlock &MBB = *MI.getParent();
303   DebugLoc DL = MI.getDebugLoc();
304 
305   unsigned Dst = MI.getOperand(0).getReg();
306   unsigned Vcc = MI.getOperand(1).getReg();
307   unsigned Src = MI.getOperand(2).getReg();
308 
309   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
310           .addReg(Vcc)
311           .addReg(Src);
312 
313   MI.eraseFromParent();
314 }
315 
316 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
317   MachineBasicBlock &MBB = *MI.getParent();
318   DebugLoc DL = MI.getDebugLoc();
319 
320   unsigned Dst = MI.getOperand(0).getReg();
321   unsigned Saved = MI.getOperand(1).getReg();
322   unsigned Src = MI.getOperand(2).getReg();
323 
324   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
325           .addReg(Saved)
326           .addReg(Src);
327 
328   MI.eraseFromParent();
329 }
330 
331 void SILowerControlFlow::Loop(MachineInstr &MI) {
332   MachineBasicBlock &MBB = *MI.getParent();
333   DebugLoc DL = MI.getDebugLoc();
334   unsigned Src = MI.getOperand(0).getReg();
335 
336   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
337           .addReg(AMDGPU::EXEC)
338           .addReg(Src);
339 
340   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
341     .addOperand(MI.getOperand(1));
342 
343   MI.eraseFromParent();
344 }
345 
346 void SILowerControlFlow::EndCf(MachineInstr &MI) {
347   MachineBasicBlock &MBB = *MI.getParent();
348   DebugLoc DL = MI.getDebugLoc();
349   unsigned Reg = MI.getOperand(0).getReg();
350 
351   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
352           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
353           .addReg(AMDGPU::EXEC)
354           .addReg(Reg);
355 
356   MI.eraseFromParent();
357 }
358 
359 void SILowerControlFlow::Branch(MachineInstr &MI) {
360   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
361   if (MBB == MI.getParent()->getNextNode())
362     MI.eraseFromParent();
363 
364   // If these aren't equal, this is probably an infinite loop.
365 }
366 
367 void SILowerControlFlow::Kill(MachineInstr &MI) {
368   MachineBasicBlock &MBB = *MI.getParent();
369   DebugLoc DL = MI.getDebugLoc();
370   const MachineOperand &Op = MI.getOperand(0);
371 
372 #ifndef NDEBUG
373   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
374   // Kill is only allowed in pixel / geometry shaders.
375   assert(CallConv == CallingConv::AMDGPU_PS ||
376          CallConv == CallingConv::AMDGPU_GS);
377 #endif
378 
379   // Clear this thread from the exec mask if the operand is negative
380   if ((Op.isImm())) {
381     // Constant operand: Set exec mask to 0 or do nothing
382     if (Op.getImm() & 0x80000000) {
383       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
384               .addImm(0);
385     }
386   } else {
387     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
388            .addImm(0)
389            .addOperand(Op);
390   }
391 
392   MI.eraseFromParent();
393 }
394 
395 MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
396   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
397   MachineFunction *MF = MBB.getParent();
398 
399   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
400   MachineFunction::iterator MBBI(MBB);
401   ++MBBI;
402 
403   MF->insert(MBBI, SkipBB);
404 
405   return SkipBB;
406 }
407 
408 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
409   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
410   TII = ST.getInstrInfo();
411   TRI = &TII->getRegisterInfo();
412   SkipThreshold = SkipThresholdFlag;
413 
414   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
415 
416   bool HaveKill = false;
417   bool NeedFlat = false;
418   unsigned Depth = 0;
419 
420   MachineFunction::iterator NextBB;
421 
422   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
423        BI != BE; BI = NextBB) {
424     NextBB = std::next(BI);
425     MachineBasicBlock &MBB = *BI;
426 
427     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
428     MachineBasicBlock::iterator I, Next;
429     bool ExecModified = false;
430 
431     for (I = MBB.begin(); I != MBB.end(); I = Next) {
432       Next = std::next(I);
433 
434       MachineInstr &MI = *I;
435 
436       // Flat uses m0 in case it needs to access LDS.
437       if (TII->isFLAT(MI))
438         NeedFlat = true;
439 
440       if (I->modifiesRegister(AMDGPU::EXEC, TRI))
441         ExecModified = true;
442 
443       switch (MI.getOpcode()) {
444         default: break;
445         case AMDGPU::SI_IF:
446           ++Depth;
447           If(MI);
448           break;
449 
450         case AMDGPU::SI_ELSE:
451           Else(MI, ExecModified);
452           break;
453 
454         case AMDGPU::SI_BREAK:
455           Break(MI);
456           break;
457 
458         case AMDGPU::SI_IF_BREAK:
459           IfBreak(MI);
460           break;
461 
462         case AMDGPU::SI_ELSE_BREAK:
463           ElseBreak(MI);
464           break;
465 
466         case AMDGPU::SI_LOOP:
467           ++Depth;
468           Loop(MI);
469           break;
470 
471         case AMDGPU::SI_END_CF:
472           if (--Depth == 0 && HaveKill) {
473             HaveKill = false;
474             // TODO: Insert skip if exec is 0?
475           }
476 
477           EndCf(MI);
478           break;
479 
480         case AMDGPU::SI_KILL_TERMINATOR:
481           if (Depth == 0) {
482             if (skipIfDead(MI, *NextBB)) {
483               NextBB = std::next(BI);
484               BE = MF.end();
485             }
486           } else
487             HaveKill = true;
488           Kill(MI);
489           break;
490 
491         case AMDGPU::S_BRANCH:
492           Branch(MI);
493           break;
494 
495         case AMDGPU::SI_RETURN: {
496           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
497 
498           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
499           // because external bytecode will be appended at the end.
500           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
501             // SI_RETURN is not the last instruction. Add an empty block at
502             // the end and jump there.
503             if (!EmptyMBBAtEnd) {
504               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
505               MF.insert(MF.end(), EmptyMBBAtEnd);
506             }
507 
508             MBB.addSuccessor(EmptyMBBAtEnd);
509             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
510                     .addMBB(EmptyMBBAtEnd);
511             I->eraseFromParent();
512           }
513           break;
514         }
515       }
516     }
517   }
518 
519   if (NeedFlat && MFI->isKernel()) {
520     // TODO: What to use with function calls?
521     // We will need to Initialize the flat scratch register pair.
522     if (NeedFlat)
523       MFI->setHasFlatInstructions(true);
524   }
525 
526   return true;
527 }
528