xref: /llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (revision 63e596803332206ce3fc07e2cf485f8f10c79a73)
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
13 ///
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
22 ///
23 /// For example:
24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25 /// %SGPR0 = SI_IF %VCC
26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27 /// %SGPR0 = SI_ELSE %SGPR0
28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29 /// SI_END_CF %SGPR0
30 ///
31 /// becomes:
32 ///
33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
36 ///                                   // optimization which allows us to
37 ///                                   // branch if all the bits of
38 ///                                   // EXEC are zero.
39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40 ///
41 /// label0:
42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
47 /// label1:
48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "SIMachineFunctionInfo.h"
55 #include "llvm/CodeGen/LivePhysRegs.h"
56 #include "llvm/CodeGen/MachineFrameInfo.h"
57 #include "llvm/CodeGen/MachineFunction.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/CodeGen/MachineInstrBuilder.h"
60 #include "llvm/CodeGen/MachineRegisterInfo.h"
61 #include "llvm/IR/Constants.h"
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "si-lower-control-flow"
66 
67 namespace {
68 
69 class SILowerControlFlow : public MachineFunctionPass {
70 private:
71   static const unsigned SkipThreshold = 12;
72 
73   const SIRegisterInfo *TRI;
74   const SIInstrInfo *TII;
75 
76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
77 
78   void Skip(MachineInstr &From, MachineOperand &To);
79   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
80 
81   void If(MachineInstr &MI);
82   void Else(MachineInstr &MI, bool ExecModified);
83   void Break(MachineInstr &MI);
84   void IfBreak(MachineInstr &MI);
85   void ElseBreak(MachineInstr &MI);
86   void Loop(MachineInstr &MI);
87   void EndCf(MachineInstr &MI);
88 
89   void Kill(MachineInstr &MI);
90   void Branch(MachineInstr &MI);
91 
92   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
93                                      MachineBasicBlock::iterator I) const;
94 public:
95   static char ID;
96 
97   SILowerControlFlow() :
98     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
99 
100   bool runOnMachineFunction(MachineFunction &MF) override;
101 
102   const char *getPassName() const override {
103     return "SI Lower control flow pseudo instructions";
104   }
105 };
106 
107 } // End anonymous namespace
108 
109 char SILowerControlFlow::ID = 0;
110 
111 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
112                 "SI lower control flow", false, false)
113 
114 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
115 
116 
117 FunctionPass *llvm::createSILowerControlFlowPass() {
118   return new SILowerControlFlow();
119 }
120 
121 static bool opcodeEmitsNoInsts(unsigned Opc) {
122   switch (Opc) {
123   case TargetOpcode::IMPLICIT_DEF:
124   case TargetOpcode::KILL:
125   case TargetOpcode::BUNDLE:
126   case TargetOpcode::CFI_INSTRUCTION:
127   case TargetOpcode::EH_LABEL:
128   case TargetOpcode::GC_LABEL:
129   case TargetOpcode::DBG_VALUE:
130     return true;
131   default:
132     return false;
133   }
134 }
135 
136 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
137                                     MachineBasicBlock *To) {
138   if (From->succ_empty())
139     return false;
140 
141   unsigned NumInstr = 0;
142   MachineFunction *MF = From->getParent();
143 
144   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
145        MBBI != End && MBBI != ToI; ++MBBI) {
146     MachineBasicBlock &MBB = *MBBI;
147 
148     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
149          NumInstr < SkipThreshold && I != E; ++I) {
150       if (opcodeEmitsNoInsts(I->getOpcode()))
151         continue;
152 
153       // When a uniform loop is inside non-uniform control flow, the branch
154       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
155       // when EXEC = 0. We should skip the loop lest it becomes infinite.
156       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
157           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
158         return true;
159 
160       if (I->isInlineAsm()) {
161         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
162         const char *AsmStr = I->getOperand(0).getSymbolName();
163 
164         // inlineasm length estimate is number of bytes assuming the longest
165         // instruction.
166         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
167         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
168       } else {
169         ++NumInstr;
170       }
171 
172       if (NumInstr >= SkipThreshold)
173         return true;
174     }
175   }
176 
177   return false;
178 }
179 
180 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
181 
182   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
183     return;
184 
185   DebugLoc DL = From.getDebugLoc();
186   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
187     .addOperand(To);
188 }
189 
190 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
191   MachineBasicBlock &MBB = *MI.getParent();
192   MachineFunction *MF = MBB.getParent();
193 
194   if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
195       !shouldSkip(&MBB, &MBB.getParent()->back()))
196     return false;
197 
198   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
199   MBB.addSuccessor(SkipBB);
200 
201   const DebugLoc &DL = MI.getDebugLoc();
202 
203   // If the exec mask is non-zero, skip the next two instructions
204   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
205     .addMBB(&NextBB);
206 
207   MachineBasicBlock::iterator Insert = SkipBB->begin();
208 
209   // Exec mask is zero: Export to NULL target...
210   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
211     .addImm(0)
212     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
213     .addImm(0)
214     .addImm(1)
215     .addImm(1)
216     .addReg(AMDGPU::VGPR0, RegState::Undef)
217     .addReg(AMDGPU::VGPR0, RegState::Undef)
218     .addReg(AMDGPU::VGPR0, RegState::Undef)
219     .addReg(AMDGPU::VGPR0, RegState::Undef);
220 
221   // ... and terminate wavefront.
222   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
223 
224   return true;
225 }
226 
227 void SILowerControlFlow::If(MachineInstr &MI) {
228   MachineBasicBlock &MBB = *MI.getParent();
229   DebugLoc DL = MI.getDebugLoc();
230   unsigned Reg = MI.getOperand(0).getReg();
231   unsigned Vcc = MI.getOperand(1).getReg();
232 
233   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
234           .addReg(Vcc);
235 
236   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
237           .addReg(AMDGPU::EXEC)
238           .addReg(Reg);
239 
240   Skip(MI, MI.getOperand(2));
241 
242   // Insert a pseudo terminator to help keep the verifier happy.
243   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
244     .addOperand(MI.getOperand(2))
245     .addReg(Reg);
246 
247   MI.eraseFromParent();
248 }
249 
250 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
251   MachineBasicBlock &MBB = *MI.getParent();
252   DebugLoc DL = MI.getDebugLoc();
253   unsigned Dst = MI.getOperand(0).getReg();
254   unsigned Src = MI.getOperand(1).getReg();
255 
256   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
257           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
258           .addReg(Src); // Saved EXEC
259 
260   if (ExecModified) {
261     // Adjust the saved exec to account for the modifications during the flow
262     // block that contains the ELSE. This can happen when WQM mode is switched
263     // off.
264     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
265             .addReg(AMDGPU::EXEC)
266             .addReg(Dst);
267   }
268 
269   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
270           .addReg(AMDGPU::EXEC)
271           .addReg(Dst);
272 
273   Skip(MI, MI.getOperand(2));
274 
275   // Insert a pseudo terminator to help keep the verifier happy.
276   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
277     .addOperand(MI.getOperand(2))
278     .addReg(Dst);
279 
280   MI.eraseFromParent();
281 }
282 
283 void SILowerControlFlow::Break(MachineInstr &MI) {
284   MachineBasicBlock &MBB = *MI.getParent();
285   DebugLoc DL = MI.getDebugLoc();
286 
287   unsigned Dst = MI.getOperand(0).getReg();
288   unsigned Src = MI.getOperand(1).getReg();
289 
290   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
291           .addReg(AMDGPU::EXEC)
292           .addReg(Src);
293 
294   MI.eraseFromParent();
295 }
296 
297 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
298   MachineBasicBlock &MBB = *MI.getParent();
299   DebugLoc DL = MI.getDebugLoc();
300 
301   unsigned Dst = MI.getOperand(0).getReg();
302   unsigned Vcc = MI.getOperand(1).getReg();
303   unsigned Src = MI.getOperand(2).getReg();
304 
305   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
306           .addReg(Vcc)
307           .addReg(Src);
308 
309   MI.eraseFromParent();
310 }
311 
312 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
313   MachineBasicBlock &MBB = *MI.getParent();
314   DebugLoc DL = MI.getDebugLoc();
315 
316   unsigned Dst = MI.getOperand(0).getReg();
317   unsigned Saved = MI.getOperand(1).getReg();
318   unsigned Src = MI.getOperand(2).getReg();
319 
320   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
321           .addReg(Saved)
322           .addReg(Src);
323 
324   MI.eraseFromParent();
325 }
326 
327 void SILowerControlFlow::Loop(MachineInstr &MI) {
328   MachineBasicBlock &MBB = *MI.getParent();
329   DebugLoc DL = MI.getDebugLoc();
330   unsigned Src = MI.getOperand(0).getReg();
331 
332   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
333           .addReg(AMDGPU::EXEC)
334           .addReg(Src);
335 
336   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
337     .addOperand(MI.getOperand(1));
338 
339   MI.eraseFromParent();
340 }
341 
342 void SILowerControlFlow::EndCf(MachineInstr &MI) {
343   MachineBasicBlock &MBB = *MI.getParent();
344   DebugLoc DL = MI.getDebugLoc();
345   unsigned Reg = MI.getOperand(0).getReg();
346 
347   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
348           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
349           .addReg(AMDGPU::EXEC)
350           .addReg(Reg);
351 
352   MI.eraseFromParent();
353 }
354 
355 void SILowerControlFlow::Branch(MachineInstr &MI) {
356   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
357   if (MBB == MI.getParent()->getNextNode())
358     MI.eraseFromParent();
359 
360   // If these aren't equal, this is probably an infinite loop.
361 }
362 
363 void SILowerControlFlow::Kill(MachineInstr &MI) {
364   MachineBasicBlock &MBB = *MI.getParent();
365   DebugLoc DL = MI.getDebugLoc();
366   const MachineOperand &Op = MI.getOperand(0);
367 
368 #ifndef NDEBUG
369   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
370   // Kill is only allowed in pixel / geometry shaders.
371   assert(CallConv == CallingConv::AMDGPU_PS ||
372          CallConv == CallingConv::AMDGPU_GS);
373 #endif
374 
375   // Clear this thread from the exec mask if the operand is negative
376   if ((Op.isImm())) {
377     // Constant operand: Set exec mask to 0 or do nothing
378     if (Op.getImm() & 0x80000000) {
379       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
380               .addImm(0);
381     }
382   } else {
383     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
384            .addImm(0)
385            .addOperand(Op);
386   }
387 
388   MI.eraseFromParent();
389 }
390 
391 MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
392   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
393   MachineFunction *MF = MBB.getParent();
394 
395   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
396   MachineFunction::iterator MBBI(MBB);
397   ++MBBI;
398 
399   MF->insert(MBBI, SkipBB);
400 
401   return SkipBB;
402 }
403 
404 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
405   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
406   TII = ST.getInstrInfo();
407   TRI = &TII->getRegisterInfo();
408 
409   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
410 
411   bool HaveKill = false;
412   bool NeedFlat = false;
413   unsigned Depth = 0;
414 
415   MachineFunction::iterator NextBB;
416 
417   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
418        BI != BE; BI = NextBB) {
419     NextBB = std::next(BI);
420     MachineBasicBlock &MBB = *BI;
421 
422     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
423     MachineBasicBlock::iterator I, Next;
424     bool ExecModified = false;
425 
426     for (I = MBB.begin(); I != MBB.end(); I = Next) {
427       Next = std::next(I);
428 
429       MachineInstr &MI = *I;
430 
431       // Flat uses m0 in case it needs to access LDS.
432       if (TII->isFLAT(MI))
433         NeedFlat = true;
434 
435       if (I->modifiesRegister(AMDGPU::EXEC, TRI))
436         ExecModified = true;
437 
438       switch (MI.getOpcode()) {
439         default: break;
440         case AMDGPU::SI_IF:
441           ++Depth;
442           If(MI);
443           break;
444 
445         case AMDGPU::SI_ELSE:
446           Else(MI, ExecModified);
447           break;
448 
449         case AMDGPU::SI_BREAK:
450           Break(MI);
451           break;
452 
453         case AMDGPU::SI_IF_BREAK:
454           IfBreak(MI);
455           break;
456 
457         case AMDGPU::SI_ELSE_BREAK:
458           ElseBreak(MI);
459           break;
460 
461         case AMDGPU::SI_LOOP:
462           ++Depth;
463           Loop(MI);
464           break;
465 
466         case AMDGPU::SI_END_CF:
467           if (--Depth == 0 && HaveKill) {
468             HaveKill = false;
469             // TODO: Insert skip if exec is 0?
470           }
471 
472           EndCf(MI);
473           break;
474 
475         case AMDGPU::SI_KILL_TERMINATOR:
476           if (Depth == 0) {
477             if (skipIfDead(MI, *NextBB)) {
478               NextBB = std::next(BI);
479               BE = MF.end();
480             }
481           } else
482             HaveKill = true;
483           Kill(MI);
484           break;
485 
486         case AMDGPU::S_BRANCH:
487           Branch(MI);
488           break;
489 
490         case AMDGPU::SI_RETURN: {
491           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
492 
493           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
494           // because external bytecode will be appended at the end.
495           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
496             // SI_RETURN is not the last instruction. Add an empty block at
497             // the end and jump there.
498             if (!EmptyMBBAtEnd) {
499               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
500               MF.insert(MF.end(), EmptyMBBAtEnd);
501             }
502 
503             MBB.addSuccessor(EmptyMBBAtEnd);
504             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
505                     .addMBB(EmptyMBBAtEnd);
506             I->eraseFromParent();
507           }
508           break;
509         }
510       }
511     }
512   }
513 
514   if (NeedFlat && MFI->IsKernel) {
515     // TODO: What to use with function calls?
516     // We will need to Initialize the flat scratch register pair.
517     if (NeedFlat)
518       MFI->setHasFlatInstructions(true);
519   }
520 
521   return true;
522 }
523