xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision b3bde2ea50decc455f3b2fff02e49351e4209d92)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass adds instructions to enable whole quad mode for pixel
12 /// shaders, and whole wavefront mode for all programs.
13 ///
14 /// Whole quad mode is required for derivative computations, but it interferes
15 /// with shader side effects (stores and atomics). This pass is run on the
16 /// scheduled machine IR but before register coalescing, so that machine SSA is
17 /// available for analysis. It ensures that WQM is enabled when necessary, but
18 /// disabled around stores and atomics.
19 ///
20 /// When necessary, this pass creates a function prolog
21 ///
22 ///   S_MOV_B64 LiveMask, EXEC
23 ///   S_WQM_B64 EXEC, EXEC
24 ///
25 /// to enter WQM at the top of the function and surrounds blocks of Exact
26 /// instructions by
27 ///
28 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 ///   ...
30 ///   S_MOV_B64 EXEC, Tmp
31 ///
32 /// We also compute when a sequence of instructions requires Whole Wavefront
33 /// Mode (WWM) and insert instructions to save and restore it:
34 ///
35 /// S_OR_SAVEEXEC_B64 Tmp, -1
36 /// ...
37 /// S_MOV_B64 EXEC, Tmp
38 ///
39 /// In order to avoid excessive switching during sequences of Exact
40 /// instructions, the pass first analyzes which instructions must be run in WQM
41 /// (aka which instructions produce values that lead to derivative
42 /// computations).
43 ///
44 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
45 ///
46 /// There is room for improvement given better control flow analysis:
47 ///
48 ///  (1) at the top level (outside of control flow statements, and as long as
49 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
50 ///      the LiveMask (this is implemented for the entry block).
51 ///
52 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
53 ///      consist of exact and don't-care instructions, the switch only has to
54 ///      be done at the entry and exit points rather than potentially in each
55 ///      block of the region.
56 ///
57 //===----------------------------------------------------------------------===//
58 
59 #include "AMDGPU.h"
60 #include "AMDGPUSubtarget.h"
61 #include "SIInstrInfo.h"
62 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/PostOrderIterator.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/StringRef.h"
67 #include "llvm/CodeGen/LiveInterval.h"
68 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
69 #include "llvm/CodeGen/MachineBasicBlock.h"
70 #include "llvm/CodeGen/MachineFunction.h"
71 #include "llvm/CodeGen/MachineFunctionPass.h"
72 #include "llvm/CodeGen/MachineInstr.h"
73 #include "llvm/CodeGen/MachineInstrBuilder.h"
74 #include "llvm/CodeGen/MachineOperand.h"
75 #include "llvm/CodeGen/MachineRegisterInfo.h"
76 #include "llvm/CodeGen/SlotIndexes.h"
77 #include "llvm/CodeGen/TargetRegisterInfo.h"
78 #include "llvm/IR/CallingConv.h"
79 #include "llvm/IR/DebugLoc.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/raw_ostream.h"
84 #include <cassert>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-wqm"
90 
91 namespace {
92 
93 enum {
94   StateWQM = 0x1,
95   StateWWM = 0x2,
96   StateExact = 0x4,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108   if (PS.State & StateWQM)
109     OS << "WQM";
110   if (PS.State & StateWWM) {
111     if (PS.State & StateWQM)
112       OS << '|';
113     OS << "WWM";
114   }
115   if (PS.State & StateExact) {
116     if (PS.State & (StateWQM | StateWWM))
117       OS << '|';
118     OS << "Exact";
119   }
120 
121   return OS;
122 }
123 #endif
124 
125 struct InstrInfo {
126   char Needs = 0;
127   char Disabled = 0;
128   char OutNeeds = 0;
129 };
130 
131 struct BlockInfo {
132   char Needs = 0;
133   char InNeeds = 0;
134   char OutNeeds = 0;
135 };
136 
137 struct WorkItem {
138   MachineBasicBlock *MBB = nullptr;
139   MachineInstr *MI = nullptr;
140 
141   WorkItem() = default;
142   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143   WorkItem(MachineInstr *MI) : MI(MI) {}
144 };
145 
146 class SIWholeQuadMode : public MachineFunctionPass {
147 private:
148   CallingConv::ID CallingConv;
149   const SIInstrInfo *TII;
150   const SIRegisterInfo *TRI;
151   MachineRegisterInfo *MRI;
152   LiveIntervals *LIS;
153 
154   DenseMap<const MachineInstr *, InstrInfo> Instructions;
155   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
156   SmallVector<MachineInstr *, 1> LiveMaskQueries;
157   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
158 
159   void printInfo();
160 
161   void markInstruction(MachineInstr &MI, char Flag,
162                        std::vector<WorkItem> &Worklist);
163   void markInstructionUses(const MachineInstr &MI, char Flag,
164                            std::vector<WorkItem> &Worklist);
165   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
166   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
167   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
168   char analyzeFunction(MachineFunction &MF);
169 
170   bool requiresCorrectState(const MachineInstr &MI) const;
171 
172   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
173                                       MachineBasicBlock::iterator Before);
174   MachineBasicBlock::iterator
175   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
176                    MachineBasicBlock::iterator Last, bool PreferLast,
177                    bool SaveSCC);
178   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
179                unsigned SaveWQM, unsigned LiveMaskReg);
180   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
181              unsigned SavedWQM);
182   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
183              unsigned SaveOrig);
184   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
185                unsigned SavedOrig);
186   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
187 
188   void lowerLiveMaskQueries(unsigned LiveMaskReg);
189   void lowerCopyInstrs();
190 
191 public:
192   static char ID;
193 
194   SIWholeQuadMode() :
195     MachineFunctionPass(ID) { }
196 
197   bool runOnMachineFunction(MachineFunction &MF) override;
198 
199   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
200 
201   void getAnalysisUsage(AnalysisUsage &AU) const override {
202     AU.addRequired<LiveIntervals>();
203     AU.setPreservesCFG();
204     MachineFunctionPass::getAnalysisUsage(AU);
205   }
206 };
207 
208 } // end anonymous namespace
209 
210 char SIWholeQuadMode::ID = 0;
211 
212 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
213                       false)
214 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
215 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
216                     false)
217 
218 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
219 
220 FunctionPass *llvm::createSIWholeQuadModePass() {
221   return new SIWholeQuadMode;
222 }
223 
224 #ifndef NDEBUG
225 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
226   for (const auto &BII : Blocks) {
227     dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
228            << "  InNeeds = " << PrintState(BII.second.InNeeds)
229            << ", Needs = " << PrintState(BII.second.Needs)
230            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
231 
232     for (const MachineInstr &MI : *BII.first) {
233       auto III = Instructions.find(&MI);
234       if (III == Instructions.end())
235         continue;
236 
237       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
238              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
239     }
240   }
241 }
242 #endif
243 
244 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
245                                       std::vector<WorkItem> &Worklist) {
246   InstrInfo &II = Instructions[&MI];
247 
248   assert(!(Flag & StateExact) && Flag != 0);
249 
250   // Remove any disabled states from the flag. The user that required it gets
251   // an undefined value in the helper lanes. For example, this can happen if
252   // the result of an atomic is used by instruction that requires WQM, where
253   // ignoring the request for WQM is correct as per the relevant specs.
254   Flag &= ~II.Disabled;
255 
256   // Ignore if the flag is already encompassed by the existing needs, or we
257   // just disabled everything.
258   if ((II.Needs & Flag) == Flag)
259     return;
260 
261   II.Needs |= Flag;
262   Worklist.push_back(&MI);
263 }
264 
265 /// Mark all instructions defining the uses in \p MI with \p Flag.
266 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
267                                           std::vector<WorkItem> &Worklist) {
268   for (const MachineOperand &Use : MI.uses()) {
269     if (!Use.isReg() || !Use.isUse())
270       continue;
271 
272     unsigned Reg = Use.getReg();
273 
274     // Handle physical registers that we need to track; this is mostly relevant
275     // for VCC, which can appear as the (implicit) input of a uniform branch,
276     // e.g. when a loop counter is stored in a VGPR.
277     if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
278       if (Reg == AMDGPU::EXEC)
279         continue;
280 
281       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
282         LiveRange &LR = LIS->getRegUnit(*RegUnit);
283         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
284         if (!Value)
285           continue;
286 
287         // Since we're in machine SSA, we do not need to track physical
288         // registers across basic blocks.
289         if (Value->isPHIDef())
290           continue;
291 
292         markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
293                         Worklist);
294       }
295 
296       continue;
297     }
298 
299     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
300       markInstruction(DefMI, Flag, Worklist);
301   }
302 }
303 
304 // Scan instructions to determine which ones require an Exact execmask and
305 // which ones seed WQM requirements.
306 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
307                                        std::vector<WorkItem> &Worklist) {
308   char GlobalFlags = 0;
309   bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
310   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
311 
312   // We need to visit the basic blocks in reverse post-order so that we visit
313   // defs before uses, in particular so that we don't accidentally mark an
314   // instruction as needing e.g. WQM before visiting it and realizing it needs
315   // WQM disabled.
316   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
317   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
318     MachineBasicBlock &MBB = **BI;
319     BlockInfo &BBI = Blocks[&MBB];
320 
321     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
322       MachineInstr &MI = *II;
323       InstrInfo &III = Instructions[&MI];
324       unsigned Opcode = MI.getOpcode();
325       char Flags = 0;
326 
327       if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
328         Flags = StateWQM;
329       } else if (TII->isWQM(Opcode)) {
330         // Sampling instructions don't need to produce results for all pixels
331         // in a quad, they just require all inputs of a quad to have been
332         // computed for derivatives.
333         markInstructionUses(MI, StateWQM, Worklist);
334         GlobalFlags |= StateWQM;
335         continue;
336       } else if (Opcode == AMDGPU::WQM) {
337         // The WQM intrinsic requires its output to have all the helper lanes
338         // correct, so we need it to be in WQM.
339         Flags = StateWQM;
340         LowerToCopyInstrs.push_back(&MI);
341       } else if (Opcode == AMDGPU::WWM) {
342         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
343         // to be executed in WQM or Exact so that its copy doesn't clobber
344         // inactive lanes.
345         markInstructionUses(MI, StateWWM, Worklist);
346         GlobalFlags |= StateWWM;
347         LowerToCopyInstrs.push_back(&MI);
348         continue;
349       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
350                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
351         III.Disabled = StateWWM;
352         MachineOperand &Inactive = MI.getOperand(2);
353         if (Inactive.isReg()) {
354           if (Inactive.isUndef()) {
355             LowerToCopyInstrs.push_back(&MI);
356           } else {
357             unsigned Reg = Inactive.getReg();
358             if (TargetRegisterInfo::isVirtualRegister(Reg)) {
359               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
360                 markInstruction(DefMI, StateWWM, Worklist);
361             }
362           }
363         }
364         SetInactiveInstrs.push_back(&MI);
365         continue;
366       } else if (TII->isDisableWQM(MI)) {
367         BBI.Needs |= StateExact;
368         if (!(BBI.InNeeds & StateExact)) {
369           BBI.InNeeds |= StateExact;
370           Worklist.push_back(&MBB);
371         }
372         GlobalFlags |= StateExact;
373         III.Disabled = StateWQM | StateWWM;
374         continue;
375       } else {
376         if (Opcode == AMDGPU::SI_PS_LIVE) {
377           LiveMaskQueries.push_back(&MI);
378         } else if (WQMOutputs) {
379           // The function is in machine SSA form, which means that physical
380           // VGPRs correspond to shader inputs and outputs. Inputs are
381           // only used, outputs are only defined.
382           for (const MachineOperand &MO : MI.defs()) {
383             if (!MO.isReg())
384               continue;
385 
386             unsigned Reg = MO.getReg();
387 
388             if (!TRI->isVirtualRegister(Reg) &&
389                 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
390               Flags = StateWQM;
391               break;
392             }
393           }
394         }
395 
396         if (!Flags)
397           continue;
398       }
399 
400       markInstruction(MI, Flags, Worklist);
401       GlobalFlags |= Flags;
402     }
403   }
404 
405   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
406   // ever used anywhere in the function. This implements the corresponding
407   // semantics of @llvm.amdgcn.set.inactive.
408   if (GlobalFlags & StateWQM) {
409     for (MachineInstr *MI : SetInactiveInstrs)
410       markInstruction(*MI, StateWQM, Worklist);
411   }
412 
413   return GlobalFlags;
414 }
415 
416 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
417                                            std::vector<WorkItem>& Worklist) {
418   MachineBasicBlock *MBB = MI.getParent();
419   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
420   BlockInfo &BI = Blocks[MBB];
421 
422   // Control flow-type instructions and stores to temporary memory that are
423   // followed by WQM computations must themselves be in WQM.
424   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
425       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
426     Instructions[&MI].Needs = StateWQM;
427     II.Needs = StateWQM;
428   }
429 
430   // Propagate to block level
431   if (II.Needs & StateWQM) {
432     BI.Needs |= StateWQM;
433     if (!(BI.InNeeds & StateWQM)) {
434       BI.InNeeds |= StateWQM;
435       Worklist.push_back(MBB);
436     }
437   }
438 
439   // Propagate backwards within block
440   if (MachineInstr *PrevMI = MI.getPrevNode()) {
441     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
442     if (!PrevMI->isPHI()) {
443       InstrInfo &PrevII = Instructions[PrevMI];
444       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
445         PrevII.OutNeeds |= InNeeds;
446         Worklist.push_back(PrevMI);
447       }
448     }
449   }
450 
451   // Propagate WQM flag to instruction inputs
452   assert(!(II.Needs & StateExact));
453 
454   if (II.Needs != 0)
455     markInstructionUses(MI, II.Needs, Worklist);
456 }
457 
458 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
459                                      std::vector<WorkItem>& Worklist) {
460   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
461 
462   // Propagate through instructions
463   if (!MBB.empty()) {
464     MachineInstr *LastMI = &*MBB.rbegin();
465     InstrInfo &LastII = Instructions[LastMI];
466     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
467       LastII.OutNeeds |= BI.OutNeeds;
468       Worklist.push_back(LastMI);
469     }
470   }
471 
472   // Predecessor blocks must provide for our WQM/Exact needs.
473   for (MachineBasicBlock *Pred : MBB.predecessors()) {
474     BlockInfo &PredBI = Blocks[Pred];
475     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
476       continue;
477 
478     PredBI.OutNeeds |= BI.InNeeds;
479     PredBI.InNeeds |= BI.InNeeds;
480     Worklist.push_back(Pred);
481   }
482 
483   // All successors must be prepared to accept the same set of WQM/Exact data.
484   for (MachineBasicBlock *Succ : MBB.successors()) {
485     BlockInfo &SuccBI = Blocks[Succ];
486     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
487       continue;
488 
489     SuccBI.InNeeds |= BI.OutNeeds;
490     Worklist.push_back(Succ);
491   }
492 }
493 
494 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
495   std::vector<WorkItem> Worklist;
496   char GlobalFlags = scanInstructions(MF, Worklist);
497 
498   while (!Worklist.empty()) {
499     WorkItem WI = Worklist.back();
500     Worklist.pop_back();
501 
502     if (WI.MI)
503       propagateInstruction(*WI.MI, Worklist);
504     else
505       propagateBlock(*WI.MBB, Worklist);
506   }
507 
508   return GlobalFlags;
509 }
510 
511 /// Whether \p MI really requires the exec state computed during analysis.
512 ///
513 /// Scalar instructions must occasionally be marked WQM for correct propagation
514 /// (e.g. thread masks leading up to branches), but when it comes to actual
515 /// execution, they don't care about EXEC.
516 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
517   if (MI.isTerminator())
518     return true;
519 
520   // Skip instructions that are not affected by EXEC
521   if (TII->isScalarUnit(MI))
522     return false;
523 
524   // Generic instructions such as COPY will either disappear by register
525   // coalescing or be lowered to SALU or VALU instructions.
526   if (MI.isTransient()) {
527     if (MI.getNumExplicitOperands() >= 1) {
528       const MachineOperand &Op = MI.getOperand(0);
529       if (Op.isReg()) {
530         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
531           // SGPR instructions are not affected by EXEC
532           return false;
533         }
534       }
535     }
536   }
537 
538   return true;
539 }
540 
541 MachineBasicBlock::iterator
542 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
543                          MachineBasicBlock::iterator Before) {
544   unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
545 
546   MachineInstr *Save =
547       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
548           .addReg(AMDGPU::SCC);
549   MachineInstr *Restore =
550       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
551           .addReg(SaveReg);
552 
553   LIS->InsertMachineInstrInMaps(*Save);
554   LIS->InsertMachineInstrInMaps(*Restore);
555   LIS->createAndComputeVirtRegInterval(SaveReg);
556 
557   return Restore;
558 }
559 
560 // Return an iterator in the (inclusive) range [First, Last] at which
561 // instructions can be safely inserted, keeping in mind that some of the
562 // instructions we want to add necessarily clobber SCC.
563 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
564     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
565     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
566   if (!SaveSCC)
567     return PreferLast ? Last : First;
568 
569   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
570   auto MBBE = MBB.end();
571   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
572                                      : LIS->getMBBEndIdx(&MBB);
573   SlotIndex LastIdx =
574       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
575   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
576   const LiveRange::Segment *S;
577 
578   for (;;) {
579     S = LR.getSegmentContaining(Idx);
580     if (!S)
581       break;
582 
583     if (PreferLast) {
584       SlotIndex Next = S->start.getBaseIndex();
585       if (Next < FirstIdx)
586         break;
587       Idx = Next;
588     } else {
589       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
590       if (Next > LastIdx)
591         break;
592       Idx = Next;
593     }
594   }
595 
596   MachineBasicBlock::iterator MBBI;
597 
598   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
599     MBBI = MI;
600   else {
601     assert(Idx == LIS->getMBBEndIdx(&MBB));
602     MBBI = MBB.end();
603   }
604 
605   if (S)
606     MBBI = saveSCC(MBB, MBBI);
607 
608   return MBBI;
609 }
610 
611 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
612                               MachineBasicBlock::iterator Before,
613                               unsigned SaveWQM, unsigned LiveMaskReg) {
614   MachineInstr *MI;
615 
616   if (SaveWQM) {
617     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
618                  SaveWQM)
619              .addReg(LiveMaskReg);
620   } else {
621     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
622                  AMDGPU::EXEC)
623              .addReg(AMDGPU::EXEC)
624              .addReg(LiveMaskReg);
625   }
626 
627   LIS->InsertMachineInstrInMaps(*MI);
628 }
629 
630 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
631                             MachineBasicBlock::iterator Before,
632                             unsigned SavedWQM) {
633   MachineInstr *MI;
634 
635   if (SavedWQM) {
636     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
637              .addReg(SavedWQM);
638   } else {
639     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
640                  AMDGPU::EXEC)
641              .addReg(AMDGPU::EXEC);
642   }
643 
644   LIS->InsertMachineInstrInMaps(*MI);
645 }
646 
647 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
648                             MachineBasicBlock::iterator Before,
649                             unsigned SaveOrig) {
650   MachineInstr *MI;
651 
652   assert(SaveOrig);
653   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
654                SaveOrig)
655            .addImm(-1);
656   LIS->InsertMachineInstrInMaps(*MI);
657 }
658 
659 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
660                               MachineBasicBlock::iterator Before,
661                               unsigned SavedOrig) {
662   MachineInstr *MI;
663 
664   assert(SavedOrig);
665   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
666            .addReg(SavedOrig);
667   LIS->InsertMachineInstrInMaps(*MI);
668 }
669 
670 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
671                                    bool isEntry) {
672   auto BII = Blocks.find(&MBB);
673   if (BII == Blocks.end())
674     return;
675 
676   const BlockInfo &BI = BII->second;
677 
678   // This is a non-entry block that is WQM throughout, so no need to do
679   // anything.
680   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
681     return;
682 
683   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
684 
685   unsigned SavedWQMReg = 0;
686   unsigned SavedNonWWMReg = 0;
687   bool WQMFromExec = isEntry;
688   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
689   char NonWWMState = 0;
690 
691   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
692   if (isEntry)
693     ++II; // Skip the instruction that saves LiveMask
694 
695   // This stores the first instruction where it's safe to switch from WQM to
696   // Exact or vice versa.
697   MachineBasicBlock::iterator FirstWQM = IE;
698 
699   // This stores the first instruction where it's safe to switch from WWM to
700   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
701   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
702   // switch to/from WQM as well.
703   MachineBasicBlock::iterator FirstWWM = IE;
704   for (;;) {
705     MachineBasicBlock::iterator Next = II;
706     char Needs = StateExact | StateWQM; // WWM is disabled by default
707     char OutNeeds = 0;
708 
709     if (FirstWQM == IE)
710       FirstWQM = II;
711 
712     if (FirstWWM == IE)
713       FirstWWM = II;
714 
715     // First, figure out the allowed states (Needs) based on the propagated
716     // flags.
717     if (II != IE) {
718       MachineInstr &MI = *II;
719 
720       if (requiresCorrectState(MI)) {
721         auto III = Instructions.find(&MI);
722         if (III != Instructions.end()) {
723           if (III->second.Needs & StateWWM)
724             Needs = StateWWM;
725           else if (III->second.Needs & StateWQM)
726             Needs = StateWQM;
727           else
728             Needs &= ~III->second.Disabled;
729           OutNeeds = III->second.OutNeeds;
730         }
731       } else {
732         // If the instruction doesn't actually need a correct EXEC, then we can
733         // safely leave WWM enabled.
734         Needs = StateExact | StateWQM | StateWWM;
735       }
736 
737       if (MI.isTerminator() && OutNeeds == StateExact)
738         Needs = StateExact;
739 
740       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
741         MI.getOperand(3).setImm(1);
742 
743       ++Next;
744     } else {
745       // End of basic block
746       if (BI.OutNeeds & StateWQM)
747         Needs = StateWQM;
748       else if (BI.OutNeeds == StateExact)
749         Needs = StateExact;
750       else
751         Needs = StateWQM | StateExact;
752     }
753 
754     // Now, transition if necessary.
755     if (!(Needs & State)) {
756       MachineBasicBlock::iterator First;
757       if (State == StateWWM || Needs == StateWWM) {
758         // We must switch to or from WWM
759         First = FirstWWM;
760       } else {
761         // We only need to switch to/from WQM, so we can use FirstWQM
762         First = FirstWQM;
763       }
764 
765       MachineBasicBlock::iterator Before =
766           prepareInsertion(MBB, First, II, Needs == StateWQM,
767                            Needs == StateExact || WQMFromExec);
768 
769       if (State == StateWWM) {
770         assert(SavedNonWWMReg);
771         fromWWM(MBB, Before, SavedNonWWMReg);
772         State = NonWWMState;
773       }
774 
775       if (Needs == StateWWM) {
776         NonWWMState = State;
777         SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
778         toWWM(MBB, Before, SavedNonWWMReg);
779         State = StateWWM;
780       } else {
781         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
782           if (!WQMFromExec && (OutNeeds & StateWQM))
783             SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
784 
785           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
786           State = StateExact;
787         } else if (State == StateExact && (Needs & StateWQM) &&
788                    !(Needs & StateExact)) {
789           assert(WQMFromExec == (SavedWQMReg == 0));
790 
791           toWQM(MBB, Before, SavedWQMReg);
792 
793           if (SavedWQMReg) {
794             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
795             SavedWQMReg = 0;
796           }
797           State = StateWQM;
798         } else {
799           // We can get here if we transitioned from WWM to a non-WWM state that
800           // already matches our needs, but we shouldn't need to do anything.
801           assert(Needs & State);
802         }
803       }
804     }
805 
806     if (Needs != (StateExact | StateWQM | StateWWM)) {
807       if (Needs != (StateExact | StateWQM))
808         FirstWQM = IE;
809       FirstWWM = IE;
810     }
811 
812     if (II == IE)
813       break;
814     II = Next;
815   }
816 }
817 
818 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
819   for (MachineInstr *MI : LiveMaskQueries) {
820     const DebugLoc &DL = MI->getDebugLoc();
821     unsigned Dest = MI->getOperand(0).getReg();
822     MachineInstr *Copy =
823         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
824             .addReg(LiveMaskReg);
825 
826     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
827     MI->eraseFromParent();
828   }
829 }
830 
831 void SIWholeQuadMode::lowerCopyInstrs() {
832   for (MachineInstr *MI : LowerToCopyInstrs) {
833     for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
834       MI->RemoveOperand(i);
835     MI->setDesc(TII->get(AMDGPU::COPY));
836   }
837 }
838 
839 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
840   Instructions.clear();
841   Blocks.clear();
842   LiveMaskQueries.clear();
843   LowerToCopyInstrs.clear();
844   CallingConv = MF.getFunction()->getCallingConv();
845 
846   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
847 
848   TII = ST.getInstrInfo();
849   TRI = &TII->getRegisterInfo();
850   MRI = &MF.getRegInfo();
851   LIS = &getAnalysis<LiveIntervals>();
852 
853   char GlobalFlags = analyzeFunction(MF);
854   unsigned LiveMaskReg = 0;
855   if (!(GlobalFlags & StateWQM)) {
856     lowerLiveMaskQueries(AMDGPU::EXEC);
857     if (!(GlobalFlags & StateWWM))
858       return !LiveMaskQueries.empty();
859   } else {
860     // Store a copy of the original live mask when required
861     MachineBasicBlock &Entry = MF.front();
862     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
863 
864     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
865       LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
866       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
867                                  TII->get(AMDGPU::COPY), LiveMaskReg)
868                              .addReg(AMDGPU::EXEC);
869       LIS->InsertMachineInstrInMaps(*MI);
870     }
871 
872     lowerLiveMaskQueries(LiveMaskReg);
873 
874     if (GlobalFlags == StateWQM) {
875       // For a shader that needs only WQM, we can just set it once.
876       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
877               AMDGPU::EXEC)
878           .addReg(AMDGPU::EXEC);
879 
880       lowerCopyInstrs();
881       // EntryMI may become invalid here
882       return true;
883     }
884   }
885 
886   DEBUG(printInfo());
887 
888   lowerCopyInstrs();
889 
890   // Handle the general case
891   for (auto BII : Blocks)
892     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
893 
894   // Physical registers like SCC aren't tracked by default anyway, so just
895   // removing the ranges we computed is the simplest option for maintaining
896   // the analysis results.
897   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
898 
899   return true;
900 }
901