xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision e58e0e3fe30a961501d63f2bf1c5ee83a493a417)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass adds instructions to enable whole quad mode for pixel
12 /// shaders.
13 ///
14 /// Whole quad mode is required for derivative computations, but it interferes
15 /// with shader side effects (stores and atomics). This pass is run on the
16 /// scheduled machine IR but before register coalescing, so that machine SSA is
17 /// available for analysis. It ensures that WQM is enabled when necessary, but
18 /// disabled around stores and atomics.
19 ///
20 /// When necessary, this pass creates a function prolog
21 ///
22 ///   S_MOV_B64 LiveMask, EXEC
23 ///   S_WQM_B64 EXEC, EXEC
24 ///
25 /// to enter WQM at the top of the function and surrounds blocks of Exact
26 /// instructions by
27 ///
28 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 ///   ...
30 ///   S_MOV_B64 EXEC, Tmp
31 ///
32 /// In order to avoid excessive switching during sequences of Exact
33 /// instructions, the pass first analyzes which instructions must be run in WQM
34 /// (aka which instructions produce values that lead to derivative
35 /// computations).
36 ///
37 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
38 ///
39 /// There is room for improvement given better control flow analysis:
40 ///
41 ///  (1) at the top level (outside of control flow statements, and as long as
42 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
43 ///      the LiveMask (this is implemented for the entry block).
44 ///
45 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
46 ///      consist of exact and don't-care instructions, the switch only has to
47 ///      be done at the entry and exit points rather than potentially in each
48 ///      block of the region.
49 ///
50 //===----------------------------------------------------------------------===//
51 
52 #include "AMDGPU.h"
53 #include "AMDGPUSubtarget.h"
54 #include "SIInstrInfo.h"
55 #include "SIMachineFunctionInfo.h"
56 #include "llvm/CodeGen/MachineFunction.h"
57 #include "llvm/CodeGen/MachineFunctionPass.h"
58 #include "llvm/CodeGen/MachineInstrBuilder.h"
59 #include "llvm/CodeGen/MachineRegisterInfo.h"
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "si-wqm"
64 
65 namespace {
66 
67 enum {
68   StateWQM = 0x1,
69   StateExact = 0x2,
70 };
71 
72 struct PrintState {
73 public:
74   explicit PrintState(int State) : State(State) {}
75 
76   int State;
77 };
78 
79 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
80   if (PS.State & StateWQM)
81     OS << "WQM";
82   if (PS.State & StateExact) {
83     if (PS.State & StateWQM)
84       OS << '|';
85     OS << "Exact";
86   }
87 
88   return OS;
89 }
90 
91 struct InstrInfo {
92   char Needs = 0;
93   char OutNeeds = 0;
94 };
95 
96 struct BlockInfo {
97   char Needs = 0;
98   char InNeeds = 0;
99   char OutNeeds = 0;
100 };
101 
102 struct WorkItem {
103   MachineBasicBlock *MBB = nullptr;
104   MachineInstr *MI = nullptr;
105 
106   WorkItem() {}
107   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
108   WorkItem(MachineInstr *MI) : MI(MI) {}
109 };
110 
111 class SIWholeQuadMode : public MachineFunctionPass {
112 private:
113   const SIInstrInfo *TII;
114   const SIRegisterInfo *TRI;
115   MachineRegisterInfo *MRI;
116   LiveIntervals *LIS;
117 
118   DenseMap<const MachineInstr *, InstrInfo> Instructions;
119   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
120   SmallVector<MachineInstr *, 1> LiveMaskQueries;
121 
122   void printInfo();
123 
124   void markInstruction(MachineInstr &MI, char Flag,
125                        std::vector<WorkItem> &Worklist);
126   void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
127   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
128   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
129   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
130   char analyzeFunction(MachineFunction &MF);
131 
132   bool requiresCorrectState(const MachineInstr &MI) const;
133 
134   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
135                                       MachineBasicBlock::iterator Before);
136   MachineBasicBlock::iterator
137   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
138                    MachineBasicBlock::iterator Last, bool PreferLast,
139                    bool SaveSCC);
140   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
141                unsigned SaveWQM, unsigned LiveMaskReg);
142   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
143              unsigned SavedWQM);
144   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
145 
146   void lowerLiveMaskQueries(unsigned LiveMaskReg);
147 
148 public:
149   static char ID;
150 
151   SIWholeQuadMode() :
152     MachineFunctionPass(ID) { }
153 
154   bool runOnMachineFunction(MachineFunction &MF) override;
155 
156   const char *getPassName() const override {
157     return "SI Whole Quad Mode";
158   }
159 
160   void getAnalysisUsage(AnalysisUsage &AU) const override {
161     AU.addRequired<LiveIntervals>();
162     AU.setPreservesCFG();
163     MachineFunctionPass::getAnalysisUsage(AU);
164   }
165 };
166 
167 } // End anonymous namespace
168 
169 char SIWholeQuadMode::ID = 0;
170 
171 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
172                       false)
173 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
174 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
175                     false)
176 
177 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
178 
179 FunctionPass *llvm::createSIWholeQuadModePass() {
180   return new SIWholeQuadMode;
181 }
182 
183 void SIWholeQuadMode::printInfo() {
184   for (const auto &BII : Blocks) {
185     dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
186            << "  InNeeds = " << PrintState(BII.second.InNeeds)
187            << ", Needs = " << PrintState(BII.second.Needs)
188            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
189 
190     for (const MachineInstr &MI : *BII.first) {
191       auto III = Instructions.find(&MI);
192       if (III == Instructions.end())
193         continue;
194 
195       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
196              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
197     }
198   }
199 }
200 
201 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
202                                       std::vector<WorkItem> &Worklist) {
203   InstrInfo &II = Instructions[&MI];
204 
205   assert(Flag == StateWQM || Flag == StateExact);
206 
207   // Ignore if the instruction is already marked. The typical case is that we
208   // mark an instruction WQM multiple times, but for atomics it can happen that
209   // Flag is StateWQM, but Needs is already set to StateExact. In this case,
210   // letting the atomic run in StateExact is correct as per the relevant specs.
211   if (II.Needs)
212     return;
213 
214   II.Needs = Flag;
215   Worklist.push_back(&MI);
216 }
217 
218 /// Mark all instructions defining the uses in \p MI as WQM.
219 void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
220                                   std::vector<WorkItem> &Worklist) {
221   for (const MachineOperand &Use : MI.uses()) {
222     if (!Use.isReg() || !Use.isUse())
223       continue;
224 
225     unsigned Reg = Use.getReg();
226 
227     // Handle physical registers that we need to track; this is mostly relevant
228     // for VCC, which can appear as the (implicit) input of a uniform branch,
229     // e.g. when a loop counter is stored in a VGPR.
230     if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
231       if (Reg == AMDGPU::EXEC)
232         continue;
233 
234       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
235         LiveRange &LR = LIS->getRegUnit(*RegUnit);
236         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
237         if (!Value)
238           continue;
239 
240         // Since we're in machine SSA, we do not need to track physical
241         // registers across basic blocks.
242         if (Value->isPHIDef())
243           continue;
244 
245         markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
246                         Worklist);
247       }
248 
249       continue;
250     }
251 
252     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
253       markInstruction(DefMI, StateWQM, Worklist);
254   }
255 }
256 
257 // Scan instructions to determine which ones require an Exact execmask and
258 // which ones seed WQM requirements.
259 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
260                                        std::vector<WorkItem> &Worklist) {
261   char GlobalFlags = 0;
262   bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
263 
264   for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
265     MachineBasicBlock &MBB = *BI;
266 
267     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
268       MachineInstr &MI = *II;
269       unsigned Opcode = MI.getOpcode();
270       char Flags = 0;
271 
272       if (TII->isDS(Opcode)) {
273         Flags = StateWQM;
274       } else if (TII->isWQM(Opcode)) {
275         // Sampling instructions don't need to produce results for all pixels
276         // in a quad, they just require all inputs of a quad to have been
277         // computed for derivatives.
278         markUsesWQM(MI, Worklist);
279         GlobalFlags |= StateWQM;
280         continue;
281       } else if (TII->isDisableWQM(MI)) {
282         Flags = StateExact;
283       } else {
284         if (Opcode == AMDGPU::SI_PS_LIVE) {
285           LiveMaskQueries.push_back(&MI);
286         } else if (WQMOutputs) {
287           // The function is in machine SSA form, which means that physical
288           // VGPRs correspond to shader inputs and outputs. Inputs are
289           // only used, outputs are only defined.
290           for (const MachineOperand &MO : MI.defs()) {
291             if (!MO.isReg())
292               continue;
293 
294             unsigned Reg = MO.getReg();
295 
296             if (!TRI->isVirtualRegister(Reg) &&
297                 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
298               Flags = StateWQM;
299               break;
300             }
301           }
302         }
303 
304         if (!Flags)
305           continue;
306       }
307 
308       markInstruction(MI, Flags, Worklist);
309       GlobalFlags |= Flags;
310     }
311   }
312 
313   return GlobalFlags;
314 }
315 
316 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
317                                            std::vector<WorkItem>& Worklist) {
318   MachineBasicBlock *MBB = MI.getParent();
319   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
320   BlockInfo &BI = Blocks[MBB];
321 
322   // Control flow-type instructions and stores to temporary memory that are
323   // followed by WQM computations must themselves be in WQM.
324   if ((II.OutNeeds & StateWQM) && !II.Needs &&
325       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
326     Instructions[&MI].Needs = StateWQM;
327     II.Needs = StateWQM;
328   }
329 
330   // Propagate to block level
331   BI.Needs |= II.Needs;
332   if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
333     BI.InNeeds |= II.Needs;
334     Worklist.push_back(MBB);
335   }
336 
337   // Propagate backwards within block
338   if (MachineInstr *PrevMI = MI.getPrevNode()) {
339     char InNeeds = II.Needs | II.OutNeeds;
340     if (!PrevMI->isPHI()) {
341       InstrInfo &PrevII = Instructions[PrevMI];
342       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
343         PrevII.OutNeeds |= InNeeds;
344         Worklist.push_back(PrevMI);
345       }
346     }
347   }
348 
349   // Propagate WQM flag to instruction inputs
350   assert(II.Needs != (StateWQM | StateExact));
351 
352   if (II.Needs == StateWQM)
353     markUsesWQM(MI, Worklist);
354 }
355 
356 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
357                                      std::vector<WorkItem>& Worklist) {
358   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
359 
360   // Propagate through instructions
361   if (!MBB.empty()) {
362     MachineInstr *LastMI = &*MBB.rbegin();
363     InstrInfo &LastII = Instructions[LastMI];
364     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
365       LastII.OutNeeds |= BI.OutNeeds;
366       Worklist.push_back(LastMI);
367     }
368   }
369 
370   // Predecessor blocks must provide for our WQM/Exact needs.
371   for (MachineBasicBlock *Pred : MBB.predecessors()) {
372     BlockInfo &PredBI = Blocks[Pred];
373     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
374       continue;
375 
376     PredBI.OutNeeds |= BI.InNeeds;
377     PredBI.InNeeds |= BI.InNeeds;
378     Worklist.push_back(Pred);
379   }
380 
381   // All successors must be prepared to accept the same set of WQM/Exact data.
382   for (MachineBasicBlock *Succ : MBB.successors()) {
383     BlockInfo &SuccBI = Blocks[Succ];
384     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
385       continue;
386 
387     SuccBI.InNeeds |= BI.OutNeeds;
388     Worklist.push_back(Succ);
389   }
390 }
391 
392 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
393   std::vector<WorkItem> Worklist;
394   char GlobalFlags = scanInstructions(MF, Worklist);
395 
396   while (!Worklist.empty()) {
397     WorkItem WI = Worklist.back();
398     Worklist.pop_back();
399 
400     if (WI.MI)
401       propagateInstruction(*WI.MI, Worklist);
402     else
403       propagateBlock(*WI.MBB, Worklist);
404   }
405 
406   return GlobalFlags;
407 }
408 
409 /// Whether \p MI really requires the exec state computed during analysis.
410 ///
411 /// Scalar instructions must occasionally be marked WQM for correct propagation
412 /// (e.g. thread masks leading up to branches), but when it comes to actual
413 /// execution, they don't care about EXEC.
414 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
415   if (MI.isTerminator())
416     return true;
417 
418   // Skip instructions that are not affected by EXEC
419   if (TII->isScalarUnit(MI))
420     return false;
421 
422   // Generic instructions such as COPY will either disappear by register
423   // coalescing or be lowered to SALU or VALU instructions.
424   if (MI.isTransient()) {
425     if (MI.getNumExplicitOperands() >= 1) {
426       const MachineOperand &Op = MI.getOperand(0);
427       if (Op.isReg()) {
428         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
429           // SGPR instructions are not affected by EXEC
430           return false;
431         }
432       }
433     }
434   }
435 
436   return true;
437 }
438 
439 MachineBasicBlock::iterator
440 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
441                          MachineBasicBlock::iterator Before) {
442   unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
443 
444   MachineInstr *Save =
445       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
446           .addReg(AMDGPU::SCC);
447   MachineInstr *Restore =
448       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
449           .addReg(SaveReg);
450 
451   LIS->InsertMachineInstrInMaps(*Save);
452   LIS->InsertMachineInstrInMaps(*Restore);
453   LIS->createAndComputeVirtRegInterval(SaveReg);
454 
455   return Restore;
456 }
457 
458 // Return an iterator in the (inclusive) range [First, Last] at which
459 // instructions can be safely inserted, keeping in mind that some of the
460 // instructions we want to add necessarily clobber SCC.
461 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
462     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
463     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
464   if (!SaveSCC)
465     return PreferLast ? Last : First;
466 
467   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
468   auto MBBE = MBB.end();
469   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
470                                      : LIS->getMBBEndIdx(&MBB);
471   SlotIndex LastIdx =
472       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
473   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
474   const LiveRange::Segment *S;
475 
476   for (;;) {
477     S = LR.getSegmentContaining(Idx);
478     if (!S)
479       break;
480 
481     if (PreferLast) {
482       SlotIndex Next = S->start.getBaseIndex();
483       if (Next < FirstIdx)
484         break;
485       Idx = Next;
486     } else {
487       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
488       if (Next > LastIdx)
489         break;
490       Idx = Next;
491     }
492   }
493 
494   MachineBasicBlock::iterator MBBI;
495 
496   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
497     MBBI = MI;
498   else {
499     assert(Idx == LIS->getMBBEndIdx(&MBB));
500     MBBI = MBB.end();
501   }
502 
503   if (S)
504     MBBI = saveSCC(MBB, MBBI);
505 
506   return MBBI;
507 }
508 
509 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
510                               MachineBasicBlock::iterator Before,
511                               unsigned SaveWQM, unsigned LiveMaskReg) {
512   MachineInstr *MI;
513 
514   if (SaveWQM) {
515     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
516                  SaveWQM)
517              .addReg(LiveMaskReg);
518   } else {
519     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
520                  AMDGPU::EXEC)
521              .addReg(AMDGPU::EXEC)
522              .addReg(LiveMaskReg);
523   }
524 
525   LIS->InsertMachineInstrInMaps(*MI);
526 }
527 
528 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
529                             MachineBasicBlock::iterator Before,
530                             unsigned SavedWQM) {
531   MachineInstr *MI;
532 
533   if (SavedWQM) {
534     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
535              .addReg(SavedWQM);
536   } else {
537     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
538                  AMDGPU::EXEC)
539              .addReg(AMDGPU::EXEC);
540   }
541 
542   LIS->InsertMachineInstrInMaps(*MI);
543 }
544 
545 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
546                                    bool isEntry) {
547   auto BII = Blocks.find(&MBB);
548   if (BII == Blocks.end())
549     return;
550 
551   const BlockInfo &BI = BII->second;
552 
553   if (!(BI.InNeeds & StateWQM))
554     return;
555 
556   // This is a non-entry block that is WQM throughout, so no need to do
557   // anything.
558   if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
559     return;
560 
561   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
562 
563   unsigned SavedWQMReg = 0;
564   bool WQMFromExec = isEntry;
565   char State = isEntry ? StateExact : StateWQM;
566 
567   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
568   if (isEntry)
569     ++II; // Skip the instruction that saves LiveMask
570 
571   MachineBasicBlock::iterator First = IE;
572   for (;;) {
573     MachineBasicBlock::iterator Next = II;
574     char Needs = 0;
575     char OutNeeds = 0;
576 
577     if (First == IE)
578       First = II;
579 
580     if (II != IE) {
581       MachineInstr &MI = *II;
582 
583       if (requiresCorrectState(MI)) {
584         auto III = Instructions.find(&MI);
585         if (III != Instructions.end()) {
586           Needs = III->second.Needs;
587           OutNeeds = III->second.OutNeeds;
588         }
589       }
590 
591       if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
592         Needs = StateExact;
593 
594       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
595         MI.getOperand(3).setImm(1);
596 
597       ++Next;
598     } else {
599       // End of basic block
600       if (BI.OutNeeds & StateWQM)
601         Needs = StateWQM;
602       else if (BI.OutNeeds == StateExact)
603         Needs = StateExact;
604     }
605 
606     if (Needs) {
607       if (Needs != State) {
608         MachineBasicBlock::iterator Before =
609             prepareInsertion(MBB, First, II, Needs == StateWQM,
610                              Needs == StateExact || WQMFromExec);
611 
612         if (Needs == StateExact) {
613           if (!WQMFromExec && (OutNeeds & StateWQM))
614             SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
615 
616           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
617         } else {
618           assert(WQMFromExec == (SavedWQMReg == 0));
619 
620           toWQM(MBB, Before, SavedWQMReg);
621 
622           if (SavedWQMReg) {
623             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
624             SavedWQMReg = 0;
625           }
626         }
627 
628         State = Needs;
629       }
630 
631       First = IE;
632     }
633 
634     if (II == IE)
635       break;
636     II = Next;
637   }
638 }
639 
640 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
641   for (MachineInstr *MI : LiveMaskQueries) {
642     const DebugLoc &DL = MI->getDebugLoc();
643     unsigned Dest = MI->getOperand(0).getReg();
644     MachineInstr *Copy =
645         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
646             .addReg(LiveMaskReg);
647 
648     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
649     MI->eraseFromParent();
650   }
651 }
652 
653 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
654   if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
655     return false;
656 
657   Instructions.clear();
658   Blocks.clear();
659   LiveMaskQueries.clear();
660 
661   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
662 
663   TII = ST.getInstrInfo();
664   TRI = &TII->getRegisterInfo();
665   MRI = &MF.getRegInfo();
666   LIS = &getAnalysis<LiveIntervals>();
667 
668   char GlobalFlags = analyzeFunction(MF);
669   if (!(GlobalFlags & StateWQM)) {
670     lowerLiveMaskQueries(AMDGPU::EXEC);
671     return !LiveMaskQueries.empty();
672   }
673 
674   // Store a copy of the original live mask when required
675   unsigned LiveMaskReg = 0;
676   {
677     MachineBasicBlock &Entry = MF.front();
678     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
679 
680     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
681       LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
682       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
683                                  TII->get(AMDGPU::COPY), LiveMaskReg)
684                              .addReg(AMDGPU::EXEC);
685       LIS->InsertMachineInstrInMaps(*MI);
686     }
687 
688     if (GlobalFlags == StateWQM) {
689       // For a shader that needs only WQM, we can just set it once.
690       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
691               AMDGPU::EXEC)
692           .addReg(AMDGPU::EXEC);
693 
694       lowerLiveMaskQueries(LiveMaskReg);
695       // EntryMI may become invalid here
696       return true;
697     }
698   }
699 
700   DEBUG(printInfo());
701 
702   lowerLiveMaskQueries(LiveMaskReg);
703 
704   // Handle the general case
705   for (auto BII : Blocks)
706     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
707 
708   // Physical registers like SCC aren't tracked by default anyway, so just
709   // removing the ranges we computed is the simplest option for maintaining
710   // the analysis results.
711   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
712 
713   return true;
714 }
715