xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision de068fe8b4d41dfd317f3cd54d0c439d0d4ea246)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass adds instructions to enable whole quad mode for pixel
12 /// shaders.
13 ///
14 /// Whole quad mode is required for derivative computations, but it interferes
15 /// with shader side effects (stores and atomics). This pass is run on the
16 /// scheduled machine IR but before register coalescing, so that machine SSA is
17 /// available for analysis. It ensures that WQM is enabled when necessary, but
18 /// disabled around stores and atomics.
19 ///
20 /// When necessary, this pass creates a function prolog
21 ///
22 ///   S_MOV_B64 LiveMask, EXEC
23 ///   S_WQM_B64 EXEC, EXEC
24 ///
25 /// to enter WQM at the top of the function and surrounds blocks of Exact
26 /// instructions by
27 ///
28 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 ///   ...
30 ///   S_MOV_B64 EXEC, Tmp
31 ///
32 /// In order to avoid excessive switching during sequences of Exact
33 /// instructions, the pass first analyzes which instructions must be run in WQM
34 /// (aka which instructions produce values that lead to derivative
35 /// computations).
36 ///
37 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
38 ///
39 /// There is room for improvement given better control flow analysis:
40 ///
41 ///  (1) at the top level (outside of control flow statements, and as long as
42 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
43 ///      the LiveMask (this is implemented for the entry block).
44 ///
45 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
46 ///      consist of exact and don't-care instructions, the switch only has to
47 ///      be done at the entry and exit points rather than potentially in each
48 ///      block of the region.
49 ///
50 //===----------------------------------------------------------------------===//
51 
52 #include "AMDGPU.h"
53 #include "AMDGPUSubtarget.h"
54 #include "SIInstrInfo.h"
55 #include "SIMachineFunctionInfo.h"
56 #include "llvm/ADT/DenseMap.h"
57 #include "llvm/ADT/PostOrderIterator.h"
58 #include "llvm/ADT/SmallVector.h"
59 #include "llvm/ADT/StringRef.h"
60 #include "llvm/CodeGen/LiveInterval.h"
61 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
62 #include "llvm/CodeGen/MachineBasicBlock.h"
63 #include "llvm/CodeGen/MachineFunction.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/CodeGen/MachineInstr.h"
66 #include "llvm/CodeGen/MachineInstrBuilder.h"
67 #include "llvm/CodeGen/MachineOperand.h"
68 #include "llvm/CodeGen/MachineRegisterInfo.h"
69 #include "llvm/CodeGen/SlotIndexes.h"
70 #include "llvm/IR/CallingConv.h"
71 #include "llvm/IR/DebugLoc.h"
72 #include "llvm/MC/MCRegisterInfo.h"
73 #include "llvm/Pass.h"
74 #include "llvm/Support/Debug.h"
75 #include "llvm/Support/raw_ostream.h"
76 #include "llvm/Target/TargetRegisterInfo.h"
77 #include <cassert>
78 #include <vector>
79 
80 using namespace llvm;
81 
82 #define DEBUG_TYPE "si-wqm"
83 
84 namespace {
85 
86 enum {
87   StateWQM = 0x1,
88   StateExact = 0x2,
89 };
90 
91 struct PrintState {
92 public:
93   int State;
94 
95   explicit PrintState(int State) : State(State) {}
96 };
97 
98 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
99   if (PS.State & StateWQM)
100     OS << "WQM";
101   if (PS.State & StateExact) {
102     if (PS.State & StateWQM)
103       OS << '|';
104     OS << "Exact";
105   }
106 
107   return OS;
108 }
109 
110 struct InstrInfo {
111   char Needs = 0;
112   char Disabled = 0;
113   char OutNeeds = 0;
114 };
115 
116 struct BlockInfo {
117   char Needs = 0;
118   char InNeeds = 0;
119   char OutNeeds = 0;
120 };
121 
122 struct WorkItem {
123   MachineBasicBlock *MBB = nullptr;
124   MachineInstr *MI = nullptr;
125 
126   WorkItem() = default;
127   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
128   WorkItem(MachineInstr *MI) : MI(MI) {}
129 };
130 
131 class SIWholeQuadMode : public MachineFunctionPass {
132 private:
133   const SIInstrInfo *TII;
134   const SIRegisterInfo *TRI;
135   MachineRegisterInfo *MRI;
136   LiveIntervals *LIS;
137 
138   DenseMap<const MachineInstr *, InstrInfo> Instructions;
139   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
140   SmallVector<MachineInstr *, 1> LiveMaskQueries;
141   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
142 
143   void printInfo();
144 
145   void markInstruction(MachineInstr &MI, char Flag,
146                        std::vector<WorkItem> &Worklist);
147   void markInstructionUses(const MachineInstr &MI, char Flag,
148                            std::vector<WorkItem> &Worklist);
149   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
150   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
151   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
152   char analyzeFunction(MachineFunction &MF);
153 
154   bool requiresCorrectState(const MachineInstr &MI) const;
155 
156   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
157                                       MachineBasicBlock::iterator Before);
158   MachineBasicBlock::iterator
159   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
160                    MachineBasicBlock::iterator Last, bool PreferLast,
161                    bool SaveSCC);
162   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
163                unsigned SaveWQM, unsigned LiveMaskReg);
164   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
165              unsigned SavedWQM);
166   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
167 
168   void lowerLiveMaskQueries(unsigned LiveMaskReg);
169   void lowerCopyInstrs();
170 
171 public:
172   static char ID;
173 
174   SIWholeQuadMode() :
175     MachineFunctionPass(ID) { }
176 
177   bool runOnMachineFunction(MachineFunction &MF) override;
178 
179   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
180 
181   void getAnalysisUsage(AnalysisUsage &AU) const override {
182     AU.addRequired<LiveIntervals>();
183     AU.setPreservesCFG();
184     MachineFunctionPass::getAnalysisUsage(AU);
185   }
186 };
187 
188 } // end anonymous namespace
189 
190 char SIWholeQuadMode::ID = 0;
191 
192 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
193                       false)
194 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
195 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
196                     false)
197 
198 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
199 
200 FunctionPass *llvm::createSIWholeQuadModePass() {
201   return new SIWholeQuadMode;
202 }
203 
204 void SIWholeQuadMode::printInfo() {
205   for (const auto &BII : Blocks) {
206     dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
207            << "  InNeeds = " << PrintState(BII.second.InNeeds)
208            << ", Needs = " << PrintState(BII.second.Needs)
209            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
210 
211     for (const MachineInstr &MI : *BII.first) {
212       auto III = Instructions.find(&MI);
213       if (III == Instructions.end())
214         continue;
215 
216       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
217              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
218     }
219   }
220 }
221 
222 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
223                                       std::vector<WorkItem> &Worklist) {
224   InstrInfo &II = Instructions[&MI];
225 
226   assert(Flag == StateWQM);
227 
228   // Remove any disabled states from the flag. The user that required it gets
229   // an undefined value in the helper lanes. For example, this can happen if
230   // the result of an atomic is used by instruction that requires WQM, where
231   // ignoring the request for WQM is correct as per the relevant specs.
232   Flag &= ~II.Disabled;
233 
234   // Ignore if the flag is already encompassed by the existing needs, or we
235   // just disabled everything.
236   if ((II.Needs & Flag) == Flag)
237     return;
238 
239   II.Needs |= Flag;
240   Worklist.push_back(&MI);
241 }
242 
243 /// Mark all instructions defining the uses in \p MI with \p Flag.
244 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
245                                           std::vector<WorkItem> &Worklist) {
246   assert(Flag == StateWQM);
247   for (const MachineOperand &Use : MI.uses()) {
248     if (!Use.isReg() || !Use.isUse())
249       continue;
250 
251     unsigned Reg = Use.getReg();
252 
253     // Handle physical registers that we need to track; this is mostly relevant
254     // for VCC, which can appear as the (implicit) input of a uniform branch,
255     // e.g. when a loop counter is stored in a VGPR.
256     if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
257       if (Reg == AMDGPU::EXEC)
258         continue;
259 
260       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
261         LiveRange &LR = LIS->getRegUnit(*RegUnit);
262         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
263         if (!Value)
264           continue;
265 
266         // Since we're in machine SSA, we do not need to track physical
267         // registers across basic blocks.
268         if (Value->isPHIDef())
269           continue;
270 
271         markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
272                         Worklist);
273       }
274 
275       continue;
276     }
277 
278     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
279       markInstruction(DefMI, Flag, Worklist);
280   }
281 }
282 
283 // Scan instructions to determine which ones require an Exact execmask and
284 // which ones seed WQM requirements.
285 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
286                                        std::vector<WorkItem> &Worklist) {
287   char GlobalFlags = 0;
288   bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
289 
290   // We need to visit the basic blocks in reverse post-order so that we visit
291   // defs before uses, in particular so that we don't accidentally mark an
292   // instruction as needing e.g. WQM before visiting it and realizing it needs
293   // WQM disabled.
294   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
295   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
296     MachineBasicBlock &MBB = **BI;
297     BlockInfo &BBI = Blocks[&MBB];
298 
299     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
300       MachineInstr &MI = *II;
301       InstrInfo &III = Instructions[&MI];
302       unsigned Opcode = MI.getOpcode();
303       char Flags = 0;
304 
305       if (TII->isDS(Opcode)) {
306         Flags = StateWQM;
307       } else if (TII->isWQM(Opcode)) {
308         // Sampling instructions don't need to produce results for all pixels
309         // in a quad, they just require all inputs of a quad to have been
310         // computed for derivatives.
311         markInstructionUses(MI, StateWQM, Worklist);
312         GlobalFlags |= StateWQM;
313         continue;
314       } else if (Opcode == AMDGPU::WQM) {
315         // The WQM intrinsic requires its output to have all the helper lanes
316         // correct, so we need it to be in WQM.
317         Flags = StateWQM;
318         LowerToCopyInstrs.push_back(&MI);
319       } else if (TII->isDisableWQM(MI)) {
320         BBI.Needs |= StateExact;
321         if (!(BBI.InNeeds & StateExact)) {
322           BBI.InNeeds |= StateExact;
323           Worklist.push_back(&MBB);
324         }
325         GlobalFlags |= StateExact;
326         III.Disabled = StateWQM;
327         continue;
328       } else {
329         if (Opcode == AMDGPU::SI_PS_LIVE) {
330           LiveMaskQueries.push_back(&MI);
331         } else if (WQMOutputs) {
332           // The function is in machine SSA form, which means that physical
333           // VGPRs correspond to shader inputs and outputs. Inputs are
334           // only used, outputs are only defined.
335           for (const MachineOperand &MO : MI.defs()) {
336             if (!MO.isReg())
337               continue;
338 
339             unsigned Reg = MO.getReg();
340 
341             if (!TRI->isVirtualRegister(Reg) &&
342                 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
343               Flags = StateWQM;
344               break;
345             }
346           }
347         }
348 
349         if (!Flags)
350           continue;
351       }
352 
353       markInstruction(MI, Flags, Worklist);
354       GlobalFlags |= Flags;
355     }
356   }
357 
358   return GlobalFlags;
359 }
360 
361 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
362                                            std::vector<WorkItem>& Worklist) {
363   MachineBasicBlock *MBB = MI.getParent();
364   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
365   BlockInfo &BI = Blocks[MBB];
366 
367   // Control flow-type instructions and stores to temporary memory that are
368   // followed by WQM computations must themselves be in WQM.
369   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
370       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
371     Instructions[&MI].Needs = StateWQM;
372     II.Needs = StateWQM;
373   }
374 
375   // Propagate to block level
376   if (II.Needs & StateWQM) {
377     BI.Needs |= StateWQM;
378     if (!(BI.InNeeds & StateWQM)) {
379       BI.InNeeds |= StateWQM;
380       Worklist.push_back(MBB);
381     }
382   }
383 
384   // Propagate backwards within block
385   if (MachineInstr *PrevMI = MI.getPrevNode()) {
386     char InNeeds = II.Needs | II.OutNeeds;
387     if (!PrevMI->isPHI()) {
388       InstrInfo &PrevII = Instructions[PrevMI];
389       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
390         PrevII.OutNeeds |= InNeeds;
391         Worklist.push_back(PrevMI);
392       }
393     }
394   }
395 
396   // Propagate WQM flag to instruction inputs
397   assert(!(II.Needs & StateExact));
398 
399   if (II.Needs != 0)
400     markInstructionUses(MI, II.Needs, Worklist);
401 }
402 
403 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
404                                      std::vector<WorkItem>& Worklist) {
405   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
406 
407   // Propagate through instructions
408   if (!MBB.empty()) {
409     MachineInstr *LastMI = &*MBB.rbegin();
410     InstrInfo &LastII = Instructions[LastMI];
411     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
412       LastII.OutNeeds |= BI.OutNeeds;
413       Worklist.push_back(LastMI);
414     }
415   }
416 
417   // Predecessor blocks must provide for our WQM/Exact needs.
418   for (MachineBasicBlock *Pred : MBB.predecessors()) {
419     BlockInfo &PredBI = Blocks[Pred];
420     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
421       continue;
422 
423     PredBI.OutNeeds |= BI.InNeeds;
424     PredBI.InNeeds |= BI.InNeeds;
425     Worklist.push_back(Pred);
426   }
427 
428   // All successors must be prepared to accept the same set of WQM/Exact data.
429   for (MachineBasicBlock *Succ : MBB.successors()) {
430     BlockInfo &SuccBI = Blocks[Succ];
431     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
432       continue;
433 
434     SuccBI.InNeeds |= BI.OutNeeds;
435     Worklist.push_back(Succ);
436   }
437 }
438 
439 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
440   std::vector<WorkItem> Worklist;
441   char GlobalFlags = scanInstructions(MF, Worklist);
442 
443   while (!Worklist.empty()) {
444     WorkItem WI = Worklist.back();
445     Worklist.pop_back();
446 
447     if (WI.MI)
448       propagateInstruction(*WI.MI, Worklist);
449     else
450       propagateBlock(*WI.MBB, Worklist);
451   }
452 
453   return GlobalFlags;
454 }
455 
456 /// Whether \p MI really requires the exec state computed during analysis.
457 ///
458 /// Scalar instructions must occasionally be marked WQM for correct propagation
459 /// (e.g. thread masks leading up to branches), but when it comes to actual
460 /// execution, they don't care about EXEC.
461 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
462   if (MI.isTerminator())
463     return true;
464 
465   // Skip instructions that are not affected by EXEC
466   if (TII->isScalarUnit(MI))
467     return false;
468 
469   // Generic instructions such as COPY will either disappear by register
470   // coalescing or be lowered to SALU or VALU instructions.
471   if (MI.isTransient()) {
472     if (MI.getNumExplicitOperands() >= 1) {
473       const MachineOperand &Op = MI.getOperand(0);
474       if (Op.isReg()) {
475         if (TRI->isSGPRReg(*MRI, Op.getReg())) {
476           // SGPR instructions are not affected by EXEC
477           return false;
478         }
479       }
480     }
481   }
482 
483   return true;
484 }
485 
486 MachineBasicBlock::iterator
487 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
488                          MachineBasicBlock::iterator Before) {
489   unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
490 
491   MachineInstr *Save =
492       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
493           .addReg(AMDGPU::SCC);
494   MachineInstr *Restore =
495       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
496           .addReg(SaveReg);
497 
498   LIS->InsertMachineInstrInMaps(*Save);
499   LIS->InsertMachineInstrInMaps(*Restore);
500   LIS->createAndComputeVirtRegInterval(SaveReg);
501 
502   return Restore;
503 }
504 
505 // Return an iterator in the (inclusive) range [First, Last] at which
506 // instructions can be safely inserted, keeping in mind that some of the
507 // instructions we want to add necessarily clobber SCC.
508 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
509     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
510     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
511   if (!SaveSCC)
512     return PreferLast ? Last : First;
513 
514   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
515   auto MBBE = MBB.end();
516   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
517                                      : LIS->getMBBEndIdx(&MBB);
518   SlotIndex LastIdx =
519       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
520   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
521   const LiveRange::Segment *S;
522 
523   for (;;) {
524     S = LR.getSegmentContaining(Idx);
525     if (!S)
526       break;
527 
528     if (PreferLast) {
529       SlotIndex Next = S->start.getBaseIndex();
530       if (Next < FirstIdx)
531         break;
532       Idx = Next;
533     } else {
534       SlotIndex Next = S->end.getNextIndex().getBaseIndex();
535       if (Next > LastIdx)
536         break;
537       Idx = Next;
538     }
539   }
540 
541   MachineBasicBlock::iterator MBBI;
542 
543   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
544     MBBI = MI;
545   else {
546     assert(Idx == LIS->getMBBEndIdx(&MBB));
547     MBBI = MBB.end();
548   }
549 
550   if (S)
551     MBBI = saveSCC(MBB, MBBI);
552 
553   return MBBI;
554 }
555 
556 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
557                               MachineBasicBlock::iterator Before,
558                               unsigned SaveWQM, unsigned LiveMaskReg) {
559   MachineInstr *MI;
560 
561   if (SaveWQM) {
562     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
563                  SaveWQM)
564              .addReg(LiveMaskReg);
565   } else {
566     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
567                  AMDGPU::EXEC)
568              .addReg(AMDGPU::EXEC)
569              .addReg(LiveMaskReg);
570   }
571 
572   LIS->InsertMachineInstrInMaps(*MI);
573 }
574 
575 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
576                             MachineBasicBlock::iterator Before,
577                             unsigned SavedWQM) {
578   MachineInstr *MI;
579 
580   if (SavedWQM) {
581     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
582              .addReg(SavedWQM);
583   } else {
584     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
585                  AMDGPU::EXEC)
586              .addReg(AMDGPU::EXEC);
587   }
588 
589   LIS->InsertMachineInstrInMaps(*MI);
590 }
591 
592 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
593                                    bool isEntry) {
594   auto BII = Blocks.find(&MBB);
595   if (BII == Blocks.end())
596     return;
597 
598   const BlockInfo &BI = BII->second;
599 
600   if (!(BI.InNeeds & StateWQM))
601     return;
602 
603   // This is a non-entry block that is WQM throughout, so no need to do
604   // anything.
605   if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
606     return;
607 
608   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
609 
610   unsigned SavedWQMReg = 0;
611   bool WQMFromExec = isEntry;
612   char State = isEntry ? StateExact : StateWQM;
613 
614   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
615   if (isEntry)
616     ++II; // Skip the instruction that saves LiveMask
617 
618   MachineBasicBlock::iterator First = IE;
619   for (;;) {
620     MachineBasicBlock::iterator Next = II;
621     char Needs = StateExact | StateWQM;
622     char OutNeeds = 0;
623 
624     if (First == IE)
625       First = II;
626 
627     if (II != IE) {
628       MachineInstr &MI = *II;
629 
630       if (requiresCorrectState(MI)) {
631         auto III = Instructions.find(&MI);
632         if (III != Instructions.end()) {
633           if (III->second.Needs & StateWQM)
634             Needs = StateWQM;
635           else
636             Needs &= ~III->second.Disabled;
637           OutNeeds = III->second.OutNeeds;
638         }
639       }
640 
641       if (MI.isTerminator() && OutNeeds == StateExact)
642         Needs = StateExact;
643 
644       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
645         MI.getOperand(3).setImm(1);
646 
647       ++Next;
648     } else {
649       // End of basic block
650       if (BI.OutNeeds & StateWQM)
651         Needs = StateWQM;
652       else if (BI.OutNeeds == StateExact)
653         Needs = StateExact;
654       else
655         Needs = StateWQM | StateExact;
656     }
657 
658     if (!(Needs & State)) {
659       MachineBasicBlock::iterator Before =
660           prepareInsertion(MBB, First, II, Needs == StateWQM,
661                            Needs == StateExact || WQMFromExec);
662 
663       if (Needs == StateExact) {
664         if (!WQMFromExec && (OutNeeds & StateWQM))
665           SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
666 
667         toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
668         State = StateExact;
669       } else {
670         assert(Needs == StateWQM);
671         assert(WQMFromExec == (SavedWQMReg == 0));
672 
673         toWQM(MBB, Before, SavedWQMReg);
674 
675         if (SavedWQMReg) {
676           LIS->createAndComputeVirtRegInterval(SavedWQMReg);
677           SavedWQMReg = 0;
678         }
679         State = StateWQM;
680       }
681 
682       First = IE;
683     }
684 
685     if (Needs != (StateExact | StateWQM))
686       First = IE;
687 
688     if (II == IE)
689       break;
690     II = Next;
691   }
692 }
693 
694 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
695   for (MachineInstr *MI : LiveMaskQueries) {
696     const DebugLoc &DL = MI->getDebugLoc();
697     unsigned Dest = MI->getOperand(0).getReg();
698     MachineInstr *Copy =
699         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
700             .addReg(LiveMaskReg);
701 
702     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
703     MI->eraseFromParent();
704   }
705 }
706 
707 void SIWholeQuadMode::lowerCopyInstrs() {
708   for (MachineInstr *MI : LowerToCopyInstrs)
709     MI->setDesc(TII->get(AMDGPU::COPY));
710 }
711 
712 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
713   if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
714     return false;
715 
716   Instructions.clear();
717   Blocks.clear();
718   LiveMaskQueries.clear();
719   LowerToCopyInstrs.clear();
720 
721   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
722 
723   TII = ST.getInstrInfo();
724   TRI = &TII->getRegisterInfo();
725   MRI = &MF.getRegInfo();
726   LIS = &getAnalysis<LiveIntervals>();
727 
728   char GlobalFlags = analyzeFunction(MF);
729   if (!(GlobalFlags & StateWQM)) {
730     lowerLiveMaskQueries(AMDGPU::EXEC);
731     return !LiveMaskQueries.empty();
732   }
733 
734   // Store a copy of the original live mask when required
735   unsigned LiveMaskReg = 0;
736   {
737     MachineBasicBlock &Entry = MF.front();
738     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
739 
740     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
741       LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
742       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
743                                  TII->get(AMDGPU::COPY), LiveMaskReg)
744                              .addReg(AMDGPU::EXEC);
745       LIS->InsertMachineInstrInMaps(*MI);
746     }
747 
748     if (GlobalFlags == StateWQM) {
749       // For a shader that needs only WQM, we can just set it once.
750       BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
751               AMDGPU::EXEC)
752           .addReg(AMDGPU::EXEC);
753 
754       lowerLiveMaskQueries(LiveMaskReg);
755       lowerCopyInstrs();
756       // EntryMI may become invalid here
757       return true;
758     }
759   }
760 
761   DEBUG(printInfo());
762 
763   lowerLiveMaskQueries(LiveMaskReg);
764   lowerCopyInstrs();
765 
766   // Handle the general case
767   for (auto BII : Blocks)
768     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
769 
770   // Physical registers like SCC aren't tracked by default anyway, so just
771   // removing the ranges we computed is the simplest option for maintaining
772   // the analysis results.
773   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
774 
775   return true;
776 }
777