xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision db096adba0f5d602587d0c90fb093cd12e706e5b)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137   char InitialState = 0;
138   bool NeedsLowering = false;
139 };
140 
141 struct WorkItem {
142   MachineBasicBlock *MBB = nullptr;
143   MachineInstr *MI = nullptr;
144 
145   WorkItem() = default;
146   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147   WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149 
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152   const SIInstrInfo *TII;
153   const SIRegisterInfo *TRI;
154   const GCNSubtarget *ST;
155   MachineRegisterInfo *MRI;
156   LiveIntervals *LIS;
157   MachineDominatorTree *MDT;
158   MachinePostDominatorTree *PDT;
159 
160   unsigned AndOpc;
161   unsigned AndTermOpc;
162   unsigned AndN2Opc;
163   unsigned XorOpc;
164   unsigned AndSaveExecOpc;
165   unsigned AndSaveExecTermOpc;
166   unsigned WQMOpc;
167   Register Exec;
168   Register LiveMaskReg;
169 
170   DenseMap<const MachineInstr *, InstrInfo> Instructions;
171   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172 
173   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174   DenseMap<const MachineInstr *, char> StateTransition;
175 
176   SmallVector<MachineInstr *, 2> LiveMaskQueries;
177   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179   SmallVector<MachineInstr *, 4> KillInstrs;
180   SmallVector<MachineInstr *, 4> InitExecInstrs;
181 
182   void printInfo();
183 
184   void markInstruction(MachineInstr &MI, char Flag,
185                        std::vector<WorkItem> &Worklist);
186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189                    std::vector<WorkItem> &Worklist);
190   void markInstructionUses(const MachineInstr &MI, char Flag,
191                            std::vector<WorkItem> &Worklist);
192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195   char analyzeFunction(MachineFunction &MF);
196 
197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198                                       MachineBasicBlock::iterator Before);
199   MachineBasicBlock::iterator
200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201                    MachineBasicBlock::iterator Last, bool PreferLast,
202                    bool SaveSCC);
203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204                Register SaveWQM);
205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206              Register SavedWQM);
207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208                     Register SaveOrig, char StrictStateNeeded);
209   void fromStrictMode(MachineBasicBlock &MBB,
210                       MachineBasicBlock::iterator Before, Register SavedOrig,
211                       char NonStrictState, char CurrentStrictState);
212 
213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214 
215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216                             bool IsWQM);
217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218 
219   void lowerBlock(MachineBasicBlock &MBB);
220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222   void lowerLiveMaskQueries();
223   void lowerCopyInstrs();
224   void lowerKillInstrs(bool IsWQM);
225   void lowerInitExec(MachineInstr &MI);
226   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
227 
228 public:
229   static char ID;
230 
231   SIWholeQuadMode() :
232     MachineFunctionPass(ID) { }
233 
234   bool runOnMachineFunction(MachineFunction &MF) override;
235 
236   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
237 
238   void getAnalysisUsage(AnalysisUsage &AU) const override {
239     AU.addRequired<LiveIntervals>();
240     AU.addPreserved<SlotIndexes>();
241     AU.addPreserved<LiveIntervals>();
242     AU.addPreserved<MachineDominatorTreeWrapperPass>();
243     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
244     MachineFunctionPass::getAnalysisUsage(AU);
245   }
246 
247   MachineFunctionProperties getClearedProperties() const override {
248     return MachineFunctionProperties().set(
249         MachineFunctionProperties::Property::IsSSA);
250   }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258                       false)
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263                     false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
267 FunctionPass *llvm::createSIWholeQuadModePass() {
268   return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273   for (const auto &BII : Blocks) {
274     dbgs() << "\n"
275            << printMBBReference(*BII.first) << ":\n"
276            << "  InNeeds = " << PrintState(BII.second.InNeeds)
277            << ", Needs = " << PrintState(BII.second.Needs)
278            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280     for (const MachineInstr &MI : *BII.first) {
281       auto III = Instructions.find(&MI);
282       if (III != Instructions.end()) {
283         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
284                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
285       }
286     }
287   }
288 }
289 #endif
290 
291 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
292                                       std::vector<WorkItem> &Worklist) {
293   InstrInfo &II = Instructions[&MI];
294 
295   assert(!(Flag & StateExact) && Flag != 0);
296 
297   // Remove any disabled states from the flag. The user that required it gets
298   // an undefined value in the helper lanes. For example, this can happen if
299   // the result of an atomic is used by instruction that requires WQM, where
300   // ignoring the request for WQM is correct as per the relevant specs.
301   Flag &= ~II.Disabled;
302 
303   // Ignore if the flag is already encompassed by the existing needs, or we
304   // just disabled everything.
305   if ((II.Needs & Flag) == Flag)
306     return;
307 
308   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
309   II.Needs |= Flag;
310   Worklist.push_back(&MI);
311 }
312 
313 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
314 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
315                                Register Reg, unsigned SubReg, char Flag,
316                                std::vector<WorkItem> &Worklist) {
317   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
318 
319   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
320   const VNInfo *Value = UseLRQ.valueIn();
321   if (!Value)
322     return;
323 
324   // Note: this code assumes that lane masks on AMDGPU completely
325   // cover registers.
326   const LaneBitmask UseLanes =
327       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
328              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
329                                 : LaneBitmask::getNone());
330 
331   // Perform a depth-first iteration of the LiveRange graph marking defs.
332   // Stop processing of a given branch when all use lanes have been defined.
333   // The first definition stops processing for a physical register.
334   struct PhiEntry {
335     const VNInfo *Phi;
336     unsigned PredIdx;
337     LaneBitmask DefinedLanes;
338 
339     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
340         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
341   };
342   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
343   SmallVector<PhiEntry, 2> PhiStack;
344   SmallSet<VisitKey, 4> Visited;
345   LaneBitmask DefinedLanes;
346   unsigned NextPredIdx = 0; // Only used for processing phi nodes
347   do {
348     const VNInfo *NextValue = nullptr;
349     const VisitKey Key(Value, DefinedLanes);
350 
351     if (Visited.insert(Key).second) {
352       // On first visit to a phi then start processing first predecessor
353       NextPredIdx = 0;
354     }
355 
356     if (Value->isPHIDef()) {
357       // Each predecessor node in the phi must be processed as a subgraph
358       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
359       assert(MBB && "Phi-def has no defining MBB");
360 
361       // Find next predecessor to process
362       unsigned Idx = NextPredIdx;
363       auto PI = MBB->pred_begin() + Idx;
364       auto PE = MBB->pred_end();
365       for (; PI != PE && !NextValue; ++PI, ++Idx) {
366         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
367           if (!Visited.count(VisitKey(VN, DefinedLanes)))
368             NextValue = VN;
369         }
370       }
371 
372       // If there are more predecessors to process; add phi to stack
373       if (PI != PE)
374         PhiStack.emplace_back(Value, Idx, DefinedLanes);
375     } else {
376       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
377       assert(MI && "Def has no defining instruction");
378 
379       if (Reg.isVirtual()) {
380         // Iterate over all operands to find relevant definitions
381         bool HasDef = false;
382         for (const MachineOperand &Op : MI->all_defs()) {
383           if (Op.getReg() != Reg)
384             continue;
385 
386           // Compute lanes defined and overlap with use
387           LaneBitmask OpLanes =
388               Op.isUndef() ? LaneBitmask::getAll()
389                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
390           LaneBitmask Overlap = (UseLanes & OpLanes);
391 
392           // Record if this instruction defined any of use
393           HasDef |= Overlap.any();
394 
395           // Mark any lanes defined
396           DefinedLanes |= OpLanes;
397         }
398 
399         // Check if all lanes of use have been defined
400         if ((DefinedLanes & UseLanes) != UseLanes) {
401           // Definition not complete; need to process input value
402           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
403           if (const VNInfo *VN = LRQ.valueIn()) {
404             if (!Visited.count(VisitKey(VN, DefinedLanes)))
405               NextValue = VN;
406           }
407         }
408 
409         // Only mark the instruction if it defines some part of the use
410         if (HasDef)
411           markInstruction(*MI, Flag, Worklist);
412       } else {
413         // For physical registers simply mark the defining instruction
414         markInstruction(*MI, Flag, Worklist);
415       }
416     }
417 
418     if (!NextValue && !PhiStack.empty()) {
419       // Reach end of chain; revert to processing last phi
420       PhiEntry &Entry = PhiStack.back();
421       NextValue = Entry.Phi;
422       NextPredIdx = Entry.PredIdx;
423       DefinedLanes = Entry.DefinedLanes;
424       PhiStack.pop_back();
425     }
426 
427     Value = NextValue;
428   } while (Value);
429 }
430 
431 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
432                                   const MachineOperand &Op, char Flag,
433                                   std::vector<WorkItem> &Worklist) {
434   assert(Op.isReg());
435   Register Reg = Op.getReg();
436 
437   // Ignore some hardware registers
438   switch (Reg) {
439   case AMDGPU::EXEC:
440   case AMDGPU::EXEC_LO:
441     return;
442   default:
443     break;
444   }
445 
446   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
447                     << " for " << MI);
448   if (Reg.isVirtual()) {
449     LiveRange &LR = LIS->getInterval(Reg);
450     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
451   } else {
452     // Handle physical registers that we need to track; this is mostly relevant
453     // for VCC, which can appear as the (implicit) input of a uniform branch,
454     // e.g. when a loop counter is stored in a VGPR.
455     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
456       LiveRange &LR = LIS->getRegUnit(Unit);
457       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
458       if (Value)
459         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
460     }
461   }
462 }
463 
464 /// Mark all instructions defining the uses in \p MI with \p Flag.
465 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
466                                           std::vector<WorkItem> &Worklist) {
467   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
468                     << MI);
469 
470   for (const MachineOperand &Use : MI.all_uses())
471     markOperand(MI, Use, Flag, Worklist);
472 }
473 
474 // Scan instructions to determine which ones require an Exact execmask and
475 // which ones seed WQM requirements.
476 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
477                                        std::vector<WorkItem> &Worklist) {
478   char GlobalFlags = 0;
479   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
480   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
481   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
482   bool HasImplicitDerivatives =
483       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
484 
485   // We need to visit the basic blocks in reverse post-order so that we visit
486   // defs before uses, in particular so that we don't accidentally mark an
487   // instruction as needing e.g. WQM before visiting it and realizing it needs
488   // WQM disabled.
489   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
490   for (MachineBasicBlock *MBB : RPOT) {
491     BlockInfo &BBI = Blocks[MBB];
492 
493     for (MachineInstr &MI : *MBB) {
494       InstrInfo &III = Instructions[&MI];
495       unsigned Opcode = MI.getOpcode();
496       char Flags = 0;
497 
498       if (TII->isWQM(Opcode)) {
499         // If LOD is not supported WQM is not needed.
500         // Only generate implicit WQM if implicit derivatives are required.
501         // This avoids inserting unintended WQM if a shader type without
502         // implicit derivatives uses an image sampling instruction.
503         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
504           // Sampling instructions don't need to produce results for all pixels
505           // in a quad, they just require all inputs of a quad to have been
506           // computed for derivatives.
507           markInstructionUses(MI, StateWQM, Worklist);
508           GlobalFlags |= StateWQM;
509         }
510       } else if (Opcode == AMDGPU::WQM) {
511         // The WQM intrinsic requires its output to have all the helper lanes
512         // correct, so we need it to be in WQM.
513         Flags = StateWQM;
514         LowerToCopyInstrs.push_back(&MI);
515       } else if (Opcode == AMDGPU::SOFT_WQM) {
516         LowerToCopyInstrs.push_back(&MI);
517         SoftWQMInstrs.push_back(&MI);
518       } else if (Opcode == AMDGPU::STRICT_WWM) {
519         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
520         // it needs to be executed in WQM or Exact so that its copy doesn't
521         // clobber inactive lanes.
522         markInstructionUses(MI, StateStrictWWM, Worklist);
523         GlobalFlags |= StateStrictWWM;
524         LowerToMovInstrs.push_back(&MI);
525       } else if (Opcode == AMDGPU::STRICT_WQM ||
526                  TII->isDualSourceBlendEXP(MI)) {
527         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
528         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
529         // quads that have at least one active thread.
530         markInstructionUses(MI, StateStrictWQM, Worklist);
531         GlobalFlags |= StateStrictWQM;
532 
533         if (Opcode == AMDGPU::STRICT_WQM) {
534           LowerToMovInstrs.push_back(&MI);
535         } else {
536           // Dual source blend export acts as implicit strict-wqm, its sources
537           // need to be shuffled in strict wqm, but the export itself needs to
538           // run in exact mode.
539           BBI.Needs |= StateExact;
540           if (!(BBI.InNeeds & StateExact)) {
541             BBI.InNeeds |= StateExact;
542             Worklist.push_back(MBB);
543           }
544           GlobalFlags |= StateExact;
545           III.Disabled = StateWQM | StateStrict;
546         }
547       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
548                  Opcode == AMDGPU::DS_PARAM_LOAD ||
549                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
550                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
551         // Mark these STRICTWQM, but only for the instruction, not its operands.
552         // This avoid unnecessarily marking M0 as requiring WQM.
553         InstrInfo &II = Instructions[&MI];
554         II.Needs |= StateStrictWQM;
555         GlobalFlags |= StateStrictWQM;
556       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
557                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
558         III.Disabled = StateStrict;
559         MachineOperand &Inactive = MI.getOperand(2);
560         if (Inactive.isReg()) {
561           if (Inactive.isUndef()) {
562             LowerToCopyInstrs.push_back(&MI);
563           } else {
564             markOperand(MI, Inactive, StateStrictWWM, Worklist);
565           }
566         }
567         SetInactiveInstrs.push_back(&MI);
568       } else if (TII->isDisableWQM(MI)) {
569         BBI.Needs |= StateExact;
570         if (!(BBI.InNeeds & StateExact)) {
571           BBI.InNeeds |= StateExact;
572           Worklist.push_back(MBB);
573         }
574         GlobalFlags |= StateExact;
575         III.Disabled = StateWQM | StateStrict;
576       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
577                  Opcode == AMDGPU::SI_LIVE_MASK) {
578         LiveMaskQueries.push_back(&MI);
579       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
580                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
581                  Opcode == AMDGPU::SI_DEMOTE_I1) {
582         KillInstrs.push_back(&MI);
583         BBI.NeedsLowering = true;
584       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
585                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
586         InitExecInstrs.push_back(&MI);
587       } else if (WQMOutputs) {
588         // The function is in machine SSA form, which means that physical
589         // VGPRs correspond to shader inputs and outputs. Inputs are
590         // only used, outputs are only defined.
591         // FIXME: is this still valid?
592         for (const MachineOperand &MO : MI.defs()) {
593           Register Reg = MO.getReg();
594           if (Reg.isPhysical() &&
595               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
596             Flags = StateWQM;
597             break;
598           }
599         }
600       }
601 
602       if (Flags) {
603         markInstruction(MI, Flags, Worklist);
604         GlobalFlags |= Flags;
605       }
606     }
607   }
608 
609   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
610   // ever used anywhere in the function. This implements the corresponding
611   // semantics of @llvm.amdgcn.set.inactive.
612   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
613   if (GlobalFlags & StateWQM) {
614     for (MachineInstr *MI : SetInactiveInstrs)
615       markInstruction(*MI, StateWQM, Worklist);
616     for (MachineInstr *MI : SoftWQMInstrs)
617       markInstruction(*MI, StateWQM, Worklist);
618   }
619 
620   return GlobalFlags;
621 }
622 
623 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
624                                            std::vector<WorkItem>& Worklist) {
625   MachineBasicBlock *MBB = MI.getParent();
626   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
627   BlockInfo &BI = Blocks[MBB];
628 
629   // Control flow-type instructions and stores to temporary memory that are
630   // followed by WQM computations must themselves be in WQM.
631   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
632       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
633     Instructions[&MI].Needs = StateWQM;
634     II.Needs = StateWQM;
635   }
636 
637   // Propagate to block level
638   if (II.Needs & StateWQM) {
639     BI.Needs |= StateWQM;
640     if (!(BI.InNeeds & StateWQM)) {
641       BI.InNeeds |= StateWQM;
642       Worklist.push_back(MBB);
643     }
644   }
645 
646   // Propagate backwards within block
647   if (MachineInstr *PrevMI = MI.getPrevNode()) {
648     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
649     if (!PrevMI->isPHI()) {
650       InstrInfo &PrevII = Instructions[PrevMI];
651       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
652         PrevII.OutNeeds |= InNeeds;
653         Worklist.push_back(PrevMI);
654       }
655     }
656   }
657 
658   // Propagate WQM flag to instruction inputs
659   assert(!(II.Needs & StateExact));
660 
661   if (II.Needs != 0)
662     markInstructionUses(MI, II.Needs, Worklist);
663 
664   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
665   // not require any WQM transitions.
666   if (II.Needs & StateStrictWWM)
667     BI.Needs |= StateStrictWWM;
668   if (II.Needs & StateStrictWQM)
669     BI.Needs |= StateStrictWQM;
670 }
671 
672 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
673                                      std::vector<WorkItem>& Worklist) {
674   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
675 
676   // Propagate through instructions
677   if (!MBB.empty()) {
678     MachineInstr *LastMI = &*MBB.rbegin();
679     InstrInfo &LastII = Instructions[LastMI];
680     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
681       LastII.OutNeeds |= BI.OutNeeds;
682       Worklist.push_back(LastMI);
683     }
684   }
685 
686   // Predecessor blocks must provide for our WQM/Exact needs.
687   for (MachineBasicBlock *Pred : MBB.predecessors()) {
688     BlockInfo &PredBI = Blocks[Pred];
689     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
690       continue;
691 
692     PredBI.OutNeeds |= BI.InNeeds;
693     PredBI.InNeeds |= BI.InNeeds;
694     Worklist.push_back(Pred);
695   }
696 
697   // All successors must be prepared to accept the same set of WQM/Exact data.
698   for (MachineBasicBlock *Succ : MBB.successors()) {
699     BlockInfo &SuccBI = Blocks[Succ];
700     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
701       continue;
702 
703     SuccBI.InNeeds |= BI.OutNeeds;
704     Worklist.push_back(Succ);
705   }
706 }
707 
708 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
709   std::vector<WorkItem> Worklist;
710   char GlobalFlags = scanInstructions(MF, Worklist);
711 
712   while (!Worklist.empty()) {
713     WorkItem WI = Worklist.back();
714     Worklist.pop_back();
715 
716     if (WI.MI)
717       propagateInstruction(*WI.MI, Worklist);
718     else
719       propagateBlock(*WI.MBB, Worklist);
720   }
721 
722   return GlobalFlags;
723 }
724 
725 MachineBasicBlock::iterator
726 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
727                          MachineBasicBlock::iterator Before) {
728   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
729 
730   MachineInstr *Save =
731       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
732           .addReg(AMDGPU::SCC);
733   MachineInstr *Restore =
734       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
735           .addReg(SaveReg);
736 
737   LIS->InsertMachineInstrInMaps(*Save);
738   LIS->InsertMachineInstrInMaps(*Restore);
739   LIS->createAndComputeVirtRegInterval(SaveReg);
740 
741   return Restore;
742 }
743 
744 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
745                                                MachineInstr *TermMI) {
746   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
747                     << *TermMI << "\n");
748 
749   MachineBasicBlock *SplitBB =
750       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
751 
752   // Convert last instruction in block to a terminator.
753   // Note: this only covers the expected patterns
754   unsigned NewOpcode = 0;
755   switch (TermMI->getOpcode()) {
756   case AMDGPU::S_AND_B32:
757     NewOpcode = AMDGPU::S_AND_B32_term;
758     break;
759   case AMDGPU::S_AND_B64:
760     NewOpcode = AMDGPU::S_AND_B64_term;
761     break;
762   case AMDGPU::S_MOV_B32:
763     NewOpcode = AMDGPU::S_MOV_B32_term;
764     break;
765   case AMDGPU::S_MOV_B64:
766     NewOpcode = AMDGPU::S_MOV_B64_term;
767     break;
768   default:
769     break;
770   }
771   if (NewOpcode)
772     TermMI->setDesc(TII->get(NewOpcode));
773 
774   if (SplitBB != BB) {
775     // Update dominator trees
776     using DomTreeT = DomTreeBase<MachineBasicBlock>;
777     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
778     for (MachineBasicBlock *Succ : SplitBB->successors()) {
779       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
780       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
781     }
782     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
783     if (MDT)
784       MDT->getBase().applyUpdates(DTUpdates);
785     if (PDT)
786       PDT->applyUpdates(DTUpdates);
787 
788     // Link blocks
789     MachineInstr *MI =
790         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
791             .addMBB(SplitBB);
792     LIS->InsertMachineInstrInMaps(*MI);
793   }
794 
795   return SplitBB;
796 }
797 
798 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
799                                             MachineInstr &MI) {
800   const DebugLoc &DL = MI.getDebugLoc();
801   unsigned Opcode = 0;
802 
803   assert(MI.getOperand(0).isReg());
804 
805   // Comparison is for live lanes; however here we compute the inverse
806   // (killed lanes).  This is because VCMP will always generate 0 bits
807   // for inactive lanes so a mask of live lanes would not be correct
808   // inside control flow.
809   // Invert the comparison by swapping the operands and adjusting
810   // the comparison codes.
811 
812   switch (MI.getOperand(2).getImm()) {
813   case ISD::SETUEQ:
814     Opcode = AMDGPU::V_CMP_LG_F32_e64;
815     break;
816   case ISD::SETUGT:
817     Opcode = AMDGPU::V_CMP_GE_F32_e64;
818     break;
819   case ISD::SETUGE:
820     Opcode = AMDGPU::V_CMP_GT_F32_e64;
821     break;
822   case ISD::SETULT:
823     Opcode = AMDGPU::V_CMP_LE_F32_e64;
824     break;
825   case ISD::SETULE:
826     Opcode = AMDGPU::V_CMP_LT_F32_e64;
827     break;
828   case ISD::SETUNE:
829     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
830     break;
831   case ISD::SETO:
832     Opcode = AMDGPU::V_CMP_O_F32_e64;
833     break;
834   case ISD::SETUO:
835     Opcode = AMDGPU::V_CMP_U_F32_e64;
836     break;
837   case ISD::SETOEQ:
838   case ISD::SETEQ:
839     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
840     break;
841   case ISD::SETOGT:
842   case ISD::SETGT:
843     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
844     break;
845   case ISD::SETOGE:
846   case ISD::SETGE:
847     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
848     break;
849   case ISD::SETOLT:
850   case ISD::SETLT:
851     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
852     break;
853   case ISD::SETOLE:
854   case ISD::SETLE:
855     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
856     break;
857   case ISD::SETONE:
858   case ISD::SETNE:
859     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
860     break;
861   default:
862     llvm_unreachable("invalid ISD:SET cond code");
863   }
864 
865   // Pick opcode based on comparison type.
866   MachineInstr *VcmpMI;
867   const MachineOperand &Op0 = MI.getOperand(0);
868   const MachineOperand &Op1 = MI.getOperand(1);
869 
870   // VCC represents lanes killed.
871   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
872 
873   if (TRI->isVGPR(*MRI, Op0.getReg())) {
874     Opcode = AMDGPU::getVOPe32(Opcode);
875     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
876   } else {
877     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
878                  .addReg(VCC, RegState::Define)
879                  .addImm(0) // src0 modifiers
880                  .add(Op1)
881                  .addImm(0) // src1 modifiers
882                  .add(Op0)
883                  .addImm(0); // omod
884   }
885 
886   MachineInstr *MaskUpdateMI =
887       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
888           .addReg(LiveMaskReg)
889           .addReg(VCC);
890 
891   // State of SCC represents whether any lanes are live in mask,
892   // if SCC is 0 then no lanes will be alive anymore.
893   MachineInstr *EarlyTermMI =
894       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
895 
896   MachineInstr *ExecMaskMI =
897       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
898 
899   assert(MBB.succ_size() == 1);
900   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
901                               .addMBB(*MBB.succ_begin());
902 
903   // Update live intervals
904   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
905   MBB.remove(&MI);
906 
907   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
908   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
909   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
910   LIS->InsertMachineInstrInMaps(*NewTerm);
911 
912   return NewTerm;
913 }
914 
915 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
916                                            MachineInstr &MI, bool IsWQM) {
917   const DebugLoc &DL = MI.getDebugLoc();
918   MachineInstr *MaskUpdateMI = nullptr;
919 
920   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
921   const MachineOperand &Op = MI.getOperand(0);
922   int64_t KillVal = MI.getOperand(1).getImm();
923   MachineInstr *ComputeKilledMaskMI = nullptr;
924   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
925   Register TmpReg;
926 
927   // Is this a static or dynamic kill?
928   if (Op.isImm()) {
929     if (Op.getImm() == KillVal) {
930       // Static: all active lanes are killed
931       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
932                          .addReg(LiveMaskReg)
933                          .addReg(Exec);
934     } else {
935       // Static: kill does nothing
936       MachineInstr *NewTerm = nullptr;
937       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
938         LIS->RemoveMachineInstrFromMaps(MI);
939       } else {
940         assert(MBB.succ_size() == 1);
941         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
942                       .addMBB(*MBB.succ_begin());
943         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
944       }
945       MBB.remove(&MI);
946       return NewTerm;
947     }
948   } else {
949     if (!KillVal) {
950       // Op represents live lanes after kill,
951       // so exec mask needs to be factored in.
952       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
953       ComputeKilledMaskMI =
954           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
955       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
956                          .addReg(LiveMaskReg)
957                          .addReg(TmpReg);
958     } else {
959       // Op represents lanes to kill
960       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
961                          .addReg(LiveMaskReg)
962                          .add(Op);
963     }
964   }
965 
966   // State of SCC represents whether any lanes are live in mask,
967   // if SCC is 0 then no lanes will be alive anymore.
968   MachineInstr *EarlyTermMI =
969       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
970 
971   // In the case we got this far some lanes are still live,
972   // update EXEC to deactivate lanes as appropriate.
973   MachineInstr *NewTerm;
974   MachineInstr *WQMMaskMI = nullptr;
975   Register LiveMaskWQM;
976   if (IsDemote) {
977     // Demote - deactivate quads with only helper lanes
978     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
979     WQMMaskMI =
980         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
981     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
982                   .addReg(Exec)
983                   .addReg(LiveMaskWQM);
984   } else {
985     // Kill - deactivate lanes no longer in live mask
986     if (Op.isImm()) {
987       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
988       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
989     } else if (!IsWQM) {
990       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
991                     .addReg(Exec)
992                     .addReg(LiveMaskReg);
993     } else {
994       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
995       NewTerm =
996           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
997     }
998   }
999 
1000   // Update live intervals
1001   LIS->RemoveMachineInstrFromMaps(MI);
1002   MBB.remove(&MI);
1003   assert(EarlyTermMI);
1004   assert(MaskUpdateMI);
1005   assert(NewTerm);
1006   if (ComputeKilledMaskMI)
1007     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1008   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1009   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1010   if (WQMMaskMI)
1011     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1012   LIS->InsertMachineInstrInMaps(*NewTerm);
1013 
1014   if (CndReg) {
1015     LIS->removeInterval(CndReg);
1016     LIS->createAndComputeVirtRegInterval(CndReg);
1017   }
1018   if (TmpReg)
1019     LIS->createAndComputeVirtRegInterval(TmpReg);
1020   if (LiveMaskWQM)
1021     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1022 
1023   return NewTerm;
1024 }
1025 
1026 // Replace (or supplement) instructions accessing live mask.
1027 // This can only happen once all the live mask registers have been created
1028 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1029 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1030   auto BII = Blocks.find(&MBB);
1031   if (BII == Blocks.end())
1032     return;
1033 
1034   const BlockInfo &BI = BII->second;
1035   if (!BI.NeedsLowering)
1036     return;
1037 
1038   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1039 
1040   SmallVector<MachineInstr *, 4> SplitPoints;
1041   char State = BI.InitialState;
1042 
1043   for (MachineInstr &MI : llvm::make_early_inc_range(
1044            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1045     if (StateTransition.count(&MI))
1046       State = StateTransition[&MI];
1047 
1048     MachineInstr *SplitPoint = nullptr;
1049     switch (MI.getOpcode()) {
1050     case AMDGPU::SI_DEMOTE_I1:
1051     case AMDGPU::SI_KILL_I1_TERMINATOR:
1052       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1053       break;
1054     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1055       SplitPoint = lowerKillF32(MBB, MI);
1056       break;
1057     default:
1058       break;
1059     }
1060     if (SplitPoint)
1061       SplitPoints.push_back(SplitPoint);
1062   }
1063 
1064   // Perform splitting after instruction scan to simplify iteration.
1065   if (!SplitPoints.empty()) {
1066     MachineBasicBlock *BB = &MBB;
1067     for (MachineInstr *MI : SplitPoints) {
1068       BB = splitBlock(BB, MI);
1069     }
1070   }
1071 }
1072 
1073 // Return an iterator in the (inclusive) range [First, Last] at which
1074 // instructions can be safely inserted, keeping in mind that some of the
1075 // instructions we want to add necessarily clobber SCC.
1076 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1077     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1078     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1079   if (!SaveSCC)
1080     return PreferLast ? Last : First;
1081 
1082   LiveRange &LR =
1083       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1084   auto MBBE = MBB.end();
1085   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1086                                      : LIS->getMBBEndIdx(&MBB);
1087   SlotIndex LastIdx =
1088       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1089   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1090   const LiveRange::Segment *S;
1091 
1092   for (;;) {
1093     S = LR.getSegmentContaining(Idx);
1094     if (!S)
1095       break;
1096 
1097     if (PreferLast) {
1098       SlotIndex Next = S->start.getBaseIndex();
1099       if (Next < FirstIdx)
1100         break;
1101       Idx = Next;
1102     } else {
1103       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1104       assert(EndMI && "Segment does not end on valid instruction");
1105       auto NextI = std::next(EndMI->getIterator());
1106       if (NextI == MBB.end())
1107         break;
1108       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1109       if (Next > LastIdx)
1110         break;
1111       Idx = Next;
1112     }
1113   }
1114 
1115   MachineBasicBlock::iterator MBBI;
1116 
1117   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1118     MBBI = MI;
1119   else {
1120     assert(Idx == LIS->getMBBEndIdx(&MBB));
1121     MBBI = MBB.end();
1122   }
1123 
1124   // Move insertion point past any operations modifying EXEC.
1125   // This assumes that the value of SCC defined by any of these operations
1126   // does not need to be preserved.
1127   while (MBBI != Last) {
1128     bool IsExecDef = false;
1129     for (const MachineOperand &MO : MBBI->all_defs()) {
1130       IsExecDef |=
1131           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1132     }
1133     if (!IsExecDef)
1134       break;
1135     MBBI++;
1136     S = nullptr;
1137   }
1138 
1139   if (S)
1140     MBBI = saveSCC(MBB, MBBI);
1141 
1142   return MBBI;
1143 }
1144 
1145 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1146                               MachineBasicBlock::iterator Before,
1147                               Register SaveWQM) {
1148   bool IsTerminator = Before == MBB.end();
1149   if (!IsTerminator) {
1150     auto FirstTerm = MBB.getFirstTerminator();
1151     if (FirstTerm != MBB.end()) {
1152       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1153       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1154       IsTerminator = BeforeIdx > FirstTermIdx;
1155     }
1156   }
1157 
1158   MachineInstr *MI;
1159 
1160   if (SaveWQM) {
1161     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1162     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1163              .addReg(LiveMaskReg);
1164   } else {
1165     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1166     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1167              .addReg(Exec)
1168              .addReg(LiveMaskReg);
1169   }
1170 
1171   LIS->InsertMachineInstrInMaps(*MI);
1172   StateTransition[MI] = StateExact;
1173 }
1174 
1175 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1176                             MachineBasicBlock::iterator Before,
1177                             Register SavedWQM) {
1178   MachineInstr *MI;
1179 
1180   if (SavedWQM) {
1181     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1182              .addReg(SavedWQM);
1183   } else {
1184     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1185   }
1186 
1187   LIS->InsertMachineInstrInMaps(*MI);
1188   StateTransition[MI] = StateWQM;
1189 }
1190 
1191 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1192                                    MachineBasicBlock::iterator Before,
1193                                    Register SaveOrig, char StrictStateNeeded) {
1194   MachineInstr *MI;
1195   assert(SaveOrig);
1196   assert(StrictStateNeeded == StateStrictWWM ||
1197          StrictStateNeeded == StateStrictWQM);
1198 
1199   if (StrictStateNeeded == StateStrictWWM) {
1200     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1201                  SaveOrig)
1202              .addImm(-1);
1203   } else {
1204     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1205                  SaveOrig)
1206              .addImm(-1);
1207   }
1208   LIS->InsertMachineInstrInMaps(*MI);
1209   StateTransition[MI] = StrictStateNeeded;
1210 }
1211 
1212 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1213                                      MachineBasicBlock::iterator Before,
1214                                      Register SavedOrig, char NonStrictState,
1215                                      char CurrentStrictState) {
1216   MachineInstr *MI;
1217 
1218   assert(SavedOrig);
1219   assert(CurrentStrictState == StateStrictWWM ||
1220          CurrentStrictState == StateStrictWQM);
1221 
1222   if (CurrentStrictState == StateStrictWWM) {
1223     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1224                  Exec)
1225              .addReg(SavedOrig);
1226   } else {
1227     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1228                  Exec)
1229              .addReg(SavedOrig);
1230   }
1231   LIS->InsertMachineInstrInMaps(*MI);
1232   StateTransition[MI] = NonStrictState;
1233 }
1234 
1235 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1236   auto BII = Blocks.find(&MBB);
1237   if (BII == Blocks.end())
1238     return;
1239 
1240   BlockInfo &BI = BII->second;
1241 
1242   // This is a non-entry block that is WQM throughout, so no need to do
1243   // anything.
1244   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1245     BI.InitialState = StateWQM;
1246     return;
1247   }
1248 
1249   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1250                     << ":\n");
1251 
1252   Register SavedWQMReg;
1253   Register SavedNonStrictReg;
1254   bool WQMFromExec = IsEntry;
1255   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1256   char NonStrictState = 0;
1257   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1258 
1259   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1260   if (IsEntry) {
1261     // Skip the instruction that saves LiveMask
1262     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1263         II->getOperand(1).getReg() == TRI->getExec())
1264       ++II;
1265   }
1266 
1267   // This stores the first instruction where it's safe to switch from WQM to
1268   // Exact or vice versa.
1269   MachineBasicBlock::iterator FirstWQM = IE;
1270 
1271   // This stores the first instruction where it's safe to switch from Strict
1272   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1273   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1274   // be safe to switch to/from WQM as well.
1275   MachineBasicBlock::iterator FirstStrict = IE;
1276 
1277   // Record initial state is block information.
1278   BI.InitialState = State;
1279 
1280   for (;;) {
1281     MachineBasicBlock::iterator Next = II;
1282     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1283     char OutNeeds = 0;
1284 
1285     if (FirstWQM == IE)
1286       FirstWQM = II;
1287 
1288     if (FirstStrict == IE)
1289       FirstStrict = II;
1290 
1291     // First, figure out the allowed states (Needs) based on the propagated
1292     // flags.
1293     if (II != IE) {
1294       MachineInstr &MI = *II;
1295 
1296       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1297         auto III = Instructions.find(&MI);
1298         if (III != Instructions.end()) {
1299           if (III->second.Needs & StateStrictWWM)
1300             Needs = StateStrictWWM;
1301           else if (III->second.Needs & StateStrictWQM)
1302             Needs = StateStrictWQM;
1303           else if (III->second.Needs & StateWQM)
1304             Needs = StateWQM;
1305           else
1306             Needs &= ~III->second.Disabled;
1307           OutNeeds = III->second.OutNeeds;
1308         }
1309       } else {
1310         // If the instruction doesn't actually need a correct EXEC, then we can
1311         // safely leave Strict mode enabled.
1312         Needs = StateExact | StateWQM | StateStrict;
1313       }
1314 
1315       // Exact mode exit can occur in terminators, but must be before branches.
1316       if (MI.isBranch() && OutNeeds == StateExact)
1317         Needs = StateExact;
1318 
1319       ++Next;
1320     } else {
1321       // End of basic block
1322       if (BI.OutNeeds & StateWQM)
1323         Needs = StateWQM;
1324       else if (BI.OutNeeds == StateExact)
1325         Needs = StateExact;
1326       else
1327         Needs = StateWQM | StateExact;
1328     }
1329 
1330     // Now, transition if necessary.
1331     if (!(Needs & State)) {
1332       MachineBasicBlock::iterator First;
1333       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1334           State == StateStrictWQM || Needs == StateStrictWQM) {
1335         // We must switch to or from Strict mode.
1336         First = FirstStrict;
1337       } else {
1338         // We only need to switch to/from WQM, so we can use FirstWQM.
1339         First = FirstWQM;
1340       }
1341 
1342       // Whether we need to save SCC depends on start and end states.
1343       bool SaveSCC = false;
1344       switch (State) {
1345       case StateExact:
1346       case StateStrictWWM:
1347       case StateStrictWQM:
1348         // Exact/Strict -> Strict: save SCC
1349         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1350         // Exact/Strict -> Exact: no save
1351         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1352         break;
1353       case StateWQM:
1354         // WQM -> Exact/Strict: save SCC
1355         SaveSCC = !(Needs & StateWQM);
1356         break;
1357       default:
1358         llvm_unreachable("Unknown state");
1359         break;
1360       }
1361       MachineBasicBlock::iterator Before =
1362           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1363 
1364       if (State & StateStrict) {
1365         assert(State == StateStrictWWM || State == StateStrictWQM);
1366         assert(SavedNonStrictReg);
1367         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1368 
1369         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1370         SavedNonStrictReg = 0;
1371         State = NonStrictState;
1372       }
1373 
1374       if (Needs & StateStrict) {
1375         NonStrictState = State;
1376         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1377         assert(!SavedNonStrictReg);
1378         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1379 
1380         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1381         State = Needs;
1382 
1383       } else {
1384         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1385           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1386             assert(!SavedWQMReg);
1387             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1388           }
1389 
1390           toExact(MBB, Before, SavedWQMReg);
1391           State = StateExact;
1392         } else if (State == StateExact && (Needs & StateWQM) &&
1393                    !(Needs & StateExact)) {
1394           assert(WQMFromExec == (SavedWQMReg == 0));
1395 
1396           toWQM(MBB, Before, SavedWQMReg);
1397 
1398           if (SavedWQMReg) {
1399             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1400             SavedWQMReg = 0;
1401           }
1402           State = StateWQM;
1403         } else {
1404           // We can get here if we transitioned from StrictWWM to a
1405           // non-StrictWWM state that already matches our needs, but we
1406           // shouldn't need to do anything.
1407           assert(Needs & State);
1408         }
1409       }
1410     }
1411 
1412     if (Needs != (StateExact | StateWQM | StateStrict)) {
1413       if (Needs != (StateExact | StateWQM))
1414         FirstWQM = IE;
1415       FirstStrict = IE;
1416     }
1417 
1418     if (II == IE)
1419       break;
1420 
1421     II = Next;
1422   }
1423   assert(!SavedWQMReg);
1424   assert(!SavedNonStrictReg);
1425 }
1426 
1427 void SIWholeQuadMode::lowerLiveMaskQueries() {
1428   for (MachineInstr *MI : LiveMaskQueries) {
1429     const DebugLoc &DL = MI->getDebugLoc();
1430     Register Dest = MI->getOperand(0).getReg();
1431 
1432     MachineInstr *Copy =
1433         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1434             .addReg(LiveMaskReg);
1435 
1436     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1437     MI->eraseFromParent();
1438   }
1439 }
1440 
1441 void SIWholeQuadMode::lowerCopyInstrs() {
1442   for (MachineInstr *MI : LowerToMovInstrs) {
1443     assert(MI->getNumExplicitOperands() == 2);
1444 
1445     const Register Reg = MI->getOperand(0).getReg();
1446 
1447     const TargetRegisterClass *regClass =
1448         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1449     if (TRI->isVGPRClass(regClass)) {
1450       const unsigned MovOp = TII->getMovOpcode(regClass);
1451       MI->setDesc(TII->get(MovOp));
1452 
1453       // Check that it already implicitly depends on exec (like all VALU movs
1454       // should do).
1455       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1456         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1457       }));
1458     } else {
1459       // Remove early-clobber and exec dependency from simple SGPR copies.
1460       // This allows some to be eliminated during/post RA.
1461       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1462       if (MI->getOperand(0).isEarlyClobber()) {
1463         LIS->removeInterval(Reg);
1464         MI->getOperand(0).setIsEarlyClobber(false);
1465         LIS->createAndComputeVirtRegInterval(Reg);
1466       }
1467       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1468       while (Index >= 0) {
1469         MI->removeOperand(Index);
1470         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1471       }
1472       MI->setDesc(TII->get(AMDGPU::COPY));
1473       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1474     }
1475   }
1476   for (MachineInstr *MI : LowerToCopyInstrs) {
1477     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1478         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1479       assert(MI->getNumExplicitOperands() == 3);
1480       // the only reason we should be here is V_SET_INACTIVE has
1481       // an undef input so it is being replaced by a simple copy.
1482       // There should be a second undef source that we should remove.
1483       assert(MI->getOperand(2).isUndef());
1484       MI->removeOperand(2);
1485       MI->untieRegOperand(1);
1486     } else {
1487       assert(MI->getNumExplicitOperands() == 2);
1488     }
1489 
1490     unsigned CopyOp = MI->getOperand(1).isReg()
1491                           ? (unsigned)AMDGPU::COPY
1492                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1493                                 *MRI, MI->getOperand(0)));
1494     MI->setDesc(TII->get(CopyOp));
1495   }
1496 }
1497 
1498 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1499   for (MachineInstr *MI : KillInstrs) {
1500     MachineBasicBlock *MBB = MI->getParent();
1501     MachineInstr *SplitPoint = nullptr;
1502     switch (MI->getOpcode()) {
1503     case AMDGPU::SI_DEMOTE_I1:
1504     case AMDGPU::SI_KILL_I1_TERMINATOR:
1505       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1506       break;
1507     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1508       SplitPoint = lowerKillF32(*MBB, *MI);
1509       break;
1510     }
1511     if (SplitPoint)
1512       splitBlock(MBB, SplitPoint);
1513   }
1514 }
1515 
1516 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1517   MachineBasicBlock *MBB = MI.getParent();
1518   bool IsWave32 = ST->isWave32();
1519 
1520   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1521     // This should be before all vector instructions.
1522     MachineInstr *InitMI =
1523         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1524                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1525                 Exec)
1526             .addImm(MI.getOperand(0).getImm());
1527     if (LIS) {
1528       LIS->RemoveMachineInstrFromMaps(MI);
1529       LIS->InsertMachineInstrInMaps(*InitMI);
1530     }
1531     MI.eraseFromParent();
1532     return;
1533   }
1534 
1535   // Extract the thread count from an SGPR input and set EXEC accordingly.
1536   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1537   //
1538   // S_BFE_U32 count, input, {shift, 7}
1539   // S_BFM_B64 exec, count, 0
1540   // S_CMP_EQ_U32 count, 64
1541   // S_CMOV_B64 exec, -1
1542   Register InputReg = MI.getOperand(0).getReg();
1543   MachineInstr *FirstMI = &*MBB->begin();
1544   if (InputReg.isVirtual()) {
1545     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1546     assert(DefInstr && DefInstr->isCopy());
1547     if (DefInstr->getParent() == MBB) {
1548       if (DefInstr != FirstMI) {
1549         // If the `InputReg` is defined in current block, we also need to
1550         // move that instruction to the beginning of the block.
1551         DefInstr->removeFromParent();
1552         MBB->insert(FirstMI, DefInstr);
1553         if (LIS)
1554           LIS->handleMove(*DefInstr);
1555       } else {
1556         // If first instruction is definition then move pointer after it.
1557         FirstMI = &*std::next(FirstMI->getIterator());
1558       }
1559     }
1560   }
1561 
1562   // Insert instruction sequence at block beginning (before vector operations).
1563   const DebugLoc DL = MI.getDebugLoc();
1564   const unsigned WavefrontSize = ST->getWavefrontSize();
1565   const unsigned Mask = (WavefrontSize << 1) - 1;
1566   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1567   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1568                    .addReg(InputReg)
1569                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1570   auto BfmMI =
1571       BuildMI(*MBB, FirstMI, DL,
1572               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1573           .addReg(CountReg)
1574           .addImm(0);
1575   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1576                    .addReg(CountReg, RegState::Kill)
1577                    .addImm(WavefrontSize);
1578   auto CmovMI =
1579       BuildMI(*MBB, FirstMI, DL,
1580               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1581               Exec)
1582           .addImm(-1);
1583 
1584   if (!LIS) {
1585     MI.eraseFromParent();
1586     return;
1587   }
1588 
1589   LIS->RemoveMachineInstrFromMaps(MI);
1590   MI.eraseFromParent();
1591 
1592   LIS->InsertMachineInstrInMaps(*BfeMI);
1593   LIS->InsertMachineInstrInMaps(*BfmMI);
1594   LIS->InsertMachineInstrInMaps(*CmpMI);
1595   LIS->InsertMachineInstrInMaps(*CmovMI);
1596 
1597   LIS->removeInterval(InputReg);
1598   LIS->createAndComputeVirtRegInterval(InputReg);
1599   LIS->createAndComputeVirtRegInterval(CountReg);
1600 }
1601 
1602 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1603 /// for instructions that depend on EXEC.
1604 MachineBasicBlock::iterator
1605 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
1606   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1607 
1608   for (MachineInstr *MI : InitExecInstrs) {
1609     // Try to handle undefined cases gracefully:
1610     // - multiple INIT_EXEC instructions
1611     // - INIT_EXEC instructions not in the entry block
1612     if (MI->getParent() == &Entry)
1613       InsertPt = std::next(MI->getIterator());
1614 
1615     lowerInitExec(*MI);
1616   }
1617 
1618   return InsertPt;
1619 }
1620 
1621 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1622   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1623                     << " ------------- \n");
1624   LLVM_DEBUG(MF.dump(););
1625 
1626   Instructions.clear();
1627   Blocks.clear();
1628   LiveMaskQueries.clear();
1629   LowerToCopyInstrs.clear();
1630   LowerToMovInstrs.clear();
1631   KillInstrs.clear();
1632   InitExecInstrs.clear();
1633   StateTransition.clear();
1634 
1635   ST = &MF.getSubtarget<GCNSubtarget>();
1636 
1637   TII = ST->getInstrInfo();
1638   TRI = &TII->getRegisterInfo();
1639   MRI = &MF.getRegInfo();
1640   LIS = &getAnalysis<LiveIntervals>();
1641   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1642   MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1643   auto *PDTWrapper =
1644       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1645   PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1646 
1647   if (ST->isWave32()) {
1648     AndOpc = AMDGPU::S_AND_B32;
1649     AndTermOpc = AMDGPU::S_AND_B32_term;
1650     AndN2Opc = AMDGPU::S_ANDN2_B32;
1651     XorOpc = AMDGPU::S_XOR_B32;
1652     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1653     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1654     WQMOpc = AMDGPU::S_WQM_B32;
1655     Exec = AMDGPU::EXEC_LO;
1656   } else {
1657     AndOpc = AMDGPU::S_AND_B64;
1658     AndTermOpc = AMDGPU::S_AND_B64_term;
1659     AndN2Opc = AMDGPU::S_ANDN2_B64;
1660     XorOpc = AMDGPU::S_XOR_B64;
1661     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1662     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1663     WQMOpc = AMDGPU::S_WQM_B64;
1664     Exec = AMDGPU::EXEC;
1665   }
1666 
1667   const char GlobalFlags = analyzeFunction(MF);
1668   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1669 
1670   LiveMaskReg = Exec;
1671 
1672   MachineBasicBlock &Entry = MF.front();
1673   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
1674 
1675   // Shader is simple does not need any state changes or any complex lowering
1676   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1677       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1678     lowerLiveMaskQueries();
1679     return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
1680   }
1681 
1682   // Store a copy of the original live mask when required
1683   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1684     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1685     MachineInstr *MI =
1686         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1687             .addReg(Exec);
1688     LIS->InsertMachineInstrInMaps(*MI);
1689   }
1690 
1691   LLVM_DEBUG(printInfo());
1692 
1693   lowerLiveMaskQueries();
1694   lowerCopyInstrs();
1695 
1696   // Shader only needs WQM
1697   if (GlobalFlags == StateWQM) {
1698     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1699                   .addReg(Exec);
1700     LIS->InsertMachineInstrInMaps(*MI);
1701     lowerKillInstrs(true);
1702   } else {
1703     for (auto BII : Blocks)
1704       processBlock(*BII.first, BII.first == &Entry);
1705     // Lowering blocks causes block splitting so perform as a second pass.
1706     for (auto BII : Blocks)
1707       lowerBlock(*BII.first);
1708   }
1709 
1710   // Compute live range for live mask
1711   if (LiveMaskReg != Exec)
1712     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1713 
1714   // Physical registers like SCC aren't tracked by default anyway, so just
1715   // removing the ranges we computed is the simplest option for maintaining
1716   // the analysis results.
1717   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1718 
1719   // If we performed any kills then recompute EXEC
1720   if (!KillInstrs.empty())
1721     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1722 
1723   return true;
1724 }
1725