xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 7792b4ae79e5ac9355ee13b01f16e25455f8427f)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131   char MarkedStates = 0;
132 };
133 
134 struct BlockInfo {
135   char Needs = 0;
136   char InNeeds = 0;
137   char OutNeeds = 0;
138   char InitialState = 0;
139   bool NeedsLowering = false;
140 };
141 
142 struct WorkItem {
143   MachineBasicBlock *MBB = nullptr;
144   MachineInstr *MI = nullptr;
145 
146   WorkItem() = default;
147   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
148   WorkItem(MachineInstr *MI) : MI(MI) {}
149 };
150 
151 class SIWholeQuadMode : public MachineFunctionPass {
152 private:
153   const SIInstrInfo *TII;
154   const SIRegisterInfo *TRI;
155   const GCNSubtarget *ST;
156   MachineRegisterInfo *MRI;
157   LiveIntervals *LIS;
158   MachineDominatorTree *MDT;
159   MachinePostDominatorTree *PDT;
160 
161   unsigned AndOpc;
162   unsigned AndTermOpc;
163   unsigned AndN2Opc;
164   unsigned XorOpc;
165   unsigned AndSaveExecOpc;
166   unsigned AndSaveExecTermOpc;
167   unsigned WQMOpc;
168   Register Exec;
169   Register LiveMaskReg;
170 
171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
173 
174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175   DenseMap<const MachineInstr *, char> StateTransition;
176 
177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179   SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
180   SmallVector<MachineInstr *, 4> KillInstrs;
181   SmallVector<MachineInstr *, 4> InitExecInstrs;
182   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
183 
184   void printInfo();
185 
186   void markInstruction(MachineInstr &MI, char Flag,
187                        std::vector<WorkItem> &Worklist);
188   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
189                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
190   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
191                    std::vector<WorkItem> &Worklist);
192   void markInstructionUses(const MachineInstr &MI, char Flag,
193                            std::vector<WorkItem> &Worklist);
194   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
195   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
196   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
197   char analyzeFunction(MachineFunction &MF);
198 
199   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
200                                       MachineBasicBlock::iterator Before);
201   MachineBasicBlock::iterator
202   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
203                    MachineBasicBlock::iterator Last, bool PreferLast,
204                    bool SaveSCC);
205   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206                Register SaveWQM);
207   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208              Register SavedWQM);
209   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
210                     Register SaveOrig, char StrictStateNeeded);
211   void fromStrictMode(MachineBasicBlock &MBB,
212                       MachineBasicBlock::iterator Before, Register SavedOrig,
213                       char NonStrictState, char CurrentStrictState);
214 
215   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
216 
217   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
218                             bool IsWQM);
219   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
220 
221   void lowerBlock(MachineBasicBlock &MBB);
222   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
223 
224   bool lowerLiveMaskQueries();
225   bool lowerCopyInstrs();
226   bool lowerKillInstrs(bool IsWQM);
227   void lowerInitExec(MachineInstr &MI);
228   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
229                                                   bool &Changed);
230 
231 public:
232   static char ID;
233 
234   SIWholeQuadMode() :
235     MachineFunctionPass(ID) { }
236 
237   bool runOnMachineFunction(MachineFunction &MF) override;
238 
239   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
240 
241   void getAnalysisUsage(AnalysisUsage &AU) const override {
242     AU.addRequired<LiveIntervalsWrapperPass>();
243     AU.addPreserved<SlotIndexesWrapperPass>();
244     AU.addPreserved<LiveIntervalsWrapperPass>();
245     AU.addPreserved<MachineDominatorTreeWrapperPass>();
246     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
247     MachineFunctionPass::getAnalysisUsage(AU);
248   }
249 
250   MachineFunctionProperties getClearedProperties() const override {
251     return MachineFunctionProperties().set(
252         MachineFunctionProperties::Property::IsSSA);
253   }
254 };
255 
256 } // end anonymous namespace
257 
258 char SIWholeQuadMode::ID = 0;
259 
260 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
261                       false)
262 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
263 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
264 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
265 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
266                     false)
267 
268 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
269 
270 FunctionPass *llvm::createSIWholeQuadModePass() {
271   return new SIWholeQuadMode;
272 }
273 
274 #ifndef NDEBUG
275 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
276   for (const auto &BII : Blocks) {
277     dbgs() << "\n"
278            << printMBBReference(*BII.first) << ":\n"
279            << "  InNeeds = " << PrintState(BII.second.InNeeds)
280            << ", Needs = " << PrintState(BII.second.Needs)
281            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
282 
283     for (const MachineInstr &MI : *BII.first) {
284       auto III = Instructions.find(&MI);
285       if (III != Instructions.end()) {
286         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
287                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
288       }
289     }
290   }
291 }
292 #endif
293 
294 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
295                                       std::vector<WorkItem> &Worklist) {
296   InstrInfo &II = Instructions[&MI];
297 
298   assert(!(Flag & StateExact) && Flag != 0);
299 
300   // Capture all states requested in marking including disabled ones.
301   II.MarkedStates |= Flag;
302 
303   // Remove any disabled states from the flag. The user that required it gets
304   // an undefined value in the helper lanes. For example, this can happen if
305   // the result of an atomic is used by instruction that requires WQM, where
306   // ignoring the request for WQM is correct as per the relevant specs.
307   Flag &= ~II.Disabled;
308 
309   // Ignore if the flag is already encompassed by the existing needs, or we
310   // just disabled everything.
311   if ((II.Needs & Flag) == Flag)
312     return;
313 
314   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
315   II.Needs |= Flag;
316   Worklist.emplace_back(&MI);
317 }
318 
319 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
320 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
321                                Register Reg, unsigned SubReg, char Flag,
322                                std::vector<WorkItem> &Worklist) {
323   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
324 
325   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
326   const VNInfo *Value = UseLRQ.valueIn();
327   if (!Value)
328     return;
329 
330   // Note: this code assumes that lane masks on AMDGPU completely
331   // cover registers.
332   const LaneBitmask UseLanes =
333       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
334              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
335                                 : LaneBitmask::getNone());
336 
337   // Perform a depth-first iteration of the LiveRange graph marking defs.
338   // Stop processing of a given branch when all use lanes have been defined.
339   // The first definition stops processing for a physical register.
340   struct PhiEntry {
341     const VNInfo *Phi;
342     unsigned PredIdx;
343     LaneBitmask DefinedLanes;
344 
345     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
346         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
347   };
348   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
349   SmallVector<PhiEntry, 2> PhiStack;
350   SmallSet<VisitKey, 4> Visited;
351   LaneBitmask DefinedLanes;
352   unsigned NextPredIdx = 0; // Only used for processing phi nodes
353   do {
354     const VNInfo *NextValue = nullptr;
355     const VisitKey Key(Value, DefinedLanes);
356 
357     if (Visited.insert(Key).second) {
358       // On first visit to a phi then start processing first predecessor
359       NextPredIdx = 0;
360     }
361 
362     if (Value->isPHIDef()) {
363       // Each predecessor node in the phi must be processed as a subgraph
364       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
365       assert(MBB && "Phi-def has no defining MBB");
366 
367       // Find next predecessor to process
368       unsigned Idx = NextPredIdx;
369       auto PI = MBB->pred_begin() + Idx;
370       auto PE = MBB->pred_end();
371       for (; PI != PE && !NextValue; ++PI, ++Idx) {
372         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
373           if (!Visited.count(VisitKey(VN, DefinedLanes)))
374             NextValue = VN;
375         }
376       }
377 
378       // If there are more predecessors to process; add phi to stack
379       if (PI != PE)
380         PhiStack.emplace_back(Value, Idx, DefinedLanes);
381     } else {
382       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
383       assert(MI && "Def has no defining instruction");
384 
385       if (Reg.isVirtual()) {
386         // Iterate over all operands to find relevant definitions
387         bool HasDef = false;
388         for (const MachineOperand &Op : MI->all_defs()) {
389           if (Op.getReg() != Reg)
390             continue;
391 
392           // Compute lanes defined and overlap with use
393           LaneBitmask OpLanes =
394               Op.isUndef() ? LaneBitmask::getAll()
395                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
396           LaneBitmask Overlap = (UseLanes & OpLanes);
397 
398           // Record if this instruction defined any of use
399           HasDef |= Overlap.any();
400 
401           // Mark any lanes defined
402           DefinedLanes |= OpLanes;
403         }
404 
405         // Check if all lanes of use have been defined
406         if ((DefinedLanes & UseLanes) != UseLanes) {
407           // Definition not complete; need to process input value
408           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
409           if (const VNInfo *VN = LRQ.valueIn()) {
410             if (!Visited.count(VisitKey(VN, DefinedLanes)))
411               NextValue = VN;
412           }
413         }
414 
415         // Only mark the instruction if it defines some part of the use
416         if (HasDef)
417           markInstruction(*MI, Flag, Worklist);
418       } else {
419         // For physical registers simply mark the defining instruction
420         markInstruction(*MI, Flag, Worklist);
421       }
422     }
423 
424     if (!NextValue && !PhiStack.empty()) {
425       // Reach end of chain; revert to processing last phi
426       PhiEntry &Entry = PhiStack.back();
427       NextValue = Entry.Phi;
428       NextPredIdx = Entry.PredIdx;
429       DefinedLanes = Entry.DefinedLanes;
430       PhiStack.pop_back();
431     }
432 
433     Value = NextValue;
434   } while (Value);
435 }
436 
437 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
438                                   const MachineOperand &Op, char Flag,
439                                   std::vector<WorkItem> &Worklist) {
440   assert(Op.isReg());
441   Register Reg = Op.getReg();
442 
443   // Ignore some hardware registers
444   switch (Reg) {
445   case AMDGPU::EXEC:
446   case AMDGPU::EXEC_LO:
447     return;
448   default:
449     break;
450   }
451 
452   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
453                     << " for " << MI);
454   if (Reg.isVirtual()) {
455     LiveRange &LR = LIS->getInterval(Reg);
456     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
457   } else {
458     // Handle physical registers that we need to track; this is mostly relevant
459     // for VCC, which can appear as the (implicit) input of a uniform branch,
460     // e.g. when a loop counter is stored in a VGPR.
461     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
462       LiveRange &LR = LIS->getRegUnit(Unit);
463       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
464       if (Value)
465         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
466     }
467   }
468 }
469 
470 /// Mark all instructions defining the uses in \p MI with \p Flag.
471 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
472                                           std::vector<WorkItem> &Worklist) {
473   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
474                     << MI);
475 
476   for (const MachineOperand &Use : MI.all_uses())
477     markOperand(MI, Use, Flag, Worklist);
478 }
479 
480 // Scan instructions to determine which ones require an Exact execmask and
481 // which ones seed WQM requirements.
482 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
483                                        std::vector<WorkItem> &Worklist) {
484   char GlobalFlags = 0;
485   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
486   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
487   bool HasImplicitDerivatives =
488       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
489 
490   // We need to visit the basic blocks in reverse post-order so that we visit
491   // defs before uses, in particular so that we don't accidentally mark an
492   // instruction as needing e.g. WQM before visiting it and realizing it needs
493   // WQM disabled.
494   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
495   for (MachineBasicBlock *MBB : RPOT) {
496     BlockInfo &BBI = Blocks[MBB];
497 
498     for (MachineInstr &MI : *MBB) {
499       InstrInfo &III = Instructions[&MI];
500       unsigned Opcode = MI.getOpcode();
501       char Flags = 0;
502 
503       if (TII->isWQM(Opcode)) {
504         // If LOD is not supported WQM is not needed.
505         // Only generate implicit WQM if implicit derivatives are required.
506         // This avoids inserting unintended WQM if a shader type without
507         // implicit derivatives uses an image sampling instruction.
508         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
509           // Sampling instructions don't need to produce results for all pixels
510           // in a quad, they just require all inputs of a quad to have been
511           // computed for derivatives.
512           markInstructionUses(MI, StateWQM, Worklist);
513           GlobalFlags |= StateWQM;
514         }
515       } else if (Opcode == AMDGPU::WQM) {
516         // The WQM intrinsic requires its output to have all the helper lanes
517         // correct, so we need it to be in WQM.
518         Flags = StateWQM;
519         LowerToCopyInstrs.insert(&MI);
520       } else if (Opcode == AMDGPU::SOFT_WQM) {
521         LowerToCopyInstrs.insert(&MI);
522         SoftWQMInstrs.push_back(&MI);
523       } else if (Opcode == AMDGPU::STRICT_WWM) {
524         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
525         // it needs to be executed in WQM or Exact so that its copy doesn't
526         // clobber inactive lanes.
527         markInstructionUses(MI, StateStrictWWM, Worklist);
528         GlobalFlags |= StateStrictWWM;
529         LowerToMovInstrs.push_back(&MI);
530       } else if (Opcode == AMDGPU::STRICT_WQM ||
531                  TII->isDualSourceBlendEXP(MI)) {
532         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
533         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
534         // quads that have at least one active thread.
535         markInstructionUses(MI, StateStrictWQM, Worklist);
536         GlobalFlags |= StateStrictWQM;
537 
538         if (Opcode == AMDGPU::STRICT_WQM) {
539           LowerToMovInstrs.push_back(&MI);
540         } else {
541           // Dual source blend export acts as implicit strict-wqm, its sources
542           // need to be shuffled in strict wqm, but the export itself needs to
543           // run in exact mode.
544           BBI.Needs |= StateExact;
545           if (!(BBI.InNeeds & StateExact)) {
546             BBI.InNeeds |= StateExact;
547             Worklist.emplace_back(MBB);
548           }
549           GlobalFlags |= StateExact;
550           III.Disabled = StateWQM | StateStrict;
551         }
552       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
553                  Opcode == AMDGPU::DS_PARAM_LOAD ||
554                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
555                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
556         // Mark these STRICTWQM, but only for the instruction, not its operands.
557         // This avoid unnecessarily marking M0 as requiring WQM.
558         III.Needs |= StateStrictWQM;
559         GlobalFlags |= StateStrictWQM;
560       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
561         // Disable strict states; StrictWQM will be added as required later.
562         III.Disabled = StateStrict;
563         MachineOperand &Inactive = MI.getOperand(4);
564         if (Inactive.isReg()) {
565           if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
566             LowerToCopyInstrs.insert(&MI);
567           else
568             markOperand(MI, Inactive, StateStrictWWM, Worklist);
569         }
570         SetInactiveInstrs.push_back(&MI);
571         BBI.NeedsLowering = true;
572       } else if (TII->isDisableWQM(MI)) {
573         BBI.Needs |= StateExact;
574         if (!(BBI.InNeeds & StateExact)) {
575           BBI.InNeeds |= StateExact;
576           Worklist.emplace_back(MBB);
577         }
578         GlobalFlags |= StateExact;
579         III.Disabled = StateWQM | StateStrict;
580       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
581                  Opcode == AMDGPU::SI_LIVE_MASK) {
582         LiveMaskQueries.push_back(&MI);
583       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
584                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
585                  Opcode == AMDGPU::SI_DEMOTE_I1) {
586         KillInstrs.push_back(&MI);
587         BBI.NeedsLowering = true;
588       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
589                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
590         InitExecInstrs.push_back(&MI);
591       } else if (WQMOutputs) {
592         // The function is in machine SSA form, which means that physical
593         // VGPRs correspond to shader inputs and outputs. Inputs are
594         // only used, outputs are only defined.
595         // FIXME: is this still valid?
596         for (const MachineOperand &MO : MI.defs()) {
597           Register Reg = MO.getReg();
598           if (Reg.isPhysical() &&
599               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
600             Flags = StateWQM;
601             break;
602           }
603         }
604       }
605 
606       if (Flags) {
607         markInstruction(MI, Flags, Worklist);
608         GlobalFlags |= Flags;
609       }
610     }
611   }
612 
613   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
614   // ever used anywhere in the function. This implements the corresponding
615   // semantics of @llvm.amdgcn.set.inactive.
616   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
617   if (GlobalFlags & StateWQM) {
618     for (MachineInstr *MI : SetInactiveInstrs)
619       markInstruction(*MI, StateWQM, Worklist);
620     for (MachineInstr *MI : SoftWQMInstrs)
621       markInstruction(*MI, StateWQM, Worklist);
622   }
623 
624   return GlobalFlags;
625 }
626 
627 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
628                                            std::vector<WorkItem>& Worklist) {
629   MachineBasicBlock *MBB = MI.getParent();
630   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
631   BlockInfo &BI = Blocks[MBB];
632 
633   // Control flow-type instructions and stores to temporary memory that are
634   // followed by WQM computations must themselves be in WQM.
635   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
636       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
637     Instructions[&MI].Needs = StateWQM;
638     II.Needs = StateWQM;
639   }
640 
641   // Propagate to block level
642   if (II.Needs & StateWQM) {
643     BI.Needs |= StateWQM;
644     if (!(BI.InNeeds & StateWQM)) {
645       BI.InNeeds |= StateWQM;
646       Worklist.emplace_back(MBB);
647     }
648   }
649 
650   // Propagate backwards within block
651   if (MachineInstr *PrevMI = MI.getPrevNode()) {
652     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
653     if (!PrevMI->isPHI()) {
654       InstrInfo &PrevII = Instructions[PrevMI];
655       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
656         PrevII.OutNeeds |= InNeeds;
657         Worklist.emplace_back(PrevMI);
658       }
659     }
660   }
661 
662   // Propagate WQM flag to instruction inputs
663   assert(!(II.Needs & StateExact));
664 
665   if (II.Needs != 0)
666     markInstructionUses(MI, II.Needs, Worklist);
667 
668   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
669   // not require any WQM transitions.
670   if (II.Needs & StateStrictWWM)
671     BI.Needs |= StateStrictWWM;
672   if (II.Needs & StateStrictWQM)
673     BI.Needs |= StateStrictWQM;
674 }
675 
676 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
677                                      std::vector<WorkItem>& Worklist) {
678   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
679 
680   // Propagate through instructions
681   if (!MBB.empty()) {
682     MachineInstr *LastMI = &*MBB.rbegin();
683     InstrInfo &LastII = Instructions[LastMI];
684     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
685       LastII.OutNeeds |= BI.OutNeeds;
686       Worklist.emplace_back(LastMI);
687     }
688   }
689 
690   // Predecessor blocks must provide for our WQM/Exact needs.
691   for (MachineBasicBlock *Pred : MBB.predecessors()) {
692     BlockInfo &PredBI = Blocks[Pred];
693     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
694       continue;
695 
696     PredBI.OutNeeds |= BI.InNeeds;
697     PredBI.InNeeds |= BI.InNeeds;
698     Worklist.emplace_back(Pred);
699   }
700 
701   // All successors must be prepared to accept the same set of WQM/Exact data.
702   for (MachineBasicBlock *Succ : MBB.successors()) {
703     BlockInfo &SuccBI = Blocks[Succ];
704     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
705       continue;
706 
707     SuccBI.InNeeds |= BI.OutNeeds;
708     Worklist.emplace_back(Succ);
709   }
710 }
711 
712 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
713   std::vector<WorkItem> Worklist;
714   char GlobalFlags = scanInstructions(MF, Worklist);
715 
716   while (!Worklist.empty()) {
717     WorkItem WI = Worklist.back();
718     Worklist.pop_back();
719 
720     if (WI.MI)
721       propagateInstruction(*WI.MI, Worklist);
722     else
723       propagateBlock(*WI.MBB, Worklist);
724   }
725 
726   return GlobalFlags;
727 }
728 
729 MachineBasicBlock::iterator
730 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
731                          MachineBasicBlock::iterator Before) {
732   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
733 
734   MachineInstr *Save =
735       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
736           .addReg(AMDGPU::SCC);
737   MachineInstr *Restore =
738       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
739           .addReg(SaveReg);
740 
741   LIS->InsertMachineInstrInMaps(*Save);
742   LIS->InsertMachineInstrInMaps(*Restore);
743   LIS->createAndComputeVirtRegInterval(SaveReg);
744 
745   return Restore;
746 }
747 
748 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
749                                                MachineInstr *TermMI) {
750   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
751                     << *TermMI << "\n");
752 
753   MachineBasicBlock *SplitBB =
754       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
755 
756   // Convert last instruction in block to a terminator.
757   // Note: this only covers the expected patterns
758   unsigned NewOpcode = 0;
759   switch (TermMI->getOpcode()) {
760   case AMDGPU::S_AND_B32:
761     NewOpcode = AMDGPU::S_AND_B32_term;
762     break;
763   case AMDGPU::S_AND_B64:
764     NewOpcode = AMDGPU::S_AND_B64_term;
765     break;
766   case AMDGPU::S_MOV_B32:
767     NewOpcode = AMDGPU::S_MOV_B32_term;
768     break;
769   case AMDGPU::S_MOV_B64:
770     NewOpcode = AMDGPU::S_MOV_B64_term;
771     break;
772   default:
773     break;
774   }
775   if (NewOpcode)
776     TermMI->setDesc(TII->get(NewOpcode));
777 
778   if (SplitBB != BB) {
779     // Update dominator trees
780     using DomTreeT = DomTreeBase<MachineBasicBlock>;
781     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
782     for (MachineBasicBlock *Succ : SplitBB->successors()) {
783       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
784       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
785     }
786     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
787     if (MDT)
788       MDT->getBase().applyUpdates(DTUpdates);
789     if (PDT)
790       PDT->applyUpdates(DTUpdates);
791 
792     // Link blocks
793     MachineInstr *MI =
794         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
795             .addMBB(SplitBB);
796     LIS->InsertMachineInstrInMaps(*MI);
797   }
798 
799   return SplitBB;
800 }
801 
802 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
803                                             MachineInstr &MI) {
804   assert(LiveMaskReg.isVirtual());
805 
806   const DebugLoc &DL = MI.getDebugLoc();
807   unsigned Opcode = 0;
808 
809   assert(MI.getOperand(0).isReg());
810 
811   // Comparison is for live lanes; however here we compute the inverse
812   // (killed lanes).  This is because VCMP will always generate 0 bits
813   // for inactive lanes so a mask of live lanes would not be correct
814   // inside control flow.
815   // Invert the comparison by swapping the operands and adjusting
816   // the comparison codes.
817 
818   switch (MI.getOperand(2).getImm()) {
819   case ISD::SETUEQ:
820     Opcode = AMDGPU::V_CMP_LG_F32_e64;
821     break;
822   case ISD::SETUGT:
823     Opcode = AMDGPU::V_CMP_GE_F32_e64;
824     break;
825   case ISD::SETUGE:
826     Opcode = AMDGPU::V_CMP_GT_F32_e64;
827     break;
828   case ISD::SETULT:
829     Opcode = AMDGPU::V_CMP_LE_F32_e64;
830     break;
831   case ISD::SETULE:
832     Opcode = AMDGPU::V_CMP_LT_F32_e64;
833     break;
834   case ISD::SETUNE:
835     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
836     break;
837   case ISD::SETO:
838     Opcode = AMDGPU::V_CMP_O_F32_e64;
839     break;
840   case ISD::SETUO:
841     Opcode = AMDGPU::V_CMP_U_F32_e64;
842     break;
843   case ISD::SETOEQ:
844   case ISD::SETEQ:
845     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
846     break;
847   case ISD::SETOGT:
848   case ISD::SETGT:
849     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
850     break;
851   case ISD::SETOGE:
852   case ISD::SETGE:
853     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
854     break;
855   case ISD::SETOLT:
856   case ISD::SETLT:
857     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
858     break;
859   case ISD::SETOLE:
860   case ISD::SETLE:
861     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
862     break;
863   case ISD::SETONE:
864   case ISD::SETNE:
865     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
866     break;
867   default:
868     llvm_unreachable("invalid ISD:SET cond code");
869   }
870 
871   // Pick opcode based on comparison type.
872   MachineInstr *VcmpMI;
873   const MachineOperand &Op0 = MI.getOperand(0);
874   const MachineOperand &Op1 = MI.getOperand(1);
875 
876   // VCC represents lanes killed.
877   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
878 
879   if (TRI->isVGPR(*MRI, Op0.getReg())) {
880     Opcode = AMDGPU::getVOPe32(Opcode);
881     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
882   } else {
883     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
884                  .addReg(VCC, RegState::Define)
885                  .addImm(0) // src0 modifiers
886                  .add(Op1)
887                  .addImm(0) // src1 modifiers
888                  .add(Op0)
889                  .addImm(0); // omod
890   }
891 
892   MachineInstr *MaskUpdateMI =
893       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
894           .addReg(LiveMaskReg)
895           .addReg(VCC);
896 
897   // State of SCC represents whether any lanes are live in mask,
898   // if SCC is 0 then no lanes will be alive anymore.
899   MachineInstr *EarlyTermMI =
900       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
901 
902   MachineInstr *ExecMaskMI =
903       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
904 
905   assert(MBB.succ_size() == 1);
906   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
907                               .addMBB(*MBB.succ_begin());
908 
909   // Update live intervals
910   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
911   MBB.remove(&MI);
912 
913   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
914   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
915   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
916   LIS->InsertMachineInstrInMaps(*NewTerm);
917 
918   return NewTerm;
919 }
920 
921 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
922                                            MachineInstr &MI, bool IsWQM) {
923   assert(LiveMaskReg.isVirtual());
924 
925   const DebugLoc &DL = MI.getDebugLoc();
926   MachineInstr *MaskUpdateMI = nullptr;
927 
928   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
929   const MachineOperand &Op = MI.getOperand(0);
930   int64_t KillVal = MI.getOperand(1).getImm();
931   MachineInstr *ComputeKilledMaskMI = nullptr;
932   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
933   Register TmpReg;
934 
935   // Is this a static or dynamic kill?
936   if (Op.isImm()) {
937     if (Op.getImm() == KillVal) {
938       // Static: all active lanes are killed
939       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
940                          .addReg(LiveMaskReg)
941                          .addReg(Exec);
942     } else {
943       // Static: kill does nothing
944       MachineInstr *NewTerm = nullptr;
945       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
946         LIS->RemoveMachineInstrFromMaps(MI);
947       } else {
948         assert(MBB.succ_size() == 1);
949         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
950                       .addMBB(*MBB.succ_begin());
951         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
952       }
953       MBB.remove(&MI);
954       return NewTerm;
955     }
956   } else {
957     if (!KillVal) {
958       // Op represents live lanes after kill,
959       // so exec mask needs to be factored in.
960       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
961       ComputeKilledMaskMI =
962           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
963       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
964                          .addReg(LiveMaskReg)
965                          .addReg(TmpReg);
966     } else {
967       // Op represents lanes to kill
968       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
969                          .addReg(LiveMaskReg)
970                          .add(Op);
971     }
972   }
973 
974   // State of SCC represents whether any lanes are live in mask,
975   // if SCC is 0 then no lanes will be alive anymore.
976   MachineInstr *EarlyTermMI =
977       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
978 
979   // In the case we got this far some lanes are still live,
980   // update EXEC to deactivate lanes as appropriate.
981   MachineInstr *NewTerm;
982   MachineInstr *WQMMaskMI = nullptr;
983   Register LiveMaskWQM;
984   if (IsDemote) {
985     // Demote - deactivate quads with only helper lanes
986     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
987     WQMMaskMI =
988         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
989     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
990                   .addReg(Exec)
991                   .addReg(LiveMaskWQM);
992   } else {
993     // Kill - deactivate lanes no longer in live mask
994     if (Op.isImm()) {
995       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
996       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
997     } else if (!IsWQM) {
998       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
999                     .addReg(Exec)
1000                     .addReg(LiveMaskReg);
1001     } else {
1002       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1003       NewTerm =
1004           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1005     }
1006   }
1007 
1008   // Update live intervals
1009   LIS->RemoveMachineInstrFromMaps(MI);
1010   MBB.remove(&MI);
1011   assert(EarlyTermMI);
1012   assert(MaskUpdateMI);
1013   assert(NewTerm);
1014   if (ComputeKilledMaskMI)
1015     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1016   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1017   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1018   if (WQMMaskMI)
1019     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1020   LIS->InsertMachineInstrInMaps(*NewTerm);
1021 
1022   if (CndReg) {
1023     LIS->removeInterval(CndReg);
1024     LIS->createAndComputeVirtRegInterval(CndReg);
1025   }
1026   if (TmpReg)
1027     LIS->createAndComputeVirtRegInterval(TmpReg);
1028   if (LiveMaskWQM)
1029     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1030 
1031   return NewTerm;
1032 }
1033 
1034 // Replace (or supplement) instructions accessing live mask.
1035 // This can only happen once all the live mask registers have been created
1036 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1037 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1038   auto BII = Blocks.find(&MBB);
1039   if (BII == Blocks.end())
1040     return;
1041 
1042   const BlockInfo &BI = BII->second;
1043   if (!BI.NeedsLowering)
1044     return;
1045 
1046   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1047 
1048   SmallVector<MachineInstr *, 4> SplitPoints;
1049   Register ActiveLanesReg = 0;
1050   char State = BI.InitialState;
1051 
1052   for (MachineInstr &MI : llvm::make_early_inc_range(
1053            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1054     if (StateTransition.count(&MI))
1055       State = StateTransition[&MI];
1056 
1057     MachineInstr *SplitPoint = nullptr;
1058     switch (MI.getOpcode()) {
1059     case AMDGPU::SI_DEMOTE_I1:
1060     case AMDGPU::SI_KILL_I1_TERMINATOR:
1061       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1062       break;
1063     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1064       SplitPoint = lowerKillF32(MBB, MI);
1065       break;
1066     case AMDGPU::ENTER_STRICT_WWM:
1067       ActiveLanesReg = MI.getOperand(0).getReg();
1068       break;
1069     case AMDGPU::EXIT_STRICT_WWM:
1070       ActiveLanesReg = 0;
1071       break;
1072     case AMDGPU::V_SET_INACTIVE_B32:
1073       if (ActiveLanesReg) {
1074         LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
1075         MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
1076         MI.getOperand(5).setReg(ActiveLanesReg);
1077         LIS->shrinkToUses(&LI);
1078       } else {
1079         assert(State == StateExact || State == StateWQM);
1080       }
1081       break;
1082     default:
1083       break;
1084     }
1085     if (SplitPoint)
1086       SplitPoints.push_back(SplitPoint);
1087   }
1088 
1089   // Perform splitting after instruction scan to simplify iteration.
1090   if (!SplitPoints.empty()) {
1091     MachineBasicBlock *BB = &MBB;
1092     for (MachineInstr *MI : SplitPoints) {
1093       BB = splitBlock(BB, MI);
1094     }
1095   }
1096 }
1097 
1098 // Return an iterator in the (inclusive) range [First, Last] at which
1099 // instructions can be safely inserted, keeping in mind that some of the
1100 // instructions we want to add necessarily clobber SCC.
1101 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1102     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1103     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1104   if (!SaveSCC)
1105     return PreferLast ? Last : First;
1106 
1107   LiveRange &LR =
1108       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1109   auto MBBE = MBB.end();
1110   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1111                                      : LIS->getMBBEndIdx(&MBB);
1112   SlotIndex LastIdx =
1113       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1114   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1115   const LiveRange::Segment *S;
1116 
1117   for (;;) {
1118     S = LR.getSegmentContaining(Idx);
1119     if (!S)
1120       break;
1121 
1122     if (PreferLast) {
1123       SlotIndex Next = S->start.getBaseIndex();
1124       if (Next < FirstIdx)
1125         break;
1126       Idx = Next;
1127     } else {
1128       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1129       assert(EndMI && "Segment does not end on valid instruction");
1130       auto NextI = std::next(EndMI->getIterator());
1131       if (NextI == MBB.end())
1132         break;
1133       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1134       if (Next > LastIdx)
1135         break;
1136       Idx = Next;
1137     }
1138   }
1139 
1140   MachineBasicBlock::iterator MBBI;
1141 
1142   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1143     MBBI = MI;
1144   else {
1145     assert(Idx == LIS->getMBBEndIdx(&MBB));
1146     MBBI = MBB.end();
1147   }
1148 
1149   // Move insertion point past any operations modifying EXEC.
1150   // This assumes that the value of SCC defined by any of these operations
1151   // does not need to be preserved.
1152   while (MBBI != Last) {
1153     bool IsExecDef = false;
1154     for (const MachineOperand &MO : MBBI->all_defs()) {
1155       IsExecDef |=
1156           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1157     }
1158     if (!IsExecDef)
1159       break;
1160     MBBI++;
1161     S = nullptr;
1162   }
1163 
1164   if (S)
1165     MBBI = saveSCC(MBB, MBBI);
1166 
1167   return MBBI;
1168 }
1169 
1170 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1171                               MachineBasicBlock::iterator Before,
1172                               Register SaveWQM) {
1173   assert(LiveMaskReg.isVirtual());
1174 
1175   bool IsTerminator = Before == MBB.end();
1176   if (!IsTerminator) {
1177     auto FirstTerm = MBB.getFirstTerminator();
1178     if (FirstTerm != MBB.end()) {
1179       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1180       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1181       IsTerminator = BeforeIdx > FirstTermIdx;
1182     }
1183   }
1184 
1185   MachineInstr *MI;
1186 
1187   if (SaveWQM) {
1188     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1189     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1190              .addReg(LiveMaskReg);
1191   } else {
1192     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1193     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1194              .addReg(Exec)
1195              .addReg(LiveMaskReg);
1196   }
1197 
1198   LIS->InsertMachineInstrInMaps(*MI);
1199   StateTransition[MI] = StateExact;
1200 }
1201 
1202 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1203                             MachineBasicBlock::iterator Before,
1204                             Register SavedWQM) {
1205   MachineInstr *MI;
1206 
1207   if (SavedWQM) {
1208     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1209              .addReg(SavedWQM);
1210   } else {
1211     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1212   }
1213 
1214   LIS->InsertMachineInstrInMaps(*MI);
1215   StateTransition[MI] = StateWQM;
1216 }
1217 
1218 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1219                                    MachineBasicBlock::iterator Before,
1220                                    Register SaveOrig, char StrictStateNeeded) {
1221   MachineInstr *MI;
1222   assert(SaveOrig);
1223   assert(StrictStateNeeded == StateStrictWWM ||
1224          StrictStateNeeded == StateStrictWQM);
1225 
1226   if (StrictStateNeeded == StateStrictWWM) {
1227     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1228                  SaveOrig)
1229              .addImm(-1);
1230   } else {
1231     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1232                  SaveOrig)
1233              .addImm(-1);
1234   }
1235   LIS->InsertMachineInstrInMaps(*MI);
1236   StateTransition[MI] = StrictStateNeeded;
1237 }
1238 
1239 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1240                                      MachineBasicBlock::iterator Before,
1241                                      Register SavedOrig, char NonStrictState,
1242                                      char CurrentStrictState) {
1243   MachineInstr *MI;
1244 
1245   assert(SavedOrig);
1246   assert(CurrentStrictState == StateStrictWWM ||
1247          CurrentStrictState == StateStrictWQM);
1248 
1249   if (CurrentStrictState == StateStrictWWM) {
1250     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1251                  Exec)
1252              .addReg(SavedOrig);
1253   } else {
1254     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1255                  Exec)
1256              .addReg(SavedOrig);
1257   }
1258   LIS->InsertMachineInstrInMaps(*MI);
1259   StateTransition[MI] = NonStrictState;
1260 }
1261 
1262 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1263   auto BII = Blocks.find(&MBB);
1264   if (BII == Blocks.end())
1265     return;
1266 
1267   BlockInfo &BI = BII->second;
1268 
1269   // This is a non-entry block that is WQM throughout, so no need to do
1270   // anything.
1271   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1272     BI.InitialState = StateWQM;
1273     return;
1274   }
1275 
1276   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1277                     << ":\n");
1278 
1279   Register SavedWQMReg;
1280   Register SavedNonStrictReg;
1281   bool WQMFromExec = IsEntry;
1282   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1283   char NonStrictState = 0;
1284   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1285 
1286   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1287   if (IsEntry) {
1288     // Skip the instruction that saves LiveMask
1289     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1290         II->getOperand(1).getReg() == TRI->getExec())
1291       ++II;
1292   }
1293 
1294   // This stores the first instruction where it's safe to switch from WQM to
1295   // Exact or vice versa.
1296   MachineBasicBlock::iterator FirstWQM = IE;
1297 
1298   // This stores the first instruction where it's safe to switch from Strict
1299   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1300   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1301   // be safe to switch to/from WQM as well.
1302   MachineBasicBlock::iterator FirstStrict = IE;
1303 
1304   // Record initial state is block information.
1305   BI.InitialState = State;
1306 
1307   for (;;) {
1308     MachineBasicBlock::iterator Next = II;
1309     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1310     char OutNeeds = 0;
1311 
1312     if (FirstWQM == IE)
1313       FirstWQM = II;
1314 
1315     if (FirstStrict == IE)
1316       FirstStrict = II;
1317 
1318     // First, figure out the allowed states (Needs) based on the propagated
1319     // flags.
1320     if (II != IE) {
1321       MachineInstr &MI = *II;
1322 
1323       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1324         auto III = Instructions.find(&MI);
1325         if (III != Instructions.end()) {
1326           if (III->second.Needs & StateStrictWWM)
1327             Needs = StateStrictWWM;
1328           else if (III->second.Needs & StateStrictWQM)
1329             Needs = StateStrictWQM;
1330           else if (III->second.Needs & StateWQM)
1331             Needs = StateWQM;
1332           else
1333             Needs &= ~III->second.Disabled;
1334           OutNeeds = III->second.OutNeeds;
1335         }
1336       } else {
1337         // If the instruction doesn't actually need a correct EXEC, then we can
1338         // safely leave Strict mode enabled.
1339         Needs = StateExact | StateWQM | StateStrict;
1340       }
1341 
1342       // Exact mode exit can occur in terminators, but must be before branches.
1343       if (MI.isBranch() && OutNeeds == StateExact)
1344         Needs = StateExact;
1345 
1346       ++Next;
1347     } else {
1348       // End of basic block
1349       if (BI.OutNeeds & StateWQM)
1350         Needs = StateWQM;
1351       else if (BI.OutNeeds == StateExact)
1352         Needs = StateExact;
1353       else
1354         Needs = StateWQM | StateExact;
1355     }
1356 
1357     // Now, transition if necessary.
1358     if (!(Needs & State)) {
1359       MachineBasicBlock::iterator First;
1360       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1361           State == StateStrictWQM || Needs == StateStrictWQM) {
1362         // We must switch to or from Strict mode.
1363         First = FirstStrict;
1364       } else {
1365         // We only need to switch to/from WQM, so we can use FirstWQM.
1366         First = FirstWQM;
1367       }
1368 
1369       // Whether we need to save SCC depends on start and end states.
1370       bool SaveSCC = false;
1371       switch (State) {
1372       case StateExact:
1373       case StateStrictWWM:
1374       case StateStrictWQM:
1375         // Exact/Strict -> Strict: save SCC
1376         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1377         // Exact/Strict -> Exact: no save
1378         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1379         break;
1380       case StateWQM:
1381         // WQM -> Exact/Strict: save SCC
1382         SaveSCC = !(Needs & StateWQM);
1383         break;
1384       default:
1385         llvm_unreachable("Unknown state");
1386         break;
1387       }
1388       char StartState = State & StateStrict ? NonStrictState : State;
1389       bool WQMToExact =
1390           StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1391       bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1392                         !(Needs & StateExact);
1393       bool PreferLast = Needs == StateWQM;
1394       // Exact regions in divergent control flow may run at EXEC=0, so try to
1395       // exclude instructions with unexpected effects from them.
1396       // FIXME: ideally we would branch over these when EXEC=0,
1397       // but this requires updating implicit values, live intervals and CFG.
1398       if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1399         for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1400           if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
1401             PreferLast = WQMToExact;
1402             break;
1403           }
1404         }
1405       }
1406       MachineBasicBlock::iterator Before =
1407           prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
1408 
1409       if (State & StateStrict) {
1410         assert(State == StateStrictWWM || State == StateStrictWQM);
1411         assert(SavedNonStrictReg);
1412         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1413 
1414         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1415         SavedNonStrictReg = 0;
1416         State = NonStrictState;
1417       }
1418 
1419       if (Needs & StateStrict) {
1420         NonStrictState = State;
1421         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1422         assert(!SavedNonStrictReg);
1423         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1424 
1425         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1426         State = Needs;
1427       } else {
1428         if (WQMToExact) {
1429           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1430             assert(!SavedWQMReg);
1431             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1432           }
1433 
1434           toExact(MBB, Before, SavedWQMReg);
1435           State = StateExact;
1436         } else if (ExactToWQM) {
1437           assert(WQMFromExec == (SavedWQMReg == 0));
1438 
1439           toWQM(MBB, Before, SavedWQMReg);
1440 
1441           if (SavedWQMReg) {
1442             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1443             SavedWQMReg = 0;
1444           }
1445           State = StateWQM;
1446         } else {
1447           // We can get here if we transitioned from StrictWWM to a
1448           // non-StrictWWM state that already matches our needs, but we
1449           // shouldn't need to do anything.
1450           assert(Needs & State);
1451         }
1452       }
1453     }
1454 
1455     if (Needs != (StateExact | StateWQM | StateStrict)) {
1456       if (Needs != (StateExact | StateWQM))
1457         FirstWQM = IE;
1458       FirstStrict = IE;
1459     }
1460 
1461     if (II == IE)
1462       break;
1463 
1464     II = Next;
1465   }
1466   assert(!SavedWQMReg);
1467   assert(!SavedNonStrictReg);
1468 }
1469 
1470 bool SIWholeQuadMode::lowerLiveMaskQueries() {
1471   for (MachineInstr *MI : LiveMaskQueries) {
1472     const DebugLoc &DL = MI->getDebugLoc();
1473     Register Dest = MI->getOperand(0).getReg();
1474 
1475     MachineInstr *Copy =
1476         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1477             .addReg(LiveMaskReg);
1478 
1479     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1480     MI->eraseFromParent();
1481   }
1482   return !LiveMaskQueries.empty();
1483 }
1484 
1485 bool SIWholeQuadMode::lowerCopyInstrs() {
1486   for (MachineInstr *MI : LowerToMovInstrs) {
1487     assert(MI->getNumExplicitOperands() == 2);
1488 
1489     const Register Reg = MI->getOperand(0).getReg();
1490 
1491     const TargetRegisterClass *regClass =
1492         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1493     if (TRI->isVGPRClass(regClass)) {
1494       const unsigned MovOp = TII->getMovOpcode(regClass);
1495       MI->setDesc(TII->get(MovOp));
1496 
1497       // Check that it already implicitly depends on exec (like all VALU movs
1498       // should do).
1499       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1500         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1501       }));
1502     } else {
1503       // Remove early-clobber and exec dependency from simple SGPR copies.
1504       // This allows some to be eliminated during/post RA.
1505       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1506       if (MI->getOperand(0).isEarlyClobber()) {
1507         LIS->removeInterval(Reg);
1508         MI->getOperand(0).setIsEarlyClobber(false);
1509         LIS->createAndComputeVirtRegInterval(Reg);
1510       }
1511       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1512       while (Index >= 0) {
1513         MI->removeOperand(Index);
1514         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1515       }
1516       MI->setDesc(TII->get(AMDGPU::COPY));
1517       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1518     }
1519   }
1520   for (MachineInstr *MI : LowerToCopyInstrs) {
1521     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1522 
1523     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1524       assert(MI->getNumExplicitOperands() == 6);
1525 
1526       LiveInterval *RecomputeLI = nullptr;
1527       if (MI->getOperand(4).isReg())
1528         RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
1529 
1530       MI->removeOperand(5);
1531       MI->removeOperand(4);
1532       MI->removeOperand(3);
1533       MI->removeOperand(1);
1534 
1535       if (RecomputeLI)
1536         LIS->shrinkToUses(RecomputeLI);
1537     } else {
1538       assert(MI->getNumExplicitOperands() == 2);
1539     }
1540 
1541     unsigned CopyOp = MI->getOperand(1).isReg()
1542                           ? (unsigned)AMDGPU::COPY
1543                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1544                                 *MRI, MI->getOperand(0)));
1545     MI->setDesc(TII->get(CopyOp));
1546     LLVM_DEBUG(dbgs() << " -> " << *MI);
1547   }
1548   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1549 }
1550 
1551 bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1552   for (MachineInstr *MI : KillInstrs) {
1553     MachineBasicBlock *MBB = MI->getParent();
1554     MachineInstr *SplitPoint = nullptr;
1555     switch (MI->getOpcode()) {
1556     case AMDGPU::SI_DEMOTE_I1:
1557     case AMDGPU::SI_KILL_I1_TERMINATOR:
1558       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1559       break;
1560     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1561       SplitPoint = lowerKillF32(*MBB, *MI);
1562       break;
1563     }
1564     if (SplitPoint)
1565       splitBlock(MBB, SplitPoint);
1566   }
1567   return !KillInstrs.empty();
1568 }
1569 
1570 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1571   MachineBasicBlock *MBB = MI.getParent();
1572   bool IsWave32 = ST->isWave32();
1573 
1574   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1575     // This should be before all vector instructions.
1576     MachineInstr *InitMI =
1577         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1578                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1579                 Exec)
1580             .addImm(MI.getOperand(0).getImm());
1581     if (LIS) {
1582       LIS->RemoveMachineInstrFromMaps(MI);
1583       LIS->InsertMachineInstrInMaps(*InitMI);
1584     }
1585     MI.eraseFromParent();
1586     return;
1587   }
1588 
1589   // Extract the thread count from an SGPR input and set EXEC accordingly.
1590   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1591   //
1592   // S_BFE_U32 count, input, {shift, 7}
1593   // S_BFM_B64 exec, count, 0
1594   // S_CMP_EQ_U32 count, 64
1595   // S_CMOV_B64 exec, -1
1596   Register InputReg = MI.getOperand(0).getReg();
1597   MachineInstr *FirstMI = &*MBB->begin();
1598   if (InputReg.isVirtual()) {
1599     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1600     assert(DefInstr && DefInstr->isCopy());
1601     if (DefInstr->getParent() == MBB) {
1602       if (DefInstr != FirstMI) {
1603         // If the `InputReg` is defined in current block, we also need to
1604         // move that instruction to the beginning of the block.
1605         DefInstr->removeFromParent();
1606         MBB->insert(FirstMI, DefInstr);
1607         if (LIS)
1608           LIS->handleMove(*DefInstr);
1609       } else {
1610         // If first instruction is definition then move pointer after it.
1611         FirstMI = &*std::next(FirstMI->getIterator());
1612       }
1613     }
1614   }
1615 
1616   // Insert instruction sequence at block beginning (before vector operations).
1617   const DebugLoc DL = MI.getDebugLoc();
1618   const unsigned WavefrontSize = ST->getWavefrontSize();
1619   const unsigned Mask = (WavefrontSize << 1) - 1;
1620   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1621   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1622                    .addReg(InputReg)
1623                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1624   auto BfmMI =
1625       BuildMI(*MBB, FirstMI, DL,
1626               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1627           .addReg(CountReg)
1628           .addImm(0);
1629   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1630                    .addReg(CountReg, RegState::Kill)
1631                    .addImm(WavefrontSize);
1632   auto CmovMI =
1633       BuildMI(*MBB, FirstMI, DL,
1634               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1635               Exec)
1636           .addImm(-1);
1637 
1638   if (!LIS) {
1639     MI.eraseFromParent();
1640     return;
1641   }
1642 
1643   LIS->RemoveMachineInstrFromMaps(MI);
1644   MI.eraseFromParent();
1645 
1646   LIS->InsertMachineInstrInMaps(*BfeMI);
1647   LIS->InsertMachineInstrInMaps(*BfmMI);
1648   LIS->InsertMachineInstrInMaps(*CmpMI);
1649   LIS->InsertMachineInstrInMaps(*CmovMI);
1650 
1651   LIS->removeInterval(InputReg);
1652   LIS->createAndComputeVirtRegInterval(InputReg);
1653   LIS->createAndComputeVirtRegInterval(CountReg);
1654 }
1655 
1656 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1657 /// for instructions that depend on EXEC.
1658 MachineBasicBlock::iterator
1659 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1660   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1661 
1662   for (MachineInstr *MI : InitExecInstrs) {
1663     // Try to handle undefined cases gracefully:
1664     // - multiple INIT_EXEC instructions
1665     // - INIT_EXEC instructions not in the entry block
1666     if (MI->getParent() == &Entry)
1667       InsertPt = std::next(MI->getIterator());
1668 
1669     lowerInitExec(*MI);
1670     Changed = true;
1671   }
1672 
1673   return InsertPt;
1674 }
1675 
1676 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1677   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1678                     << " ------------- \n");
1679   LLVM_DEBUG(MF.dump(););
1680 
1681   Instructions.clear();
1682   Blocks.clear();
1683   LiveMaskQueries.clear();
1684   LowerToCopyInstrs.clear();
1685   LowerToMovInstrs.clear();
1686   KillInstrs.clear();
1687   InitExecInstrs.clear();
1688   SetInactiveInstrs.clear();
1689   StateTransition.clear();
1690 
1691   ST = &MF.getSubtarget<GCNSubtarget>();
1692 
1693   TII = ST->getInstrInfo();
1694   TRI = &TII->getRegisterInfo();
1695   MRI = &MF.getRegInfo();
1696   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1697   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1698   MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1699   auto *PDTWrapper =
1700       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1701   PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1702 
1703   if (ST->isWave32()) {
1704     AndOpc = AMDGPU::S_AND_B32;
1705     AndTermOpc = AMDGPU::S_AND_B32_term;
1706     AndN2Opc = AMDGPU::S_ANDN2_B32;
1707     XorOpc = AMDGPU::S_XOR_B32;
1708     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1709     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1710     WQMOpc = AMDGPU::S_WQM_B32;
1711     Exec = AMDGPU::EXEC_LO;
1712   } else {
1713     AndOpc = AMDGPU::S_AND_B64;
1714     AndTermOpc = AMDGPU::S_AND_B64_term;
1715     AndN2Opc = AMDGPU::S_ANDN2_B64;
1716     XorOpc = AMDGPU::S_XOR_B64;
1717     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1718     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1719     WQMOpc = AMDGPU::S_WQM_B64;
1720     Exec = AMDGPU::EXEC;
1721   }
1722 
1723   const char GlobalFlags = analyzeFunction(MF);
1724   bool Changed = false;
1725 
1726   LiveMaskReg = Exec;
1727 
1728   MachineBasicBlock &Entry = MF.front();
1729   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1730 
1731   // Store a copy of the original live mask when required
1732   const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1733   const bool HasWaveModes = GlobalFlags & ~StateExact;
1734   const bool HasKills = !KillInstrs.empty();
1735   const bool UsesWQM = GlobalFlags & StateWQM;
1736   if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1737     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1738     MachineInstr *MI =
1739         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1740             .addReg(Exec);
1741     LIS->InsertMachineInstrInMaps(*MI);
1742     Changed = true;
1743   }
1744 
1745   // Check if V_SET_INACTIVE was touched by a strict state mode.
1746   // If so, promote to WWM; otherwise lower to COPY.
1747   for (MachineInstr *MI : SetInactiveInstrs) {
1748     if (LowerToCopyInstrs.contains(MI))
1749       continue;
1750     if (Instructions[MI].MarkedStates & StateStrict) {
1751       Instructions[MI].Needs |= StateStrictWWM;
1752       Instructions[MI].Disabled &= ~StateStrictWWM;
1753       Blocks[MI->getParent()].Needs |= StateStrictWWM;
1754     } else {
1755       LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1756       LowerToCopyInstrs.insert(MI);
1757     }
1758   }
1759 
1760   LLVM_DEBUG(printInfo());
1761 
1762   Changed |= lowerLiveMaskQueries();
1763   Changed |= lowerCopyInstrs();
1764 
1765   if (!HasWaveModes) {
1766     // No wave mode execution
1767     Changed |= lowerKillInstrs(false);
1768   } else if (GlobalFlags == StateWQM) {
1769     // Shader only needs WQM
1770     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1771                   .addReg(Exec);
1772     LIS->InsertMachineInstrInMaps(*MI);
1773     lowerKillInstrs(true);
1774     Changed = true;
1775   } else {
1776     // Wave mode switching requires full lowering pass.
1777     for (auto BII : Blocks)
1778       processBlock(*BII.first, BII.first == &Entry);
1779     // Lowering blocks causes block splitting so perform as a second pass.
1780     for (auto BII : Blocks)
1781       lowerBlock(*BII.first);
1782     Changed = true;
1783   }
1784 
1785   // Compute live range for live mask
1786   if (LiveMaskReg != Exec)
1787     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1788 
1789   // Physical registers like SCC aren't tracked by default anyway, so just
1790   // removing the ranges we computed is the simplest option for maintaining
1791   // the analysis results.
1792   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1793 
1794   // If we performed any kills then recompute EXEC
1795   if (!KillInstrs.empty() || !InitExecInstrs.empty())
1796     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1797 
1798   return Changed;
1799 }
1800