xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 79047fac65b95a5a98bde0de473da858e805576c)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131   char MarkedStates = 0;
132 };
133 
134 struct BlockInfo {
135   char Needs = 0;
136   char InNeeds = 0;
137   char OutNeeds = 0;
138   char InitialState = 0;
139   bool NeedsLowering = false;
140 };
141 
142 struct WorkItem {
143   MachineBasicBlock *MBB = nullptr;
144   MachineInstr *MI = nullptr;
145 
146   WorkItem() = default;
147   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
148   WorkItem(MachineInstr *MI) : MI(MI) {}
149 };
150 
151 class SIWholeQuadMode : public MachineFunctionPass {
152 private:
153   const SIInstrInfo *TII;
154   const SIRegisterInfo *TRI;
155   const GCNSubtarget *ST;
156   MachineRegisterInfo *MRI;
157   LiveIntervals *LIS;
158   MachineDominatorTree *MDT;
159   MachinePostDominatorTree *PDT;
160 
161   unsigned AndOpc;
162   unsigned AndTermOpc;
163   unsigned AndN2Opc;
164   unsigned XorOpc;
165   unsigned AndSaveExecOpc;
166   unsigned AndSaveExecTermOpc;
167   unsigned WQMOpc;
168   Register Exec;
169   Register LiveMaskReg;
170 
171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
173 
174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175   DenseMap<const MachineInstr *, char> StateTransition;
176 
177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179   SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
180   SmallVector<MachineInstr *, 4> KillInstrs;
181   SmallVector<MachineInstr *, 4> InitExecInstrs;
182   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
183 
184   void printInfo();
185 
186   void markInstruction(MachineInstr &MI, char Flag,
187                        std::vector<WorkItem> &Worklist);
188   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
189                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
190   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
191                    std::vector<WorkItem> &Worklist);
192   void markInstructionUses(const MachineInstr &MI, char Flag,
193                            std::vector<WorkItem> &Worklist);
194   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
195   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
196   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
197   char analyzeFunction(MachineFunction &MF);
198 
199   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
200                                       MachineBasicBlock::iterator Before);
201   MachineBasicBlock::iterator
202   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
203                    MachineBasicBlock::iterator Last, bool PreferLast,
204                    bool SaveSCC);
205   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206                Register SaveWQM);
207   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208              Register SavedWQM);
209   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
210                     Register SaveOrig, char StrictStateNeeded);
211   void fromStrictMode(MachineBasicBlock &MBB,
212                       MachineBasicBlock::iterator Before, Register SavedOrig,
213                       char NonStrictState, char CurrentStrictState);
214 
215   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
216 
217   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
218                             bool IsWQM);
219   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
220 
221   void lowerBlock(MachineBasicBlock &MBB);
222   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
223 
224   bool lowerLiveMaskQueries();
225   bool lowerCopyInstrs();
226   bool lowerKillInstrs(bool IsWQM);
227   void lowerInitExec(MachineInstr &MI);
228   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
229                                                   bool &Changed);
230 
231 public:
232   static char ID;
233 
234   SIWholeQuadMode() :
235     MachineFunctionPass(ID) { }
236 
237   bool runOnMachineFunction(MachineFunction &MF) override;
238 
239   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
240 
241   void getAnalysisUsage(AnalysisUsage &AU) const override {
242     AU.addRequired<LiveIntervalsWrapperPass>();
243     AU.addPreserved<SlotIndexesWrapperPass>();
244     AU.addPreserved<LiveIntervalsWrapperPass>();
245     AU.addPreserved<MachineDominatorTreeWrapperPass>();
246     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
247     MachineFunctionPass::getAnalysisUsage(AU);
248   }
249 
250   MachineFunctionProperties getClearedProperties() const override {
251     return MachineFunctionProperties().set(
252         MachineFunctionProperties::Property::IsSSA);
253   }
254 };
255 
256 } // end anonymous namespace
257 
258 char SIWholeQuadMode::ID = 0;
259 
260 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
261                       false)
262 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
263 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
264 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
265 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
266                     false)
267 
268 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
269 
270 FunctionPass *llvm::createSIWholeQuadModePass() {
271   return new SIWholeQuadMode;
272 }
273 
274 #ifndef NDEBUG
275 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
276   for (const auto &BII : Blocks) {
277     dbgs() << "\n"
278            << printMBBReference(*BII.first) << ":\n"
279            << "  InNeeds = " << PrintState(BII.second.InNeeds)
280            << ", Needs = " << PrintState(BII.second.Needs)
281            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
282 
283     for (const MachineInstr &MI : *BII.first) {
284       auto III = Instructions.find(&MI);
285       if (III != Instructions.end()) {
286         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
287                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
288       }
289     }
290   }
291 }
292 #endif
293 
294 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
295                                       std::vector<WorkItem> &Worklist) {
296   InstrInfo &II = Instructions[&MI];
297 
298   assert(!(Flag & StateExact) && Flag != 0);
299 
300   // Capture all states requested in marking including disabled ones.
301   II.MarkedStates |= Flag;
302 
303   // Remove any disabled states from the flag. The user that required it gets
304   // an undefined value in the helper lanes. For example, this can happen if
305   // the result of an atomic is used by instruction that requires WQM, where
306   // ignoring the request for WQM is correct as per the relevant specs.
307   Flag &= ~II.Disabled;
308 
309   // Ignore if the flag is already encompassed by the existing needs, or we
310   // just disabled everything.
311   if ((II.Needs & Flag) == Flag)
312     return;
313 
314   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
315   II.Needs |= Flag;
316   Worklist.emplace_back(&MI);
317 }
318 
319 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
320 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
321                                Register Reg, unsigned SubReg, char Flag,
322                                std::vector<WorkItem> &Worklist) {
323   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
324 
325   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
326   const VNInfo *Value = UseLRQ.valueIn();
327   if (!Value)
328     return;
329 
330   // Note: this code assumes that lane masks on AMDGPU completely
331   // cover registers.
332   const LaneBitmask UseLanes =
333       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
334              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
335                                 : LaneBitmask::getNone());
336 
337   // Perform a depth-first iteration of the LiveRange graph marking defs.
338   // Stop processing of a given branch when all use lanes have been defined.
339   // The first definition stops processing for a physical register.
340   struct PhiEntry {
341     const VNInfo *Phi;
342     unsigned PredIdx;
343     LaneBitmask DefinedLanes;
344 
345     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
346         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
347   };
348   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
349   SmallVector<PhiEntry, 2> PhiStack;
350   SmallSet<VisitKey, 4> Visited;
351   LaneBitmask DefinedLanes;
352   unsigned NextPredIdx = 0; // Only used for processing phi nodes
353   do {
354     const VNInfo *NextValue = nullptr;
355     const VisitKey Key(Value, DefinedLanes);
356 
357     if (Visited.insert(Key).second) {
358       // On first visit to a phi then start processing first predecessor
359       NextPredIdx = 0;
360     }
361 
362     if (Value->isPHIDef()) {
363       // Each predecessor node in the phi must be processed as a subgraph
364       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
365       assert(MBB && "Phi-def has no defining MBB");
366 
367       // Find next predecessor to process
368       unsigned Idx = NextPredIdx;
369       const auto *PI = MBB->pred_begin() + Idx;
370       const auto *PE = MBB->pred_end();
371       for (; PI != PE && !NextValue; ++PI, ++Idx) {
372         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
373           if (!Visited.count(VisitKey(VN, DefinedLanes)))
374             NextValue = VN;
375         }
376       }
377 
378       // If there are more predecessors to process; add phi to stack
379       if (PI != PE)
380         PhiStack.emplace_back(Value, Idx, DefinedLanes);
381     } else {
382       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
383       assert(MI && "Def has no defining instruction");
384 
385       if (Reg.isVirtual()) {
386         // Iterate over all operands to find relevant definitions
387         bool HasDef = false;
388         for (const MachineOperand &Op : MI->all_defs()) {
389           if (Op.getReg() != Reg)
390             continue;
391 
392           // Compute lanes defined and overlap with use
393           LaneBitmask OpLanes =
394               Op.isUndef() ? LaneBitmask::getAll()
395                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
396           LaneBitmask Overlap = (UseLanes & OpLanes);
397 
398           // Record if this instruction defined any of use
399           HasDef |= Overlap.any();
400 
401           // Mark any lanes defined
402           DefinedLanes |= OpLanes;
403         }
404 
405         // Check if all lanes of use have been defined
406         if ((DefinedLanes & UseLanes) != UseLanes) {
407           // Definition not complete; need to process input value
408           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
409           if (const VNInfo *VN = LRQ.valueIn()) {
410             if (!Visited.count(VisitKey(VN, DefinedLanes)))
411               NextValue = VN;
412           }
413         }
414 
415         // Only mark the instruction if it defines some part of the use
416         if (HasDef)
417           markInstruction(*MI, Flag, Worklist);
418       } else {
419         // For physical registers simply mark the defining instruction
420         markInstruction(*MI, Flag, Worklist);
421       }
422     }
423 
424     if (!NextValue && !PhiStack.empty()) {
425       // Reach end of chain; revert to processing last phi
426       PhiEntry &Entry = PhiStack.back();
427       NextValue = Entry.Phi;
428       NextPredIdx = Entry.PredIdx;
429       DefinedLanes = Entry.DefinedLanes;
430       PhiStack.pop_back();
431     }
432 
433     Value = NextValue;
434   } while (Value);
435 }
436 
437 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
438                                   const MachineOperand &Op, char Flag,
439                                   std::vector<WorkItem> &Worklist) {
440   assert(Op.isReg());
441   Register Reg = Op.getReg();
442 
443   // Ignore some hardware registers
444   switch (Reg) {
445   case AMDGPU::EXEC:
446   case AMDGPU::EXEC_LO:
447     return;
448   default:
449     break;
450   }
451 
452   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
453                     << " for " << MI);
454   if (Reg.isVirtual()) {
455     LiveRange &LR = LIS->getInterval(Reg);
456     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
457   } else {
458     // Handle physical registers that we need to track; this is mostly relevant
459     // for VCC, which can appear as the (implicit) input of a uniform branch,
460     // e.g. when a loop counter is stored in a VGPR.
461     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
462       LiveRange &LR = LIS->getRegUnit(Unit);
463       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
464       if (Value)
465         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
466     }
467   }
468 }
469 
470 /// Mark all instructions defining the uses in \p MI with \p Flag.
471 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
472                                           std::vector<WorkItem> &Worklist) {
473   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
474                     << MI);
475 
476   for (const MachineOperand &Use : MI.all_uses())
477     markOperand(MI, Use, Flag, Worklist);
478 }
479 
480 // Scan instructions to determine which ones require an Exact execmask and
481 // which ones seed WQM requirements.
482 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
483                                        std::vector<WorkItem> &Worklist) {
484   char GlobalFlags = 0;
485   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
486   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
487   bool HasImplicitDerivatives =
488       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
489 
490   // We need to visit the basic blocks in reverse post-order so that we visit
491   // defs before uses, in particular so that we don't accidentally mark an
492   // instruction as needing e.g. WQM before visiting it and realizing it needs
493   // WQM disabled.
494   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
495   for (MachineBasicBlock *MBB : RPOT) {
496     BlockInfo &BBI = Blocks[MBB];
497 
498     for (MachineInstr &MI : *MBB) {
499       InstrInfo &III = Instructions[&MI];
500       unsigned Opcode = MI.getOpcode();
501       char Flags = 0;
502 
503       if (TII->isWQM(Opcode)) {
504         // If LOD is not supported WQM is not needed.
505         // Only generate implicit WQM if implicit derivatives are required.
506         // This avoids inserting unintended WQM if a shader type without
507         // implicit derivatives uses an image sampling instruction.
508         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
509           // Sampling instructions don't need to produce results for all pixels
510           // in a quad, they just require all inputs of a quad to have been
511           // computed for derivatives.
512           markInstructionUses(MI, StateWQM, Worklist);
513           GlobalFlags |= StateWQM;
514         }
515       } else if (Opcode == AMDGPU::WQM) {
516         // The WQM intrinsic requires its output to have all the helper lanes
517         // correct, so we need it to be in WQM.
518         Flags = StateWQM;
519         LowerToCopyInstrs.insert(&MI);
520       } else if (Opcode == AMDGPU::SOFT_WQM) {
521         LowerToCopyInstrs.insert(&MI);
522         SoftWQMInstrs.push_back(&MI);
523       } else if (Opcode == AMDGPU::STRICT_WWM) {
524         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
525         // it needs to be executed in WQM or Exact so that its copy doesn't
526         // clobber inactive lanes.
527         markInstructionUses(MI, StateStrictWWM, Worklist);
528         GlobalFlags |= StateStrictWWM;
529         LowerToMovInstrs.push_back(&MI);
530       } else if (Opcode == AMDGPU::STRICT_WQM ||
531                  TII->isDualSourceBlendEXP(MI)) {
532         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
533         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
534         // quads that have at least one active thread.
535         markInstructionUses(MI, StateStrictWQM, Worklist);
536         GlobalFlags |= StateStrictWQM;
537 
538         if (Opcode == AMDGPU::STRICT_WQM) {
539           LowerToMovInstrs.push_back(&MI);
540         } else {
541           // Dual source blend export acts as implicit strict-wqm, its sources
542           // need to be shuffled in strict wqm, but the export itself needs to
543           // run in exact mode.
544           BBI.Needs |= StateExact;
545           if (!(BBI.InNeeds & StateExact)) {
546             BBI.InNeeds |= StateExact;
547             Worklist.emplace_back(MBB);
548           }
549           GlobalFlags |= StateExact;
550           III.Disabled = StateWQM | StateStrict;
551         }
552       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
553                  Opcode == AMDGPU::DS_PARAM_LOAD ||
554                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
555                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
556         // Mark these STRICTWQM, but only for the instruction, not its operands.
557         // This avoid unnecessarily marking M0 as requiring WQM.
558         III.Needs |= StateStrictWQM;
559         GlobalFlags |= StateStrictWQM;
560       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
561         // Disable strict states; StrictWQM will be added as required later.
562         III.Disabled = StateStrict;
563         MachineOperand &Inactive = MI.getOperand(4);
564         if (Inactive.isReg()) {
565           if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
566             LowerToCopyInstrs.insert(&MI);
567           else
568             markOperand(MI, Inactive, StateStrictWWM, Worklist);
569         }
570         SetInactiveInstrs.push_back(&MI);
571         BBI.NeedsLowering = true;
572       } else if (TII->isDisableWQM(MI)) {
573         BBI.Needs |= StateExact;
574         if (!(BBI.InNeeds & StateExact)) {
575           BBI.InNeeds |= StateExact;
576           Worklist.emplace_back(MBB);
577         }
578         GlobalFlags |= StateExact;
579         III.Disabled = StateWQM | StateStrict;
580       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
581                  Opcode == AMDGPU::SI_LIVE_MASK) {
582         LiveMaskQueries.push_back(&MI);
583       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
584                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
585                  Opcode == AMDGPU::SI_DEMOTE_I1) {
586         KillInstrs.push_back(&MI);
587         BBI.NeedsLowering = true;
588       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
589                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
590                  Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
591         InitExecInstrs.push_back(&MI);
592       } else if (WQMOutputs) {
593         // The function is in machine SSA form, which means that physical
594         // VGPRs correspond to shader inputs and outputs. Inputs are
595         // only used, outputs are only defined.
596         // FIXME: is this still valid?
597         for (const MachineOperand &MO : MI.defs()) {
598           Register Reg = MO.getReg();
599           if (Reg.isPhysical() &&
600               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
601             Flags = StateWQM;
602             break;
603           }
604         }
605       }
606 
607       if (Flags) {
608         markInstruction(MI, Flags, Worklist);
609         GlobalFlags |= Flags;
610       }
611     }
612   }
613 
614   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
615   // ever used anywhere in the function. This implements the corresponding
616   // semantics of @llvm.amdgcn.set.inactive.
617   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
618   if (GlobalFlags & StateWQM) {
619     for (MachineInstr *MI : SetInactiveInstrs)
620       markInstruction(*MI, StateWQM, Worklist);
621     for (MachineInstr *MI : SoftWQMInstrs)
622       markInstruction(*MI, StateWQM, Worklist);
623   }
624 
625   return GlobalFlags;
626 }
627 
628 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
629                                            std::vector<WorkItem>& Worklist) {
630   MachineBasicBlock *MBB = MI.getParent();
631   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
632   BlockInfo &BI = Blocks[MBB];
633 
634   // Control flow-type instructions and stores to temporary memory that are
635   // followed by WQM computations must themselves be in WQM.
636   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
637       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
638     Instructions[&MI].Needs = StateWQM;
639     II.Needs = StateWQM;
640   }
641 
642   // Propagate to block level
643   if (II.Needs & StateWQM) {
644     BI.Needs |= StateWQM;
645     if (!(BI.InNeeds & StateWQM)) {
646       BI.InNeeds |= StateWQM;
647       Worklist.emplace_back(MBB);
648     }
649   }
650 
651   // Propagate backwards within block
652   if (MachineInstr *PrevMI = MI.getPrevNode()) {
653     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
654     if (!PrevMI->isPHI()) {
655       InstrInfo &PrevII = Instructions[PrevMI];
656       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
657         PrevII.OutNeeds |= InNeeds;
658         Worklist.emplace_back(PrevMI);
659       }
660     }
661   }
662 
663   // Propagate WQM flag to instruction inputs
664   assert(!(II.Needs & StateExact));
665 
666   if (II.Needs != 0)
667     markInstructionUses(MI, II.Needs, Worklist);
668 
669   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
670   // not require any WQM transitions.
671   if (II.Needs & StateStrictWWM)
672     BI.Needs |= StateStrictWWM;
673   if (II.Needs & StateStrictWQM)
674     BI.Needs |= StateStrictWQM;
675 }
676 
677 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
678                                      std::vector<WorkItem>& Worklist) {
679   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
680 
681   // Propagate through instructions
682   if (!MBB.empty()) {
683     MachineInstr *LastMI = &*MBB.rbegin();
684     InstrInfo &LastII = Instructions[LastMI];
685     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
686       LastII.OutNeeds |= BI.OutNeeds;
687       Worklist.emplace_back(LastMI);
688     }
689   }
690 
691   // Predecessor blocks must provide for our WQM/Exact needs.
692   for (MachineBasicBlock *Pred : MBB.predecessors()) {
693     BlockInfo &PredBI = Blocks[Pred];
694     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
695       continue;
696 
697     PredBI.OutNeeds |= BI.InNeeds;
698     PredBI.InNeeds |= BI.InNeeds;
699     Worklist.emplace_back(Pred);
700   }
701 
702   // All successors must be prepared to accept the same set of WQM/Exact data.
703   for (MachineBasicBlock *Succ : MBB.successors()) {
704     BlockInfo &SuccBI = Blocks[Succ];
705     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
706       continue;
707 
708     SuccBI.InNeeds |= BI.OutNeeds;
709     Worklist.emplace_back(Succ);
710   }
711 }
712 
713 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
714   std::vector<WorkItem> Worklist;
715   char GlobalFlags = scanInstructions(MF, Worklist);
716 
717   while (!Worklist.empty()) {
718     WorkItem WI = Worklist.back();
719     Worklist.pop_back();
720 
721     if (WI.MI)
722       propagateInstruction(*WI.MI, Worklist);
723     else
724       propagateBlock(*WI.MBB, Worklist);
725   }
726 
727   return GlobalFlags;
728 }
729 
730 MachineBasicBlock::iterator
731 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
732                          MachineBasicBlock::iterator Before) {
733   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
734 
735   MachineInstr *Save =
736       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
737           .addReg(AMDGPU::SCC);
738   MachineInstr *Restore =
739       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
740           .addReg(SaveReg);
741 
742   LIS->InsertMachineInstrInMaps(*Save);
743   LIS->InsertMachineInstrInMaps(*Restore);
744   LIS->createAndComputeVirtRegInterval(SaveReg);
745 
746   return Restore;
747 }
748 
749 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
750                                                MachineInstr *TermMI) {
751   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
752                     << *TermMI << "\n");
753 
754   MachineBasicBlock *SplitBB =
755       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
756 
757   // Convert last instruction in block to a terminator.
758   // Note: this only covers the expected patterns
759   unsigned NewOpcode = 0;
760   switch (TermMI->getOpcode()) {
761   case AMDGPU::S_AND_B32:
762     NewOpcode = AMDGPU::S_AND_B32_term;
763     break;
764   case AMDGPU::S_AND_B64:
765     NewOpcode = AMDGPU::S_AND_B64_term;
766     break;
767   case AMDGPU::S_MOV_B32:
768     NewOpcode = AMDGPU::S_MOV_B32_term;
769     break;
770   case AMDGPU::S_MOV_B64:
771     NewOpcode = AMDGPU::S_MOV_B64_term;
772     break;
773   default:
774     break;
775   }
776   if (NewOpcode)
777     TermMI->setDesc(TII->get(NewOpcode));
778 
779   if (SplitBB != BB) {
780     // Update dominator trees
781     using DomTreeT = DomTreeBase<MachineBasicBlock>;
782     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
783     for (MachineBasicBlock *Succ : SplitBB->successors()) {
784       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
785       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
786     }
787     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
788     if (MDT)
789       MDT->applyUpdates(DTUpdates);
790     if (PDT)
791       PDT->applyUpdates(DTUpdates);
792 
793     // Link blocks
794     MachineInstr *MI =
795         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
796             .addMBB(SplitBB);
797     LIS->InsertMachineInstrInMaps(*MI);
798   }
799 
800   return SplitBB;
801 }
802 
803 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
804                                             MachineInstr &MI) {
805   assert(LiveMaskReg.isVirtual());
806 
807   const DebugLoc &DL = MI.getDebugLoc();
808   unsigned Opcode = 0;
809 
810   assert(MI.getOperand(0).isReg());
811 
812   // Comparison is for live lanes; however here we compute the inverse
813   // (killed lanes).  This is because VCMP will always generate 0 bits
814   // for inactive lanes so a mask of live lanes would not be correct
815   // inside control flow.
816   // Invert the comparison by swapping the operands and adjusting
817   // the comparison codes.
818 
819   switch (MI.getOperand(2).getImm()) {
820   case ISD::SETUEQ:
821     Opcode = AMDGPU::V_CMP_LG_F32_e64;
822     break;
823   case ISD::SETUGT:
824     Opcode = AMDGPU::V_CMP_GE_F32_e64;
825     break;
826   case ISD::SETUGE:
827     Opcode = AMDGPU::V_CMP_GT_F32_e64;
828     break;
829   case ISD::SETULT:
830     Opcode = AMDGPU::V_CMP_LE_F32_e64;
831     break;
832   case ISD::SETULE:
833     Opcode = AMDGPU::V_CMP_LT_F32_e64;
834     break;
835   case ISD::SETUNE:
836     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
837     break;
838   case ISD::SETO:
839     Opcode = AMDGPU::V_CMP_O_F32_e64;
840     break;
841   case ISD::SETUO:
842     Opcode = AMDGPU::V_CMP_U_F32_e64;
843     break;
844   case ISD::SETOEQ:
845   case ISD::SETEQ:
846     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
847     break;
848   case ISD::SETOGT:
849   case ISD::SETGT:
850     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
851     break;
852   case ISD::SETOGE:
853   case ISD::SETGE:
854     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
855     break;
856   case ISD::SETOLT:
857   case ISD::SETLT:
858     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
859     break;
860   case ISD::SETOLE:
861   case ISD::SETLE:
862     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
863     break;
864   case ISD::SETONE:
865   case ISD::SETNE:
866     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
867     break;
868   default:
869     llvm_unreachable("invalid ISD:SET cond code");
870   }
871 
872   // Pick opcode based on comparison type.
873   MachineInstr *VcmpMI;
874   const MachineOperand &Op0 = MI.getOperand(0);
875   const MachineOperand &Op1 = MI.getOperand(1);
876 
877   // VCC represents lanes killed.
878   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
879 
880   if (TRI->isVGPR(*MRI, Op0.getReg())) {
881     Opcode = AMDGPU::getVOPe32(Opcode);
882     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
883   } else {
884     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
885                  .addReg(VCC, RegState::Define)
886                  .addImm(0) // src0 modifiers
887                  .add(Op1)
888                  .addImm(0) // src1 modifiers
889                  .add(Op0)
890                  .addImm(0); // omod
891   }
892 
893   MachineInstr *MaskUpdateMI =
894       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
895           .addReg(LiveMaskReg)
896           .addReg(VCC);
897 
898   // State of SCC represents whether any lanes are live in mask,
899   // if SCC is 0 then no lanes will be alive anymore.
900   MachineInstr *EarlyTermMI =
901       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
902 
903   MachineInstr *ExecMaskMI =
904       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
905 
906   assert(MBB.succ_size() == 1);
907   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
908                               .addMBB(*MBB.succ_begin());
909 
910   // Update live intervals
911   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
912   MBB.remove(&MI);
913 
914   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
915   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
916   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
917   LIS->InsertMachineInstrInMaps(*NewTerm);
918 
919   return NewTerm;
920 }
921 
922 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
923                                            MachineInstr &MI, bool IsWQM) {
924   assert(LiveMaskReg.isVirtual());
925 
926   const DebugLoc &DL = MI.getDebugLoc();
927   MachineInstr *MaskUpdateMI = nullptr;
928 
929   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
930   const MachineOperand &Op = MI.getOperand(0);
931   int64_t KillVal = MI.getOperand(1).getImm();
932   MachineInstr *ComputeKilledMaskMI = nullptr;
933   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
934   Register TmpReg;
935 
936   // Is this a static or dynamic kill?
937   if (Op.isImm()) {
938     if (Op.getImm() == KillVal) {
939       // Static: all active lanes are killed
940       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
941                          .addReg(LiveMaskReg)
942                          .addReg(Exec);
943     } else {
944       // Static: kill does nothing
945       MachineInstr *NewTerm = nullptr;
946       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
947         LIS->RemoveMachineInstrFromMaps(MI);
948       } else {
949         assert(MBB.succ_size() == 1);
950         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
951                       .addMBB(*MBB.succ_begin());
952         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
953       }
954       MBB.remove(&MI);
955       return NewTerm;
956     }
957   } else {
958     if (!KillVal) {
959       // Op represents live lanes after kill,
960       // so exec mask needs to be factored in.
961       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
962       ComputeKilledMaskMI =
963           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
964       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
965                          .addReg(LiveMaskReg)
966                          .addReg(TmpReg);
967     } else {
968       // Op represents lanes to kill
969       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
970                          .addReg(LiveMaskReg)
971                          .add(Op);
972     }
973   }
974 
975   // State of SCC represents whether any lanes are live in mask,
976   // if SCC is 0 then no lanes will be alive anymore.
977   MachineInstr *EarlyTermMI =
978       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
979 
980   // In the case we got this far some lanes are still live,
981   // update EXEC to deactivate lanes as appropriate.
982   MachineInstr *NewTerm;
983   MachineInstr *WQMMaskMI = nullptr;
984   Register LiveMaskWQM;
985   if (IsDemote) {
986     // Demote - deactivate quads with only helper lanes
987     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
988     WQMMaskMI =
989         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
990     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
991                   .addReg(Exec)
992                   .addReg(LiveMaskWQM);
993   } else {
994     // Kill - deactivate lanes no longer in live mask
995     if (Op.isImm()) {
996       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
997       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
998     } else if (!IsWQM) {
999       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1000                     .addReg(Exec)
1001                     .addReg(LiveMaskReg);
1002     } else {
1003       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1004       NewTerm =
1005           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1006     }
1007   }
1008 
1009   // Update live intervals
1010   LIS->RemoveMachineInstrFromMaps(MI);
1011   MBB.remove(&MI);
1012   assert(EarlyTermMI);
1013   assert(MaskUpdateMI);
1014   assert(NewTerm);
1015   if (ComputeKilledMaskMI)
1016     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1017   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1018   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1019   if (WQMMaskMI)
1020     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1021   LIS->InsertMachineInstrInMaps(*NewTerm);
1022 
1023   if (CndReg) {
1024     LIS->removeInterval(CndReg);
1025     LIS->createAndComputeVirtRegInterval(CndReg);
1026   }
1027   if (TmpReg)
1028     LIS->createAndComputeVirtRegInterval(TmpReg);
1029   if (LiveMaskWQM)
1030     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1031 
1032   return NewTerm;
1033 }
1034 
1035 // Replace (or supplement) instructions accessing live mask.
1036 // This can only happen once all the live mask registers have been created
1037 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1038 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1039   auto *BII = Blocks.find(&MBB);
1040   if (BII == Blocks.end())
1041     return;
1042 
1043   const BlockInfo &BI = BII->second;
1044   if (!BI.NeedsLowering)
1045     return;
1046 
1047   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1048 
1049   SmallVector<MachineInstr *, 4> SplitPoints;
1050   Register ActiveLanesReg = 0;
1051   char State = BI.InitialState;
1052 
1053   for (MachineInstr &MI : llvm::make_early_inc_range(
1054            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1055     if (StateTransition.count(&MI))
1056       State = StateTransition[&MI];
1057 
1058     MachineInstr *SplitPoint = nullptr;
1059     switch (MI.getOpcode()) {
1060     case AMDGPU::SI_DEMOTE_I1:
1061     case AMDGPU::SI_KILL_I1_TERMINATOR:
1062       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1063       break;
1064     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1065       SplitPoint = lowerKillF32(MBB, MI);
1066       break;
1067     case AMDGPU::ENTER_STRICT_WWM:
1068       ActiveLanesReg = MI.getOperand(0).getReg();
1069       break;
1070     case AMDGPU::EXIT_STRICT_WWM:
1071       ActiveLanesReg = 0;
1072       break;
1073     case AMDGPU::V_SET_INACTIVE_B32:
1074       if (ActiveLanesReg) {
1075         LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
1076         MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
1077         MI.getOperand(5).setReg(ActiveLanesReg);
1078         LIS->shrinkToUses(&LI);
1079       } else {
1080         assert(State == StateExact || State == StateWQM);
1081       }
1082       break;
1083     default:
1084       break;
1085     }
1086     if (SplitPoint)
1087       SplitPoints.push_back(SplitPoint);
1088   }
1089 
1090   // Perform splitting after instruction scan to simplify iteration.
1091   if (!SplitPoints.empty()) {
1092     MachineBasicBlock *BB = &MBB;
1093     for (MachineInstr *MI : SplitPoints) {
1094       BB = splitBlock(BB, MI);
1095     }
1096   }
1097 }
1098 
1099 // Return an iterator in the (inclusive) range [First, Last] at which
1100 // instructions can be safely inserted, keeping in mind that some of the
1101 // instructions we want to add necessarily clobber SCC.
1102 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1103     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1104     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1105   if (!SaveSCC)
1106     return PreferLast ? Last : First;
1107 
1108   LiveRange &LR =
1109       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1110   auto MBBE = MBB.end();
1111   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1112                                      : LIS->getMBBEndIdx(&MBB);
1113   SlotIndex LastIdx =
1114       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1115   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1116   const LiveRange::Segment *S;
1117 
1118   for (;;) {
1119     S = LR.getSegmentContaining(Idx);
1120     if (!S)
1121       break;
1122 
1123     if (PreferLast) {
1124       SlotIndex Next = S->start.getBaseIndex();
1125       if (Next < FirstIdx)
1126         break;
1127       Idx = Next;
1128     } else {
1129       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1130       assert(EndMI && "Segment does not end on valid instruction");
1131       auto NextI = std::next(EndMI->getIterator());
1132       if (NextI == MBB.end())
1133         break;
1134       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1135       if (Next > LastIdx)
1136         break;
1137       Idx = Next;
1138     }
1139   }
1140 
1141   MachineBasicBlock::iterator MBBI;
1142 
1143   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1144     MBBI = MI;
1145   else {
1146     assert(Idx == LIS->getMBBEndIdx(&MBB));
1147     MBBI = MBB.end();
1148   }
1149 
1150   // Move insertion point past any operations modifying EXEC.
1151   // This assumes that the value of SCC defined by any of these operations
1152   // does not need to be preserved.
1153   while (MBBI != Last) {
1154     bool IsExecDef = false;
1155     for (const MachineOperand &MO : MBBI->all_defs()) {
1156       IsExecDef |=
1157           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1158     }
1159     if (!IsExecDef)
1160       break;
1161     MBBI++;
1162     S = nullptr;
1163   }
1164 
1165   if (S)
1166     MBBI = saveSCC(MBB, MBBI);
1167 
1168   return MBBI;
1169 }
1170 
1171 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1172                               MachineBasicBlock::iterator Before,
1173                               Register SaveWQM) {
1174   assert(LiveMaskReg.isVirtual());
1175 
1176   bool IsTerminator = Before == MBB.end();
1177   if (!IsTerminator) {
1178     auto FirstTerm = MBB.getFirstTerminator();
1179     if (FirstTerm != MBB.end()) {
1180       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1181       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1182       IsTerminator = BeforeIdx > FirstTermIdx;
1183     }
1184   }
1185 
1186   MachineInstr *MI;
1187 
1188   if (SaveWQM) {
1189     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1190     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1191              .addReg(LiveMaskReg);
1192   } else {
1193     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1194     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1195              .addReg(Exec)
1196              .addReg(LiveMaskReg);
1197   }
1198 
1199   LIS->InsertMachineInstrInMaps(*MI);
1200   StateTransition[MI] = StateExact;
1201 }
1202 
1203 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1204                             MachineBasicBlock::iterator Before,
1205                             Register SavedWQM) {
1206   MachineInstr *MI;
1207 
1208   if (SavedWQM) {
1209     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1210              .addReg(SavedWQM);
1211   } else {
1212     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1213   }
1214 
1215   LIS->InsertMachineInstrInMaps(*MI);
1216   StateTransition[MI] = StateWQM;
1217 }
1218 
1219 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1220                                    MachineBasicBlock::iterator Before,
1221                                    Register SaveOrig, char StrictStateNeeded) {
1222   MachineInstr *MI;
1223   assert(SaveOrig);
1224   assert(StrictStateNeeded == StateStrictWWM ||
1225          StrictStateNeeded == StateStrictWQM);
1226 
1227   if (StrictStateNeeded == StateStrictWWM) {
1228     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1229                  SaveOrig)
1230              .addImm(-1);
1231   } else {
1232     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1233                  SaveOrig)
1234              .addImm(-1);
1235   }
1236   LIS->InsertMachineInstrInMaps(*MI);
1237   StateTransition[MI] = StrictStateNeeded;
1238 }
1239 
1240 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1241                                      MachineBasicBlock::iterator Before,
1242                                      Register SavedOrig, char NonStrictState,
1243                                      char CurrentStrictState) {
1244   MachineInstr *MI;
1245 
1246   assert(SavedOrig);
1247   assert(CurrentStrictState == StateStrictWWM ||
1248          CurrentStrictState == StateStrictWQM);
1249 
1250   if (CurrentStrictState == StateStrictWWM) {
1251     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1252                  Exec)
1253              .addReg(SavedOrig);
1254   } else {
1255     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1256                  Exec)
1257              .addReg(SavedOrig);
1258   }
1259   LIS->InsertMachineInstrInMaps(*MI);
1260   StateTransition[MI] = NonStrictState;
1261 }
1262 
1263 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1264   auto *BII = Blocks.find(&MBB);
1265   if (BII == Blocks.end())
1266     return;
1267 
1268   BlockInfo &BI = BII->second;
1269 
1270   // This is a non-entry block that is WQM throughout, so no need to do
1271   // anything.
1272   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1273     BI.InitialState = StateWQM;
1274     return;
1275   }
1276 
1277   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1278                     << ":\n");
1279 
1280   Register SavedWQMReg;
1281   Register SavedNonStrictReg;
1282   bool WQMFromExec = IsEntry;
1283   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1284   char NonStrictState = 0;
1285   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1286 
1287   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1288   if (IsEntry) {
1289     // Skip the instruction that saves LiveMask
1290     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1291         II->getOperand(1).getReg() == TRI->getExec())
1292       ++II;
1293   }
1294 
1295   // This stores the first instruction where it's safe to switch from WQM to
1296   // Exact or vice versa.
1297   MachineBasicBlock::iterator FirstWQM = IE;
1298 
1299   // This stores the first instruction where it's safe to switch from Strict
1300   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1301   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1302   // be safe to switch to/from WQM as well.
1303   MachineBasicBlock::iterator FirstStrict = IE;
1304 
1305   // Record initial state is block information.
1306   BI.InitialState = State;
1307 
1308   for (;;) {
1309     MachineBasicBlock::iterator Next = II;
1310     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1311     char OutNeeds = 0;
1312 
1313     if (FirstWQM == IE)
1314       FirstWQM = II;
1315 
1316     if (FirstStrict == IE)
1317       FirstStrict = II;
1318 
1319     // First, figure out the allowed states (Needs) based on the propagated
1320     // flags.
1321     if (II != IE) {
1322       MachineInstr &MI = *II;
1323 
1324       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1325         auto III = Instructions.find(&MI);
1326         if (III != Instructions.end()) {
1327           if (III->second.Needs & StateStrictWWM)
1328             Needs = StateStrictWWM;
1329           else if (III->second.Needs & StateStrictWQM)
1330             Needs = StateStrictWQM;
1331           else if (III->second.Needs & StateWQM)
1332             Needs = StateWQM;
1333           else
1334             Needs &= ~III->second.Disabled;
1335           OutNeeds = III->second.OutNeeds;
1336         }
1337       } else {
1338         // If the instruction doesn't actually need a correct EXEC, then we can
1339         // safely leave Strict mode enabled.
1340         Needs = StateExact | StateWQM | StateStrict;
1341       }
1342 
1343       // Exact mode exit can occur in terminators, but must be before branches.
1344       if (MI.isBranch() && OutNeeds == StateExact)
1345         Needs = StateExact;
1346 
1347       ++Next;
1348     } else {
1349       // End of basic block
1350       if (BI.OutNeeds & StateWQM)
1351         Needs = StateWQM;
1352       else if (BI.OutNeeds == StateExact)
1353         Needs = StateExact;
1354       else
1355         Needs = StateWQM | StateExact;
1356     }
1357 
1358     // Now, transition if necessary.
1359     if (!(Needs & State)) {
1360       MachineBasicBlock::iterator First;
1361       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1362           State == StateStrictWQM || Needs == StateStrictWQM) {
1363         // We must switch to or from Strict mode.
1364         First = FirstStrict;
1365       } else {
1366         // We only need to switch to/from WQM, so we can use FirstWQM.
1367         First = FirstWQM;
1368       }
1369 
1370       // Whether we need to save SCC depends on start and end states.
1371       bool SaveSCC = false;
1372       switch (State) {
1373       case StateExact:
1374       case StateStrictWWM:
1375       case StateStrictWQM:
1376         // Exact/Strict -> Strict: save SCC
1377         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1378         // Exact/Strict -> Exact: no save
1379         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1380         break;
1381       case StateWQM:
1382         // WQM -> Exact/Strict: save SCC
1383         SaveSCC = !(Needs & StateWQM);
1384         break;
1385       default:
1386         llvm_unreachable("Unknown state");
1387         break;
1388       }
1389       char StartState = State & StateStrict ? NonStrictState : State;
1390       bool WQMToExact =
1391           StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1392       bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1393                         !(Needs & StateExact);
1394       bool PreferLast = Needs == StateWQM;
1395       // Exact regions in divergent control flow may run at EXEC=0, so try to
1396       // exclude instructions with unexpected effects from them.
1397       // FIXME: ideally we would branch over these when EXEC=0,
1398       // but this requires updating implicit values, live intervals and CFG.
1399       if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1400         for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1401           if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
1402             PreferLast = WQMToExact;
1403             break;
1404           }
1405         }
1406       }
1407       MachineBasicBlock::iterator Before =
1408           prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
1409 
1410       if (State & StateStrict) {
1411         assert(State == StateStrictWWM || State == StateStrictWQM);
1412         assert(SavedNonStrictReg);
1413         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1414 
1415         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1416         SavedNonStrictReg = 0;
1417         State = NonStrictState;
1418       }
1419 
1420       if (Needs & StateStrict) {
1421         NonStrictState = State;
1422         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1423         assert(!SavedNonStrictReg);
1424         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1425 
1426         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1427         State = Needs;
1428       } else {
1429         if (WQMToExact) {
1430           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1431             assert(!SavedWQMReg);
1432             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1433           }
1434 
1435           toExact(MBB, Before, SavedWQMReg);
1436           State = StateExact;
1437         } else if (ExactToWQM) {
1438           assert(WQMFromExec == (SavedWQMReg == 0));
1439 
1440           toWQM(MBB, Before, SavedWQMReg);
1441 
1442           if (SavedWQMReg) {
1443             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1444             SavedWQMReg = 0;
1445           }
1446           State = StateWQM;
1447         } else {
1448           // We can get here if we transitioned from StrictWWM to a
1449           // non-StrictWWM state that already matches our needs, but we
1450           // shouldn't need to do anything.
1451           assert(Needs & State);
1452         }
1453       }
1454     }
1455 
1456     if (Needs != (StateExact | StateWQM | StateStrict)) {
1457       if (Needs != (StateExact | StateWQM))
1458         FirstWQM = IE;
1459       FirstStrict = IE;
1460     }
1461 
1462     if (II == IE)
1463       break;
1464 
1465     II = Next;
1466   }
1467   assert(!SavedWQMReg);
1468   assert(!SavedNonStrictReg);
1469 }
1470 
1471 bool SIWholeQuadMode::lowerLiveMaskQueries() {
1472   for (MachineInstr *MI : LiveMaskQueries) {
1473     const DebugLoc &DL = MI->getDebugLoc();
1474     Register Dest = MI->getOperand(0).getReg();
1475 
1476     MachineInstr *Copy =
1477         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1478             .addReg(LiveMaskReg);
1479 
1480     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1481     MI->eraseFromParent();
1482   }
1483   return !LiveMaskQueries.empty();
1484 }
1485 
1486 bool SIWholeQuadMode::lowerCopyInstrs() {
1487   for (MachineInstr *MI : LowerToMovInstrs) {
1488     assert(MI->getNumExplicitOperands() == 2);
1489 
1490     const Register Reg = MI->getOperand(0).getReg();
1491 
1492     const TargetRegisterClass *regClass =
1493         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1494     if (TRI->isVGPRClass(regClass)) {
1495       const unsigned MovOp = TII->getMovOpcode(regClass);
1496       MI->setDesc(TII->get(MovOp));
1497 
1498       // Check that it already implicitly depends on exec (like all VALU movs
1499       // should do).
1500       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1501         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1502       }));
1503     } else {
1504       // Remove early-clobber and exec dependency from simple SGPR copies.
1505       // This allows some to be eliminated during/post RA.
1506       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1507       if (MI->getOperand(0).isEarlyClobber()) {
1508         LIS->removeInterval(Reg);
1509         MI->getOperand(0).setIsEarlyClobber(false);
1510         LIS->createAndComputeVirtRegInterval(Reg);
1511       }
1512       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1513       while (Index >= 0) {
1514         MI->removeOperand(Index);
1515         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1516       }
1517       MI->setDesc(TII->get(AMDGPU::COPY));
1518       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1519     }
1520   }
1521   for (MachineInstr *MI : LowerToCopyInstrs) {
1522     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1523 
1524     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1525       assert(MI->getNumExplicitOperands() == 6);
1526 
1527       LiveInterval *RecomputeLI = nullptr;
1528       if (MI->getOperand(4).isReg())
1529         RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
1530 
1531       MI->removeOperand(5);
1532       MI->removeOperand(4);
1533       MI->removeOperand(3);
1534       MI->removeOperand(1);
1535 
1536       if (RecomputeLI)
1537         LIS->shrinkToUses(RecomputeLI);
1538     } else {
1539       assert(MI->getNumExplicitOperands() == 2);
1540     }
1541 
1542     unsigned CopyOp = MI->getOperand(1).isReg()
1543                           ? (unsigned)AMDGPU::COPY
1544                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1545                                 *MRI, MI->getOperand(0)));
1546     MI->setDesc(TII->get(CopyOp));
1547     LLVM_DEBUG(dbgs() << " -> " << *MI);
1548   }
1549   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1550 }
1551 
1552 bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1553   for (MachineInstr *MI : KillInstrs) {
1554     MachineBasicBlock *MBB = MI->getParent();
1555     MachineInstr *SplitPoint = nullptr;
1556     switch (MI->getOpcode()) {
1557     case AMDGPU::SI_DEMOTE_I1:
1558     case AMDGPU::SI_KILL_I1_TERMINATOR:
1559       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1560       break;
1561     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1562       SplitPoint = lowerKillF32(*MBB, *MI);
1563       break;
1564     }
1565     if (SplitPoint)
1566       splitBlock(MBB, SplitPoint);
1567   }
1568   return !KillInstrs.empty();
1569 }
1570 
1571 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1572   MachineBasicBlock *MBB = MI.getParent();
1573   bool IsWave32 = ST->isWave32();
1574 
1575   if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1576     assert(MBB == &MBB->getParent()->front() &&
1577            "init whole wave not in entry block");
1578     Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1579     MachineInstr *SaveExec =
1580         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1581                 TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1582                                   : AMDGPU::S_OR_SAVEEXEC_B64),
1583                 EntryExec)
1584             .addImm(-1);
1585 
1586     // Replace all uses of MI's destination reg with EntryExec.
1587     MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1588 
1589     if (LIS) {
1590       LIS->RemoveMachineInstrFromMaps(MI);
1591     }
1592 
1593     MI.eraseFromParent();
1594 
1595     if (LIS) {
1596       LIS->InsertMachineInstrInMaps(*SaveExec);
1597       LIS->createAndComputeVirtRegInterval(EntryExec);
1598     }
1599     return;
1600   }
1601 
1602   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1603     // This should be before all vector instructions.
1604     MachineInstr *InitMI =
1605         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1606                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1607                 Exec)
1608             .addImm(MI.getOperand(0).getImm());
1609     if (LIS) {
1610       LIS->RemoveMachineInstrFromMaps(MI);
1611       LIS->InsertMachineInstrInMaps(*InitMI);
1612     }
1613     MI.eraseFromParent();
1614     return;
1615   }
1616 
1617   // Extract the thread count from an SGPR input and set EXEC accordingly.
1618   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1619   //
1620   // S_BFE_U32 count, input, {shift, 7}
1621   // S_BFM_B64 exec, count, 0
1622   // S_CMP_EQ_U32 count, 64
1623   // S_CMOV_B64 exec, -1
1624   Register InputReg = MI.getOperand(0).getReg();
1625   MachineInstr *FirstMI = &*MBB->begin();
1626   if (InputReg.isVirtual()) {
1627     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1628     assert(DefInstr && DefInstr->isCopy());
1629     if (DefInstr->getParent() == MBB) {
1630       if (DefInstr != FirstMI) {
1631         // If the `InputReg` is defined in current block, we also need to
1632         // move that instruction to the beginning of the block.
1633         DefInstr->removeFromParent();
1634         MBB->insert(FirstMI, DefInstr);
1635         if (LIS)
1636           LIS->handleMove(*DefInstr);
1637       } else {
1638         // If first instruction is definition then move pointer after it.
1639         FirstMI = &*std::next(FirstMI->getIterator());
1640       }
1641     }
1642   }
1643 
1644   // Insert instruction sequence at block beginning (before vector operations).
1645   const DebugLoc DL = MI.getDebugLoc();
1646   const unsigned WavefrontSize = ST->getWavefrontSize();
1647   const unsigned Mask = (WavefrontSize << 1) - 1;
1648   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1649   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1650                    .addReg(InputReg)
1651                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1652   auto BfmMI =
1653       BuildMI(*MBB, FirstMI, DL,
1654               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1655           .addReg(CountReg)
1656           .addImm(0);
1657   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1658                    .addReg(CountReg, RegState::Kill)
1659                    .addImm(WavefrontSize);
1660   auto CmovMI =
1661       BuildMI(*MBB, FirstMI, DL,
1662               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1663               Exec)
1664           .addImm(-1);
1665 
1666   if (!LIS) {
1667     MI.eraseFromParent();
1668     return;
1669   }
1670 
1671   LIS->RemoveMachineInstrFromMaps(MI);
1672   MI.eraseFromParent();
1673 
1674   LIS->InsertMachineInstrInMaps(*BfeMI);
1675   LIS->InsertMachineInstrInMaps(*BfmMI);
1676   LIS->InsertMachineInstrInMaps(*CmpMI);
1677   LIS->InsertMachineInstrInMaps(*CmovMI);
1678 
1679   LIS->removeInterval(InputReg);
1680   LIS->createAndComputeVirtRegInterval(InputReg);
1681   LIS->createAndComputeVirtRegInterval(CountReg);
1682 }
1683 
1684 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1685 /// for instructions that depend on EXEC.
1686 MachineBasicBlock::iterator
1687 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1688   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1689 
1690   for (MachineInstr *MI : InitExecInstrs) {
1691     // Try to handle undefined cases gracefully:
1692     // - multiple INIT_EXEC instructions
1693     // - INIT_EXEC instructions not in the entry block
1694     if (MI->getParent() == &Entry)
1695       InsertPt = std::next(MI->getIterator());
1696 
1697     lowerInitExec(*MI);
1698     Changed = true;
1699   }
1700 
1701   return InsertPt;
1702 }
1703 
1704 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1705   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1706                     << " ------------- \n");
1707   LLVM_DEBUG(MF.dump(););
1708 
1709   Instructions.clear();
1710   Blocks.clear();
1711   LiveMaskQueries.clear();
1712   LowerToCopyInstrs.clear();
1713   LowerToMovInstrs.clear();
1714   KillInstrs.clear();
1715   InitExecInstrs.clear();
1716   SetInactiveInstrs.clear();
1717   StateTransition.clear();
1718 
1719   ST = &MF.getSubtarget<GCNSubtarget>();
1720 
1721   TII = ST->getInstrInfo();
1722   TRI = &TII->getRegisterInfo();
1723   MRI = &MF.getRegInfo();
1724   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1725   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1726   MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1727   auto *PDTWrapper =
1728       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1729   PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1730 
1731   if (ST->isWave32()) {
1732     AndOpc = AMDGPU::S_AND_B32;
1733     AndTermOpc = AMDGPU::S_AND_B32_term;
1734     AndN2Opc = AMDGPU::S_ANDN2_B32;
1735     XorOpc = AMDGPU::S_XOR_B32;
1736     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1737     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1738     WQMOpc = AMDGPU::S_WQM_B32;
1739     Exec = AMDGPU::EXEC_LO;
1740   } else {
1741     AndOpc = AMDGPU::S_AND_B64;
1742     AndTermOpc = AMDGPU::S_AND_B64_term;
1743     AndN2Opc = AMDGPU::S_ANDN2_B64;
1744     XorOpc = AMDGPU::S_XOR_B64;
1745     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1746     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1747     WQMOpc = AMDGPU::S_WQM_B64;
1748     Exec = AMDGPU::EXEC;
1749   }
1750 
1751   const char GlobalFlags = analyzeFunction(MF);
1752   bool Changed = false;
1753 
1754   LiveMaskReg = Exec;
1755 
1756   MachineBasicBlock &Entry = MF.front();
1757   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1758 
1759   // Store a copy of the original live mask when required
1760   const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1761   const bool HasWaveModes = GlobalFlags & ~StateExact;
1762   const bool HasKills = !KillInstrs.empty();
1763   const bool UsesWQM = GlobalFlags & StateWQM;
1764   if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1765     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1766     MachineInstr *MI =
1767         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1768             .addReg(Exec);
1769     LIS->InsertMachineInstrInMaps(*MI);
1770     Changed = true;
1771   }
1772 
1773   // Check if V_SET_INACTIVE was touched by a strict state mode.
1774   // If so, promote to WWM; otherwise lower to COPY.
1775   for (MachineInstr *MI : SetInactiveInstrs) {
1776     if (LowerToCopyInstrs.contains(MI))
1777       continue;
1778     if (Instructions[MI].MarkedStates & StateStrict) {
1779       Instructions[MI].Needs |= StateStrictWWM;
1780       Instructions[MI].Disabled &= ~StateStrictWWM;
1781       Blocks[MI->getParent()].Needs |= StateStrictWWM;
1782     } else {
1783       LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1784       LowerToCopyInstrs.insert(MI);
1785     }
1786   }
1787 
1788   LLVM_DEBUG(printInfo());
1789 
1790   Changed |= lowerLiveMaskQueries();
1791   Changed |= lowerCopyInstrs();
1792 
1793   if (!HasWaveModes) {
1794     // No wave mode execution
1795     Changed |= lowerKillInstrs(false);
1796   } else if (GlobalFlags == StateWQM) {
1797     // Shader only needs WQM
1798     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1799                   .addReg(Exec);
1800     LIS->InsertMachineInstrInMaps(*MI);
1801     lowerKillInstrs(true);
1802     Changed = true;
1803   } else {
1804     // Wave mode switching requires full lowering pass.
1805     for (auto BII : Blocks)
1806       processBlock(*BII.first, BII.first == &Entry);
1807     // Lowering blocks causes block splitting so perform as a second pass.
1808     for (auto BII : Blocks)
1809       lowerBlock(*BII.first);
1810     Changed = true;
1811   }
1812 
1813   // Compute live range for live mask
1814   if (LiveMaskReg != Exec)
1815     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1816 
1817   // Physical registers like SCC aren't tracked by default anyway, so just
1818   // removing the ranges we computed is the simplest option for maintaining
1819   // the analysis results.
1820   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1821 
1822   // If we performed any kills then recompute EXEC
1823   if (!KillInstrs.empty() || !InitExecInstrs.empty())
1824     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1825 
1826   return Changed;
1827 }
1828