xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision df6750eaa80099a1e96439bc060a18a26d7212e6)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137   char InitialState = 0;
138   bool NeedsLowering = false;
139 };
140 
141 struct WorkItem {
142   MachineBasicBlock *MBB = nullptr;
143   MachineInstr *MI = nullptr;
144 
145   WorkItem() = default;
146   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147   WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149 
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152   const SIInstrInfo *TII;
153   const SIRegisterInfo *TRI;
154   const GCNSubtarget *ST;
155   MachineRegisterInfo *MRI;
156   LiveIntervals *LIS;
157   MachineDominatorTree *MDT;
158   MachinePostDominatorTree *PDT;
159 
160   unsigned AndOpc;
161   unsigned AndTermOpc;
162   unsigned AndN2Opc;
163   unsigned XorOpc;
164   unsigned AndSaveExecOpc;
165   unsigned AndSaveExecTermOpc;
166   unsigned WQMOpc;
167   Register Exec;
168   Register LiveMaskReg;
169 
170   DenseMap<const MachineInstr *, InstrInfo> Instructions;
171   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172 
173   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174   DenseMap<const MachineInstr *, char> StateTransition;
175 
176   SmallVector<MachineInstr *, 2> LiveMaskQueries;
177   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179   SmallVector<MachineInstr *, 4> KillInstrs;
180   SmallVector<MachineInstr *, 4> InitExecInstrs;
181 
182   void printInfo();
183 
184   void markInstruction(MachineInstr &MI, char Flag,
185                        std::vector<WorkItem> &Worklist);
186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189                    std::vector<WorkItem> &Worklist);
190   void markInstructionUses(const MachineInstr &MI, char Flag,
191                            std::vector<WorkItem> &Worklist);
192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195   char analyzeFunction(MachineFunction &MF);
196 
197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198                                       MachineBasicBlock::iterator Before);
199   MachineBasicBlock::iterator
200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201                    MachineBasicBlock::iterator Last, bool PreferLast,
202                    bool SaveSCC);
203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204                Register SaveWQM);
205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206              Register SavedWQM);
207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208                     Register SaveOrig, char StrictStateNeeded);
209   void fromStrictMode(MachineBasicBlock &MBB,
210                       MachineBasicBlock::iterator Before, Register SavedOrig,
211                       char NonStrictState, char CurrentStrictState);
212 
213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214 
215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216                             bool IsWQM);
217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218   void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
219                              MachineInstr *Exit);
220 
221   void lowerBlock(MachineBasicBlock &MBB);
222   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
223 
224   void lowerLiveMaskQueries();
225   void lowerCopyInstrs();
226   void lowerKillInstrs(bool IsWQM);
227   void lowerInitExec(MachineInstr &MI);
228   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
229 
230 public:
231   static char ID;
232 
233   SIWholeQuadMode() :
234     MachineFunctionPass(ID) { }
235 
236   bool runOnMachineFunction(MachineFunction &MF) override;
237 
238   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
239 
240   void getAnalysisUsage(AnalysisUsage &AU) const override {
241     AU.addRequired<LiveIntervals>();
242     AU.addPreserved<SlotIndexes>();
243     AU.addPreserved<LiveIntervals>();
244     AU.addPreserved<MachineDominatorTree>();
245     AU.addPreserved<MachinePostDominatorTree>();
246     MachineFunctionPass::getAnalysisUsage(AU);
247   }
248 
249   MachineFunctionProperties getClearedProperties() const override {
250     return MachineFunctionProperties().set(
251         MachineFunctionProperties::Property::IsSSA);
252   }
253 };
254 
255 } // end anonymous namespace
256 
257 char SIWholeQuadMode::ID = 0;
258 
259 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
260                       false)
261 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
262 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
263 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
264 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
265                     false)
266 
267 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
268 
269 FunctionPass *llvm::createSIWholeQuadModePass() {
270   return new SIWholeQuadMode;
271 }
272 
273 #ifndef NDEBUG
274 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
275   for (const auto &BII : Blocks) {
276     dbgs() << "\n"
277            << printMBBReference(*BII.first) << ":\n"
278            << "  InNeeds = " << PrintState(BII.second.InNeeds)
279            << ", Needs = " << PrintState(BII.second.Needs)
280            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
281 
282     for (const MachineInstr &MI : *BII.first) {
283       auto III = Instructions.find(&MI);
284       if (III != Instructions.end()) {
285         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
286                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287       }
288     }
289   }
290 }
291 #endif
292 
293 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
294                                       std::vector<WorkItem> &Worklist) {
295   InstrInfo &II = Instructions[&MI];
296 
297   assert(!(Flag & StateExact) && Flag != 0);
298 
299   // Remove any disabled states from the flag. The user that required it gets
300   // an undefined value in the helper lanes. For example, this can happen if
301   // the result of an atomic is used by instruction that requires WQM, where
302   // ignoring the request for WQM is correct as per the relevant specs.
303   Flag &= ~II.Disabled;
304 
305   // Ignore if the flag is already encompassed by the existing needs, or we
306   // just disabled everything.
307   if ((II.Needs & Flag) == Flag)
308     return;
309 
310   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
311   II.Needs |= Flag;
312   Worklist.push_back(&MI);
313 }
314 
315 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
316 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
317                                Register Reg, unsigned SubReg, char Flag,
318                                std::vector<WorkItem> &Worklist) {
319   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
320 
321   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
322   const VNInfo *Value = UseLRQ.valueIn();
323   if (!Value)
324     return;
325 
326   // Note: this code assumes that lane masks on AMDGPU completely
327   // cover registers.
328   const LaneBitmask UseLanes =
329       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
330              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
331                                 : LaneBitmask::getNone());
332 
333   // Perform a depth-first iteration of the LiveRange graph marking defs.
334   // Stop processing of a given branch when all use lanes have been defined.
335   // The first definition stops processing for a physical register.
336   struct PhiEntry {
337     const VNInfo *Phi;
338     unsigned PredIdx;
339     LaneBitmask DefinedLanes;
340 
341     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
342         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
343   };
344   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
345   SmallVector<PhiEntry, 2> PhiStack;
346   SmallSet<VisitKey, 4> Visited;
347   LaneBitmask DefinedLanes;
348   unsigned NextPredIdx = 0; // Only used for processing phi nodes
349   do {
350     const VNInfo *NextValue = nullptr;
351     const VisitKey Key(Value, DefinedLanes);
352 
353     if (Visited.insert(Key).second) {
354       // On first visit to a phi then start processing first predecessor
355       NextPredIdx = 0;
356     }
357 
358     if (Value->isPHIDef()) {
359       // Each predecessor node in the phi must be processed as a subgraph
360       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361       assert(MBB && "Phi-def has no defining MBB");
362 
363       // Find next predecessor to process
364       unsigned Idx = NextPredIdx;
365       auto PI = MBB->pred_begin() + Idx;
366       auto PE = MBB->pred_end();
367       for (; PI != PE && !NextValue; ++PI, ++Idx) {
368         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369           if (!Visited.count(VisitKey(VN, DefinedLanes)))
370             NextValue = VN;
371         }
372       }
373 
374       // If there are more predecessors to process; add phi to stack
375       if (PI != PE)
376         PhiStack.emplace_back(Value, Idx, DefinedLanes);
377     } else {
378       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379       assert(MI && "Def has no defining instruction");
380 
381       if (Reg.isVirtual()) {
382         // Iterate over all operands to find relevant definitions
383         bool HasDef = false;
384         for (const MachineOperand &Op : MI->all_defs()) {
385           if (Op.getReg() != Reg)
386             continue;
387 
388           // Compute lanes defined and overlap with use
389           LaneBitmask OpLanes =
390               Op.isUndef() ? LaneBitmask::getAll()
391                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392           LaneBitmask Overlap = (UseLanes & OpLanes);
393 
394           // Record if this instruction defined any of use
395           HasDef |= Overlap.any();
396 
397           // Mark any lanes defined
398           DefinedLanes |= OpLanes;
399         }
400 
401         // Check if all lanes of use have been defined
402         if ((DefinedLanes & UseLanes) != UseLanes) {
403           // Definition not complete; need to process input value
404           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405           if (const VNInfo *VN = LRQ.valueIn()) {
406             if (!Visited.count(VisitKey(VN, DefinedLanes)))
407               NextValue = VN;
408           }
409         }
410 
411         // Only mark the instruction if it defines some part of the use
412         if (HasDef)
413           markInstruction(*MI, Flag, Worklist);
414       } else {
415         // For physical registers simply mark the defining instruction
416         markInstruction(*MI, Flag, Worklist);
417       }
418     }
419 
420     if (!NextValue && !PhiStack.empty()) {
421       // Reach end of chain; revert to processing last phi
422       PhiEntry &Entry = PhiStack.back();
423       NextValue = Entry.Phi;
424       NextPredIdx = Entry.PredIdx;
425       DefinedLanes = Entry.DefinedLanes;
426       PhiStack.pop_back();
427     }
428 
429     Value = NextValue;
430   } while (Value);
431 }
432 
433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434                                   const MachineOperand &Op, char Flag,
435                                   std::vector<WorkItem> &Worklist) {
436   assert(Op.isReg());
437   Register Reg = Op.getReg();
438 
439   // Ignore some hardware registers
440   switch (Reg) {
441   case AMDGPU::EXEC:
442   case AMDGPU::EXEC_LO:
443     return;
444   default:
445     break;
446   }
447 
448   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449                     << " for " << MI);
450   if (Reg.isVirtual()) {
451     LiveRange &LR = LIS->getInterval(Reg);
452     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453   } else {
454     // Handle physical registers that we need to track; this is mostly relevant
455     // for VCC, which can appear as the (implicit) input of a uniform branch,
456     // e.g. when a loop counter is stored in a VGPR.
457     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
458       LiveRange &LR = LIS->getRegUnit(Unit);
459       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
460       if (Value)
461         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
462     }
463   }
464 }
465 
466 /// Mark all instructions defining the uses in \p MI with \p Flag.
467 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
468                                           std::vector<WorkItem> &Worklist) {
469   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
470                     << MI);
471 
472   for (const MachineOperand &Use : MI.all_uses())
473     markOperand(MI, Use, Flag, Worklist);
474 }
475 
476 // Scan instructions to determine which ones require an Exact execmask and
477 // which ones seed WQM requirements.
478 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
479                                        std::vector<WorkItem> &Worklist) {
480   char GlobalFlags = 0;
481   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
482   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
483   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
484   bool HasImplicitDerivatives =
485       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
486 
487   // We need to visit the basic blocks in reverse post-order so that we visit
488   // defs before uses, in particular so that we don't accidentally mark an
489   // instruction as needing e.g. WQM before visiting it and realizing it needs
490   // WQM disabled.
491   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
492   for (MachineBasicBlock *MBB : RPOT) {
493     BlockInfo &BBI = Blocks[MBB];
494 
495     for (MachineInstr &MI : *MBB) {
496       InstrInfo &III = Instructions[&MI];
497       unsigned Opcode = MI.getOpcode();
498       char Flags = 0;
499 
500       if (TII->isWQM(Opcode)) {
501         // If LOD is not supported WQM is not needed.
502         // Only generate implicit WQM if implicit derivatives are required.
503         // This avoids inserting unintended WQM if a shader type without
504         // implicit derivatives uses an image sampling instruction.
505         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
506           // Sampling instructions don't need to produce results for all pixels
507           // in a quad, they just require all inputs of a quad to have been
508           // computed for derivatives.
509           markInstructionUses(MI, StateWQM, Worklist);
510           GlobalFlags |= StateWQM;
511         }
512       } else if (Opcode == AMDGPU::WQM) {
513         // The WQM intrinsic requires its output to have all the helper lanes
514         // correct, so we need it to be in WQM.
515         Flags = StateWQM;
516         LowerToCopyInstrs.push_back(&MI);
517       } else if (Opcode == AMDGPU::SOFT_WQM) {
518         LowerToCopyInstrs.push_back(&MI);
519         SoftWQMInstrs.push_back(&MI);
520       } else if (Opcode == AMDGPU::STRICT_WWM) {
521         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
522         // it needs to be executed in WQM or Exact so that its copy doesn't
523         // clobber inactive lanes.
524         markInstructionUses(MI, StateStrictWWM, Worklist);
525         GlobalFlags |= StateStrictWWM;
526         LowerToMovInstrs.push_back(&MI);
527       } else if (Opcode == AMDGPU::STRICT_WQM ||
528                  TII->isDualSourceBlendEXP(MI)) {
529         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
530         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
531         // quads that have at least one active thread.
532         markInstructionUses(MI, StateStrictWQM, Worklist);
533         GlobalFlags |= StateStrictWQM;
534 
535         if (Opcode == AMDGPU::STRICT_WQM) {
536           LowerToMovInstrs.push_back(&MI);
537         } else {
538           // Dual source blend export acts as implicit strict-wqm, its sources
539           // need to be shuffled in strict wqm, but the export itself needs to
540           // run in exact mode.
541           BBI.Needs |= StateExact;
542           if (!(BBI.InNeeds & StateExact)) {
543             BBI.InNeeds |= StateExact;
544             Worklist.push_back(MBB);
545           }
546           GlobalFlags |= StateExact;
547           III.Disabled = StateWQM | StateStrict;
548         }
549       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
550                  Opcode == AMDGPU::DS_PARAM_LOAD ||
551                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
552                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
553         // Mark these STRICTWQM, but only for the instruction, not its operands.
554         // This avoid unnecessarily marking M0 as requiring WQM.
555         InstrInfo &II = Instructions[&MI];
556         II.Needs |= StateStrictWQM;
557         GlobalFlags |= StateStrictWQM;
558       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
559                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
560         III.Disabled = StateStrict;
561         MachineOperand &Inactive = MI.getOperand(2);
562         if (Inactive.isReg()) {
563           if (Inactive.isUndef()) {
564             LowerToCopyInstrs.push_back(&MI);
565           } else {
566             markOperand(MI, Inactive, StateStrictWWM, Worklist);
567           }
568         }
569         SetInactiveInstrs.push_back(&MI);
570       } else if (TII->isDisableWQM(MI)) {
571         BBI.Needs |= StateExact;
572         if (!(BBI.InNeeds & StateExact)) {
573           BBI.InNeeds |= StateExact;
574           Worklist.push_back(MBB);
575         }
576         GlobalFlags |= StateExact;
577         III.Disabled = StateWQM | StateStrict;
578       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
579                  Opcode == AMDGPU::SI_LIVE_MASK) {
580         LiveMaskQueries.push_back(&MI);
581       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
582                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
583                  Opcode == AMDGPU::SI_DEMOTE_I1) {
584         KillInstrs.push_back(&MI);
585         BBI.NeedsLowering = true;
586       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
587                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
588         InitExecInstrs.push_back(&MI);
589       } else if (WQMOutputs) {
590         // The function is in machine SSA form, which means that physical
591         // VGPRs correspond to shader inputs and outputs. Inputs are
592         // only used, outputs are only defined.
593         // FIXME: is this still valid?
594         for (const MachineOperand &MO : MI.defs()) {
595           Register Reg = MO.getReg();
596           if (Reg.isPhysical() &&
597               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
598             Flags = StateWQM;
599             break;
600           }
601         }
602       }
603 
604       if (Flags) {
605         markInstruction(MI, Flags, Worklist);
606         GlobalFlags |= Flags;
607       }
608     }
609   }
610 
611   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
612   // ever used anywhere in the function. This implements the corresponding
613   // semantics of @llvm.amdgcn.set.inactive.
614   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
615   if (GlobalFlags & StateWQM) {
616     for (MachineInstr *MI : SetInactiveInstrs)
617       markInstruction(*MI, StateWQM, Worklist);
618     for (MachineInstr *MI : SoftWQMInstrs)
619       markInstruction(*MI, StateWQM, Worklist);
620   }
621 
622   return GlobalFlags;
623 }
624 
625 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
626                                            std::vector<WorkItem>& Worklist) {
627   MachineBasicBlock *MBB = MI.getParent();
628   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
629   BlockInfo &BI = Blocks[MBB];
630 
631   // Control flow-type instructions and stores to temporary memory that are
632   // followed by WQM computations must themselves be in WQM.
633   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
634       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
635     Instructions[&MI].Needs = StateWQM;
636     II.Needs = StateWQM;
637   }
638 
639   // Propagate to block level
640   if (II.Needs & StateWQM) {
641     BI.Needs |= StateWQM;
642     if (!(BI.InNeeds & StateWQM)) {
643       BI.InNeeds |= StateWQM;
644       Worklist.push_back(MBB);
645     }
646   }
647 
648   // Propagate backwards within block
649   if (MachineInstr *PrevMI = MI.getPrevNode()) {
650     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
651     if (!PrevMI->isPHI()) {
652       InstrInfo &PrevII = Instructions[PrevMI];
653       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
654         PrevII.OutNeeds |= InNeeds;
655         Worklist.push_back(PrevMI);
656       }
657     }
658   }
659 
660   // Propagate WQM flag to instruction inputs
661   assert(!(II.Needs & StateExact));
662 
663   if (II.Needs != 0)
664     markInstructionUses(MI, II.Needs, Worklist);
665 
666   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
667   // not require any WQM transitions.
668   if (II.Needs & StateStrictWWM)
669     BI.Needs |= StateStrictWWM;
670   if (II.Needs & StateStrictWQM)
671     BI.Needs |= StateStrictWQM;
672 }
673 
674 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
675                                      std::vector<WorkItem>& Worklist) {
676   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
677 
678   // Propagate through instructions
679   if (!MBB.empty()) {
680     MachineInstr *LastMI = &*MBB.rbegin();
681     InstrInfo &LastII = Instructions[LastMI];
682     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
683       LastII.OutNeeds |= BI.OutNeeds;
684       Worklist.push_back(LastMI);
685     }
686   }
687 
688   // Predecessor blocks must provide for our WQM/Exact needs.
689   for (MachineBasicBlock *Pred : MBB.predecessors()) {
690     BlockInfo &PredBI = Blocks[Pred];
691     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
692       continue;
693 
694     PredBI.OutNeeds |= BI.InNeeds;
695     PredBI.InNeeds |= BI.InNeeds;
696     Worklist.push_back(Pred);
697   }
698 
699   // All successors must be prepared to accept the same set of WQM/Exact data.
700   for (MachineBasicBlock *Succ : MBB.successors()) {
701     BlockInfo &SuccBI = Blocks[Succ];
702     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
703       continue;
704 
705     SuccBI.InNeeds |= BI.OutNeeds;
706     Worklist.push_back(Succ);
707   }
708 }
709 
710 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
711   std::vector<WorkItem> Worklist;
712   char GlobalFlags = scanInstructions(MF, Worklist);
713 
714   while (!Worklist.empty()) {
715     WorkItem WI = Worklist.back();
716     Worklist.pop_back();
717 
718     if (WI.MI)
719       propagateInstruction(*WI.MI, Worklist);
720     else
721       propagateBlock(*WI.MBB, Worklist);
722   }
723 
724   return GlobalFlags;
725 }
726 
727 MachineBasicBlock::iterator
728 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
729                          MachineBasicBlock::iterator Before) {
730   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
731 
732   MachineInstr *Save =
733       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
734           .addReg(AMDGPU::SCC);
735   MachineInstr *Restore =
736       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
737           .addReg(SaveReg);
738 
739   LIS->InsertMachineInstrInMaps(*Save);
740   LIS->InsertMachineInstrInMaps(*Restore);
741   LIS->createAndComputeVirtRegInterval(SaveReg);
742 
743   return Restore;
744 }
745 
746 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
747                                                MachineInstr *TermMI) {
748   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
749                     << *TermMI << "\n");
750 
751   MachineBasicBlock *SplitBB =
752       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
753 
754   // Convert last instruction in block to a terminator.
755   // Note: this only covers the expected patterns
756   unsigned NewOpcode = 0;
757   switch (TermMI->getOpcode()) {
758   case AMDGPU::S_AND_B32:
759     NewOpcode = AMDGPU::S_AND_B32_term;
760     break;
761   case AMDGPU::S_AND_B64:
762     NewOpcode = AMDGPU::S_AND_B64_term;
763     break;
764   case AMDGPU::S_MOV_B32:
765     NewOpcode = AMDGPU::S_MOV_B32_term;
766     break;
767   case AMDGPU::S_MOV_B64:
768     NewOpcode = AMDGPU::S_MOV_B64_term;
769     break;
770   default:
771     break;
772   }
773   if (NewOpcode)
774     TermMI->setDesc(TII->get(NewOpcode));
775 
776   if (SplitBB != BB) {
777     // Update dominator trees
778     using DomTreeT = DomTreeBase<MachineBasicBlock>;
779     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
780     for (MachineBasicBlock *Succ : SplitBB->successors()) {
781       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
782       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
783     }
784     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
785     if (MDT)
786       MDT->getBase().applyUpdates(DTUpdates);
787     if (PDT)
788       PDT->getBase().applyUpdates(DTUpdates);
789 
790     // Link blocks
791     MachineInstr *MI =
792         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
793             .addMBB(SplitBB);
794     LIS->InsertMachineInstrInMaps(*MI);
795   }
796 
797   return SplitBB;
798 }
799 
800 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
801                                             MachineInstr &MI) {
802   const DebugLoc &DL = MI.getDebugLoc();
803   unsigned Opcode = 0;
804 
805   assert(MI.getOperand(0).isReg());
806 
807   // Comparison is for live lanes; however here we compute the inverse
808   // (killed lanes).  This is because VCMP will always generate 0 bits
809   // for inactive lanes so a mask of live lanes would not be correct
810   // inside control flow.
811   // Invert the comparison by swapping the operands and adjusting
812   // the comparison codes.
813 
814   switch (MI.getOperand(2).getImm()) {
815   case ISD::SETUEQ:
816     Opcode = AMDGPU::V_CMP_LG_F32_e64;
817     break;
818   case ISD::SETUGT:
819     Opcode = AMDGPU::V_CMP_GE_F32_e64;
820     break;
821   case ISD::SETUGE:
822     Opcode = AMDGPU::V_CMP_GT_F32_e64;
823     break;
824   case ISD::SETULT:
825     Opcode = AMDGPU::V_CMP_LE_F32_e64;
826     break;
827   case ISD::SETULE:
828     Opcode = AMDGPU::V_CMP_LT_F32_e64;
829     break;
830   case ISD::SETUNE:
831     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
832     break;
833   case ISD::SETO:
834     Opcode = AMDGPU::V_CMP_O_F32_e64;
835     break;
836   case ISD::SETUO:
837     Opcode = AMDGPU::V_CMP_U_F32_e64;
838     break;
839   case ISD::SETOEQ:
840   case ISD::SETEQ:
841     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
842     break;
843   case ISD::SETOGT:
844   case ISD::SETGT:
845     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
846     break;
847   case ISD::SETOGE:
848   case ISD::SETGE:
849     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
850     break;
851   case ISD::SETOLT:
852   case ISD::SETLT:
853     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
854     break;
855   case ISD::SETOLE:
856   case ISD::SETLE:
857     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
858     break;
859   case ISD::SETONE:
860   case ISD::SETNE:
861     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
862     break;
863   default:
864     llvm_unreachable("invalid ISD:SET cond code");
865   }
866 
867   // Pick opcode based on comparison type.
868   MachineInstr *VcmpMI;
869   const MachineOperand &Op0 = MI.getOperand(0);
870   const MachineOperand &Op1 = MI.getOperand(1);
871 
872   // VCC represents lanes killed.
873   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
874 
875   if (TRI->isVGPR(*MRI, Op0.getReg())) {
876     Opcode = AMDGPU::getVOPe32(Opcode);
877     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
878   } else {
879     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
880                  .addReg(VCC, RegState::Define)
881                  .addImm(0) // src0 modifiers
882                  .add(Op1)
883                  .addImm(0) // src1 modifiers
884                  .add(Op0)
885                  .addImm(0); // omod
886   }
887 
888   MachineInstr *MaskUpdateMI =
889       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
890           .addReg(LiveMaskReg)
891           .addReg(VCC);
892 
893   // State of SCC represents whether any lanes are live in mask,
894   // if SCC is 0 then no lanes will be alive anymore.
895   MachineInstr *EarlyTermMI =
896       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
897 
898   MachineInstr *ExecMaskMI =
899       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
900 
901   assert(MBB.succ_size() == 1);
902   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
903                               .addMBB(*MBB.succ_begin());
904 
905   // Update live intervals
906   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
907   MBB.remove(&MI);
908 
909   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
910   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
911   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
912   LIS->InsertMachineInstrInMaps(*NewTerm);
913 
914   return NewTerm;
915 }
916 
917 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
918                                            MachineInstr &MI, bool IsWQM) {
919   const DebugLoc &DL = MI.getDebugLoc();
920   MachineInstr *MaskUpdateMI = nullptr;
921 
922   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
923   const MachineOperand &Op = MI.getOperand(0);
924   int64_t KillVal = MI.getOperand(1).getImm();
925   MachineInstr *ComputeKilledMaskMI = nullptr;
926   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
927   Register TmpReg;
928 
929   // Is this a static or dynamic kill?
930   if (Op.isImm()) {
931     if (Op.getImm() == KillVal) {
932       // Static: all active lanes are killed
933       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
934                          .addReg(LiveMaskReg)
935                          .addReg(Exec);
936     } else {
937       // Static: kill does nothing
938       MachineInstr *NewTerm = nullptr;
939       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
940         LIS->RemoveMachineInstrFromMaps(MI);
941       } else {
942         assert(MBB.succ_size() == 1);
943         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
944                       .addMBB(*MBB.succ_begin());
945         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
946       }
947       MBB.remove(&MI);
948       return NewTerm;
949     }
950   } else {
951     if (!KillVal) {
952       // Op represents live lanes after kill,
953       // so exec mask needs to be factored in.
954       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
955       ComputeKilledMaskMI =
956           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
957       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
958                          .addReg(LiveMaskReg)
959                          .addReg(TmpReg);
960     } else {
961       // Op represents lanes to kill
962       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
963                          .addReg(LiveMaskReg)
964                          .add(Op);
965     }
966   }
967 
968   // State of SCC represents whether any lanes are live in mask,
969   // if SCC is 0 then no lanes will be alive anymore.
970   MachineInstr *EarlyTermMI =
971       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
972 
973   // In the case we got this far some lanes are still live,
974   // update EXEC to deactivate lanes as appropriate.
975   MachineInstr *NewTerm;
976   MachineInstr *WQMMaskMI = nullptr;
977   Register LiveMaskWQM;
978   if (IsDemote) {
979     // Demote - deactivate quads with only helper lanes
980     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
981     WQMMaskMI =
982         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
983     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
984                   .addReg(Exec)
985                   .addReg(LiveMaskWQM);
986   } else {
987     // Kill - deactivate lanes no longer in live mask
988     if (Op.isImm()) {
989       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
990       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
991     } else if (!IsWQM) {
992       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
993                     .addReg(Exec)
994                     .addReg(LiveMaskReg);
995     } else {
996       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
997       NewTerm =
998           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
999     }
1000   }
1001 
1002   // Update live intervals
1003   LIS->RemoveMachineInstrFromMaps(MI);
1004   MBB.remove(&MI);
1005   assert(EarlyTermMI);
1006   assert(MaskUpdateMI);
1007   assert(NewTerm);
1008   if (ComputeKilledMaskMI)
1009     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1010   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1011   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1012   if (WQMMaskMI)
1013     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1014   LIS->InsertMachineInstrInMaps(*NewTerm);
1015 
1016   if (CndReg) {
1017     LIS->removeInterval(CndReg);
1018     LIS->createAndComputeVirtRegInterval(CndReg);
1019   }
1020   if (TmpReg)
1021     LIS->createAndComputeVirtRegInterval(TmpReg);
1022   if (LiveMaskWQM)
1023     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1024 
1025   return NewTerm;
1026 }
1027 
1028 // Convert a strict mode transition to a pseudo transition.
1029 // This still pre-allocates registers to prevent clobbering,
1030 // but avoids any EXEC mask changes.
1031 void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1032                                             MachineInstr *Entry,
1033                                             MachineInstr *Exit) {
1034   assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1035   assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1036 
1037   Register SaveOrig = Entry->getOperand(0).getReg();
1038 
1039   MachineInstr *NewEntry =
1040     BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1041   MachineInstr *NewExit =
1042     BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1043 
1044   LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1045   Exit->eraseFromParent();
1046 
1047   LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1048   Entry->eraseFromParent();
1049 
1050   LIS->removeInterval(SaveOrig);
1051 }
1052 
1053 // Replace (or supplement) instructions accessing live mask.
1054 // This can only happen once all the live mask registers have been created
1055 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1056 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1057   auto BII = Blocks.find(&MBB);
1058   if (BII == Blocks.end())
1059     return;
1060 
1061   const BlockInfo &BI = BII->second;
1062   if (!BI.NeedsLowering)
1063     return;
1064 
1065   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1066 
1067   SmallVector<MachineInstr *, 4> SplitPoints;
1068   char State = BI.InitialState;
1069   MachineInstr *StrictEntry = nullptr;
1070 
1071   for (MachineInstr &MI : llvm::make_early_inc_range(
1072            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1073     char PreviousState = State;
1074 
1075     if (StateTransition.count(&MI))
1076       State = StateTransition[&MI];
1077 
1078     MachineInstr *SplitPoint = nullptr;
1079     switch (MI.getOpcode()) {
1080     case AMDGPU::SI_DEMOTE_I1:
1081     case AMDGPU::SI_KILL_I1_TERMINATOR:
1082       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1083       break;
1084     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1085       SplitPoint = lowerKillF32(MBB, MI);
1086       break;
1087     case AMDGPU::ENTER_STRICT_WQM:
1088       StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1089       break;
1090     case AMDGPU::EXIT_STRICT_WQM:
1091       if (State == StateWQM && StrictEntry) {
1092         // Transition WQM -> StrictWQM -> WQM detected.
1093         lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1094       }
1095       StrictEntry = nullptr;
1096       break;
1097     case AMDGPU::ENTER_STRICT_WWM:
1098     case AMDGPU::EXIT_STRICT_WWM:
1099       StrictEntry = nullptr;
1100       break;
1101     default:
1102       break;
1103     }
1104     if (SplitPoint)
1105       SplitPoints.push_back(SplitPoint);
1106   }
1107 
1108   // Perform splitting after instruction scan to simplify iteration.
1109   if (!SplitPoints.empty()) {
1110     MachineBasicBlock *BB = &MBB;
1111     for (MachineInstr *MI : SplitPoints) {
1112       BB = splitBlock(BB, MI);
1113     }
1114   }
1115 }
1116 
1117 // Return an iterator in the (inclusive) range [First, Last] at which
1118 // instructions can be safely inserted, keeping in mind that some of the
1119 // instructions we want to add necessarily clobber SCC.
1120 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1121     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1122     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1123   if (!SaveSCC)
1124     return PreferLast ? Last : First;
1125 
1126   LiveRange &LR =
1127       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1128   auto MBBE = MBB.end();
1129   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1130                                      : LIS->getMBBEndIdx(&MBB);
1131   SlotIndex LastIdx =
1132       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1133   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1134   const LiveRange::Segment *S;
1135 
1136   for (;;) {
1137     S = LR.getSegmentContaining(Idx);
1138     if (!S)
1139       break;
1140 
1141     if (PreferLast) {
1142       SlotIndex Next = S->start.getBaseIndex();
1143       if (Next < FirstIdx)
1144         break;
1145       Idx = Next;
1146     } else {
1147       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1148       assert(EndMI && "Segment does not end on valid instruction");
1149       auto NextI = std::next(EndMI->getIterator());
1150       if (NextI == MBB.end())
1151         break;
1152       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1153       if (Next > LastIdx)
1154         break;
1155       Idx = Next;
1156     }
1157   }
1158 
1159   MachineBasicBlock::iterator MBBI;
1160 
1161   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1162     MBBI = MI;
1163   else {
1164     assert(Idx == LIS->getMBBEndIdx(&MBB));
1165     MBBI = MBB.end();
1166   }
1167 
1168   // Move insertion point past any operations modifying EXEC.
1169   // This assumes that the value of SCC defined by any of these operations
1170   // does not need to be preserved.
1171   while (MBBI != Last) {
1172     bool IsExecDef = false;
1173     for (const MachineOperand &MO : MBBI->all_defs()) {
1174       IsExecDef |=
1175           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1176     }
1177     if (!IsExecDef)
1178       break;
1179     MBBI++;
1180     S = nullptr;
1181   }
1182 
1183   if (S)
1184     MBBI = saveSCC(MBB, MBBI);
1185 
1186   return MBBI;
1187 }
1188 
1189 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1190                               MachineBasicBlock::iterator Before,
1191                               Register SaveWQM) {
1192   bool IsTerminator = Before == MBB.end();
1193   if (!IsTerminator) {
1194     auto FirstTerm = MBB.getFirstTerminator();
1195     if (FirstTerm != MBB.end()) {
1196       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1197       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1198       IsTerminator = BeforeIdx > FirstTermIdx;
1199     }
1200   }
1201 
1202   MachineInstr *MI;
1203 
1204   if (SaveWQM) {
1205     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1206     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1207              .addReg(LiveMaskReg);
1208   } else {
1209     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1210     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1211              .addReg(Exec)
1212              .addReg(LiveMaskReg);
1213   }
1214 
1215   LIS->InsertMachineInstrInMaps(*MI);
1216   StateTransition[MI] = StateExact;
1217 }
1218 
1219 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1220                             MachineBasicBlock::iterator Before,
1221                             Register SavedWQM) {
1222   MachineInstr *MI;
1223 
1224   if (SavedWQM) {
1225     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1226              .addReg(SavedWQM);
1227   } else {
1228     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1229   }
1230 
1231   LIS->InsertMachineInstrInMaps(*MI);
1232   StateTransition[MI] = StateWQM;
1233 }
1234 
1235 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1236                                    MachineBasicBlock::iterator Before,
1237                                    Register SaveOrig, char StrictStateNeeded) {
1238   MachineInstr *MI;
1239   assert(SaveOrig);
1240   assert(StrictStateNeeded == StateStrictWWM ||
1241          StrictStateNeeded == StateStrictWQM);
1242 
1243   if (StrictStateNeeded == StateStrictWWM) {
1244     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1245                  SaveOrig)
1246              .addImm(-1);
1247   } else {
1248     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1249                  SaveOrig)
1250              .addImm(-1);
1251   }
1252   LIS->InsertMachineInstrInMaps(*MI);
1253   StateTransition[MI] = StrictStateNeeded;
1254 
1255   // Mark block as needing lower so it will be checked for unnecessary transitions.
1256   auto BII = Blocks.find(&MBB);
1257   if (BII != Blocks.end())
1258     BII->second.NeedsLowering = true;
1259 }
1260 
1261 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1262                                      MachineBasicBlock::iterator Before,
1263                                      Register SavedOrig, char NonStrictState,
1264                                      char CurrentStrictState) {
1265   MachineInstr *MI;
1266 
1267   assert(SavedOrig);
1268   assert(CurrentStrictState == StateStrictWWM ||
1269          CurrentStrictState == StateStrictWQM);
1270 
1271   if (CurrentStrictState == StateStrictWWM) {
1272     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1273                  Exec)
1274              .addReg(SavedOrig);
1275   } else {
1276     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1277                  Exec)
1278              .addReg(SavedOrig);
1279   }
1280   LIS->InsertMachineInstrInMaps(*MI);
1281   StateTransition[MI] = NonStrictState;
1282 }
1283 
1284 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1285   auto BII = Blocks.find(&MBB);
1286   if (BII == Blocks.end())
1287     return;
1288 
1289   BlockInfo &BI = BII->second;
1290 
1291   // This is a non-entry block that is WQM throughout, so no need to do
1292   // anything.
1293   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1294     BI.InitialState = StateWQM;
1295     return;
1296   }
1297 
1298   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1299                     << ":\n");
1300 
1301   Register SavedWQMReg;
1302   Register SavedNonStrictReg;
1303   bool WQMFromExec = IsEntry;
1304   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1305   char NonStrictState = 0;
1306   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1307 
1308   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1309   if (IsEntry) {
1310     // Skip the instruction that saves LiveMask
1311     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1312         II->getOperand(1).getReg() == TRI->getExec())
1313       ++II;
1314   }
1315 
1316   // This stores the first instruction where it's safe to switch from WQM to
1317   // Exact or vice versa.
1318   MachineBasicBlock::iterator FirstWQM = IE;
1319 
1320   // This stores the first instruction where it's safe to switch from Strict
1321   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1322   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1323   // be safe to switch to/from WQM as well.
1324   MachineBasicBlock::iterator FirstStrict = IE;
1325 
1326   // Record initial state is block information.
1327   BI.InitialState = State;
1328 
1329   for (;;) {
1330     MachineBasicBlock::iterator Next = II;
1331     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1332     char OutNeeds = 0;
1333 
1334     if (FirstWQM == IE)
1335       FirstWQM = II;
1336 
1337     if (FirstStrict == IE)
1338       FirstStrict = II;
1339 
1340     // First, figure out the allowed states (Needs) based on the propagated
1341     // flags.
1342     if (II != IE) {
1343       MachineInstr &MI = *II;
1344 
1345       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1346         auto III = Instructions.find(&MI);
1347         if (III != Instructions.end()) {
1348           if (III->second.Needs & StateStrictWWM)
1349             Needs = StateStrictWWM;
1350           else if (III->second.Needs & StateStrictWQM)
1351             Needs = StateStrictWQM;
1352           else if (III->second.Needs & StateWQM)
1353             Needs = StateWQM;
1354           else
1355             Needs &= ~III->second.Disabled;
1356           OutNeeds = III->second.OutNeeds;
1357         }
1358       } else {
1359         // If the instruction doesn't actually need a correct EXEC, then we can
1360         // safely leave Strict mode enabled.
1361         Needs = StateExact | StateWQM | StateStrict;
1362       }
1363 
1364       // Exact mode exit can occur in terminators, but must be before branches.
1365       if (MI.isBranch() && OutNeeds == StateExact)
1366         Needs = StateExact;
1367 
1368       ++Next;
1369     } else {
1370       // End of basic block
1371       if (BI.OutNeeds & StateWQM)
1372         Needs = StateWQM;
1373       else if (BI.OutNeeds == StateExact)
1374         Needs = StateExact;
1375       else
1376         Needs = StateWQM | StateExact;
1377     }
1378 
1379     // Now, transition if necessary.
1380     if (!(Needs & State)) {
1381       MachineBasicBlock::iterator First;
1382       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1383           State == StateStrictWQM || Needs == StateStrictWQM) {
1384         // We must switch to or from Strict mode.
1385         First = FirstStrict;
1386       } else {
1387         // We only need to switch to/from WQM, so we can use FirstWQM.
1388         First = FirstWQM;
1389       }
1390 
1391       // Whether we need to save SCC depends on start and end states.
1392       bool SaveSCC = false;
1393       switch (State) {
1394       case StateExact:
1395       case StateStrictWWM:
1396       case StateStrictWQM:
1397         // Exact/Strict -> Strict: save SCC
1398         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1399         // Exact/Strict -> Exact: no save
1400         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1401         break;
1402       case StateWQM:
1403         // WQM -> Exact/Strict: save SCC
1404         SaveSCC = !(Needs & StateWQM);
1405         break;
1406       default:
1407         llvm_unreachable("Unknown state");
1408         break;
1409       }
1410       MachineBasicBlock::iterator Before =
1411           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1412 
1413       if (State & StateStrict) {
1414         assert(State == StateStrictWWM || State == StateStrictWQM);
1415         assert(SavedNonStrictReg);
1416         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1417 
1418         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1419         SavedNonStrictReg = 0;
1420         State = NonStrictState;
1421       }
1422 
1423       if (Needs & StateStrict) {
1424         NonStrictState = State;
1425         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1426         assert(!SavedNonStrictReg);
1427         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1428 
1429         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1430         State = Needs;
1431 
1432       } else {
1433         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1434           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1435             assert(!SavedWQMReg);
1436             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1437           }
1438 
1439           toExact(MBB, Before, SavedWQMReg);
1440           State = StateExact;
1441         } else if (State == StateExact && (Needs & StateWQM) &&
1442                    !(Needs & StateExact)) {
1443           assert(WQMFromExec == (SavedWQMReg == 0));
1444 
1445           toWQM(MBB, Before, SavedWQMReg);
1446 
1447           if (SavedWQMReg) {
1448             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1449             SavedWQMReg = 0;
1450           }
1451           State = StateWQM;
1452         } else {
1453           // We can get here if we transitioned from StrictWWM to a
1454           // non-StrictWWM state that already matches our needs, but we
1455           // shouldn't need to do anything.
1456           assert(Needs & State);
1457         }
1458       }
1459     }
1460 
1461     if (Needs != (StateExact | StateWQM | StateStrict)) {
1462       if (Needs != (StateExact | StateWQM))
1463         FirstWQM = IE;
1464       FirstStrict = IE;
1465     }
1466 
1467     if (II == IE)
1468       break;
1469 
1470     II = Next;
1471   }
1472   assert(!SavedWQMReg);
1473   assert(!SavedNonStrictReg);
1474 }
1475 
1476 void SIWholeQuadMode::lowerLiveMaskQueries() {
1477   for (MachineInstr *MI : LiveMaskQueries) {
1478     const DebugLoc &DL = MI->getDebugLoc();
1479     Register Dest = MI->getOperand(0).getReg();
1480 
1481     MachineInstr *Copy =
1482         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1483             .addReg(LiveMaskReg);
1484 
1485     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1486     MI->eraseFromParent();
1487   }
1488 }
1489 
1490 void SIWholeQuadMode::lowerCopyInstrs() {
1491   for (MachineInstr *MI : LowerToMovInstrs) {
1492     assert(MI->getNumExplicitOperands() == 2);
1493 
1494     const Register Reg = MI->getOperand(0).getReg();
1495 
1496     const TargetRegisterClass *regClass =
1497         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1498     if (TRI->isVGPRClass(regClass)) {
1499       const unsigned MovOp = TII->getMovOpcode(regClass);
1500       MI->setDesc(TII->get(MovOp));
1501 
1502       // Check that it already implicitly depends on exec (like all VALU movs
1503       // should do).
1504       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1505         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1506       }));
1507     } else {
1508       // Remove early-clobber and exec dependency from simple SGPR copies.
1509       // This allows some to be eliminated during/post RA.
1510       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1511       if (MI->getOperand(0).isEarlyClobber()) {
1512         LIS->removeInterval(Reg);
1513         MI->getOperand(0).setIsEarlyClobber(false);
1514         LIS->createAndComputeVirtRegInterval(Reg);
1515       }
1516       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1517       while (Index >= 0) {
1518         MI->removeOperand(Index);
1519         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1520       }
1521       MI->setDesc(TII->get(AMDGPU::COPY));
1522       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1523     }
1524   }
1525   for (MachineInstr *MI : LowerToCopyInstrs) {
1526     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1527         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1528       assert(MI->getNumExplicitOperands() == 3);
1529       // the only reason we should be here is V_SET_INACTIVE has
1530       // an undef input so it is being replaced by a simple copy.
1531       // There should be a second undef source that we should remove.
1532       assert(MI->getOperand(2).isUndef());
1533       MI->removeOperand(2);
1534       MI->untieRegOperand(1);
1535     } else {
1536       assert(MI->getNumExplicitOperands() == 2);
1537     }
1538 
1539     unsigned CopyOp = MI->getOperand(1).isReg()
1540                           ? (unsigned)AMDGPU::COPY
1541                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1542                                 *MRI, MI->getOperand(0)));
1543     MI->setDesc(TII->get(CopyOp));
1544   }
1545 }
1546 
1547 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1548   for (MachineInstr *MI : KillInstrs) {
1549     MachineBasicBlock *MBB = MI->getParent();
1550     MachineInstr *SplitPoint = nullptr;
1551     switch (MI->getOpcode()) {
1552     case AMDGPU::SI_DEMOTE_I1:
1553     case AMDGPU::SI_KILL_I1_TERMINATOR:
1554       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1555       break;
1556     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1557       SplitPoint = lowerKillF32(*MBB, *MI);
1558       break;
1559     }
1560     if (SplitPoint)
1561       splitBlock(MBB, SplitPoint);
1562   }
1563 }
1564 
1565 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1566   MachineBasicBlock *MBB = MI.getParent();
1567   bool IsWave32 = ST->isWave32();
1568 
1569   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1570     // This should be before all vector instructions.
1571     MachineInstr *InitMI =
1572         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1573                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1574                 Exec)
1575             .addImm(MI.getOperand(0).getImm());
1576     if (LIS) {
1577       LIS->RemoveMachineInstrFromMaps(MI);
1578       LIS->InsertMachineInstrInMaps(*InitMI);
1579     }
1580     MI.eraseFromParent();
1581     return;
1582   }
1583 
1584   // Extract the thread count from an SGPR input and set EXEC accordingly.
1585   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1586   //
1587   // S_BFE_U32 count, input, {shift, 7}
1588   // S_BFM_B64 exec, count, 0
1589   // S_CMP_EQ_U32 count, 64
1590   // S_CMOV_B64 exec, -1
1591   Register InputReg = MI.getOperand(0).getReg();
1592   MachineInstr *FirstMI = &*MBB->begin();
1593   if (InputReg.isVirtual()) {
1594     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1595     assert(DefInstr && DefInstr->isCopy());
1596     if (DefInstr->getParent() == MBB) {
1597       if (DefInstr != FirstMI) {
1598         // If the `InputReg` is defined in current block, we also need to
1599         // move that instruction to the beginning of the block.
1600         DefInstr->removeFromParent();
1601         MBB->insert(FirstMI, DefInstr);
1602         if (LIS)
1603           LIS->handleMove(*DefInstr);
1604       } else {
1605         // If first instruction is definition then move pointer after it.
1606         FirstMI = &*std::next(FirstMI->getIterator());
1607       }
1608     }
1609   }
1610 
1611   // Insert instruction sequence at block beginning (before vector operations).
1612   const DebugLoc DL = MI.getDebugLoc();
1613   const unsigned WavefrontSize = ST->getWavefrontSize();
1614   const unsigned Mask = (WavefrontSize << 1) - 1;
1615   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1616   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1617                    .addReg(InputReg)
1618                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1619   auto BfmMI =
1620       BuildMI(*MBB, FirstMI, DL,
1621               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1622           .addReg(CountReg)
1623           .addImm(0);
1624   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1625                    .addReg(CountReg, RegState::Kill)
1626                    .addImm(WavefrontSize);
1627   auto CmovMI =
1628       BuildMI(*MBB, FirstMI, DL,
1629               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1630               Exec)
1631           .addImm(-1);
1632 
1633   if (!LIS) {
1634     MI.eraseFromParent();
1635     return;
1636   }
1637 
1638   LIS->RemoveMachineInstrFromMaps(MI);
1639   MI.eraseFromParent();
1640 
1641   LIS->InsertMachineInstrInMaps(*BfeMI);
1642   LIS->InsertMachineInstrInMaps(*BfmMI);
1643   LIS->InsertMachineInstrInMaps(*CmpMI);
1644   LIS->InsertMachineInstrInMaps(*CmovMI);
1645 
1646   LIS->removeInterval(InputReg);
1647   LIS->createAndComputeVirtRegInterval(InputReg);
1648   LIS->createAndComputeVirtRegInterval(CountReg);
1649 }
1650 
1651 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1652 /// for instructions that depend on EXEC.
1653 MachineBasicBlock::iterator
1654 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
1655   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1656 
1657   for (MachineInstr *MI : InitExecInstrs) {
1658     // Try to handle undefined cases gracefully:
1659     // - multiple INIT_EXEC instructions
1660     // - INIT_EXEC instructions not in the entry block
1661     if (MI->getParent() == &Entry)
1662       InsertPt = std::next(MI->getIterator());
1663 
1664     lowerInitExec(*MI);
1665   }
1666 
1667   return InsertPt;
1668 }
1669 
1670 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1671   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1672                     << " ------------- \n");
1673   LLVM_DEBUG(MF.dump(););
1674 
1675   Instructions.clear();
1676   Blocks.clear();
1677   LiveMaskQueries.clear();
1678   LowerToCopyInstrs.clear();
1679   LowerToMovInstrs.clear();
1680   KillInstrs.clear();
1681   InitExecInstrs.clear();
1682   StateTransition.clear();
1683 
1684   ST = &MF.getSubtarget<GCNSubtarget>();
1685 
1686   TII = ST->getInstrInfo();
1687   TRI = &TII->getRegisterInfo();
1688   MRI = &MF.getRegInfo();
1689   LIS = &getAnalysis<LiveIntervals>();
1690   MDT = getAnalysisIfAvailable<MachineDominatorTree>();
1691   PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
1692 
1693   if (ST->isWave32()) {
1694     AndOpc = AMDGPU::S_AND_B32;
1695     AndTermOpc = AMDGPU::S_AND_B32_term;
1696     AndN2Opc = AMDGPU::S_ANDN2_B32;
1697     XorOpc = AMDGPU::S_XOR_B32;
1698     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1699     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1700     WQMOpc = AMDGPU::S_WQM_B32;
1701     Exec = AMDGPU::EXEC_LO;
1702   } else {
1703     AndOpc = AMDGPU::S_AND_B64;
1704     AndTermOpc = AMDGPU::S_AND_B64_term;
1705     AndN2Opc = AMDGPU::S_ANDN2_B64;
1706     XorOpc = AMDGPU::S_XOR_B64;
1707     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1708     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1709     WQMOpc = AMDGPU::S_WQM_B64;
1710     Exec = AMDGPU::EXEC;
1711   }
1712 
1713   const char GlobalFlags = analyzeFunction(MF);
1714   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1715 
1716   LiveMaskReg = Exec;
1717 
1718   MachineBasicBlock &Entry = MF.front();
1719   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
1720 
1721   // Shader is simple does not need any state changes or any complex lowering
1722   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1723       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1724     lowerLiveMaskQueries();
1725     return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
1726   }
1727 
1728   // Store a copy of the original live mask when required
1729   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1730     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1731     MachineInstr *MI =
1732         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1733             .addReg(Exec);
1734     LIS->InsertMachineInstrInMaps(*MI);
1735   }
1736 
1737   LLVM_DEBUG(printInfo());
1738 
1739   lowerLiveMaskQueries();
1740   lowerCopyInstrs();
1741 
1742   // Shader only needs WQM
1743   if (GlobalFlags == StateWQM) {
1744     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1745                   .addReg(Exec);
1746     LIS->InsertMachineInstrInMaps(*MI);
1747     lowerKillInstrs(true);
1748   } else {
1749     for (auto BII : Blocks)
1750       processBlock(*BII.first, BII.first == &Entry);
1751     // Lowering blocks causes block splitting so perform as a second pass.
1752     for (auto BII : Blocks)
1753       lowerBlock(*BII.first);
1754   }
1755 
1756   // Compute live range for live mask
1757   if (LiveMaskReg != Exec)
1758     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1759 
1760   // Physical registers like SCC aren't tracked by default anyway, so just
1761   // removing the ranges we computed is the simplest option for maintaining
1762   // the analysis results.
1763   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1764 
1765   // If we performed any kills then recompute EXEC
1766   if (!KillInstrs.empty())
1767     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1768 
1769   return true;
1770 }
1771