xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 0c63ec5347d2c86ea00c77437dfaf65a360fdafd)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131   char MarkedStates = 0;
132 };
133 
134 struct BlockInfo {
135   char Needs = 0;
136   char InNeeds = 0;
137   char OutNeeds = 0;
138   char InitialState = 0;
139   bool NeedsLowering = false;
140 };
141 
142 struct WorkItem {
143   MachineBasicBlock *MBB = nullptr;
144   MachineInstr *MI = nullptr;
145 
146   WorkItem() = default;
147   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
148   WorkItem(MachineInstr *MI) : MI(MI) {}
149 };
150 
151 class SIWholeQuadMode : public MachineFunctionPass {
152 private:
153   const SIInstrInfo *TII;
154   const SIRegisterInfo *TRI;
155   const GCNSubtarget *ST;
156   MachineRegisterInfo *MRI;
157   LiveIntervals *LIS;
158   MachineDominatorTree *MDT;
159   MachinePostDominatorTree *PDT;
160 
161   unsigned AndOpc;
162   unsigned AndTermOpc;
163   unsigned AndN2Opc;
164   unsigned XorOpc;
165   unsigned AndSaveExecOpc;
166   unsigned AndSaveExecTermOpc;
167   unsigned WQMOpc;
168   Register Exec;
169   Register LiveMaskReg;
170 
171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
173 
174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175   DenseMap<const MachineInstr *, char> StateTransition;
176 
177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179   SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
180   SmallVector<MachineInstr *, 4> KillInstrs;
181   SmallVector<MachineInstr *, 4> InitExecInstrs;
182   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
183 
184   void printInfo();
185 
186   void markInstruction(MachineInstr &MI, char Flag,
187                        std::vector<WorkItem> &Worklist);
188   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
189                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
190   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
191                    std::vector<WorkItem> &Worklist);
192   void markInstructionUses(const MachineInstr &MI, char Flag,
193                            std::vector<WorkItem> &Worklist);
194   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
195   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
196   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
197   char analyzeFunction(MachineFunction &MF);
198 
199   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
200                                       MachineBasicBlock::iterator Before);
201   MachineBasicBlock::iterator
202   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
203                    MachineBasicBlock::iterator Last, bool PreferLast,
204                    bool SaveSCC);
205   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206                Register SaveWQM);
207   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208              Register SavedWQM);
209   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
210                     Register SaveOrig, char StrictStateNeeded);
211   void fromStrictMode(MachineBasicBlock &MBB,
212                       MachineBasicBlock::iterator Before, Register SavedOrig,
213                       char NonStrictState, char CurrentStrictState);
214 
215   void splitBlock(MachineInstr *TermMI);
216   MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
217   MachineInstr *lowerKillF32(MachineInstr &MI);
218 
219   void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
220   void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
221 
222   bool lowerLiveMaskQueries();
223   bool lowerCopyInstrs();
224   bool lowerKillInstrs(bool IsWQM);
225   void lowerInitExec(MachineInstr &MI);
226   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
227                                                   bool &Changed);
228 
229 public:
230   static char ID;
231 
232   SIWholeQuadMode() :
233     MachineFunctionPass(ID) { }
234 
235   bool runOnMachineFunction(MachineFunction &MF) override;
236 
237   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
238 
239   void getAnalysisUsage(AnalysisUsage &AU) const override {
240     AU.addRequired<LiveIntervalsWrapperPass>();
241     AU.addPreserved<SlotIndexesWrapperPass>();
242     AU.addPreserved<LiveIntervalsWrapperPass>();
243     AU.addPreserved<MachineDominatorTreeWrapperPass>();
244     AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
245     MachineFunctionPass::getAnalysisUsage(AU);
246   }
247 
248   MachineFunctionProperties getClearedProperties() const override {
249     return MachineFunctionProperties().set(
250         MachineFunctionProperties::Property::IsSSA);
251   }
252 };
253 
254 } // end anonymous namespace
255 
256 char SIWholeQuadMode::ID = 0;
257 
258 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
259                       false)
260 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
261 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
262 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
263 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
264                     false)
265 
266 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
267 
268 FunctionPass *llvm::createSIWholeQuadModePass() {
269   return new SIWholeQuadMode;
270 }
271 
272 #ifndef NDEBUG
273 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
274   for (const auto &BII : Blocks) {
275     dbgs() << "\n"
276            << printMBBReference(*BII.first) << ":\n"
277            << "  InNeeds = " << PrintState(BII.second.InNeeds)
278            << ", Needs = " << PrintState(BII.second.Needs)
279            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
280 
281     for (const MachineInstr &MI : *BII.first) {
282       auto III = Instructions.find(&MI);
283       if (III != Instructions.end()) {
284         dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
285                << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
286       }
287     }
288   }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293                                       std::vector<WorkItem> &Worklist) {
294   InstrInfo &II = Instructions[&MI];
295 
296   assert(!(Flag & StateExact) && Flag != 0);
297 
298   // Capture all states requested in marking including disabled ones.
299   II.MarkedStates |= Flag;
300 
301   // Remove any disabled states from the flag. The user that required it gets
302   // an undefined value in the helper lanes. For example, this can happen if
303   // the result of an atomic is used by instruction that requires WQM, where
304   // ignoring the request for WQM is correct as per the relevant specs.
305   Flag &= ~II.Disabled;
306 
307   // Ignore if the flag is already encompassed by the existing needs, or we
308   // just disabled everything.
309   if ((II.Needs & Flag) == Flag)
310     return;
311 
312   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
313   II.Needs |= Flag;
314   Worklist.emplace_back(&MI);
315 }
316 
317 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
318 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
319                                Register Reg, unsigned SubReg, char Flag,
320                                std::vector<WorkItem> &Worklist) {
321   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
322 
323   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
324   const VNInfo *Value = UseLRQ.valueIn();
325   if (!Value)
326     return;
327 
328   // Note: this code assumes that lane masks on AMDGPU completely
329   // cover registers.
330   const LaneBitmask UseLanes =
331       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
332              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
333                                 : LaneBitmask::getNone());
334 
335   // Perform a depth-first iteration of the LiveRange graph marking defs.
336   // Stop processing of a given branch when all use lanes have been defined.
337   // The first definition stops processing for a physical register.
338   struct PhiEntry {
339     const VNInfo *Phi;
340     unsigned PredIdx;
341     LaneBitmask DefinedLanes;
342 
343     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
344         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
345   };
346   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
347   SmallVector<PhiEntry, 2> PhiStack;
348   SmallSet<VisitKey, 4> Visited;
349   LaneBitmask DefinedLanes;
350   unsigned NextPredIdx = 0; // Only used for processing phi nodes
351   do {
352     const VNInfo *NextValue = nullptr;
353     const VisitKey Key(Value, DefinedLanes);
354 
355     if (Visited.insert(Key).second) {
356       // On first visit to a phi then start processing first predecessor
357       NextPredIdx = 0;
358     }
359 
360     if (Value->isPHIDef()) {
361       // Each predecessor node in the phi must be processed as a subgraph
362       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
363       assert(MBB && "Phi-def has no defining MBB");
364 
365       // Find next predecessor to process
366       unsigned Idx = NextPredIdx;
367       const auto *PI = MBB->pred_begin() + Idx;
368       const auto *PE = MBB->pred_end();
369       for (; PI != PE && !NextValue; ++PI, ++Idx) {
370         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
371           if (!Visited.count(VisitKey(VN, DefinedLanes)))
372             NextValue = VN;
373         }
374       }
375 
376       // If there are more predecessors to process; add phi to stack
377       if (PI != PE)
378         PhiStack.emplace_back(Value, Idx, DefinedLanes);
379     } else {
380       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
381       assert(MI && "Def has no defining instruction");
382 
383       if (Reg.isVirtual()) {
384         // Iterate over all operands to find relevant definitions
385         bool HasDef = false;
386         for (const MachineOperand &Op : MI->all_defs()) {
387           if (Op.getReg() != Reg)
388             continue;
389 
390           // Compute lanes defined and overlap with use
391           LaneBitmask OpLanes =
392               Op.isUndef() ? LaneBitmask::getAll()
393                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
394           LaneBitmask Overlap = (UseLanes & OpLanes);
395 
396           // Record if this instruction defined any of use
397           HasDef |= Overlap.any();
398 
399           // Mark any lanes defined
400           DefinedLanes |= OpLanes;
401         }
402 
403         // Check if all lanes of use have been defined
404         if ((DefinedLanes & UseLanes) != UseLanes) {
405           // Definition not complete; need to process input value
406           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
407           if (const VNInfo *VN = LRQ.valueIn()) {
408             if (!Visited.count(VisitKey(VN, DefinedLanes)))
409               NextValue = VN;
410           }
411         }
412 
413         // Only mark the instruction if it defines some part of the use
414         if (HasDef)
415           markInstruction(*MI, Flag, Worklist);
416       } else {
417         // For physical registers simply mark the defining instruction
418         markInstruction(*MI, Flag, Worklist);
419       }
420     }
421 
422     if (!NextValue && !PhiStack.empty()) {
423       // Reach end of chain; revert to processing last phi
424       PhiEntry &Entry = PhiStack.back();
425       NextValue = Entry.Phi;
426       NextPredIdx = Entry.PredIdx;
427       DefinedLanes = Entry.DefinedLanes;
428       PhiStack.pop_back();
429     }
430 
431     Value = NextValue;
432   } while (Value);
433 }
434 
435 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
436                                   const MachineOperand &Op, char Flag,
437                                   std::vector<WorkItem> &Worklist) {
438   assert(Op.isReg());
439   Register Reg = Op.getReg();
440 
441   // Ignore some hardware registers
442   switch (Reg) {
443   case AMDGPU::EXEC:
444   case AMDGPU::EXEC_LO:
445     return;
446   default:
447     break;
448   }
449 
450   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
451                     << " for " << MI);
452   if (Reg.isVirtual()) {
453     LiveRange &LR = LIS->getInterval(Reg);
454     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
455   } else {
456     // Handle physical registers that we need to track; this is mostly relevant
457     // for VCC, which can appear as the (implicit) input of a uniform branch,
458     // e.g. when a loop counter is stored in a VGPR.
459     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
460       LiveRange &LR = LIS->getRegUnit(Unit);
461       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
462       if (Value)
463         markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
464     }
465   }
466 }
467 
468 /// Mark all instructions defining the uses in \p MI with \p Flag.
469 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
470                                           std::vector<WorkItem> &Worklist) {
471   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
472                     << MI);
473 
474   for (const MachineOperand &Use : MI.all_uses())
475     markOperand(MI, Use, Flag, Worklist);
476 }
477 
478 // Scan instructions to determine which ones require an Exact execmask and
479 // which ones seed WQM requirements.
480 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
481                                        std::vector<WorkItem> &Worklist) {
482   char GlobalFlags = 0;
483   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
484   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
485   bool HasImplicitDerivatives =
486       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
487 
488   // We need to visit the basic blocks in reverse post-order so that we visit
489   // defs before uses, in particular so that we don't accidentally mark an
490   // instruction as needing e.g. WQM before visiting it and realizing it needs
491   // WQM disabled.
492   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
493   for (MachineBasicBlock *MBB : RPOT) {
494     BlockInfo &BBI = Blocks[MBB];
495 
496     for (MachineInstr &MI : *MBB) {
497       InstrInfo &III = Instructions[&MI];
498       unsigned Opcode = MI.getOpcode();
499       char Flags = 0;
500 
501       if (TII->isWQM(Opcode)) {
502         // If LOD is not supported WQM is not needed.
503         // Only generate implicit WQM if implicit derivatives are required.
504         // This avoids inserting unintended WQM if a shader type without
505         // implicit derivatives uses an image sampling instruction.
506         if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
507           // Sampling instructions don't need to produce results for all pixels
508           // in a quad, they just require all inputs of a quad to have been
509           // computed for derivatives.
510           markInstructionUses(MI, StateWQM, Worklist);
511           GlobalFlags |= StateWQM;
512         }
513       } else if (Opcode == AMDGPU::WQM) {
514         // The WQM intrinsic requires its output to have all the helper lanes
515         // correct, so we need it to be in WQM.
516         Flags = StateWQM;
517         LowerToCopyInstrs.insert(&MI);
518       } else if (Opcode == AMDGPU::SOFT_WQM) {
519         LowerToCopyInstrs.insert(&MI);
520         SoftWQMInstrs.push_back(&MI);
521       } else if (Opcode == AMDGPU::STRICT_WWM) {
522         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
523         // it needs to be executed in WQM or Exact so that its copy doesn't
524         // clobber inactive lanes.
525         markInstructionUses(MI, StateStrictWWM, Worklist);
526         GlobalFlags |= StateStrictWWM;
527         LowerToMovInstrs.push_back(&MI);
528       } else if (Opcode == AMDGPU::STRICT_WQM ||
529                  TII->isDualSourceBlendEXP(MI)) {
530         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
531         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
532         // quads that have at least one active thread.
533         markInstructionUses(MI, StateStrictWQM, Worklist);
534         GlobalFlags |= StateStrictWQM;
535 
536         if (Opcode == AMDGPU::STRICT_WQM) {
537           LowerToMovInstrs.push_back(&MI);
538         } else {
539           // Dual source blend export acts as implicit strict-wqm, its sources
540           // need to be shuffled in strict wqm, but the export itself needs to
541           // run in exact mode.
542           BBI.Needs |= StateExact;
543           if (!(BBI.InNeeds & StateExact)) {
544             BBI.InNeeds |= StateExact;
545             Worklist.emplace_back(MBB);
546           }
547           GlobalFlags |= StateExact;
548           III.Disabled = StateWQM | StateStrict;
549         }
550       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
551                  Opcode == AMDGPU::DS_PARAM_LOAD ||
552                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
553                  Opcode == AMDGPU::DS_DIRECT_LOAD) {
554         // Mark these STRICTWQM, but only for the instruction, not its operands.
555         // This avoid unnecessarily marking M0 as requiring WQM.
556         III.Needs |= StateStrictWQM;
557         GlobalFlags |= StateStrictWQM;
558       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
559         // Disable strict states; StrictWQM will be added as required later.
560         III.Disabled = StateStrict;
561         MachineOperand &Inactive = MI.getOperand(4);
562         if (Inactive.isReg()) {
563           if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
564             LowerToCopyInstrs.insert(&MI);
565           else
566             markOperand(MI, Inactive, StateStrictWWM, Worklist);
567         }
568         SetInactiveInstrs.push_back(&MI);
569         BBI.NeedsLowering = true;
570       } else if (TII->isDisableWQM(MI)) {
571         BBI.Needs |= StateExact;
572         if (!(BBI.InNeeds & StateExact)) {
573           BBI.InNeeds |= StateExact;
574           Worklist.emplace_back(MBB);
575         }
576         GlobalFlags |= StateExact;
577         III.Disabled = StateWQM | StateStrict;
578       } else if (Opcode == AMDGPU::SI_PS_LIVE ||
579                  Opcode == AMDGPU::SI_LIVE_MASK) {
580         LiveMaskQueries.push_back(&MI);
581       } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
582                  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
583                  Opcode == AMDGPU::SI_DEMOTE_I1) {
584         KillInstrs.push_back(&MI);
585         BBI.NeedsLowering = true;
586       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
587                  Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
588                  Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
589         InitExecInstrs.push_back(&MI);
590       } else if (WQMOutputs) {
591         // The function is in machine SSA form, which means that physical
592         // VGPRs correspond to shader inputs and outputs. Inputs are
593         // only used, outputs are only defined.
594         // FIXME: is this still valid?
595         for (const MachineOperand &MO : MI.defs()) {
596           Register Reg = MO.getReg();
597           if (Reg.isPhysical() &&
598               TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
599             Flags = StateWQM;
600             break;
601           }
602         }
603       }
604 
605       if (Flags) {
606         markInstruction(MI, Flags, Worklist);
607         GlobalFlags |= Flags;
608       }
609     }
610   }
611 
612   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
613   // ever used anywhere in the function. This implements the corresponding
614   // semantics of @llvm.amdgcn.set.inactive.
615   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
616   if (GlobalFlags & StateWQM) {
617     for (MachineInstr *MI : SetInactiveInstrs)
618       markInstruction(*MI, StateWQM, Worklist);
619     for (MachineInstr *MI : SoftWQMInstrs)
620       markInstruction(*MI, StateWQM, Worklist);
621   }
622 
623   return GlobalFlags;
624 }
625 
626 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
627                                            std::vector<WorkItem>& Worklist) {
628   MachineBasicBlock *MBB = MI.getParent();
629   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
630   BlockInfo &BI = Blocks[MBB];
631 
632   // Control flow-type instructions and stores to temporary memory that are
633   // followed by WQM computations must themselves be in WQM.
634   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
635       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
636     Instructions[&MI].Needs = StateWQM;
637     II.Needs = StateWQM;
638   }
639 
640   // Propagate to block level
641   if (II.Needs & StateWQM) {
642     BI.Needs |= StateWQM;
643     if (!(BI.InNeeds & StateWQM)) {
644       BI.InNeeds |= StateWQM;
645       Worklist.emplace_back(MBB);
646     }
647   }
648 
649   // Propagate backwards within block
650   if (MachineInstr *PrevMI = MI.getPrevNode()) {
651     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
652     if (!PrevMI->isPHI()) {
653       InstrInfo &PrevII = Instructions[PrevMI];
654       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
655         PrevII.OutNeeds |= InNeeds;
656         Worklist.emplace_back(PrevMI);
657       }
658     }
659   }
660 
661   // Propagate WQM flag to instruction inputs
662   assert(!(II.Needs & StateExact));
663 
664   if (II.Needs != 0)
665     markInstructionUses(MI, II.Needs, Worklist);
666 
667   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
668   // not require any WQM transitions.
669   if (II.Needs & StateStrictWWM)
670     BI.Needs |= StateStrictWWM;
671   if (II.Needs & StateStrictWQM)
672     BI.Needs |= StateStrictWQM;
673 }
674 
675 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
676                                      std::vector<WorkItem>& Worklist) {
677   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
678 
679   // Propagate through instructions
680   if (!MBB.empty()) {
681     MachineInstr *LastMI = &*MBB.rbegin();
682     InstrInfo &LastII = Instructions[LastMI];
683     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
684       LastII.OutNeeds |= BI.OutNeeds;
685       Worklist.emplace_back(LastMI);
686     }
687   }
688 
689   // Predecessor blocks must provide for our WQM/Exact needs.
690   for (MachineBasicBlock *Pred : MBB.predecessors()) {
691     BlockInfo &PredBI = Blocks[Pred];
692     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
693       continue;
694 
695     PredBI.OutNeeds |= BI.InNeeds;
696     PredBI.InNeeds |= BI.InNeeds;
697     Worklist.emplace_back(Pred);
698   }
699 
700   // All successors must be prepared to accept the same set of WQM/Exact data.
701   for (MachineBasicBlock *Succ : MBB.successors()) {
702     BlockInfo &SuccBI = Blocks[Succ];
703     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
704       continue;
705 
706     SuccBI.InNeeds |= BI.OutNeeds;
707     Worklist.emplace_back(Succ);
708   }
709 }
710 
711 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
712   std::vector<WorkItem> Worklist;
713   char GlobalFlags = scanInstructions(MF, Worklist);
714 
715   while (!Worklist.empty()) {
716     WorkItem WI = Worklist.back();
717     Worklist.pop_back();
718 
719     if (WI.MI)
720       propagateInstruction(*WI.MI, Worklist);
721     else
722       propagateBlock(*WI.MBB, Worklist);
723   }
724 
725   return GlobalFlags;
726 }
727 
728 MachineBasicBlock::iterator
729 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
730                          MachineBasicBlock::iterator Before) {
731   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
732 
733   MachineInstr *Save =
734       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
735           .addReg(AMDGPU::SCC);
736   MachineInstr *Restore =
737       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
738           .addReg(SaveReg);
739 
740   LIS->InsertMachineInstrInMaps(*Save);
741   LIS->InsertMachineInstrInMaps(*Restore);
742   LIS->createAndComputeVirtRegInterval(SaveReg);
743 
744   return Restore;
745 }
746 
747 void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
748   MachineBasicBlock *BB = TermMI->getParent();
749   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
750                     << *TermMI << "\n");
751 
752   MachineBasicBlock *SplitBB =
753       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
754 
755   // Convert last instruction in block to a terminator.
756   // Note: this only covers the expected patterns
757   unsigned NewOpcode = 0;
758   switch (TermMI->getOpcode()) {
759   case AMDGPU::S_AND_B32:
760     NewOpcode = AMDGPU::S_AND_B32_term;
761     break;
762   case AMDGPU::S_AND_B64:
763     NewOpcode = AMDGPU::S_AND_B64_term;
764     break;
765   case AMDGPU::S_MOV_B32:
766     NewOpcode = AMDGPU::S_MOV_B32_term;
767     break;
768   case AMDGPU::S_MOV_B64:
769     NewOpcode = AMDGPU::S_MOV_B64_term;
770     break;
771   default:
772     break;
773   }
774   if (NewOpcode)
775     TermMI->setDesc(TII->get(NewOpcode));
776 
777   if (SplitBB != BB) {
778     // Update dominator trees
779     using DomTreeT = DomTreeBase<MachineBasicBlock>;
780     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
781     for (MachineBasicBlock *Succ : SplitBB->successors()) {
782       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
783       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
784     }
785     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
786     if (MDT)
787       MDT->applyUpdates(DTUpdates);
788     if (PDT)
789       PDT->applyUpdates(DTUpdates);
790 
791     // Link blocks
792     MachineInstr *MI =
793         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
794             .addMBB(SplitBB);
795     LIS->InsertMachineInstrInMaps(*MI);
796   }
797 }
798 
799 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
800   assert(LiveMaskReg.isVirtual());
801 
802   const DebugLoc &DL = MI.getDebugLoc();
803   unsigned Opcode = 0;
804 
805   assert(MI.getOperand(0).isReg());
806 
807   // Comparison is for live lanes; however here we compute the inverse
808   // (killed lanes).  This is because VCMP will always generate 0 bits
809   // for inactive lanes so a mask of live lanes would not be correct
810   // inside control flow.
811   // Invert the comparison by swapping the operands and adjusting
812   // the comparison codes.
813 
814   switch (MI.getOperand(2).getImm()) {
815   case ISD::SETUEQ:
816     Opcode = AMDGPU::V_CMP_LG_F32_e64;
817     break;
818   case ISD::SETUGT:
819     Opcode = AMDGPU::V_CMP_GE_F32_e64;
820     break;
821   case ISD::SETUGE:
822     Opcode = AMDGPU::V_CMP_GT_F32_e64;
823     break;
824   case ISD::SETULT:
825     Opcode = AMDGPU::V_CMP_LE_F32_e64;
826     break;
827   case ISD::SETULE:
828     Opcode = AMDGPU::V_CMP_LT_F32_e64;
829     break;
830   case ISD::SETUNE:
831     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
832     break;
833   case ISD::SETO:
834     Opcode = AMDGPU::V_CMP_O_F32_e64;
835     break;
836   case ISD::SETUO:
837     Opcode = AMDGPU::V_CMP_U_F32_e64;
838     break;
839   case ISD::SETOEQ:
840   case ISD::SETEQ:
841     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
842     break;
843   case ISD::SETOGT:
844   case ISD::SETGT:
845     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
846     break;
847   case ISD::SETOGE:
848   case ISD::SETGE:
849     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
850     break;
851   case ISD::SETOLT:
852   case ISD::SETLT:
853     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
854     break;
855   case ISD::SETOLE:
856   case ISD::SETLE:
857     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
858     break;
859   case ISD::SETONE:
860   case ISD::SETNE:
861     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
862     break;
863   default:
864     llvm_unreachable("invalid ISD:SET cond code");
865   }
866 
867   MachineBasicBlock &MBB = *MI.getParent();
868 
869   // Pick opcode based on comparison type.
870   MachineInstr *VcmpMI;
871   const MachineOperand &Op0 = MI.getOperand(0);
872   const MachineOperand &Op1 = MI.getOperand(1);
873 
874   // VCC represents lanes killed.
875   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
876 
877   if (TRI->isVGPR(*MRI, Op0.getReg())) {
878     Opcode = AMDGPU::getVOPe32(Opcode);
879     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
880   } else {
881     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
882                  .addReg(VCC, RegState::Define)
883                  .addImm(0) // src0 modifiers
884                  .add(Op1)
885                  .addImm(0) // src1 modifiers
886                  .add(Op0)
887                  .addImm(0); // omod
888   }
889 
890   MachineInstr *MaskUpdateMI =
891       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
892           .addReg(LiveMaskReg)
893           .addReg(VCC);
894 
895   // State of SCC represents whether any lanes are live in mask,
896   // if SCC is 0 then no lanes will be alive anymore.
897   MachineInstr *EarlyTermMI =
898       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
899 
900   MachineInstr *ExecMaskMI =
901       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
902 
903   assert(MBB.succ_size() == 1);
904   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
905                               .addMBB(*MBB.succ_begin());
906 
907   // Update live intervals
908   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
909   MBB.remove(&MI);
910 
911   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
912   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
913   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
914   LIS->InsertMachineInstrInMaps(*NewTerm);
915 
916   return NewTerm;
917 }
918 
919 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
920   assert(LiveMaskReg.isVirtual());
921 
922   MachineBasicBlock &MBB = *MI.getParent();
923 
924   const DebugLoc &DL = MI.getDebugLoc();
925   MachineInstr *MaskUpdateMI = nullptr;
926 
927   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
928   const MachineOperand &Op = MI.getOperand(0);
929   int64_t KillVal = MI.getOperand(1).getImm();
930   MachineInstr *ComputeKilledMaskMI = nullptr;
931   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
932   Register TmpReg;
933 
934   // Is this a static or dynamic kill?
935   if (Op.isImm()) {
936     if (Op.getImm() == KillVal) {
937       // Static: all active lanes are killed
938       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
939                          .addReg(LiveMaskReg)
940                          .addReg(Exec);
941     } else {
942       // Static: kill does nothing
943       MachineInstr *NewTerm = nullptr;
944       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
945         LIS->RemoveMachineInstrFromMaps(MI);
946       } else {
947         assert(MBB.succ_size() == 1);
948         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
949                       .addMBB(*MBB.succ_begin());
950         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
951       }
952       MBB.remove(&MI);
953       return NewTerm;
954     }
955   } else {
956     if (!KillVal) {
957       // Op represents live lanes after kill,
958       // so exec mask needs to be factored in.
959       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
960       ComputeKilledMaskMI =
961           BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
962       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
963                          .addReg(LiveMaskReg)
964                          .addReg(TmpReg);
965     } else {
966       // Op represents lanes to kill
967       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
968                          .addReg(LiveMaskReg)
969                          .add(Op);
970     }
971   }
972 
973   // State of SCC represents whether any lanes are live in mask,
974   // if SCC is 0 then no lanes will be alive anymore.
975   MachineInstr *EarlyTermMI =
976       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
977 
978   // In the case we got this far some lanes are still live,
979   // update EXEC to deactivate lanes as appropriate.
980   MachineInstr *NewTerm;
981   MachineInstr *WQMMaskMI = nullptr;
982   Register LiveMaskWQM;
983   if (IsDemote) {
984     // Demote - deactivate quads with only helper lanes
985     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
986     WQMMaskMI =
987         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
988     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
989                   .addReg(Exec)
990                   .addReg(LiveMaskWQM);
991   } else {
992     // Kill - deactivate lanes no longer in live mask
993     if (Op.isImm()) {
994       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
995       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
996     } else if (!IsWQM) {
997       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
998                     .addReg(Exec)
999                     .addReg(LiveMaskReg);
1000     } else {
1001       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1002       NewTerm =
1003           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1004     }
1005   }
1006 
1007   // Update live intervals
1008   LIS->RemoveMachineInstrFromMaps(MI);
1009   MBB.remove(&MI);
1010   assert(EarlyTermMI);
1011   assert(MaskUpdateMI);
1012   assert(NewTerm);
1013   if (ComputeKilledMaskMI)
1014     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1015   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1016   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1017   if (WQMMaskMI)
1018     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1019   LIS->InsertMachineInstrInMaps(*NewTerm);
1020 
1021   if (CndReg) {
1022     LIS->removeInterval(CndReg);
1023     LIS->createAndComputeVirtRegInterval(CndReg);
1024   }
1025   if (TmpReg)
1026     LIS->createAndComputeVirtRegInterval(TmpReg);
1027   if (LiveMaskWQM)
1028     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1029 
1030   return NewTerm;
1031 }
1032 
1033 // Replace (or supplement) instructions accessing live mask.
1034 // This can only happen once all the live mask registers have been created
1035 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1036 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
1037   if (!BI.NeedsLowering)
1038     return;
1039 
1040   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1041 
1042   SmallVector<MachineInstr *, 4> SplitPoints;
1043   Register ActiveLanesReg = 0;
1044   char State = BI.InitialState;
1045 
1046   for (MachineInstr &MI : llvm::make_early_inc_range(
1047            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1048     auto MIState = StateTransition.find(&MI);
1049     if (MIState != StateTransition.end())
1050       State = MIState->second;
1051 
1052     MachineInstr *SplitPoint = nullptr;
1053     switch (MI.getOpcode()) {
1054     case AMDGPU::SI_DEMOTE_I1:
1055     case AMDGPU::SI_KILL_I1_TERMINATOR:
1056       SplitPoint = lowerKillI1(MI, State == StateWQM);
1057       break;
1058     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1059       SplitPoint = lowerKillF32(MI);
1060       break;
1061     case AMDGPU::ENTER_STRICT_WWM:
1062       ActiveLanesReg = MI.getOperand(0).getReg();
1063       break;
1064     case AMDGPU::EXIT_STRICT_WWM:
1065       ActiveLanesReg = 0;
1066       break;
1067     case AMDGPU::V_SET_INACTIVE_B32:
1068       if (ActiveLanesReg) {
1069         LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
1070         MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
1071         MI.getOperand(5).setReg(ActiveLanesReg);
1072         LIS->shrinkToUses(&LI);
1073       } else {
1074         assert(State == StateExact || State == StateWQM);
1075       }
1076       break;
1077     default:
1078       break;
1079     }
1080     if (SplitPoint)
1081       SplitPoints.push_back(SplitPoint);
1082   }
1083 
1084   // Perform splitting after instruction scan to simplify iteration.
1085   for (MachineInstr *MI : SplitPoints)
1086     splitBlock(MI);
1087 }
1088 
1089 // Return an iterator in the (inclusive) range [First, Last] at which
1090 // instructions can be safely inserted, keeping in mind that some of the
1091 // instructions we want to add necessarily clobber SCC.
1092 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1093     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1094     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1095   if (!SaveSCC)
1096     return PreferLast ? Last : First;
1097 
1098   LiveRange &LR =
1099       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1100   auto MBBE = MBB.end();
1101   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1102                                      : LIS->getMBBEndIdx(&MBB);
1103   SlotIndex LastIdx =
1104       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1105   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1106   const LiveRange::Segment *S;
1107 
1108   for (;;) {
1109     S = LR.getSegmentContaining(Idx);
1110     if (!S)
1111       break;
1112 
1113     if (PreferLast) {
1114       SlotIndex Next = S->start.getBaseIndex();
1115       if (Next < FirstIdx)
1116         break;
1117       Idx = Next;
1118     } else {
1119       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1120       assert(EndMI && "Segment does not end on valid instruction");
1121       auto NextI = std::next(EndMI->getIterator());
1122       if (NextI == MBB.end())
1123         break;
1124       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1125       if (Next > LastIdx)
1126         break;
1127       Idx = Next;
1128     }
1129   }
1130 
1131   MachineBasicBlock::iterator MBBI;
1132 
1133   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1134     MBBI = MI;
1135   else {
1136     assert(Idx == LIS->getMBBEndIdx(&MBB));
1137     MBBI = MBB.end();
1138   }
1139 
1140   // Move insertion point past any operations modifying EXEC.
1141   // This assumes that the value of SCC defined by any of these operations
1142   // does not need to be preserved.
1143   while (MBBI != Last) {
1144     bool IsExecDef = false;
1145     for (const MachineOperand &MO : MBBI->all_defs()) {
1146       IsExecDef |=
1147           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1148     }
1149     if (!IsExecDef)
1150       break;
1151     MBBI++;
1152     S = nullptr;
1153   }
1154 
1155   if (S)
1156     MBBI = saveSCC(MBB, MBBI);
1157 
1158   return MBBI;
1159 }
1160 
1161 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1162                               MachineBasicBlock::iterator Before,
1163                               Register SaveWQM) {
1164   assert(LiveMaskReg.isVirtual());
1165 
1166   bool IsTerminator = Before == MBB.end();
1167   if (!IsTerminator) {
1168     auto FirstTerm = MBB.getFirstTerminator();
1169     if (FirstTerm != MBB.end()) {
1170       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1171       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1172       IsTerminator = BeforeIdx > FirstTermIdx;
1173     }
1174   }
1175 
1176   MachineInstr *MI;
1177 
1178   if (SaveWQM) {
1179     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1180     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1181              .addReg(LiveMaskReg);
1182   } else {
1183     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1184     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1185              .addReg(Exec)
1186              .addReg(LiveMaskReg);
1187   }
1188 
1189   LIS->InsertMachineInstrInMaps(*MI);
1190   StateTransition[MI] = StateExact;
1191 }
1192 
1193 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1194                             MachineBasicBlock::iterator Before,
1195                             Register SavedWQM) {
1196   MachineInstr *MI;
1197 
1198   if (SavedWQM) {
1199     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1200              .addReg(SavedWQM);
1201   } else {
1202     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1203   }
1204 
1205   LIS->InsertMachineInstrInMaps(*MI);
1206   StateTransition[MI] = StateWQM;
1207 }
1208 
1209 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1210                                    MachineBasicBlock::iterator Before,
1211                                    Register SaveOrig, char StrictStateNeeded) {
1212   MachineInstr *MI;
1213   assert(SaveOrig);
1214   assert(StrictStateNeeded == StateStrictWWM ||
1215          StrictStateNeeded == StateStrictWQM);
1216 
1217   if (StrictStateNeeded == StateStrictWWM) {
1218     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1219                  SaveOrig)
1220              .addImm(-1);
1221   } else {
1222     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1223                  SaveOrig)
1224              .addImm(-1);
1225   }
1226   LIS->InsertMachineInstrInMaps(*MI);
1227   StateTransition[MI] = StrictStateNeeded;
1228 }
1229 
1230 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1231                                      MachineBasicBlock::iterator Before,
1232                                      Register SavedOrig, char NonStrictState,
1233                                      char CurrentStrictState) {
1234   MachineInstr *MI;
1235 
1236   assert(SavedOrig);
1237   assert(CurrentStrictState == StateStrictWWM ||
1238          CurrentStrictState == StateStrictWQM);
1239 
1240   if (CurrentStrictState == StateStrictWWM) {
1241     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1242                  Exec)
1243              .addReg(SavedOrig);
1244   } else {
1245     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1246                  Exec)
1247              .addReg(SavedOrig);
1248   }
1249   LIS->InsertMachineInstrInMaps(*MI);
1250   StateTransition[MI] = NonStrictState;
1251 }
1252 
1253 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
1254                                    bool IsEntry) {
1255   // This is a non-entry block that is WQM throughout, so no need to do
1256   // anything.
1257   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1258     BI.InitialState = StateWQM;
1259     return;
1260   }
1261 
1262   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1263                     << ":\n");
1264 
1265   Register SavedWQMReg;
1266   Register SavedNonStrictReg;
1267   bool WQMFromExec = IsEntry;
1268   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1269   char NonStrictState = 0;
1270   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1271 
1272   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1273   if (IsEntry) {
1274     // Skip the instruction that saves LiveMask
1275     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1276         II->getOperand(1).getReg() == TRI->getExec())
1277       ++II;
1278   }
1279 
1280   // This stores the first instruction where it's safe to switch from WQM to
1281   // Exact or vice versa.
1282   MachineBasicBlock::iterator FirstWQM = IE;
1283 
1284   // This stores the first instruction where it's safe to switch from Strict
1285   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1286   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1287   // be safe to switch to/from WQM as well.
1288   MachineBasicBlock::iterator FirstStrict = IE;
1289 
1290   // Record initial state is block information.
1291   BI.InitialState = State;
1292 
1293   for (unsigned Idx = 0;; ++Idx) {
1294     MachineBasicBlock::iterator Next = II;
1295     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1296     char OutNeeds = 0;
1297 
1298     if (FirstWQM == IE)
1299       FirstWQM = II;
1300 
1301     if (FirstStrict == IE)
1302       FirstStrict = II;
1303 
1304     // Adjust needs if this is first instruction of WQM requiring shader.
1305     if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1306       Needs = StateWQM;
1307 
1308     // First, figure out the allowed states (Needs) based on the propagated
1309     // flags.
1310     if (II != IE) {
1311       MachineInstr &MI = *II;
1312 
1313       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1314         auto III = Instructions.find(&MI);
1315         if (III != Instructions.end()) {
1316           if (III->second.Needs & StateStrictWWM)
1317             Needs = StateStrictWWM;
1318           else if (III->second.Needs & StateStrictWQM)
1319             Needs = StateStrictWQM;
1320           else if (III->second.Needs & StateWQM)
1321             Needs = StateWQM;
1322           else
1323             Needs &= ~III->second.Disabled;
1324           OutNeeds = III->second.OutNeeds;
1325         }
1326       } else {
1327         // If the instruction doesn't actually need a correct EXEC, then we can
1328         // safely leave Strict mode enabled.
1329         Needs = StateExact | StateWQM | StateStrict;
1330       }
1331 
1332       // Exact mode exit can occur in terminators, but must be before branches.
1333       if (MI.isBranch() && OutNeeds == StateExact)
1334         Needs = StateExact;
1335 
1336       ++Next;
1337     } else {
1338       // End of basic block
1339       if (BI.OutNeeds & StateWQM)
1340         Needs = StateWQM;
1341       else if (BI.OutNeeds == StateExact)
1342         Needs = StateExact;
1343       else
1344         Needs = StateWQM | StateExact;
1345     }
1346 
1347     // Now, transition if necessary.
1348     if (!(Needs & State)) {
1349       MachineBasicBlock::iterator First;
1350       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1351           State == StateStrictWQM || Needs == StateStrictWQM) {
1352         // We must switch to or from Strict mode.
1353         First = FirstStrict;
1354       } else {
1355         // We only need to switch to/from WQM, so we can use FirstWQM.
1356         First = FirstWQM;
1357       }
1358 
1359       // Whether we need to save SCC depends on start and end states.
1360       bool SaveSCC = false;
1361       switch (State) {
1362       case StateExact:
1363       case StateStrictWWM:
1364       case StateStrictWQM:
1365         // Exact/Strict -> Strict: save SCC
1366         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1367         // Exact/Strict -> Exact: no save
1368         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1369         break;
1370       case StateWQM:
1371         // WQM -> Exact/Strict: save SCC
1372         SaveSCC = !(Needs & StateWQM);
1373         break;
1374       default:
1375         llvm_unreachable("Unknown state");
1376         break;
1377       }
1378       char StartState = State & StateStrict ? NonStrictState : State;
1379       bool WQMToExact =
1380           StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1381       bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1382                         !(Needs & StateExact);
1383       bool PreferLast = Needs == StateWQM;
1384       // Exact regions in divergent control flow may run at EXEC=0, so try to
1385       // exclude instructions with unexpected effects from them.
1386       // FIXME: ideally we would branch over these when EXEC=0,
1387       // but this requires updating implicit values, live intervals and CFG.
1388       if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1389         for (MachineBasicBlock::iterator I = First; I != II; ++I) {
1390           if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
1391             PreferLast = WQMToExact;
1392             break;
1393           }
1394         }
1395       }
1396       MachineBasicBlock::iterator Before =
1397           prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
1398 
1399       if (State & StateStrict) {
1400         assert(State == StateStrictWWM || State == StateStrictWQM);
1401         assert(SavedNonStrictReg);
1402         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1403 
1404         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1405         SavedNonStrictReg = 0;
1406         State = NonStrictState;
1407       }
1408 
1409       if (Needs & StateStrict) {
1410         NonStrictState = State;
1411         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1412         assert(!SavedNonStrictReg);
1413         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1414 
1415         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1416         State = Needs;
1417       } else {
1418         if (WQMToExact) {
1419           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1420             assert(!SavedWQMReg);
1421             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1422           }
1423 
1424           toExact(MBB, Before, SavedWQMReg);
1425           State = StateExact;
1426         } else if (ExactToWQM) {
1427           assert(WQMFromExec == (SavedWQMReg == 0));
1428 
1429           toWQM(MBB, Before, SavedWQMReg);
1430 
1431           if (SavedWQMReg) {
1432             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1433             SavedWQMReg = 0;
1434           }
1435           State = StateWQM;
1436         } else {
1437           // We can get here if we transitioned from StrictWWM to a
1438           // non-StrictWWM state that already matches our needs, but we
1439           // shouldn't need to do anything.
1440           assert(Needs & State);
1441         }
1442       }
1443     }
1444 
1445     if (Needs != (StateExact | StateWQM | StateStrict)) {
1446       if (Needs != (StateExact | StateWQM))
1447         FirstWQM = IE;
1448       FirstStrict = IE;
1449     }
1450 
1451     if (II == IE)
1452       break;
1453 
1454     II = Next;
1455   }
1456   assert(!SavedWQMReg);
1457   assert(!SavedNonStrictReg);
1458 }
1459 
1460 bool SIWholeQuadMode::lowerLiveMaskQueries() {
1461   for (MachineInstr *MI : LiveMaskQueries) {
1462     const DebugLoc &DL = MI->getDebugLoc();
1463     Register Dest = MI->getOperand(0).getReg();
1464 
1465     MachineInstr *Copy =
1466         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1467             .addReg(LiveMaskReg);
1468 
1469     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1470     MI->eraseFromParent();
1471   }
1472   return !LiveMaskQueries.empty();
1473 }
1474 
1475 bool SIWholeQuadMode::lowerCopyInstrs() {
1476   for (MachineInstr *MI : LowerToMovInstrs) {
1477     assert(MI->getNumExplicitOperands() == 2);
1478 
1479     const Register Reg = MI->getOperand(0).getReg();
1480 
1481     const TargetRegisterClass *regClass =
1482         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1483     if (TRI->isVGPRClass(regClass)) {
1484       const unsigned MovOp = TII->getMovOpcode(regClass);
1485       MI->setDesc(TII->get(MovOp));
1486 
1487       // Check that it already implicitly depends on exec (like all VALU movs
1488       // should do).
1489       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1490         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1491       }));
1492     } else {
1493       // Remove early-clobber and exec dependency from simple SGPR copies.
1494       // This allows some to be eliminated during/post RA.
1495       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1496       if (MI->getOperand(0).isEarlyClobber()) {
1497         LIS->removeInterval(Reg);
1498         MI->getOperand(0).setIsEarlyClobber(false);
1499         LIS->createAndComputeVirtRegInterval(Reg);
1500       }
1501       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1502       while (Index >= 0) {
1503         MI->removeOperand(Index);
1504         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
1505       }
1506       MI->setDesc(TII->get(AMDGPU::COPY));
1507       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1508     }
1509   }
1510   for (MachineInstr *MI : LowerToCopyInstrs) {
1511     LLVM_DEBUG(dbgs() << "simplify: " << *MI);
1512 
1513     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1514       assert(MI->getNumExplicitOperands() == 6);
1515 
1516       LiveInterval *RecomputeLI = nullptr;
1517       if (MI->getOperand(4).isReg())
1518         RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
1519 
1520       MI->removeOperand(5);
1521       MI->removeOperand(4);
1522       MI->removeOperand(3);
1523       MI->removeOperand(1);
1524 
1525       if (RecomputeLI)
1526         LIS->shrinkToUses(RecomputeLI);
1527     } else {
1528       assert(MI->getNumExplicitOperands() == 2);
1529     }
1530 
1531     unsigned CopyOp = MI->getOperand(1).isReg()
1532                           ? (unsigned)AMDGPU::COPY
1533                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1534                                 *MRI, MI->getOperand(0)));
1535     MI->setDesc(TII->get(CopyOp));
1536     LLVM_DEBUG(dbgs() << " -> " << *MI);
1537   }
1538   return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1539 }
1540 
1541 bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1542   for (MachineInstr *MI : KillInstrs) {
1543     MachineInstr *SplitPoint = nullptr;
1544     switch (MI->getOpcode()) {
1545     case AMDGPU::SI_DEMOTE_I1:
1546     case AMDGPU::SI_KILL_I1_TERMINATOR:
1547       SplitPoint = lowerKillI1(*MI, IsWQM);
1548       break;
1549     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1550       SplitPoint = lowerKillF32(*MI);
1551       break;
1552     }
1553     if (SplitPoint)
1554       splitBlock(SplitPoint);
1555   }
1556   return !KillInstrs.empty();
1557 }
1558 
1559 void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1560   MachineBasicBlock *MBB = MI.getParent();
1561   bool IsWave32 = ST->isWave32();
1562 
1563   if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1564     assert(MBB == &MBB->getParent()->front() &&
1565            "init whole wave not in entry block");
1566     Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1567     MachineInstr *SaveExec =
1568         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1569                 TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1570                                   : AMDGPU::S_OR_SAVEEXEC_B64),
1571                 EntryExec)
1572             .addImm(-1);
1573 
1574     // Replace all uses of MI's destination reg with EntryExec.
1575     MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1576 
1577     if (LIS) {
1578       LIS->RemoveMachineInstrFromMaps(MI);
1579     }
1580 
1581     MI.eraseFromParent();
1582 
1583     if (LIS) {
1584       LIS->InsertMachineInstrInMaps(*SaveExec);
1585       LIS->createAndComputeVirtRegInterval(EntryExec);
1586     }
1587     return;
1588   }
1589 
1590   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1591     // This should be before all vector instructions.
1592     MachineInstr *InitMI =
1593         BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1594                 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1595                 Exec)
1596             .addImm(MI.getOperand(0).getImm());
1597     if (LIS) {
1598       LIS->RemoveMachineInstrFromMaps(MI);
1599       LIS->InsertMachineInstrInMaps(*InitMI);
1600     }
1601     MI.eraseFromParent();
1602     return;
1603   }
1604 
1605   // Extract the thread count from an SGPR input and set EXEC accordingly.
1606   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1607   //
1608   // S_BFE_U32 count, input, {shift, 7}
1609   // S_BFM_B64 exec, count, 0
1610   // S_CMP_EQ_U32 count, 64
1611   // S_CMOV_B64 exec, -1
1612   Register InputReg = MI.getOperand(0).getReg();
1613   MachineInstr *FirstMI = &*MBB->begin();
1614   if (InputReg.isVirtual()) {
1615     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1616     assert(DefInstr && DefInstr->isCopy());
1617     if (DefInstr->getParent() == MBB) {
1618       if (DefInstr != FirstMI) {
1619         // If the `InputReg` is defined in current block, we also need to
1620         // move that instruction to the beginning of the block.
1621         DefInstr->removeFromParent();
1622         MBB->insert(FirstMI, DefInstr);
1623         if (LIS)
1624           LIS->handleMove(*DefInstr);
1625       } else {
1626         // If first instruction is definition then move pointer after it.
1627         FirstMI = &*std::next(FirstMI->getIterator());
1628       }
1629     }
1630   }
1631 
1632   // Insert instruction sequence at block beginning (before vector operations).
1633   const DebugLoc DL = MI.getDebugLoc();
1634   const unsigned WavefrontSize = ST->getWavefrontSize();
1635   const unsigned Mask = (WavefrontSize << 1) - 1;
1636   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1637   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1638                    .addReg(InputReg)
1639                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1640   auto BfmMI =
1641       BuildMI(*MBB, FirstMI, DL,
1642               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1643           .addReg(CountReg)
1644           .addImm(0);
1645   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1646                    .addReg(CountReg, RegState::Kill)
1647                    .addImm(WavefrontSize);
1648   auto CmovMI =
1649       BuildMI(*MBB, FirstMI, DL,
1650               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1651               Exec)
1652           .addImm(-1);
1653 
1654   if (!LIS) {
1655     MI.eraseFromParent();
1656     return;
1657   }
1658 
1659   LIS->RemoveMachineInstrFromMaps(MI);
1660   MI.eraseFromParent();
1661 
1662   LIS->InsertMachineInstrInMaps(*BfeMI);
1663   LIS->InsertMachineInstrInMaps(*BfmMI);
1664   LIS->InsertMachineInstrInMaps(*CmpMI);
1665   LIS->InsertMachineInstrInMaps(*CmovMI);
1666 
1667   LIS->removeInterval(InputReg);
1668   LIS->createAndComputeVirtRegInterval(InputReg);
1669   LIS->createAndComputeVirtRegInterval(CountReg);
1670 }
1671 
1672 /// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
1673 /// for instructions that depend on EXEC.
1674 MachineBasicBlock::iterator
1675 SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
1676   MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
1677 
1678   for (MachineInstr *MI : InitExecInstrs) {
1679     // Try to handle undefined cases gracefully:
1680     // - multiple INIT_EXEC instructions
1681     // - INIT_EXEC instructions not in the entry block
1682     if (MI->getParent() == &Entry)
1683       InsertPt = std::next(MI->getIterator());
1684 
1685     lowerInitExec(*MI);
1686     Changed = true;
1687   }
1688 
1689   return InsertPt;
1690 }
1691 
1692 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1693   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1694                     << " ------------- \n");
1695   LLVM_DEBUG(MF.dump(););
1696 
1697   Instructions.clear();
1698   Blocks.clear();
1699   LiveMaskQueries.clear();
1700   LowerToCopyInstrs.clear();
1701   LowerToMovInstrs.clear();
1702   KillInstrs.clear();
1703   InitExecInstrs.clear();
1704   SetInactiveInstrs.clear();
1705   StateTransition.clear();
1706 
1707   ST = &MF.getSubtarget<GCNSubtarget>();
1708 
1709   TII = ST->getInstrInfo();
1710   TRI = &TII->getRegisterInfo();
1711   MRI = &MF.getRegInfo();
1712   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1713   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1714   MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
1715   auto *PDTWrapper =
1716       getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1717   PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
1718 
1719   if (ST->isWave32()) {
1720     AndOpc = AMDGPU::S_AND_B32;
1721     AndTermOpc = AMDGPU::S_AND_B32_term;
1722     AndN2Opc = AMDGPU::S_ANDN2_B32;
1723     XorOpc = AMDGPU::S_XOR_B32;
1724     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1725     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1726     WQMOpc = AMDGPU::S_WQM_B32;
1727     Exec = AMDGPU::EXEC_LO;
1728   } else {
1729     AndOpc = AMDGPU::S_AND_B64;
1730     AndTermOpc = AMDGPU::S_AND_B64_term;
1731     AndN2Opc = AMDGPU::S_ANDN2_B64;
1732     XorOpc = AMDGPU::S_XOR_B64;
1733     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1734     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1735     WQMOpc = AMDGPU::S_WQM_B64;
1736     Exec = AMDGPU::EXEC;
1737   }
1738 
1739   const char GlobalFlags = analyzeFunction(MF);
1740   bool Changed = false;
1741 
1742   LiveMaskReg = Exec;
1743 
1744   MachineBasicBlock &Entry = MF.front();
1745   MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
1746 
1747   // Store a copy of the original live mask when required
1748   const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1749   const bool HasWaveModes = GlobalFlags & ~StateExact;
1750   const bool HasKills = !KillInstrs.empty();
1751   const bool UsesWQM = GlobalFlags & StateWQM;
1752   if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1753     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1754     MachineInstr *MI =
1755         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1756             .addReg(Exec);
1757     LIS->InsertMachineInstrInMaps(*MI);
1758     Changed = true;
1759   }
1760 
1761   // Check if V_SET_INACTIVE was touched by a strict state mode.
1762   // If so, promote to WWM; otherwise lower to COPY.
1763   for (MachineInstr *MI : SetInactiveInstrs) {
1764     if (LowerToCopyInstrs.contains(MI))
1765       continue;
1766     if (Instructions[MI].MarkedStates & StateStrict) {
1767       Instructions[MI].Needs |= StateStrictWWM;
1768       Instructions[MI].Disabled &= ~StateStrictWWM;
1769       Blocks[MI->getParent()].Needs |= StateStrictWWM;
1770     } else {
1771       LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
1772       LowerToCopyInstrs.insert(MI);
1773     }
1774   }
1775 
1776   LLVM_DEBUG(printInfo());
1777 
1778   Changed |= lowerLiveMaskQueries();
1779   Changed |= lowerCopyInstrs();
1780 
1781   if (!HasWaveModes) {
1782     // No wave mode execution
1783     Changed |= lowerKillInstrs(false);
1784   } else if (GlobalFlags == StateWQM) {
1785     // Shader only needs WQM
1786     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1787                   .addReg(Exec);
1788     LIS->InsertMachineInstrInMaps(*MI);
1789     lowerKillInstrs(true);
1790     Changed = true;
1791   } else {
1792     // Mark entry for WQM if required.
1793     if (GlobalFlags & StateWQM)
1794       Blocks[&Entry].InNeeds |= StateWQM;
1795     // Wave mode switching requires full lowering pass.
1796     for (auto &BII : Blocks)
1797       processBlock(*BII.first, BII.second, BII.first == &Entry);
1798     // Lowering blocks causes block splitting so perform as a second pass.
1799     for (auto &BII : Blocks)
1800       lowerBlock(*BII.first, BII.second);
1801     Changed = true;
1802   }
1803 
1804   // Compute live range for live mask
1805   if (LiveMaskReg != Exec)
1806     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1807 
1808   // Physical registers like SCC aren't tracked by default anyway, so just
1809   // removing the ranges we computed is the simplest option for maintaining
1810   // the analysis results.
1811   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1812 
1813   // If we performed any kills then recompute EXEC
1814   if (!KillInstrs.empty() || !InitExecInstrs.empty())
1815     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1816 
1817   return Changed;
1818 }
1819