xref: /llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 6afc4b0629c8dc26236af72688b8c036cf090c32)
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137   char InitialState = 0;
138   bool NeedsLowering = false;
139 };
140 
141 struct WorkItem {
142   MachineBasicBlock *MBB = nullptr;
143   MachineInstr *MI = nullptr;
144 
145   WorkItem() = default;
146   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147   WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149 
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152   const SIInstrInfo *TII;
153   const SIRegisterInfo *TRI;
154   const GCNSubtarget *ST;
155   MachineRegisterInfo *MRI;
156   LiveIntervals *LIS;
157   MachineDominatorTree *MDT;
158   MachinePostDominatorTree *PDT;
159 
160   unsigned AndOpc;
161   unsigned AndTermOpc;
162   unsigned AndN2Opc;
163   unsigned XorOpc;
164   unsigned AndSaveExecOpc;
165   unsigned AndSaveExecTermOpc;
166   unsigned WQMOpc;
167   Register Exec;
168   Register LiveMaskReg;
169 
170   DenseMap<const MachineInstr *, InstrInfo> Instructions;
171   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172 
173   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174   DenseMap<const MachineInstr *, char> StateTransition;
175 
176   SmallVector<MachineInstr *, 2> LiveMaskQueries;
177   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179   SmallVector<MachineInstr *, 4> KillInstrs;
180 
181   void printInfo();
182 
183   void markInstruction(MachineInstr &MI, char Flag,
184                        std::vector<WorkItem> &Worklist);
185   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
186                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
187   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
188                    std::vector<WorkItem> &Worklist);
189   void markInstructionUses(const MachineInstr &MI, char Flag,
190                            std::vector<WorkItem> &Worklist);
191   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
192   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
193   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
194   char analyzeFunction(MachineFunction &MF);
195 
196   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
197                                       MachineBasicBlock::iterator Before);
198   MachineBasicBlock::iterator
199   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
200                    MachineBasicBlock::iterator Last, bool PreferLast,
201                    bool SaveSCC);
202   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
203                Register SaveWQM);
204   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
205              Register SavedWQM);
206   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
207                     Register SaveOrig, char StrictStateNeeded);
208   void fromStrictMode(MachineBasicBlock &MBB,
209                       MachineBasicBlock::iterator Before, Register SavedOrig,
210                       char NonStrictState, char CurrentStrictState);
211 
212   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
213 
214   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
215                             bool IsWQM);
216   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
217   void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
218                              MachineInstr *Exit);
219 
220   void lowerBlock(MachineBasicBlock &MBB);
221   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
222 
223   void lowerLiveMaskQueries();
224   void lowerCopyInstrs();
225   void lowerKillInstrs(bool IsWQM);
226 
227 public:
228   static char ID;
229 
230   SIWholeQuadMode() :
231     MachineFunctionPass(ID) { }
232 
233   bool runOnMachineFunction(MachineFunction &MF) override;
234 
235   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
236 
237   void getAnalysisUsage(AnalysisUsage &AU) const override {
238     AU.addRequired<LiveIntervals>();
239     AU.addPreserved<SlotIndexes>();
240     AU.addPreserved<LiveIntervals>();
241     AU.addRequired<MachineDominatorTree>();
242     AU.addPreserved<MachineDominatorTree>();
243     AU.addRequired<MachinePostDominatorTree>();
244     AU.addPreserved<MachinePostDominatorTree>();
245     MachineFunctionPass::getAnalysisUsage(AU);
246   }
247 
248   MachineFunctionProperties getClearedProperties() const override {
249     return MachineFunctionProperties().set(
250         MachineFunctionProperties::Property::IsSSA);
251   }
252 };
253 
254 } // end anonymous namespace
255 
256 char SIWholeQuadMode::ID = 0;
257 
258 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
259                       false)
260 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
261 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
262 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
263 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
264                     false)
265 
266 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
267 
268 FunctionPass *llvm::createSIWholeQuadModePass() {
269   return new SIWholeQuadMode;
270 }
271 
272 #ifndef NDEBUG
273 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
274   for (const auto &BII : Blocks) {
275     dbgs() << "\n"
276            << printMBBReference(*BII.first) << ":\n"
277            << "  InNeeds = " << PrintState(BII.second.InNeeds)
278            << ", Needs = " << PrintState(BII.second.Needs)
279            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
280 
281     for (const MachineInstr &MI : *BII.first) {
282       auto III = Instructions.find(&MI);
283       if (III == Instructions.end())
284         continue;
285 
286       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
287              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
288     }
289   }
290 }
291 #endif
292 
293 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
294                                       std::vector<WorkItem> &Worklist) {
295   InstrInfo &II = Instructions[&MI];
296 
297   assert(!(Flag & StateExact) && Flag != 0);
298 
299   // Remove any disabled states from the flag. The user that required it gets
300   // an undefined value in the helper lanes. For example, this can happen if
301   // the result of an atomic is used by instruction that requires WQM, where
302   // ignoring the request for WQM is correct as per the relevant specs.
303   Flag &= ~II.Disabled;
304 
305   // Ignore if the flag is already encompassed by the existing needs, or we
306   // just disabled everything.
307   if ((II.Needs & Flag) == Flag)
308     return;
309 
310   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
311   II.Needs |= Flag;
312   Worklist.push_back(&MI);
313 }
314 
315 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
316 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
317                                Register Reg, unsigned SubReg, char Flag,
318                                std::vector<WorkItem> &Worklist) {
319   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
320 
321   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
322   const VNInfo *Value = UseLRQ.valueIn();
323   if (!Value)
324     return;
325 
326   // Note: this code assumes that lane masks on AMDGPU completely
327   // cover registers.
328   const LaneBitmask UseLanes =
329       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
330              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
331                                 : LaneBitmask::getNone());
332 
333   // Perform a depth-first iteration of the LiveRange graph marking defs.
334   // Stop processing of a given branch when all use lanes have been defined.
335   // The first definition stops processing for a physical register.
336   struct PhiEntry {
337     const VNInfo *Phi;
338     unsigned PredIdx;
339     LaneBitmask DefinedLanes;
340 
341     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
342         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
343   };
344   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
345   SmallVector<PhiEntry, 2> PhiStack;
346   SmallSet<VisitKey, 4> Visited;
347   LaneBitmask DefinedLanes;
348   unsigned NextPredIdx = 0; // Only used for processing phi nodes
349   do {
350     const VNInfo *NextValue = nullptr;
351     const VisitKey Key(Value, DefinedLanes);
352 
353     if (Visited.insert(Key).second) {
354       // On first visit to a phi then start processing first predecessor
355       NextPredIdx = 0;
356     }
357 
358     if (Value->isPHIDef()) {
359       // Each predecessor node in the phi must be processed as a subgraph
360       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361       assert(MBB && "Phi-def has no defining MBB");
362 
363       // Find next predecessor to process
364       unsigned Idx = NextPredIdx;
365       auto PI = MBB->pred_begin() + Idx;
366       auto PE = MBB->pred_end();
367       for (; PI != PE && !NextValue; ++PI, ++Idx) {
368         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369           if (!Visited.count(VisitKey(VN, DefinedLanes)))
370             NextValue = VN;
371         }
372       }
373 
374       // If there are more predecessors to process; add phi to stack
375       if (PI != PE)
376         PhiStack.emplace_back(Value, Idx, DefinedLanes);
377     } else {
378       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379       assert(MI && "Def has no defining instruction");
380 
381       if (Reg.isVirtual()) {
382         // Iterate over all operands to find relevant definitions
383         bool HasDef = false;
384         for (const MachineOperand &Op : MI->all_defs()) {
385           if (Op.getReg() != Reg)
386             continue;
387 
388           // Compute lanes defined and overlap with use
389           LaneBitmask OpLanes =
390               Op.isUndef() ? LaneBitmask::getAll()
391                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392           LaneBitmask Overlap = (UseLanes & OpLanes);
393 
394           // Record if this instruction defined any of use
395           HasDef |= Overlap.any();
396 
397           // Mark any lanes defined
398           DefinedLanes |= OpLanes;
399         }
400 
401         // Check if all lanes of use have been defined
402         if ((DefinedLanes & UseLanes) != UseLanes) {
403           // Definition not complete; need to process input value
404           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405           if (const VNInfo *VN = LRQ.valueIn()) {
406             if (!Visited.count(VisitKey(VN, DefinedLanes)))
407               NextValue = VN;
408           }
409         }
410 
411         // Only mark the instruction if it defines some part of the use
412         if (HasDef)
413           markInstruction(*MI, Flag, Worklist);
414       } else {
415         // For physical registers simply mark the defining instruction
416         markInstruction(*MI, Flag, Worklist);
417       }
418     }
419 
420     if (!NextValue && !PhiStack.empty()) {
421       // Reach end of chain; revert to processing last phi
422       PhiEntry &Entry = PhiStack.back();
423       NextValue = Entry.Phi;
424       NextPredIdx = Entry.PredIdx;
425       DefinedLanes = Entry.DefinedLanes;
426       PhiStack.pop_back();
427     }
428 
429     Value = NextValue;
430   } while (Value);
431 }
432 
433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434                                   const MachineOperand &Op, char Flag,
435                                   std::vector<WorkItem> &Worklist) {
436   assert(Op.isReg());
437   Register Reg = Op.getReg();
438 
439   // Ignore some hardware registers
440   switch (Reg) {
441   case AMDGPU::EXEC:
442   case AMDGPU::EXEC_LO:
443     return;
444   default:
445     break;
446   }
447 
448   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449                     << " for " << MI);
450   if (Reg.isVirtual()) {
451     LiveRange &LR = LIS->getInterval(Reg);
452     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453   } else {
454     // Handle physical registers that we need to track; this is mostly relevant
455     // for VCC, which can appear as the (implicit) input of a uniform branch,
456     // e.g. when a loop counter is stored in a VGPR.
457     for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
458          ++RegUnit) {
459       LiveRange &LR = LIS->getRegUnit(*RegUnit);
460       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
461       if (!Value)
462         continue;
463 
464       markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
465     }
466   }
467 }
468 
469 /// Mark all instructions defining the uses in \p MI with \p Flag.
470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
471                                           std::vector<WorkItem> &Worklist) {
472   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
473                     << MI);
474 
475   for (const MachineOperand &Use : MI.all_uses())
476     markOperand(MI, Use, Flag, Worklist);
477 }
478 
479 // Scan instructions to determine which ones require an Exact execmask and
480 // which ones seed WQM requirements.
481 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
482                                        std::vector<WorkItem> &Worklist) {
483   char GlobalFlags = 0;
484   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
485   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
486   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
487   bool HasImplicitDerivatives =
488       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
489 
490   // We need to visit the basic blocks in reverse post-order so that we visit
491   // defs before uses, in particular so that we don't accidentally mark an
492   // instruction as needing e.g. WQM before visiting it and realizing it needs
493   // WQM disabled.
494   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
495   for (MachineBasicBlock *MBB : RPOT) {
496     BlockInfo &BBI = Blocks[MBB];
497 
498     for (MachineInstr &MI : *MBB) {
499       InstrInfo &III = Instructions[&MI];
500       unsigned Opcode = MI.getOpcode();
501       char Flags = 0;
502 
503       if (TII->isWQM(Opcode)) {
504         // If LOD is not supported WQM is not needed.
505         if (!ST->hasExtendedImageInsts())
506           continue;
507         // Only generate implicit WQM if implicit derivatives are required.
508         // This avoids inserting unintended WQM if a shader type without
509         // implicit derivatives uses an image sampling instruction.
510         if (!HasImplicitDerivatives)
511           continue;
512         // Sampling instructions don't need to produce results for all pixels
513         // in a quad, they just require all inputs of a quad to have been
514         // computed for derivatives.
515         markInstructionUses(MI, StateWQM, Worklist);
516         GlobalFlags |= StateWQM;
517         continue;
518       } else if (Opcode == AMDGPU::WQM) {
519         // The WQM intrinsic requires its output to have all the helper lanes
520         // correct, so we need it to be in WQM.
521         Flags = StateWQM;
522         LowerToCopyInstrs.push_back(&MI);
523       } else if (Opcode == AMDGPU::SOFT_WQM) {
524         LowerToCopyInstrs.push_back(&MI);
525         SoftWQMInstrs.push_back(&MI);
526         continue;
527       } else if (Opcode == AMDGPU::STRICT_WWM) {
528         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
529         // it needs to be executed in WQM or Exact so that its copy doesn't
530         // clobber inactive lanes.
531         markInstructionUses(MI, StateStrictWWM, Worklist);
532         GlobalFlags |= StateStrictWWM;
533         LowerToMovInstrs.push_back(&MI);
534         continue;
535       } else if (Opcode == AMDGPU::STRICT_WQM ||
536                  TII->isDualSourceBlendEXP(MI)) {
537         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
538         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
539         // quads that have at least one active thread.
540         markInstructionUses(MI, StateStrictWQM, Worklist);
541         GlobalFlags |= StateStrictWQM;
542 
543         if (Opcode == AMDGPU::STRICT_WQM) {
544           LowerToMovInstrs.push_back(&MI);
545         } else {
546           // Dual source blend export acts as implicit strict-wqm, its sources
547           // need to be shuffled in strict wqm, but the export itself needs to
548           // run in exact mode.
549           BBI.Needs |= StateExact;
550           if (!(BBI.InNeeds & StateExact)) {
551             BBI.InNeeds |= StateExact;
552             Worklist.push_back(MBB);
553           }
554           GlobalFlags |= StateExact;
555           III.Disabled = StateWQM | StateStrict;
556         }
557         continue;
558       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
559                  Opcode == AMDGPU::LDS_DIRECT_LOAD) {
560         // Mark these STRICTWQM, but only for the instruction, not its operands.
561         // This avoid unnecessarily marking M0 as requiring WQM.
562         InstrInfo &II = Instructions[&MI];
563         II.Needs |= StateStrictWQM;
564         GlobalFlags |= StateStrictWQM;
565         continue;
566       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
567                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
568         III.Disabled = StateStrict;
569         MachineOperand &Inactive = MI.getOperand(2);
570         if (Inactive.isReg()) {
571           if (Inactive.isUndef()) {
572             LowerToCopyInstrs.push_back(&MI);
573           } else {
574             markOperand(MI, Inactive, StateStrictWWM, Worklist);
575           }
576         }
577         SetInactiveInstrs.push_back(&MI);
578         continue;
579       } else if (TII->isDisableWQM(MI)) {
580         BBI.Needs |= StateExact;
581         if (!(BBI.InNeeds & StateExact)) {
582           BBI.InNeeds |= StateExact;
583           Worklist.push_back(MBB);
584         }
585         GlobalFlags |= StateExact;
586         III.Disabled = StateWQM | StateStrict;
587         continue;
588       } else {
589         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
590           LiveMaskQueries.push_back(&MI);
591         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
592                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
593                    Opcode == AMDGPU::SI_DEMOTE_I1) {
594           KillInstrs.push_back(&MI);
595           BBI.NeedsLowering = true;
596         } else if (WQMOutputs) {
597           // The function is in machine SSA form, which means that physical
598           // VGPRs correspond to shader inputs and outputs. Inputs are
599           // only used, outputs are only defined.
600           // FIXME: is this still valid?
601           for (const MachineOperand &MO : MI.defs()) {
602             if (!MO.isReg())
603               continue;
604 
605             Register Reg = MO.getReg();
606 
607             if (!Reg.isVirtual() &&
608                 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
609               Flags = StateWQM;
610               break;
611             }
612           }
613         }
614 
615         if (!Flags)
616           continue;
617       }
618 
619       markInstruction(MI, Flags, Worklist);
620       GlobalFlags |= Flags;
621     }
622   }
623 
624   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
625   // ever used anywhere in the function. This implements the corresponding
626   // semantics of @llvm.amdgcn.set.inactive.
627   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
628   if (GlobalFlags & StateWQM) {
629     for (MachineInstr *MI : SetInactiveInstrs)
630       markInstruction(*MI, StateWQM, Worklist);
631     for (MachineInstr *MI : SoftWQMInstrs)
632       markInstruction(*MI, StateWQM, Worklist);
633   }
634 
635   return GlobalFlags;
636 }
637 
638 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
639                                            std::vector<WorkItem>& Worklist) {
640   MachineBasicBlock *MBB = MI.getParent();
641   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
642   BlockInfo &BI = Blocks[MBB];
643 
644   // Control flow-type instructions and stores to temporary memory that are
645   // followed by WQM computations must themselves be in WQM.
646   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
647       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
648     Instructions[&MI].Needs = StateWQM;
649     II.Needs = StateWQM;
650   }
651 
652   // Propagate to block level
653   if (II.Needs & StateWQM) {
654     BI.Needs |= StateWQM;
655     if (!(BI.InNeeds & StateWQM)) {
656       BI.InNeeds |= StateWQM;
657       Worklist.push_back(MBB);
658     }
659   }
660 
661   // Propagate backwards within block
662   if (MachineInstr *PrevMI = MI.getPrevNode()) {
663     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
664     if (!PrevMI->isPHI()) {
665       InstrInfo &PrevII = Instructions[PrevMI];
666       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
667         PrevII.OutNeeds |= InNeeds;
668         Worklist.push_back(PrevMI);
669       }
670     }
671   }
672 
673   // Propagate WQM flag to instruction inputs
674   assert(!(II.Needs & StateExact));
675 
676   if (II.Needs != 0)
677     markInstructionUses(MI, II.Needs, Worklist);
678 
679   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
680   // not require any WQM transitions.
681   if (II.Needs & StateStrictWWM)
682     BI.Needs |= StateStrictWWM;
683   if (II.Needs & StateStrictWQM)
684     BI.Needs |= StateStrictWQM;
685 }
686 
687 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
688                                      std::vector<WorkItem>& Worklist) {
689   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
690 
691   // Propagate through instructions
692   if (!MBB.empty()) {
693     MachineInstr *LastMI = &*MBB.rbegin();
694     InstrInfo &LastII = Instructions[LastMI];
695     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
696       LastII.OutNeeds |= BI.OutNeeds;
697       Worklist.push_back(LastMI);
698     }
699   }
700 
701   // Predecessor blocks must provide for our WQM/Exact needs.
702   for (MachineBasicBlock *Pred : MBB.predecessors()) {
703     BlockInfo &PredBI = Blocks[Pred];
704     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
705       continue;
706 
707     PredBI.OutNeeds |= BI.InNeeds;
708     PredBI.InNeeds |= BI.InNeeds;
709     Worklist.push_back(Pred);
710   }
711 
712   // All successors must be prepared to accept the same set of WQM/Exact data.
713   for (MachineBasicBlock *Succ : MBB.successors()) {
714     BlockInfo &SuccBI = Blocks[Succ];
715     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
716       continue;
717 
718     SuccBI.InNeeds |= BI.OutNeeds;
719     Worklist.push_back(Succ);
720   }
721 }
722 
723 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
724   std::vector<WorkItem> Worklist;
725   char GlobalFlags = scanInstructions(MF, Worklist);
726 
727   while (!Worklist.empty()) {
728     WorkItem WI = Worklist.back();
729     Worklist.pop_back();
730 
731     if (WI.MI)
732       propagateInstruction(*WI.MI, Worklist);
733     else
734       propagateBlock(*WI.MBB, Worklist);
735   }
736 
737   return GlobalFlags;
738 }
739 
740 MachineBasicBlock::iterator
741 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
742                          MachineBasicBlock::iterator Before) {
743   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
744 
745   MachineInstr *Save =
746       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
747           .addReg(AMDGPU::SCC);
748   MachineInstr *Restore =
749       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
750           .addReg(SaveReg);
751 
752   LIS->InsertMachineInstrInMaps(*Save);
753   LIS->InsertMachineInstrInMaps(*Restore);
754   LIS->createAndComputeVirtRegInterval(SaveReg);
755 
756   return Restore;
757 }
758 
759 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
760                                                MachineInstr *TermMI) {
761   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
762                     << *TermMI << "\n");
763 
764   MachineBasicBlock *SplitBB =
765       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
766 
767   // Convert last instruction in block to a terminator.
768   // Note: this only covers the expected patterns
769   unsigned NewOpcode = 0;
770   switch (TermMI->getOpcode()) {
771   case AMDGPU::S_AND_B32:
772     NewOpcode = AMDGPU::S_AND_B32_term;
773     break;
774   case AMDGPU::S_AND_B64:
775     NewOpcode = AMDGPU::S_AND_B64_term;
776     break;
777   case AMDGPU::S_MOV_B32:
778     NewOpcode = AMDGPU::S_MOV_B32_term;
779     break;
780   case AMDGPU::S_MOV_B64:
781     NewOpcode = AMDGPU::S_MOV_B64_term;
782     break;
783   default:
784     break;
785   }
786   if (NewOpcode)
787     TermMI->setDesc(TII->get(NewOpcode));
788 
789   if (SplitBB != BB) {
790     // Update dominator trees
791     using DomTreeT = DomTreeBase<MachineBasicBlock>;
792     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
793     for (MachineBasicBlock *Succ : SplitBB->successors()) {
794       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
795       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
796     }
797     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
798     if (MDT)
799       MDT->getBase().applyUpdates(DTUpdates);
800     if (PDT)
801       PDT->getBase().applyUpdates(DTUpdates);
802 
803     // Link blocks
804     MachineInstr *MI =
805         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
806             .addMBB(SplitBB);
807     LIS->InsertMachineInstrInMaps(*MI);
808   }
809 
810   return SplitBB;
811 }
812 
813 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
814                                             MachineInstr &MI) {
815   const DebugLoc &DL = MI.getDebugLoc();
816   unsigned Opcode = 0;
817 
818   assert(MI.getOperand(0).isReg());
819 
820   // Comparison is for live lanes; however here we compute the inverse
821   // (killed lanes).  This is because VCMP will always generate 0 bits
822   // for inactive lanes so a mask of live lanes would not be correct
823   // inside control flow.
824   // Invert the comparison by swapping the operands and adjusting
825   // the comparison codes.
826 
827   switch (MI.getOperand(2).getImm()) {
828   case ISD::SETUEQ:
829     Opcode = AMDGPU::V_CMP_LG_F32_e64;
830     break;
831   case ISD::SETUGT:
832     Opcode = AMDGPU::V_CMP_GE_F32_e64;
833     break;
834   case ISD::SETUGE:
835     Opcode = AMDGPU::V_CMP_GT_F32_e64;
836     break;
837   case ISD::SETULT:
838     Opcode = AMDGPU::V_CMP_LE_F32_e64;
839     break;
840   case ISD::SETULE:
841     Opcode = AMDGPU::V_CMP_LT_F32_e64;
842     break;
843   case ISD::SETUNE:
844     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
845     break;
846   case ISD::SETO:
847     Opcode = AMDGPU::V_CMP_O_F32_e64;
848     break;
849   case ISD::SETUO:
850     Opcode = AMDGPU::V_CMP_U_F32_e64;
851     break;
852   case ISD::SETOEQ:
853   case ISD::SETEQ:
854     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
855     break;
856   case ISD::SETOGT:
857   case ISD::SETGT:
858     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
859     break;
860   case ISD::SETOGE:
861   case ISD::SETGE:
862     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
863     break;
864   case ISD::SETOLT:
865   case ISD::SETLT:
866     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
867     break;
868   case ISD::SETOLE:
869   case ISD::SETLE:
870     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
871     break;
872   case ISD::SETONE:
873   case ISD::SETNE:
874     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
875     break;
876   default:
877     llvm_unreachable("invalid ISD:SET cond code");
878   }
879 
880   // Pick opcode based on comparison type.
881   MachineInstr *VcmpMI;
882   const MachineOperand &Op0 = MI.getOperand(0);
883   const MachineOperand &Op1 = MI.getOperand(1);
884 
885   // VCC represents lanes killed.
886   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
887 
888   if (TRI->isVGPR(*MRI, Op0.getReg())) {
889     Opcode = AMDGPU::getVOPe32(Opcode);
890     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
891   } else {
892     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
893                  .addReg(VCC, RegState::Define)
894                  .addImm(0) // src0 modifiers
895                  .add(Op1)
896                  .addImm(0) // src1 modifiers
897                  .add(Op0)
898                  .addImm(0); // omod
899   }
900 
901   MachineInstr *MaskUpdateMI =
902       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
903           .addReg(LiveMaskReg)
904           .addReg(VCC);
905 
906   // State of SCC represents whether any lanes are live in mask,
907   // if SCC is 0 then no lanes will be alive anymore.
908   MachineInstr *EarlyTermMI =
909       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
910 
911   MachineInstr *ExecMaskMI =
912       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
913 
914   assert(MBB.succ_size() == 1);
915   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
916                               .addMBB(*MBB.succ_begin());
917 
918   // Update live intervals
919   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
920   MBB.remove(&MI);
921 
922   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
923   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
924   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
925   LIS->InsertMachineInstrInMaps(*NewTerm);
926 
927   return NewTerm;
928 }
929 
930 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
931                                            MachineInstr &MI, bool IsWQM) {
932   const DebugLoc &DL = MI.getDebugLoc();
933   MachineInstr *MaskUpdateMI = nullptr;
934 
935   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
936   const MachineOperand &Op = MI.getOperand(0);
937   int64_t KillVal = MI.getOperand(1).getImm();
938   MachineInstr *ComputeKilledMaskMI = nullptr;
939   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
940   Register TmpReg;
941 
942   // Is this a static or dynamic kill?
943   if (Op.isImm()) {
944     if (Op.getImm() == KillVal) {
945       // Static: all active lanes are killed
946       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
947                          .addReg(LiveMaskReg)
948                          .addReg(Exec);
949     } else {
950       // Static: kill does nothing
951       MachineInstr *NewTerm = nullptr;
952       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
953         LIS->RemoveMachineInstrFromMaps(MI);
954       } else {
955         assert(MBB.succ_size() == 1);
956         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
957                       .addMBB(*MBB.succ_begin());
958         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
959       }
960       MBB.remove(&MI);
961       return NewTerm;
962     }
963   } else {
964     if (!KillVal) {
965       // Op represents live lanes after kill,
966       // so exec mask needs to be factored in.
967       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
968       ComputeKilledMaskMI =
969           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
970       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
971                          .addReg(LiveMaskReg)
972                          .addReg(TmpReg);
973     } else {
974       // Op represents lanes to kill
975       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
976                          .addReg(LiveMaskReg)
977                          .add(Op);
978     }
979   }
980 
981   // State of SCC represents whether any lanes are live in mask,
982   // if SCC is 0 then no lanes will be alive anymore.
983   MachineInstr *EarlyTermMI =
984       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
985 
986   // In the case we got this far some lanes are still live,
987   // update EXEC to deactivate lanes as appropriate.
988   MachineInstr *NewTerm;
989   MachineInstr *WQMMaskMI = nullptr;
990   Register LiveMaskWQM;
991   if (IsDemote) {
992     // Demote - deactivate quads with only helper lanes
993     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
994     WQMMaskMI =
995         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
996     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
997                   .addReg(Exec)
998                   .addReg(LiveMaskWQM);
999   } else {
1000     // Kill - deactivate lanes no longer in live mask
1001     if (Op.isImm()) {
1002       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1003       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1004     } else if (!IsWQM) {
1005       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1006                     .addReg(Exec)
1007                     .addReg(LiveMaskReg);
1008     } else {
1009       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1010       NewTerm =
1011           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1012     }
1013   }
1014 
1015   // Update live intervals
1016   LIS->RemoveMachineInstrFromMaps(MI);
1017   MBB.remove(&MI);
1018   assert(EarlyTermMI);
1019   assert(MaskUpdateMI);
1020   assert(NewTerm);
1021   if (ComputeKilledMaskMI)
1022     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1023   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1024   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1025   if (WQMMaskMI)
1026     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1027   LIS->InsertMachineInstrInMaps(*NewTerm);
1028 
1029   if (CndReg) {
1030     LIS->removeInterval(CndReg);
1031     LIS->createAndComputeVirtRegInterval(CndReg);
1032   }
1033   if (TmpReg)
1034     LIS->createAndComputeVirtRegInterval(TmpReg);
1035   if (LiveMaskWQM)
1036     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1037 
1038   return NewTerm;
1039 }
1040 
1041 // Convert a strict mode transition to a pseudo transition.
1042 // This still pre-allocates registers to prevent clobbering,
1043 // but avoids any EXEC mask changes.
1044 void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1045                                             MachineInstr *Entry,
1046                                             MachineInstr *Exit) {
1047   assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1048   assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1049 
1050   Register SaveOrig = Entry->getOperand(0).getReg();
1051 
1052   MachineInstr *NewEntry =
1053     BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1054   MachineInstr *NewExit =
1055     BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1056 
1057   LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1058   Exit->eraseFromParent();
1059 
1060   LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1061   Entry->eraseFromParent();
1062 
1063   LIS->removeInterval(SaveOrig);
1064 }
1065 
1066 // Replace (or supplement) instructions accessing live mask.
1067 // This can only happen once all the live mask registers have been created
1068 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1069 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1070   auto BII = Blocks.find(&MBB);
1071   if (BII == Blocks.end())
1072     return;
1073 
1074   const BlockInfo &BI = BII->second;
1075   if (!BI.NeedsLowering)
1076     return;
1077 
1078   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1079 
1080   SmallVector<MachineInstr *, 4> SplitPoints;
1081   char State = BI.InitialState;
1082   MachineInstr *StrictEntry = nullptr;
1083 
1084   for (MachineInstr &MI : llvm::make_early_inc_range(
1085            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1086     char PreviousState = State;
1087 
1088     if (StateTransition.count(&MI))
1089       State = StateTransition[&MI];
1090 
1091     MachineInstr *SplitPoint = nullptr;
1092     switch (MI.getOpcode()) {
1093     case AMDGPU::SI_DEMOTE_I1:
1094     case AMDGPU::SI_KILL_I1_TERMINATOR:
1095       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1096       break;
1097     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1098       SplitPoint = lowerKillF32(MBB, MI);
1099       break;
1100     case AMDGPU::ENTER_STRICT_WQM:
1101       StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1102       break;
1103     case AMDGPU::EXIT_STRICT_WQM:
1104       if (State == StateWQM && StrictEntry) {
1105         // Transition WQM -> StrictWQM -> WQM detected.
1106         lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1107       }
1108       StrictEntry = nullptr;
1109       break;
1110     case AMDGPU::ENTER_STRICT_WWM:
1111     case AMDGPU::EXIT_STRICT_WWM:
1112       StrictEntry = nullptr;
1113       break;
1114     default:
1115       break;
1116     }
1117     if (SplitPoint)
1118       SplitPoints.push_back(SplitPoint);
1119   }
1120 
1121   // Perform splitting after instruction scan to simplify iteration.
1122   if (!SplitPoints.empty()) {
1123     MachineBasicBlock *BB = &MBB;
1124     for (MachineInstr *MI : SplitPoints) {
1125       BB = splitBlock(BB, MI);
1126     }
1127   }
1128 }
1129 
1130 // Return an iterator in the (inclusive) range [First, Last] at which
1131 // instructions can be safely inserted, keeping in mind that some of the
1132 // instructions we want to add necessarily clobber SCC.
1133 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1134     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1135     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1136   if (!SaveSCC)
1137     return PreferLast ? Last : First;
1138 
1139   LiveRange &LR =
1140       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1141   auto MBBE = MBB.end();
1142   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1143                                      : LIS->getMBBEndIdx(&MBB);
1144   SlotIndex LastIdx =
1145       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1146   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1147   const LiveRange::Segment *S;
1148 
1149   for (;;) {
1150     S = LR.getSegmentContaining(Idx);
1151     if (!S)
1152       break;
1153 
1154     if (PreferLast) {
1155       SlotIndex Next = S->start.getBaseIndex();
1156       if (Next < FirstIdx)
1157         break;
1158       Idx = Next;
1159     } else {
1160       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1161       assert(EndMI && "Segment does not end on valid instruction");
1162       auto NextI = std::next(EndMI->getIterator());
1163       if (NextI == MBB.end())
1164         break;
1165       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1166       if (Next > LastIdx)
1167         break;
1168       Idx = Next;
1169     }
1170   }
1171 
1172   MachineBasicBlock::iterator MBBI;
1173 
1174   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1175     MBBI = MI;
1176   else {
1177     assert(Idx == LIS->getMBBEndIdx(&MBB));
1178     MBBI = MBB.end();
1179   }
1180 
1181   // Move insertion point past any operations modifying EXEC.
1182   // This assumes that the value of SCC defined by any of these operations
1183   // does not need to be preserved.
1184   while (MBBI != Last) {
1185     bool IsExecDef = false;
1186     for (const MachineOperand &MO : MBBI->all_defs()) {
1187       IsExecDef |=
1188           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1189     }
1190     if (!IsExecDef)
1191       break;
1192     MBBI++;
1193     S = nullptr;
1194   }
1195 
1196   if (S)
1197     MBBI = saveSCC(MBB, MBBI);
1198 
1199   return MBBI;
1200 }
1201 
1202 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1203                               MachineBasicBlock::iterator Before,
1204                               Register SaveWQM) {
1205   bool IsTerminator = Before == MBB.end();
1206   if (!IsTerminator) {
1207     auto FirstTerm = MBB.getFirstTerminator();
1208     if (FirstTerm != MBB.end()) {
1209       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1210       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1211       IsTerminator = BeforeIdx > FirstTermIdx;
1212     }
1213   }
1214 
1215   MachineInstr *MI;
1216 
1217   if (SaveWQM) {
1218     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1219     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1220              .addReg(LiveMaskReg);
1221   } else {
1222     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1223     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1224              .addReg(Exec)
1225              .addReg(LiveMaskReg);
1226   }
1227 
1228   LIS->InsertMachineInstrInMaps(*MI);
1229   StateTransition[MI] = StateExact;
1230 }
1231 
1232 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1233                             MachineBasicBlock::iterator Before,
1234                             Register SavedWQM) {
1235   MachineInstr *MI;
1236 
1237   if (SavedWQM) {
1238     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1239              .addReg(SavedWQM);
1240   } else {
1241     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1242   }
1243 
1244   LIS->InsertMachineInstrInMaps(*MI);
1245   StateTransition[MI] = StateWQM;
1246 }
1247 
1248 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1249                                    MachineBasicBlock::iterator Before,
1250                                    Register SaveOrig, char StrictStateNeeded) {
1251   MachineInstr *MI;
1252   assert(SaveOrig);
1253   assert(StrictStateNeeded == StateStrictWWM ||
1254          StrictStateNeeded == StateStrictWQM);
1255 
1256   if (StrictStateNeeded == StateStrictWWM) {
1257     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1258                  SaveOrig)
1259              .addImm(-1);
1260   } else {
1261     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1262                  SaveOrig)
1263              .addImm(-1);
1264   }
1265   LIS->InsertMachineInstrInMaps(*MI);
1266   StateTransition[MI] = StrictStateNeeded;
1267 
1268   // Mark block as needing lower so it will be checked for unnecessary transitions.
1269   auto BII = Blocks.find(&MBB);
1270   if (BII != Blocks.end())
1271     BII->second.NeedsLowering = true;
1272 }
1273 
1274 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1275                                      MachineBasicBlock::iterator Before,
1276                                      Register SavedOrig, char NonStrictState,
1277                                      char CurrentStrictState) {
1278   MachineInstr *MI;
1279 
1280   assert(SavedOrig);
1281   assert(CurrentStrictState == StateStrictWWM ||
1282          CurrentStrictState == StateStrictWQM);
1283 
1284   if (CurrentStrictState == StateStrictWWM) {
1285     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1286                  Exec)
1287              .addReg(SavedOrig);
1288   } else {
1289     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1290                  Exec)
1291              .addReg(SavedOrig);
1292   }
1293   LIS->InsertMachineInstrInMaps(*MI);
1294   StateTransition[MI] = NonStrictState;
1295 }
1296 
1297 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1298   auto BII = Blocks.find(&MBB);
1299   if (BII == Blocks.end())
1300     return;
1301 
1302   BlockInfo &BI = BII->second;
1303 
1304   // This is a non-entry block that is WQM throughout, so no need to do
1305   // anything.
1306   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1307     BI.InitialState = StateWQM;
1308     return;
1309   }
1310 
1311   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1312                     << ":\n");
1313 
1314   Register SavedWQMReg;
1315   Register SavedNonStrictReg;
1316   bool WQMFromExec = IsEntry;
1317   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1318   char NonStrictState = 0;
1319   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1320 
1321   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1322   if (IsEntry) {
1323     // Skip the instruction that saves LiveMask
1324     if (II != IE && II->getOpcode() == AMDGPU::COPY)
1325       ++II;
1326   }
1327 
1328   // This stores the first instruction where it's safe to switch from WQM to
1329   // Exact or vice versa.
1330   MachineBasicBlock::iterator FirstWQM = IE;
1331 
1332   // This stores the first instruction where it's safe to switch from Strict
1333   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1334   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1335   // be safe to switch to/from WQM as well.
1336   MachineBasicBlock::iterator FirstStrict = IE;
1337 
1338   // Record initial state is block information.
1339   BI.InitialState = State;
1340 
1341   for (;;) {
1342     MachineBasicBlock::iterator Next = II;
1343     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1344     char OutNeeds = 0;
1345 
1346     if (FirstWQM == IE)
1347       FirstWQM = II;
1348 
1349     if (FirstStrict == IE)
1350       FirstStrict = II;
1351 
1352     // First, figure out the allowed states (Needs) based on the propagated
1353     // flags.
1354     if (II != IE) {
1355       MachineInstr &MI = *II;
1356 
1357       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1358         auto III = Instructions.find(&MI);
1359         if (III != Instructions.end()) {
1360           if (III->second.Needs & StateStrictWWM)
1361             Needs = StateStrictWWM;
1362           else if (III->second.Needs & StateStrictWQM)
1363             Needs = StateStrictWQM;
1364           else if (III->second.Needs & StateWQM)
1365             Needs = StateWQM;
1366           else
1367             Needs &= ~III->second.Disabled;
1368           OutNeeds = III->second.OutNeeds;
1369         }
1370       } else {
1371         // If the instruction doesn't actually need a correct EXEC, then we can
1372         // safely leave Strict mode enabled.
1373         Needs = StateExact | StateWQM | StateStrict;
1374       }
1375 
1376       // Exact mode exit can occur in terminators, but must be before branches.
1377       if (MI.isBranch() && OutNeeds == StateExact)
1378         Needs = StateExact;
1379 
1380       ++Next;
1381     } else {
1382       // End of basic block
1383       if (BI.OutNeeds & StateWQM)
1384         Needs = StateWQM;
1385       else if (BI.OutNeeds == StateExact)
1386         Needs = StateExact;
1387       else
1388         Needs = StateWQM | StateExact;
1389     }
1390 
1391     // Now, transition if necessary.
1392     if (!(Needs & State)) {
1393       MachineBasicBlock::iterator First;
1394       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1395           State == StateStrictWQM || Needs == StateStrictWQM) {
1396         // We must switch to or from Strict mode.
1397         First = FirstStrict;
1398       } else {
1399         // We only need to switch to/from WQM, so we can use FirstWQM.
1400         First = FirstWQM;
1401       }
1402 
1403       // Whether we need to save SCC depends on start and end states.
1404       bool SaveSCC = false;
1405       switch (State) {
1406       case StateExact:
1407       case StateStrictWWM:
1408       case StateStrictWQM:
1409         // Exact/Strict -> Strict: save SCC
1410         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1411         // Exact/Strict -> Exact: no save
1412         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1413         break;
1414       case StateWQM:
1415         // WQM -> Exact/Strict: save SCC
1416         SaveSCC = !(Needs & StateWQM);
1417         break;
1418       default:
1419         llvm_unreachable("Unknown state");
1420         break;
1421       }
1422       MachineBasicBlock::iterator Before =
1423           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1424 
1425       if (State & StateStrict) {
1426         assert(State == StateStrictWWM || State == StateStrictWQM);
1427         assert(SavedNonStrictReg);
1428         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1429 
1430         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1431         SavedNonStrictReg = 0;
1432         State = NonStrictState;
1433       }
1434 
1435       if (Needs & StateStrict) {
1436         NonStrictState = State;
1437         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1438         assert(!SavedNonStrictReg);
1439         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1440 
1441         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1442         State = Needs;
1443 
1444       } else {
1445         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1446           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1447             assert(!SavedWQMReg);
1448             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1449           }
1450 
1451           toExact(MBB, Before, SavedWQMReg);
1452           State = StateExact;
1453         } else if (State == StateExact && (Needs & StateWQM) &&
1454                    !(Needs & StateExact)) {
1455           assert(WQMFromExec == (SavedWQMReg == 0));
1456 
1457           toWQM(MBB, Before, SavedWQMReg);
1458 
1459           if (SavedWQMReg) {
1460             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1461             SavedWQMReg = 0;
1462           }
1463           State = StateWQM;
1464         } else {
1465           // We can get here if we transitioned from StrictWWM to a
1466           // non-StrictWWM state that already matches our needs, but we
1467           // shouldn't need to do anything.
1468           assert(Needs & State);
1469         }
1470       }
1471     }
1472 
1473     if (Needs != (StateExact | StateWQM | StateStrict)) {
1474       if (Needs != (StateExact | StateWQM))
1475         FirstWQM = IE;
1476       FirstStrict = IE;
1477     }
1478 
1479     if (II == IE)
1480       break;
1481 
1482     II = Next;
1483   }
1484   assert(!SavedWQMReg);
1485   assert(!SavedNonStrictReg);
1486 }
1487 
1488 void SIWholeQuadMode::lowerLiveMaskQueries() {
1489   for (MachineInstr *MI : LiveMaskQueries) {
1490     const DebugLoc &DL = MI->getDebugLoc();
1491     Register Dest = MI->getOperand(0).getReg();
1492 
1493     MachineInstr *Copy =
1494         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1495             .addReg(LiveMaskReg);
1496 
1497     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1498     MI->eraseFromParent();
1499   }
1500 }
1501 
1502 void SIWholeQuadMode::lowerCopyInstrs() {
1503   for (MachineInstr *MI : LowerToMovInstrs) {
1504     assert(MI->getNumExplicitOperands() == 2);
1505 
1506     const Register Reg = MI->getOperand(0).getReg();
1507 
1508     const TargetRegisterClass *regClass =
1509         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1510     if (TRI->isVGPRClass(regClass)) {
1511       const unsigned MovOp = TII->getMovOpcode(regClass);
1512       MI->setDesc(TII->get(MovOp));
1513 
1514       // Check that it already implicitly depends on exec (like all VALU movs
1515       // should do).
1516       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1517         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1518       }));
1519     } else {
1520       // Remove early-clobber and exec dependency from simple SGPR copies.
1521       // This allows some to be eliminated during/post RA.
1522       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1523       if (MI->getOperand(0).isEarlyClobber()) {
1524         LIS->removeInterval(Reg);
1525         MI->getOperand(0).setIsEarlyClobber(false);
1526         LIS->createAndComputeVirtRegInterval(Reg);
1527       }
1528       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1529       while (Index >= 0) {
1530         MI->removeOperand(Index);
1531         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1532       }
1533       MI->setDesc(TII->get(AMDGPU::COPY));
1534       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1535     }
1536   }
1537   for (MachineInstr *MI : LowerToCopyInstrs) {
1538     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1539         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1540       assert(MI->getNumExplicitOperands() == 3);
1541       // the only reason we should be here is V_SET_INACTIVE has
1542       // an undef input so it is being replaced by a simple copy.
1543       // There should be a second undef source that we should remove.
1544       assert(MI->getOperand(2).isUndef());
1545       MI->removeOperand(2);
1546       MI->untieRegOperand(1);
1547     } else {
1548       assert(MI->getNumExplicitOperands() == 2);
1549     }
1550 
1551     unsigned CopyOp = MI->getOperand(1).isReg()
1552                           ? (unsigned)AMDGPU::COPY
1553                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1554                                 *MRI, MI->getOperand(0)));
1555     MI->setDesc(TII->get(CopyOp));
1556   }
1557 }
1558 
1559 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1560   for (MachineInstr *MI : KillInstrs) {
1561     MachineBasicBlock *MBB = MI->getParent();
1562     MachineInstr *SplitPoint = nullptr;
1563     switch (MI->getOpcode()) {
1564     case AMDGPU::SI_DEMOTE_I1:
1565     case AMDGPU::SI_KILL_I1_TERMINATOR:
1566       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1567       break;
1568     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1569       SplitPoint = lowerKillF32(*MBB, *MI);
1570       break;
1571     default:
1572       continue;
1573     }
1574     if (SplitPoint)
1575       splitBlock(MBB, SplitPoint);
1576   }
1577 }
1578 
1579 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1580   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1581                     << " ------------- \n");
1582   LLVM_DEBUG(MF.dump(););
1583 
1584   Instructions.clear();
1585   Blocks.clear();
1586   LiveMaskQueries.clear();
1587   LowerToCopyInstrs.clear();
1588   LowerToMovInstrs.clear();
1589   KillInstrs.clear();
1590   StateTransition.clear();
1591 
1592   ST = &MF.getSubtarget<GCNSubtarget>();
1593 
1594   TII = ST->getInstrInfo();
1595   TRI = &TII->getRegisterInfo();
1596   MRI = &MF.getRegInfo();
1597   LIS = &getAnalysis<LiveIntervals>();
1598   MDT = &getAnalysis<MachineDominatorTree>();
1599   PDT = &getAnalysis<MachinePostDominatorTree>();
1600 
1601   if (ST->isWave32()) {
1602     AndOpc = AMDGPU::S_AND_B32;
1603     AndTermOpc = AMDGPU::S_AND_B32_term;
1604     AndN2Opc = AMDGPU::S_ANDN2_B32;
1605     XorOpc = AMDGPU::S_XOR_B32;
1606     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1607     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1608     WQMOpc = AMDGPU::S_WQM_B32;
1609     Exec = AMDGPU::EXEC_LO;
1610   } else {
1611     AndOpc = AMDGPU::S_AND_B64;
1612     AndTermOpc = AMDGPU::S_AND_B64_term;
1613     AndN2Opc = AMDGPU::S_ANDN2_B64;
1614     XorOpc = AMDGPU::S_XOR_B64;
1615     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1616     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1617     WQMOpc = AMDGPU::S_WQM_B64;
1618     Exec = AMDGPU::EXEC;
1619   }
1620 
1621   const char GlobalFlags = analyzeFunction(MF);
1622   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1623 
1624   LiveMaskReg = Exec;
1625 
1626   // Shader is simple does not need any state changes or any complex lowering
1627   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1628       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1629     lowerLiveMaskQueries();
1630     return !LiveMaskQueries.empty();
1631   }
1632 
1633   MachineBasicBlock &Entry = MF.front();
1634   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1635 
1636   // Store a copy of the original live mask when required
1637   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1638     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1639     MachineInstr *MI =
1640         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1641             .addReg(Exec);
1642     LIS->InsertMachineInstrInMaps(*MI);
1643   }
1644 
1645   LLVM_DEBUG(printInfo());
1646 
1647   lowerLiveMaskQueries();
1648   lowerCopyInstrs();
1649 
1650   // Shader only needs WQM
1651   if (GlobalFlags == StateWQM) {
1652     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1653                   .addReg(Exec);
1654     LIS->InsertMachineInstrInMaps(*MI);
1655     lowerKillInstrs(true);
1656   } else {
1657     for (auto BII : Blocks)
1658       processBlock(*BII.first, BII.first == &Entry);
1659     // Lowering blocks causes block splitting so perform as a second pass.
1660     for (auto BII : Blocks)
1661       lowerBlock(*BII.first);
1662   }
1663 
1664   // Compute live range for live mask
1665   if (LiveMaskReg != Exec)
1666     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1667 
1668   // Physical registers like SCC aren't tracked by default anyway, so just
1669   // removing the ranges we computed is the simplest option for maintaining
1670   // the analysis results.
1671   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1672 
1673   // If we performed any kills then recompute EXEC
1674   if (!KillInstrs.empty())
1675     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1676 
1677   return true;
1678 }
1679