xref: /llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUInstPrinter.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegUnits.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 
27 using namespace llvm;
28 
29 #define GET_REGINFO_TARGET_DESC
30 #include "AMDGPUGenRegisterInfo.inc"
31 
32 static cl::opt<bool> EnableSpillSGPRToVGPR(
33   "amdgpu-spill-sgpr-to-vgpr",
34   cl::desc("Enable spilling SGPRs to VGPRs"),
35   cl::ReallyHidden,
36   cl::init(true));
37 
38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40 
41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44 //      meaning index 7 in SubRegFromChannelTable.
45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47 
48 static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49                                  const Twine &ErrMsg) {
50   Fn.getContext().diagnose(
51       DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52 }
53 
54 namespace llvm {
55 
56 // A temporary struct to spill SGPRs.
57 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58 // just v_writelane and v_readlane.
59 //
60 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61 // is saved to scratch (or the other way around for loads).
62 // For this, a VGPR is required where the needed lanes can be clobbered. The
63 // RegScavenger can provide a VGPR where currently active lanes can be
64 // clobbered, but we still need to save inactive lanes.
65 // The high-level steps are:
66 // - Try to scavenge SGPR(s) to save exec
67 // - Try to scavenge VGPR
68 // - Save needed, all or inactive lanes of a TmpVGPR
69 // - Spill/Restore SGPRs using TmpVGPR
70 // - Restore TmpVGPR
71 //
72 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73 // cannot scavenge temporary SGPRs to save exec, we use the following code:
74 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75 // s_not exec, exec
76 // buffer_store_dword TmpVGPR ; save inactive lanes
77 // s_not exec, exec
78 struct SGPRSpillBuilder {
79   struct PerVGPRData {
80     unsigned PerVGPR;
81     unsigned NumVGPRs;
82     int64_t VGPRLanes;
83   };
84 
85   // The SGPR to save
86   Register SuperReg;
87   MachineBasicBlock::iterator MI;
88   ArrayRef<int16_t> SplitParts;
89   unsigned NumSubRegs;
90   bool IsKill;
91   const DebugLoc &DL;
92 
93   /* When spilling to stack */
94   // The SGPRs are written into this VGPR, which is then written to scratch
95   // (or vice versa for loads).
96   Register TmpVGPR = AMDGPU::NoRegister;
97   // Temporary spill slot to save TmpVGPR to.
98   int TmpVGPRIndex = 0;
99   // If TmpVGPR is live before the spill or if it is scavenged.
100   bool TmpVGPRLive = false;
101   // Scavenged SGPR to save EXEC.
102   Register SavedExecReg = AMDGPU::NoRegister;
103   // Stack index to write the SGPRs to.
104   int Index;
105   unsigned EltSize = 4;
106 
107   RegScavenger *RS;
108   MachineBasicBlock *MBB;
109   MachineFunction &MF;
110   SIMachineFunctionInfo &MFI;
111   const SIInstrInfo &TII;
112   const SIRegisterInfo &TRI;
113   bool IsWave32;
114   Register ExecReg;
115   unsigned MovOpc;
116   unsigned NotOpc;
117 
118   SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
119                    bool IsWave32, MachineBasicBlock::iterator MI, int Index,
120                    RegScavenger *RS)
121       : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122                          MI->getOperand(0).isKill(), Index, RS) {}
123 
124   SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
125                    bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
126                    bool IsKill, int Index, RegScavenger *RS)
127       : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128         Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129         MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
130         IsWave32(IsWave32) {
131     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132     SplitParts = TRI.getRegSplitParts(RC, EltSize);
133     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134 
135     if (IsWave32) {
136       ExecReg = AMDGPU::EXEC_LO;
137       MovOpc = AMDGPU::S_MOV_B32;
138       NotOpc = AMDGPU::S_NOT_B32;
139     } else {
140       ExecReg = AMDGPU::EXEC;
141       MovOpc = AMDGPU::S_MOV_B64;
142       NotOpc = AMDGPU::S_NOT_B64;
143     }
144 
145     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146     assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147            SuperReg != AMDGPU::EXEC && "exec should never spill");
148   }
149 
150   PerVGPRData getPerVGPRData() {
151     PerVGPRData Data;
152     Data.PerVGPR = IsWave32 ? 32 : 64;
153     Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154     Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155     return Data;
156   }
157 
158   // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159   // free.
160   // Writes these instructions if an SGPR can be scavenged:
161   // s_mov_b64 s[6:7], exec   ; Save exec
162   // s_mov_b64 exec, 3        ; Wanted lanemask
163   // buffer_store_dword v1    ; Write scavenged VGPR to emergency slot
164   //
165   // Writes these instructions if no SGPR can be scavenged:
166   // buffer_store_dword v0    ; Only if no free VGPR was found
167   // s_not_b64 exec, exec
168   // buffer_store_dword v0    ; Save inactive lanes
169   //                          ; exec stays inverted, it is flipped back in
170   //                          ; restore.
171   void prepare() {
172     // Scavenged temporary VGPR to use. It must be scavenged once for any number
173     // of spilled subregs.
174     // FIXME: The liveness analysis is limited and does not tell if a register
175     // is in use in lanes that are currently inactive. We can never be sure if
176     // a register as actually in use in another lane, so we need to save all
177     // used lanes of the chosen VGPR.
178     assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179     TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180                                             0, false);
181 
182     // Reserve temporary stack slot
183     TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184     if (TmpVGPR) {
185       // Found a register that is dead in the currently active lanes, we only
186       // need to spill inactive lanes.
187       TmpVGPRLive = false;
188     } else {
189       // Pick v0 because it doesn't make a difference.
190       TmpVGPR = AMDGPU::VGPR0;
191       TmpVGPRLive = true;
192     }
193 
194     if (TmpVGPRLive) {
195       // We need to inform the scavenger that this index is already in use until
196       // we're done with the custom emergency spill.
197       RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198     }
199 
200     // We may end up recursively calling the scavenger, and don't want to re-use
201     // the same register.
202     RS->setRegUsed(TmpVGPR);
203 
204     // Try to scavenge SGPRs to save exec
205     assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206     const TargetRegisterClass &RC =
207         IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208     RS->setRegUsed(SuperReg);
209     SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210 
211     int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212 
213     if (SavedExecReg) {
214       RS->setRegUsed(SavedExecReg);
215       // Set exec to needed lanes
216       BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
217       auto I =
218           BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219       if (!TmpVGPRLive)
220         I.addReg(TmpVGPR, RegState::ImplicitDefine);
221       // Spill needed lanes
222       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223     } else {
224       // The modify and restore of exec clobber SCC, which we would have to save
225       // and restore. FIXME: We probably would need to reserve a register for
226       // this.
227       if (RS->isRegUsed(AMDGPU::SCC))
228         emitUnsupportedError(MF.getFunction(), *MI,
229                              "unhandled SGPR spill to memory");
230 
231       // Spill active lanes
232       if (TmpVGPRLive)
233         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234                                     /*IsKill*/ false);
235       // Spill inactive lanes
236       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237       if (!TmpVGPRLive)
238         I.addReg(TmpVGPR, RegState::ImplicitDefine);
239       I->getOperand(2).setIsDead(); // Mark SCC as dead.
240       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241     }
242   }
243 
244   // Writes these instructions if an SGPR can be scavenged:
245   // buffer_load_dword v1     ; Write scavenged VGPR to emergency slot
246   // s_waitcnt vmcnt(0)       ; If a free VGPR was found
247   // s_mov_b64 exec, s[6:7]   ; Save exec
248   //
249   // Writes these instructions if no SGPR can be scavenged:
250   // buffer_load_dword v0     ; Restore inactive lanes
251   // s_waitcnt vmcnt(0)       ; If a free VGPR was found
252   // s_not_b64 exec, exec
253   // buffer_load_dword v0     ; Only if no free VGPR was found
254   void restore() {
255     if (SavedExecReg) {
256       // Restore used lanes
257       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258                                   /*IsKill*/ false);
259       // Restore exec
260       auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
261                    .addReg(SavedExecReg, RegState::Kill);
262       // Add an implicit use of the load so it is not dead.
263       // FIXME This inserts an unnecessary waitcnt
264       if (!TmpVGPRLive) {
265         I.addReg(TmpVGPR, RegState::ImplicitKill);
266       }
267     } else {
268       // Restore inactive lanes
269       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270                                   /*IsKill*/ false);
271       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272       if (!TmpVGPRLive)
273         I.addReg(TmpVGPR, RegState::ImplicitKill);
274       I->getOperand(2).setIsDead(); // Mark SCC as dead.
275 
276       // Restore active lanes
277       if (TmpVGPRLive)
278         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279     }
280 
281     // Inform the scavenger where we're releasing our custom scavenged register.
282     if (TmpVGPRLive) {
283       MachineBasicBlock::iterator RestorePt = std::prev(MI);
284       RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285     }
286   }
287 
288   // Write TmpVGPR to memory or read TmpVGPR from memory.
289   // Either using a single buffer_load/store if exec is set to the needed mask
290   // or using
291   // buffer_load
292   // s_not exec, exec
293   // buffer_load
294   // s_not exec, exec
295   void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296     if (SavedExecReg) {
297       // Spill needed lanes
298       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299     } else {
300       // The modify and restore of exec clobber SCC, which we would have to save
301       // and restore. FIXME: We probably would need to reserve a register for
302       // this.
303       if (RS->isRegUsed(AMDGPU::SCC))
304         emitUnsupportedError(MF.getFunction(), *MI,
305                              "unhandled SGPR spill to memory");
306 
307       // Spill active lanes
308       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309                                   /*IsKill*/ false);
310       // Spill inactive lanes
311       auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312       Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314       auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315       Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316     }
317   }
318 
319   void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
320     assert(MBB->getParent() == &MF);
321     MI = NewMI;
322     MBB = NewMBB;
323   }
324 };
325 
326 } // namespace llvm
327 
328 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
329     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330                             ST.getAMDGPUDwarfFlavour(),
331                             /*PC=*/0, ST.getHwMode()),
332       ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
333 
334   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
335          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
336          (getSubRegIndexLaneMask(AMDGPU::lo16) |
337           getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
338            getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
339          "getNumCoveredRegs() will not work with generated subreg masks!");
340 
341   RegPressureIgnoredUnits.resize(getNumRegUnits());
342   RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
343   for (auto Reg : AMDGPU::VGPR_16RegClass) {
344     if (AMDGPU::isHi16Reg(Reg, *this))
345       RegPressureIgnoredUnits.set(*regunits(Reg).begin());
346   }
347 
348   // HACK: Until this is fully tablegen'd.
349   static llvm::once_flag InitializeRegSplitPartsFlag;
350 
351   static auto InitializeRegSplitPartsOnce = [this]() {
352     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
353       unsigned Size = getSubRegIdxSize(Idx);
354       if (Size & 31)
355         continue;
356       std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
357       unsigned Pos = getSubRegIdxOffset(Idx);
358       if (Pos % Size)
359         continue;
360       Pos /= Size;
361       if (Vec.empty()) {
362         unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
363         Vec.resize(MaxNumParts);
364       }
365       Vec[Pos] = Idx;
366     }
367   };
368 
369   static llvm::once_flag InitializeSubRegFromChannelTableFlag;
370 
371   static auto InitializeSubRegFromChannelTableOnce = [this]() {
372     for (auto &Row : SubRegFromChannelTable)
373       Row.fill(AMDGPU::NoSubRegister);
374     for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
375       unsigned Width = getSubRegIdxSize(Idx) / 32;
376       unsigned Offset = getSubRegIdxOffset(Idx) / 32;
377       assert(Width < SubRegFromChannelTableWidthMap.size());
378       Width = SubRegFromChannelTableWidthMap[Width];
379       if (Width == 0)
380         continue;
381       unsigned TableIdx = Width - 1;
382       assert(TableIdx < SubRegFromChannelTable.size());
383       assert(Offset < SubRegFromChannelTable[TableIdx].size());
384       SubRegFromChannelTable[TableIdx][Offset] = Idx;
385     }
386   };
387 
388   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
389   llvm::call_once(InitializeSubRegFromChannelTableFlag,
390                   InitializeSubRegFromChannelTableOnce);
391 }
392 
393 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
394                                            MCRegister Reg) const {
395   for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
396     Reserved.set(*R);
397 }
398 
399 // Forced to be here by one .inc
400 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
401   const MachineFunction *MF) const {
402   CallingConv::ID CC = MF->getFunction().getCallingConv();
403   switch (CC) {
404   case CallingConv::C:
405   case CallingConv::Fast:
406   case CallingConv::Cold:
407     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
408                                : CSR_AMDGPU_SaveList;
409   case CallingConv::AMDGPU_Gfx:
410     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
411                                : CSR_AMDGPU_SI_Gfx_SaveList;
412   case CallingConv::AMDGPU_CS_ChainPreserve:
413     return CSR_AMDGPU_CS_ChainPreserve_SaveList;
414   default: {
415     // Dummy to not crash RegisterClassInfo.
416     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
417     return &NoCalleeSavedReg;
418   }
419   }
420 }
421 
422 const MCPhysReg *
423 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
424   return nullptr;
425 }
426 
427 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
428                                                      CallingConv::ID CC) const {
429   switch (CC) {
430   case CallingConv::C:
431   case CallingConv::Fast:
432   case CallingConv::Cold:
433     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
434                                : CSR_AMDGPU_RegMask;
435   case CallingConv::AMDGPU_Gfx:
436     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
437                                : CSR_AMDGPU_SI_Gfx_RegMask;
438   case CallingConv::AMDGPU_CS_Chain:
439   case CallingConv::AMDGPU_CS_ChainPreserve:
440     // Calls to these functions never return, so we can pretend everything is
441     // preserved.
442     return AMDGPU_AllVGPRs_RegMask;
443   default:
444     return nullptr;
445   }
446 }
447 
448 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
449   return CSR_AMDGPU_NoRegs_RegMask;
450 }
451 
452 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) {
453   return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
454 }
455 
456 const TargetRegisterClass *
457 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
458                                           const MachineFunction &MF) const {
459   // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
460   // equivalent AV class. If used one, the verifier will crash after
461   // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
462   // until Instruction selection.
463   if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
464     if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
465       return &AMDGPU::AV_32RegClass;
466     if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
467       return &AMDGPU::AV_64RegClass;
468     if (RC == &AMDGPU::VReg_64_Align2RegClass ||
469         RC == &AMDGPU::AReg_64_Align2RegClass)
470       return &AMDGPU::AV_64_Align2RegClass;
471     if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
472       return &AMDGPU::AV_96RegClass;
473     if (RC == &AMDGPU::VReg_96_Align2RegClass ||
474         RC == &AMDGPU::AReg_96_Align2RegClass)
475       return &AMDGPU::AV_96_Align2RegClass;
476     if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
477       return &AMDGPU::AV_128RegClass;
478     if (RC == &AMDGPU::VReg_128_Align2RegClass ||
479         RC == &AMDGPU::AReg_128_Align2RegClass)
480       return &AMDGPU::AV_128_Align2RegClass;
481     if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
482       return &AMDGPU::AV_160RegClass;
483     if (RC == &AMDGPU::VReg_160_Align2RegClass ||
484         RC == &AMDGPU::AReg_160_Align2RegClass)
485       return &AMDGPU::AV_160_Align2RegClass;
486     if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
487       return &AMDGPU::AV_192RegClass;
488     if (RC == &AMDGPU::VReg_192_Align2RegClass ||
489         RC == &AMDGPU::AReg_192_Align2RegClass)
490       return &AMDGPU::AV_192_Align2RegClass;
491     if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
492       return &AMDGPU::AV_256RegClass;
493     if (RC == &AMDGPU::VReg_256_Align2RegClass ||
494         RC == &AMDGPU::AReg_256_Align2RegClass)
495       return &AMDGPU::AV_256_Align2RegClass;
496     if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
497       return &AMDGPU::AV_512RegClass;
498     if (RC == &AMDGPU::VReg_512_Align2RegClass ||
499         RC == &AMDGPU::AReg_512_Align2RegClass)
500       return &AMDGPU::AV_512_Align2RegClass;
501     if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
502       return &AMDGPU::AV_1024RegClass;
503     if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
504         RC == &AMDGPU::AReg_1024_Align2RegClass)
505       return &AMDGPU::AV_1024_Align2RegClass;
506   }
507 
508   return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
509 }
510 
511 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
512   const SIFrameLowering *TFI = ST.getFrameLowering();
513   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
514   // During ISel lowering we always reserve the stack pointer in entry and chain
515   // functions, but never actually want to reference it when accessing our own
516   // frame. If we need a frame pointer we use it, but otherwise we can just use
517   // an immediate "0" which we represent by returning NoRegister.
518   if (FuncInfo->isBottomOfStack()) {
519     return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
520   }
521   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
522                         : FuncInfo->getStackPtrOffsetReg();
523 }
524 
525 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
526   // When we need stack realignment, we can't reference off of the
527   // stack pointer, so we reserve a base pointer.
528   return shouldRealignStack(MF);
529 }
530 
531 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
532 
533 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
534   return AMDGPU_AllVGPRs_RegMask;
535 }
536 
537 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
538   return AMDGPU_AllAGPRs_RegMask;
539 }
540 
541 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
542   return AMDGPU_AllVectorRegs_RegMask;
543 }
544 
545 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
546   return AMDGPU_AllAllocatableSRegs_RegMask;
547 }
548 
549 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
550                                               unsigned NumRegs) {
551   assert(NumRegs < SubRegFromChannelTableWidthMap.size());
552   unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
553   assert(NumRegIndex && "Not implemented");
554   assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
555   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
556 }
557 
558 MCRegister
559 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
560                                         const unsigned Align,
561                                         const TargetRegisterClass *RC) const {
562   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
563   MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
564   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
565 }
566 
567 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
568   const MachineFunction &MF) const {
569   return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
570 }
571 
572 std::pair<unsigned, unsigned>
573 SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
574   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
575   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
576   unsigned MaxNumAGPRs = MaxNumVGPRs;
577   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
578 
579   // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
580   // a wave may have up to 512 total vector registers combining together both
581   // VGPRs and AGPRs. Hence, in an entry function without calls and without
582   // AGPRs used within it, it is possible to use the whole vector register
583   // budget for VGPRs.
584   //
585   // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
586   //       register file accordingly.
587   if (ST.hasGFX90AInsts()) {
588     if (MFI->usesAGPRs(MF)) {
589       MaxNumVGPRs /= 2;
590       MaxNumAGPRs = MaxNumVGPRs;
591     } else {
592       if (MaxNumVGPRs > TotalNumVGPRs) {
593         MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
594         MaxNumVGPRs = TotalNumVGPRs;
595       } else
596         MaxNumAGPRs = 0;
597     }
598   }
599 
600   return std::pair(MaxNumVGPRs, MaxNumAGPRs);
601 }
602 
603 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
604   BitVector Reserved(getNumRegs());
605   Reserved.set(AMDGPU::MODE);
606 
607   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
608 
609   // Reserve special purpose registers.
610   //
611   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
612   // this seems likely to result in bugs, so I'm marking them as reserved.
613   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
614   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
615 
616   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
617   reserveRegisterTuples(Reserved, AMDGPU::M0);
618 
619   // Reserve src_vccz, src_execz, src_scc.
620   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
621   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
622   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
623 
624   // Reserve the memory aperture registers
625   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
626   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
627   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
628   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
629 
630   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
631   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
632 
633   // Reserve xnack_mask registers - support is not implemented in Codegen.
634   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
635 
636   // Reserve lds_direct register - support is not implemented in Codegen.
637   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
638 
639   // Reserve Trap Handler registers - support is not implemented in Codegen.
640   reserveRegisterTuples(Reserved, AMDGPU::TBA);
641   reserveRegisterTuples(Reserved, AMDGPU::TMA);
642   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
643   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
644   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
645   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
646   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
647   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
648   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
649   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
650 
651   // Reserve null register - it shall never be allocated
652   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
653 
654   // Reserve SGPRs.
655   //
656   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
657   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
658   for (const TargetRegisterClass *RC : regclasses()) {
659     if (RC->isBaseClass() && isSGPRClass(RC)) {
660       unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
661       for (MCPhysReg Reg : *RC) {
662         unsigned Index = getHWRegIndex(Reg);
663         if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
664           Reserved.set(Reg);
665       }
666     }
667   }
668 
669   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
670   if (ScratchRSrcReg != AMDGPU::NoRegister) {
671     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
672     // need to spill.
673     // TODO: May need to reserve a VGPR if doing LDS spilling.
674     reserveRegisterTuples(Reserved, ScratchRSrcReg);
675   }
676 
677   Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
678   if (LongBranchReservedReg)
679     reserveRegisterTuples(Reserved, LongBranchReservedReg);
680 
681   // We have to assume the SP is needed in case there are calls in the function,
682   // which is detected after the function is lowered. If we aren't really going
683   // to need SP, don't bother reserving it.
684   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
685   if (StackPtrReg) {
686     reserveRegisterTuples(Reserved, StackPtrReg);
687     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
688   }
689 
690   MCRegister FrameReg = MFI->getFrameOffsetReg();
691   if (FrameReg) {
692     reserveRegisterTuples(Reserved, FrameReg);
693     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
694   }
695 
696   if (hasBasePointer(MF)) {
697     MCRegister BasePtrReg = getBaseRegister();
698     reserveRegisterTuples(Reserved, BasePtrReg);
699     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
700   }
701 
702   // FIXME: Use same reserved register introduced in D149775
703   // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
704   Register ExecCopyReg = MFI->getSGPRForEXECCopy();
705   if (ExecCopyReg)
706     reserveRegisterTuples(Reserved, ExecCopyReg);
707 
708   // Reserve VGPRs/AGPRs.
709   //
710   auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
711 
712   for (const TargetRegisterClass *RC : regclasses()) {
713     if (RC->isBaseClass() && isVGPRClass(RC)) {
714       unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
715       for (MCPhysReg Reg : *RC) {
716         unsigned Index = getHWRegIndex(Reg);
717         if (Index + NumRegs > MaxNumVGPRs)
718           Reserved.set(Reg);
719       }
720     }
721   }
722 
723   // Reserve all the AGPRs if there are no instructions to use it.
724   if (!ST.hasMAIInsts())
725     MaxNumAGPRs = 0;
726   for (const TargetRegisterClass *RC : regclasses()) {
727     if (RC->isBaseClass() && isAGPRClass(RC)) {
728       unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
729       for (MCPhysReg Reg : *RC) {
730         unsigned Index = getHWRegIndex(Reg);
731         if (Index + NumRegs > MaxNumAGPRs)
732           Reserved.set(Reg);
733       }
734     }
735   }
736 
737   // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
738   // VGPR available at all times.
739   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
740     reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
741   }
742 
743   // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
744   // MFI->getNonWWMRegMask() field will have a valid bitmask only during
745   // wwm-regalloc and it would be empty otherwise.
746   BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
747   if (!NonWWMRegMask.empty()) {
748     for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
749          RegI < RegE; ++RegI) {
750       if (NonWWMRegMask.test(RegI))
751         reserveRegisterTuples(Reserved, RegI);
752     }
753   }
754 
755   for (Register Reg : MFI->getWWMReservedRegs())
756     reserveRegisterTuples(Reserved, Reg);
757 
758   // FIXME: Stop using reserved registers for this.
759   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
760     reserveRegisterTuples(Reserved, Reg);
761 
762   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
763     reserveRegisterTuples(Reserved, Reg);
764 
765   return Reserved;
766 }
767 
768 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
769                                       MCRegister PhysReg) const {
770   return !MF.getRegInfo().isReserved(PhysReg);
771 }
772 
773 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
774   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
775   // On entry or in chain functions, the base address is 0, so it can't possibly
776   // need any more alignment.
777 
778   // FIXME: Should be able to specify the entry frame alignment per calling
779   // convention instead.
780   if (Info->isBottomOfStack())
781     return false;
782 
783   return TargetRegisterInfo::shouldRealignStack(MF);
784 }
785 
786 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
787   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
788   if (Info->isEntryFunction()) {
789     const MachineFrameInfo &MFI = Fn.getFrameInfo();
790     return MFI.hasStackObjects() || MFI.hasCalls();
791   }
792 
793   // May need scavenger for dealing with callee saved registers.
794   return true;
795 }
796 
797 bool SIRegisterInfo::requiresFrameIndexScavenging(
798   const MachineFunction &MF) const {
799   // Do not use frame virtual registers. They used to be used for SGPRs, but
800   // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
801   // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
802   // spill.
803   return false;
804 }
805 
806 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
807   const MachineFunction &MF) const {
808   const MachineFrameInfo &MFI = MF.getFrameInfo();
809   return MFI.hasStackObjects();
810 }
811 
812 bool SIRegisterInfo::requiresVirtualBaseRegisters(
813   const MachineFunction &) const {
814   // There are no special dedicated stack or frame pointers.
815   return true;
816 }
817 
818 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
819   assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
820 
821   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
822                                           AMDGPU::OpName::offset);
823   return MI->getOperand(OffIdx).getImm();
824 }
825 
826 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
827                                                  int Idx) const {
828   switch (MI->getOpcode()) {
829   case AMDGPU::V_ADD_U32_e32:
830   case AMDGPU::V_ADD_U32_e64:
831   case AMDGPU::V_ADD_CO_U32_e32: {
832     int OtherIdx = Idx == 1 ? 2 : 1;
833     const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
834     return OtherOp.isImm() ? OtherOp.getImm() : 0;
835   }
836   case AMDGPU::V_ADD_CO_U32_e64: {
837     int OtherIdx = Idx == 2 ? 3 : 2;
838     const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
839     return OtherOp.isImm() ? OtherOp.getImm() : 0;
840   }
841   default:
842     break;
843   }
844 
845   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
846     return 0;
847 
848   assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
849                                             AMDGPU::OpName::vaddr) ||
850          (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
851                                             AMDGPU::OpName::saddr))) &&
852          "Should never see frame index on non-address operand");
853 
854   return getScratchInstrOffset(MI);
855 }
856 
857 static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI,
858                               const MachineInstr &MI) {
859   assert(MI.getDesc().isAdd());
860   const MachineOperand &Src0 = MI.getOperand(1);
861   const MachineOperand &Src1 = MI.getOperand(2);
862 
863   if (Src0.isFI()) {
864     return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
865                                                        Src1.getReg()));
866   }
867 
868   if (Src1.isFI()) {
869     return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
870                                                        Src0.getReg()));
871   }
872 
873   return false;
874 }
875 
876 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
877   // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
878   switch (MI->getOpcode()) {
879   case AMDGPU::V_ADD_U32_e32: {
880     // TODO: We could handle this but it requires work to avoid violating
881     // operand restrictions.
882     if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
883         !isFIPlusImmOrVGPR(*this, *MI))
884       return false;
885     [[fallthrough]];
886   }
887   case AMDGPU::V_ADD_U32_e64:
888     // FIXME: This optimization is barely profitable enableFlatScratch as-is.
889     //
890     // Much of the benefit with the MUBUF handling is we avoid duplicating the
891     // shift of the frame register, which isn't needed with scratch.
892     //
893     // materializeFrameBaseRegister doesn't know the register classes of the
894     // uses, and unconditionally uses an s_add_i32, which will end up using a
895     // copy for the vector uses.
896     return !ST.enableFlatScratch();
897   case AMDGPU::V_ADD_CO_U32_e32:
898     if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
899         !isFIPlusImmOrVGPR(*this, *MI))
900       return false;
901     // We can't deal with the case where the carry out has a use (though this
902     // should never happen)
903     return MI->getOperand(3).isDead();
904   case AMDGPU::V_ADD_CO_U32_e64:
905     // TODO: Should we check use_empty instead?
906     return MI->getOperand(1).isDead();
907   default:
908     break;
909   }
910 
911   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
912     return false;
913 
914   int64_t FullOffset = Offset + getScratchInstrOffset(MI);
915 
916   const SIInstrInfo *TII = ST.getInstrInfo();
917   if (SIInstrInfo::isMUBUF(*MI))
918     return !TII->isLegalMUBUFImmOffset(FullOffset);
919 
920   return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
921                                  SIInstrFlags::FlatScratch);
922 }
923 
924 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
925                                                       int FrameIdx,
926                                                       int64_t Offset) const {
927   MachineBasicBlock::iterator Ins = MBB->begin();
928   DebugLoc DL; // Defaults to "unknown"
929 
930   if (Ins != MBB->end())
931     DL = Ins->getDebugLoc();
932 
933   MachineFunction *MF = MBB->getParent();
934   const SIInstrInfo *TII = ST.getInstrInfo();
935   MachineRegisterInfo &MRI = MF->getRegInfo();
936   unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
937                                            : AMDGPU::V_MOV_B32_e32;
938 
939   Register BaseReg = MRI.createVirtualRegister(
940       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
941                              : &AMDGPU::VGPR_32RegClass);
942 
943   if (Offset == 0) {
944     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
945       .addFrameIndex(FrameIdx);
946     return BaseReg;
947   }
948 
949   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
950 
951   Register FIReg = MRI.createVirtualRegister(
952       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
953                              : &AMDGPU::VGPR_32RegClass);
954 
955   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
956     .addImm(Offset);
957   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
958     .addFrameIndex(FrameIdx);
959 
960   if (ST.enableFlatScratch() ) {
961     // FIXME: Make sure scc isn't live in.
962     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
963         .addReg(OffsetReg, RegState::Kill)
964         .addReg(FIReg)
965         .setOperandDead(3); // scc
966     return BaseReg;
967   }
968 
969   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
970     .addReg(OffsetReg, RegState::Kill)
971     .addReg(FIReg)
972     .addImm(0); // clamp bit
973 
974   return BaseReg;
975 }
976 
977 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
978                                        int64_t Offset) const {
979   const SIInstrInfo *TII = ST.getInstrInfo();
980 
981   switch (MI.getOpcode()) {
982   case AMDGPU::V_ADD_U32_e32:
983   case AMDGPU::V_ADD_CO_U32_e32: {
984     MachineOperand *FIOp = &MI.getOperand(2);
985     MachineOperand *ImmOp = &MI.getOperand(1);
986     if (!FIOp->isFI())
987       std::swap(FIOp, ImmOp);
988 
989     if (!ImmOp->isImm()) {
990       assert(Offset == 0);
991       FIOp->ChangeToRegister(BaseReg, false);
992       TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
993       return;
994     }
995 
996     int64_t TotalOffset = ImmOp->getImm() + Offset;
997     if (TotalOffset == 0) {
998       MI.setDesc(TII->get(AMDGPU::COPY));
999       for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1000         MI.removeOperand(I);
1001 
1002       MI.getOperand(1).ChangeToRegister(BaseReg, false);
1003       return;
1004     }
1005 
1006     ImmOp->setImm(TotalOffset);
1007 
1008     MachineBasicBlock *MBB = MI.getParent();
1009     MachineFunction *MF = MBB->getParent();
1010     MachineRegisterInfo &MRI = MF->getRegInfo();
1011 
1012     // FIXME: materializeFrameBaseRegister does not know the register class of
1013     // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
1014     // a copy so we have a legal operand and hope the register coalescer can
1015     // clean it up.
1016     if (isSGPRReg(MRI, BaseReg)) {
1017       Register BaseRegVGPR =
1018           MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1019       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1020           .addReg(BaseReg);
1021       MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1022     } else {
1023       MI.getOperand(2).ChangeToRegister(BaseReg, false);
1024     }
1025     return;
1026   }
1027   case AMDGPU::V_ADD_U32_e64:
1028   case AMDGPU::V_ADD_CO_U32_e64: {
1029     int Src0Idx = MI.getNumExplicitDefs();
1030     MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1031     MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1032     if (!FIOp->isFI())
1033       std::swap(FIOp, ImmOp);
1034 
1035     if (!ImmOp->isImm()) {
1036       FIOp->ChangeToRegister(BaseReg, false);
1037       TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1038       return;
1039     }
1040 
1041     int64_t TotalOffset = ImmOp->getImm() + Offset;
1042     if (TotalOffset == 0) {
1043       MI.setDesc(TII->get(AMDGPU::COPY));
1044 
1045       for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1046         MI.removeOperand(I);
1047 
1048       MI.getOperand(1).ChangeToRegister(BaseReg, false);
1049     } else {
1050       FIOp->ChangeToRegister(BaseReg, false);
1051       ImmOp->setImm(TotalOffset);
1052     }
1053 
1054     return;
1055   }
1056   default:
1057     break;
1058   }
1059 
1060   bool IsFlat = TII->isFLATScratch(MI);
1061 
1062 #ifndef NDEBUG
1063   // FIXME: Is it possible to be storing a frame index to itself?
1064   bool SeenFI = false;
1065   for (const MachineOperand &MO: MI.operands()) {
1066     if (MO.isFI()) {
1067       if (SeenFI)
1068         llvm_unreachable("should not see multiple frame indices");
1069 
1070       SeenFI = true;
1071     }
1072   }
1073 #endif
1074 
1075   MachineOperand *FIOp =
1076       TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1077                                       : AMDGPU::OpName::vaddr);
1078 
1079   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1080   int64_t NewOffset = OffsetOp->getImm() + Offset;
1081 
1082   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1083   assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1084 
1085   if (IsFlat) {
1086     assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1087                                   SIInstrFlags::FlatScratch) &&
1088            "offset should be legal");
1089     FIOp->ChangeToRegister(BaseReg, false);
1090     OffsetOp->setImm(NewOffset);
1091     return;
1092   }
1093 
1094 #ifndef NDEBUG
1095   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1096   assert(SOffset->isImm() && SOffset->getImm() == 0);
1097 #endif
1098 
1099   assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1100 
1101   FIOp->ChangeToRegister(BaseReg, false);
1102   OffsetOp->setImm(NewOffset);
1103 }
1104 
1105 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
1106                                         Register BaseReg,
1107                                         int64_t Offset) const {
1108 
1109   switch (MI->getOpcode()) {
1110   case AMDGPU::V_ADD_U32_e32:
1111   case AMDGPU::V_ADD_CO_U32_e32:
1112     return true;
1113   case AMDGPU::V_ADD_U32_e64:
1114   case AMDGPU::V_ADD_CO_U32_e64:
1115     return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1116   default:
1117     break;
1118   }
1119 
1120   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
1121     return false;
1122 
1123   int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1124 
1125   const SIInstrInfo *TII = ST.getInstrInfo();
1126   if (SIInstrInfo::isMUBUF(*MI))
1127     return TII->isLegalMUBUFImmOffset(NewOffset);
1128 
1129   return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1130                                 SIInstrFlags::FlatScratch);
1131 }
1132 
1133 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
1134   const MachineFunction &MF, unsigned Kind) const {
1135   // This is inaccurate. It depends on the instruction and address space. The
1136   // only place where we should hit this is for dealing with frame indexes /
1137   // private accesses, so this is correct in that case.
1138   return &AMDGPU::VGPR_32RegClass;
1139 }
1140 
1141 const TargetRegisterClass *
1142 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
1143   if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
1144     return getEquivalentVGPRClass(RC);
1145   if (RC == &AMDGPU::SCC_CLASSRegClass)
1146     return getWaveMaskRegClass();
1147 
1148   return RC;
1149 }
1150 
1151 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
1152 
1153   switch (Op) {
1154   case AMDGPU::SI_SPILL_S1024_SAVE:
1155   case AMDGPU::SI_SPILL_S1024_RESTORE:
1156   case AMDGPU::SI_SPILL_V1024_SAVE:
1157   case AMDGPU::SI_SPILL_V1024_RESTORE:
1158   case AMDGPU::SI_SPILL_A1024_SAVE:
1159   case AMDGPU::SI_SPILL_A1024_RESTORE:
1160   case AMDGPU::SI_SPILL_AV1024_SAVE:
1161   case AMDGPU::SI_SPILL_AV1024_RESTORE:
1162     return 32;
1163   case AMDGPU::SI_SPILL_S512_SAVE:
1164   case AMDGPU::SI_SPILL_S512_RESTORE:
1165   case AMDGPU::SI_SPILL_V512_SAVE:
1166   case AMDGPU::SI_SPILL_V512_RESTORE:
1167   case AMDGPU::SI_SPILL_A512_SAVE:
1168   case AMDGPU::SI_SPILL_A512_RESTORE:
1169   case AMDGPU::SI_SPILL_AV512_SAVE:
1170   case AMDGPU::SI_SPILL_AV512_RESTORE:
1171     return 16;
1172   case AMDGPU::SI_SPILL_S384_SAVE:
1173   case AMDGPU::SI_SPILL_S384_RESTORE:
1174   case AMDGPU::SI_SPILL_V384_SAVE:
1175   case AMDGPU::SI_SPILL_V384_RESTORE:
1176   case AMDGPU::SI_SPILL_A384_SAVE:
1177   case AMDGPU::SI_SPILL_A384_RESTORE:
1178   case AMDGPU::SI_SPILL_AV384_SAVE:
1179   case AMDGPU::SI_SPILL_AV384_RESTORE:
1180     return 12;
1181   case AMDGPU::SI_SPILL_S352_SAVE:
1182   case AMDGPU::SI_SPILL_S352_RESTORE:
1183   case AMDGPU::SI_SPILL_V352_SAVE:
1184   case AMDGPU::SI_SPILL_V352_RESTORE:
1185   case AMDGPU::SI_SPILL_A352_SAVE:
1186   case AMDGPU::SI_SPILL_A352_RESTORE:
1187   case AMDGPU::SI_SPILL_AV352_SAVE:
1188   case AMDGPU::SI_SPILL_AV352_RESTORE:
1189     return 11;
1190   case AMDGPU::SI_SPILL_S320_SAVE:
1191   case AMDGPU::SI_SPILL_S320_RESTORE:
1192   case AMDGPU::SI_SPILL_V320_SAVE:
1193   case AMDGPU::SI_SPILL_V320_RESTORE:
1194   case AMDGPU::SI_SPILL_A320_SAVE:
1195   case AMDGPU::SI_SPILL_A320_RESTORE:
1196   case AMDGPU::SI_SPILL_AV320_SAVE:
1197   case AMDGPU::SI_SPILL_AV320_RESTORE:
1198     return 10;
1199   case AMDGPU::SI_SPILL_S288_SAVE:
1200   case AMDGPU::SI_SPILL_S288_RESTORE:
1201   case AMDGPU::SI_SPILL_V288_SAVE:
1202   case AMDGPU::SI_SPILL_V288_RESTORE:
1203   case AMDGPU::SI_SPILL_A288_SAVE:
1204   case AMDGPU::SI_SPILL_A288_RESTORE:
1205   case AMDGPU::SI_SPILL_AV288_SAVE:
1206   case AMDGPU::SI_SPILL_AV288_RESTORE:
1207     return 9;
1208   case AMDGPU::SI_SPILL_S256_SAVE:
1209   case AMDGPU::SI_SPILL_S256_RESTORE:
1210   case AMDGPU::SI_SPILL_V256_SAVE:
1211   case AMDGPU::SI_SPILL_V256_RESTORE:
1212   case AMDGPU::SI_SPILL_A256_SAVE:
1213   case AMDGPU::SI_SPILL_A256_RESTORE:
1214   case AMDGPU::SI_SPILL_AV256_SAVE:
1215   case AMDGPU::SI_SPILL_AV256_RESTORE:
1216     return 8;
1217   case AMDGPU::SI_SPILL_S224_SAVE:
1218   case AMDGPU::SI_SPILL_S224_RESTORE:
1219   case AMDGPU::SI_SPILL_V224_SAVE:
1220   case AMDGPU::SI_SPILL_V224_RESTORE:
1221   case AMDGPU::SI_SPILL_A224_SAVE:
1222   case AMDGPU::SI_SPILL_A224_RESTORE:
1223   case AMDGPU::SI_SPILL_AV224_SAVE:
1224   case AMDGPU::SI_SPILL_AV224_RESTORE:
1225     return 7;
1226   case AMDGPU::SI_SPILL_S192_SAVE:
1227   case AMDGPU::SI_SPILL_S192_RESTORE:
1228   case AMDGPU::SI_SPILL_V192_SAVE:
1229   case AMDGPU::SI_SPILL_V192_RESTORE:
1230   case AMDGPU::SI_SPILL_A192_SAVE:
1231   case AMDGPU::SI_SPILL_A192_RESTORE:
1232   case AMDGPU::SI_SPILL_AV192_SAVE:
1233   case AMDGPU::SI_SPILL_AV192_RESTORE:
1234     return 6;
1235   case AMDGPU::SI_SPILL_S160_SAVE:
1236   case AMDGPU::SI_SPILL_S160_RESTORE:
1237   case AMDGPU::SI_SPILL_V160_SAVE:
1238   case AMDGPU::SI_SPILL_V160_RESTORE:
1239   case AMDGPU::SI_SPILL_A160_SAVE:
1240   case AMDGPU::SI_SPILL_A160_RESTORE:
1241   case AMDGPU::SI_SPILL_AV160_SAVE:
1242   case AMDGPU::SI_SPILL_AV160_RESTORE:
1243     return 5;
1244   case AMDGPU::SI_SPILL_S128_SAVE:
1245   case AMDGPU::SI_SPILL_S128_RESTORE:
1246   case AMDGPU::SI_SPILL_V128_SAVE:
1247   case AMDGPU::SI_SPILL_V128_RESTORE:
1248   case AMDGPU::SI_SPILL_A128_SAVE:
1249   case AMDGPU::SI_SPILL_A128_RESTORE:
1250   case AMDGPU::SI_SPILL_AV128_SAVE:
1251   case AMDGPU::SI_SPILL_AV128_RESTORE:
1252     return 4;
1253   case AMDGPU::SI_SPILL_S96_SAVE:
1254   case AMDGPU::SI_SPILL_S96_RESTORE:
1255   case AMDGPU::SI_SPILL_V96_SAVE:
1256   case AMDGPU::SI_SPILL_V96_RESTORE:
1257   case AMDGPU::SI_SPILL_A96_SAVE:
1258   case AMDGPU::SI_SPILL_A96_RESTORE:
1259   case AMDGPU::SI_SPILL_AV96_SAVE:
1260   case AMDGPU::SI_SPILL_AV96_RESTORE:
1261     return 3;
1262   case AMDGPU::SI_SPILL_S64_SAVE:
1263   case AMDGPU::SI_SPILL_S64_RESTORE:
1264   case AMDGPU::SI_SPILL_V64_SAVE:
1265   case AMDGPU::SI_SPILL_V64_RESTORE:
1266   case AMDGPU::SI_SPILL_A64_SAVE:
1267   case AMDGPU::SI_SPILL_A64_RESTORE:
1268   case AMDGPU::SI_SPILL_AV64_SAVE:
1269   case AMDGPU::SI_SPILL_AV64_RESTORE:
1270     return 2;
1271   case AMDGPU::SI_SPILL_S32_SAVE:
1272   case AMDGPU::SI_SPILL_S32_RESTORE:
1273   case AMDGPU::SI_SPILL_V32_SAVE:
1274   case AMDGPU::SI_SPILL_V32_RESTORE:
1275   case AMDGPU::SI_SPILL_A32_SAVE:
1276   case AMDGPU::SI_SPILL_A32_RESTORE:
1277   case AMDGPU::SI_SPILL_AV32_SAVE:
1278   case AMDGPU::SI_SPILL_AV32_RESTORE:
1279   case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1280   case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1281   case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1282   case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1283     return 1;
1284   default: llvm_unreachable("Invalid spill opcode");
1285   }
1286 }
1287 
1288 static int getOffsetMUBUFStore(unsigned Opc) {
1289   switch (Opc) {
1290   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1291     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1292   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1293     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1294   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1295     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1296   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1297     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1298   case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1299     return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1300   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1301     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1302   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1303     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1304   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1305     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1306   default:
1307     return -1;
1308   }
1309 }
1310 
1311 static int getOffsetMUBUFLoad(unsigned Opc) {
1312   switch (Opc) {
1313   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1314     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1315   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1316     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1317   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1318     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1319   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1320     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1321   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1322     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1323   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1324     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1325   case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1326     return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1327   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1328     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1329   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1330     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1331   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1332     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1333   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1334     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1335   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1336     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1337   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1338     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1339   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1340     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1341   default:
1342     return -1;
1343   }
1344 }
1345 
1346 static int getOffenMUBUFStore(unsigned Opc) {
1347   switch (Opc) {
1348   case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1349     return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1350   case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1351     return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1352   case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1353     return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1354   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1355     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1356   case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1357     return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1358   case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1359     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1360   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1361     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1362   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1363     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1364   default:
1365     return -1;
1366   }
1367 }
1368 
1369 static int getOffenMUBUFLoad(unsigned Opc) {
1370   switch (Opc) {
1371   case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1372     return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1373   case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1374     return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1375   case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1376     return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1377   case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1378     return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1379   case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1380     return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1381   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1382     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1383   case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1384     return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1385   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1386     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1387   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1388     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1389   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1390     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1391   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1392     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1393   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1394     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1395   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1396     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1397   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1398     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1399   default:
1400     return -1;
1401   }
1402 }
1403 
1404 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
1405                                            MachineBasicBlock &MBB,
1406                                            MachineBasicBlock::iterator MI,
1407                                            int Index, unsigned Lane,
1408                                            unsigned ValueReg, bool IsKill) {
1409   MachineFunction *MF = MBB.getParent();
1410   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1411   const SIInstrInfo *TII = ST.getInstrInfo();
1412 
1413   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1414 
1415   if (Reg == AMDGPU::NoRegister)
1416     return MachineInstrBuilder();
1417 
1418   bool IsStore = MI->mayStore();
1419   MachineRegisterInfo &MRI = MF->getRegInfo();
1420   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1421 
1422   unsigned Dst = IsStore ? Reg : ValueReg;
1423   unsigned Src = IsStore ? ValueReg : Reg;
1424   bool IsVGPR = TRI->isVGPR(MRI, Reg);
1425   DebugLoc DL = MI->getDebugLoc();
1426   if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1427     // Spiller during regalloc may restore a spilled register to its superclass.
1428     // It could result in AGPR spills restored to VGPRs or the other way around,
1429     // making the src and dst with identical regclasses at this point. It just
1430     // needs a copy in such cases.
1431     auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1432                        .addReg(Src, getKillRegState(IsKill));
1433     CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1434     return CopyMIB;
1435   }
1436   unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1437                                     : AMDGPU::V_ACCVGPR_READ_B32_e64;
1438 
1439   auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1440                  .addReg(Src, getKillRegState(IsKill));
1441   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1442   return MIB;
1443 }
1444 
1445 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1446 // need to handle the case where an SGPR may need to be spilled while spilling.
1447 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
1448                                       MachineFrameInfo &MFI,
1449                                       MachineBasicBlock::iterator MI,
1450                                       int Index,
1451                                       int64_t Offset) {
1452   const SIInstrInfo *TII = ST.getInstrInfo();
1453   MachineBasicBlock *MBB = MI->getParent();
1454   const DebugLoc &DL = MI->getDebugLoc();
1455   bool IsStore = MI->mayStore();
1456 
1457   unsigned Opc = MI->getOpcode();
1458   int LoadStoreOp = IsStore ?
1459     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
1460   if (LoadStoreOp == -1)
1461     return false;
1462 
1463   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1464   if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1465     return true;
1466 
1467   MachineInstrBuilder NewMI =
1468       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1469           .add(*Reg)
1470           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1471           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1472           .addImm(Offset)
1473           .addImm(0) // cpol
1474           .addImm(0) // swz
1475           .cloneMemRefs(*MI);
1476 
1477   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1478                                                        AMDGPU::OpName::vdata_in);
1479   if (VDataIn)
1480     NewMI.add(*VDataIn);
1481   return true;
1482 }
1483 
1484 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
1485                                           unsigned LoadStoreOp,
1486                                           unsigned EltSize) {
1487   bool IsStore = TII->get(LoadStoreOp).mayStore();
1488   bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1489   bool UseST =
1490       !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1491 
1492   switch (EltSize) {
1493   case 4:
1494     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1495                           : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1496     break;
1497   case 8:
1498     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1499                           : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1500     break;
1501   case 12:
1502     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1503                           : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1504     break;
1505   case 16:
1506     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1507                           : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1508     break;
1509   default:
1510     llvm_unreachable("Unexpected spill load/store size!");
1511   }
1512 
1513   if (HasVAddr)
1514     LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1515   else if (UseST)
1516     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1517 
1518   return LoadStoreOp;
1519 }
1520 
1521 void SIRegisterInfo::buildSpillLoadStore(
1522     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
1523     unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1524     MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1525     RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1526   assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1527 
1528   MachineFunction *MF = MBB.getParent();
1529   const SIInstrInfo *TII = ST.getInstrInfo();
1530   const MachineFrameInfo &MFI = MF->getFrameInfo();
1531   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1532 
1533   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1534   bool IsStore = Desc->mayStore();
1535   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1536 
1537   bool CanClobberSCC = false;
1538   bool Scavenged = false;
1539   MCRegister SOffset = ScratchOffsetReg;
1540 
1541   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1542   // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1543   const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1544   const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1545 
1546   // Always use 4 byte operations for AGPRs because we need to scavenge
1547   // a temporary VGPR.
1548   unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
1549   unsigned NumSubRegs = RegWidth / EltSize;
1550   unsigned Size = NumSubRegs * EltSize;
1551   unsigned RemSize = RegWidth - Size;
1552   unsigned NumRemSubRegs = RemSize ? 1 : 0;
1553   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1554   int64_t MaterializedOffset = Offset;
1555 
1556   int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1557   int64_t ScratchOffsetRegDelta = 0;
1558 
1559   if (IsFlat && EltSize > 4) {
1560     LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1561     Desc = &TII->get(LoadStoreOp);
1562   }
1563 
1564   Align Alignment = MFI.getObjectAlign(Index);
1565   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1566 
1567   assert((IsFlat || ((Offset % EltSize) == 0)) &&
1568          "unexpected VGPR spill offset");
1569 
1570   // Track a VGPR to use for a constant offset we need to materialize.
1571   Register TmpOffsetVGPR;
1572 
1573   // Track a VGPR to use as an intermediate value.
1574   Register TmpIntermediateVGPR;
1575   bool UseVGPROffset = false;
1576 
1577   // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1578   // combination.
1579   auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1580                                 int64_t VOffset) {
1581     // We are using a VGPR offset
1582     if (IsFlat && SGPRBase) {
1583       // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1584       // SGPR, so perform the add as vector.
1585       // We don't need a base SGPR in the kernel.
1586 
1587       if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1588         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1589           .addReg(SGPRBase)
1590           .addImm(VOffset)
1591           .addImm(0); // clamp
1592       } else {
1593         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1594           .addReg(SGPRBase);
1595         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1596           .addImm(VOffset)
1597           .addReg(TmpOffsetVGPR);
1598       }
1599     } else {
1600       assert(TmpOffsetVGPR);
1601       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1602         .addImm(VOffset);
1603     }
1604   };
1605 
1606   bool IsOffsetLegal =
1607       IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1608                                       SIInstrFlags::FlatScratch)
1609              : TII->isLegalMUBUFImmOffset(MaxOffset);
1610   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1611     SOffset = MCRegister();
1612 
1613     // We don't have access to the register scavenger if this function is called
1614     // during  PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1615     // TODO: Clobbering SCC is not necessary for scratch instructions in the
1616     // entry.
1617     if (RS) {
1618       SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1619 
1620       // Piggy back on the liveness scan we just did see if SCC is dead.
1621       CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1622     } else if (LiveUnits) {
1623       CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1624       for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1625         if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1626           SOffset = Reg;
1627           break;
1628         }
1629       }
1630     }
1631 
1632     if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1633       SOffset = Register();
1634 
1635     if (!SOffset) {
1636       UseVGPROffset = true;
1637 
1638       if (RS) {
1639         TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1640       } else {
1641         assert(LiveUnits);
1642         for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1643           if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1644             TmpOffsetVGPR = Reg;
1645             break;
1646           }
1647         }
1648       }
1649 
1650       assert(TmpOffsetVGPR);
1651     } else if (!SOffset && CanClobberSCC) {
1652       // There are no free SGPRs, and since we are in the process of spilling
1653       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
1654       // on SI/CI and on VI it is true until we implement spilling using scalar
1655       // stores), we have no way to free up an SGPR.  Our solution here is to
1656       // add the offset directly to the ScratchOffset or StackPtrOffset
1657       // register, and then subtract the offset after the spill to return the
1658       // register to it's original value.
1659 
1660       // TODO: If we don't have to do an emergency stack slot spill, converting
1661       // to use the VGPR offset is fewer instructions.
1662       if (!ScratchOffsetReg)
1663         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1664       SOffset = ScratchOffsetReg;
1665       ScratchOffsetRegDelta = Offset;
1666     } else {
1667       Scavenged = true;
1668     }
1669 
1670     // We currently only support spilling VGPRs to EltSize boundaries, meaning
1671     // we can simplify the adjustment of Offset here to just scale with
1672     // WavefrontSize.
1673     if (!IsFlat && !UseVGPROffset)
1674       Offset *= ST.getWavefrontSize();
1675 
1676     if (!UseVGPROffset && !SOffset)
1677       report_fatal_error("could not scavenge SGPR to spill in entry function");
1678 
1679     if (UseVGPROffset) {
1680       // We are using a VGPR offset
1681       MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1682     } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1683       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1684     } else {
1685       assert(Offset != 0);
1686       auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1687           .addReg(ScratchOffsetReg)
1688           .addImm(Offset);
1689       Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1690     }
1691 
1692     Offset = 0;
1693   }
1694 
1695   if (IsFlat && SOffset == AMDGPU::NoRegister) {
1696     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1697            && "Unexpected vaddr for flat scratch with a FI operand");
1698 
1699     if (UseVGPROffset) {
1700       LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1701     } else {
1702       assert(ST.hasFlatScratchSTMode());
1703       LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1704     }
1705 
1706     Desc = &TII->get(LoadStoreOp);
1707   }
1708 
1709   for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1710        ++i, RegOffset += EltSize) {
1711     if (i == NumSubRegs) {
1712       EltSize = RemSize;
1713       LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1714     }
1715     Desc = &TII->get(LoadStoreOp);
1716 
1717     if (!IsFlat && UseVGPROffset) {
1718       int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1719                                    : getOffenMUBUFLoad(LoadStoreOp);
1720       Desc = &TII->get(NewLoadStoreOp);
1721     }
1722 
1723     if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1724       // If we are spilling an AGPR beyond the range of the memory instruction
1725       // offset and need to use a VGPR offset, we ideally have at least 2
1726       // scratch VGPRs. If we don't have a second free VGPR without spilling,
1727       // recycle the VGPR used for the offset which requires resetting after
1728       // each subregister.
1729 
1730       MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1731     }
1732 
1733     unsigned NumRegs = EltSize / 4;
1734     Register SubReg = e == 1
1735             ? ValueReg
1736             : Register(getSubReg(ValueReg,
1737                                  getSubRegFromChannel(RegOffset / 4, NumRegs)));
1738 
1739     unsigned SOffsetRegState = 0;
1740     unsigned SrcDstRegState = getDefRegState(!IsStore);
1741     const bool IsLastSubReg = i + 1 == e;
1742     const bool IsFirstSubReg = i == 0;
1743     if (IsLastSubReg) {
1744       SOffsetRegState |= getKillRegState(Scavenged);
1745       // The last implicit use carries the "Kill" flag.
1746       SrcDstRegState |= getKillRegState(IsKill);
1747     }
1748 
1749     // Make sure the whole register is defined if there are undef components by
1750     // adding an implicit def of the super-reg on the first instruction.
1751     bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1752     bool NeedSuperRegImpOperand = e > 1;
1753 
1754     // Remaining element size to spill into memory after some parts of it
1755     // spilled into either AGPRs or VGPRs.
1756     unsigned RemEltSize = EltSize;
1757 
1758     // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1759     // starting from the last lane. In case if a register cannot be completely
1760     // spilled into another register that will ensure its alignment does not
1761     // change. For targets with VGPR alignment requirement this is important
1762     // in case of flat scratch usage as we might get a scratch_load or
1763     // scratch_store of an unaligned register otherwise.
1764     for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1765              LaneE = RegOffset / 4;
1766          Lane >= LaneE; --Lane) {
1767       bool IsSubReg = e > 1 || EltSize > 4;
1768       Register Sub = IsSubReg
1769              ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1770              : ValueReg;
1771       auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1772       if (!MIB.getInstr())
1773         break;
1774       if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1775         MIB.addReg(ValueReg, RegState::ImplicitDefine);
1776         NeedSuperRegDef = false;
1777       }
1778       if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1779         NeedSuperRegImpOperand = true;
1780         unsigned State = SrcDstRegState;
1781         if (!IsLastSubReg || (Lane != LaneE))
1782           State &= ~RegState::Kill;
1783         if (!IsFirstSubReg || (Lane != LaneS))
1784           State &= ~RegState::Define;
1785         MIB.addReg(ValueReg, RegState::Implicit | State);
1786       }
1787       RemEltSize -= 4;
1788     }
1789 
1790     if (!RemEltSize) // Fully spilled into AGPRs.
1791       continue;
1792 
1793     if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1794       assert(IsFlat && EltSize > 4);
1795 
1796       unsigned NumRegs = RemEltSize / 4;
1797       SubReg = Register(getSubReg(ValueReg,
1798                         getSubRegFromChannel(RegOffset / 4, NumRegs)));
1799       unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1800       Desc = &TII->get(Opc);
1801     }
1802 
1803     unsigned FinalReg = SubReg;
1804 
1805     if (IsAGPR) {
1806       assert(EltSize == 4);
1807 
1808       if (!TmpIntermediateVGPR) {
1809         TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1810         assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1811       }
1812       if (IsStore) {
1813         auto AccRead = BuildMI(MBB, MI, DL,
1814                                TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1815                                TmpIntermediateVGPR)
1816                            .addReg(SubReg, getKillRegState(IsKill));
1817         if (NeedSuperRegDef)
1818           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1819         if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1820           AccRead.addReg(ValueReg, RegState::Implicit);
1821         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1822       }
1823       SubReg = TmpIntermediateVGPR;
1824     } else if (UseVGPROffset) {
1825       if (!TmpOffsetVGPR) {
1826         TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1827                                                       MI, false, 0);
1828         RS->setRegUsed(TmpOffsetVGPR);
1829       }
1830     }
1831 
1832     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1833     MachineMemOperand *NewMMO =
1834         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1835                                  commonAlignment(Alignment, RegOffset));
1836 
1837     auto MIB =
1838         BuildMI(MBB, MI, DL, *Desc)
1839             .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1840 
1841     if (UseVGPROffset) {
1842       // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1843       // intermediate accvgpr_write.
1844       MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1845     }
1846 
1847     if (!IsFlat)
1848       MIB.addReg(FuncInfo->getScratchRSrcReg());
1849 
1850     if (SOffset == AMDGPU::NoRegister) {
1851       if (!IsFlat) {
1852         if (UseVGPROffset && ScratchOffsetReg) {
1853           MIB.addReg(ScratchOffsetReg);
1854         } else {
1855           assert(FuncInfo->isBottomOfStack());
1856           MIB.addImm(0);
1857         }
1858       }
1859     } else {
1860       MIB.addReg(SOffset, SOffsetRegState);
1861     }
1862 
1863     MIB.addImm(Offset + RegOffset);
1864 
1865     bool LastUse = MMO->getFlags() & MOLastUse;
1866     MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1867 
1868     if (!IsFlat)
1869       MIB.addImm(0); // swz
1870     MIB.addMemOperand(NewMMO);
1871 
1872     if (!IsAGPR && NeedSuperRegDef)
1873       MIB.addReg(ValueReg, RegState::ImplicitDefine);
1874 
1875     if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1876       MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1877                     FinalReg)
1878                 .addReg(TmpIntermediateVGPR, RegState::Kill);
1879       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1880     }
1881 
1882     if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1883       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1884 
1885     // The epilog restore of a wwm-scratch register can cause undesired
1886     // optimization during machine-cp post PrologEpilogInserter if the same
1887     // register was assigned for return value ABI lowering with a COPY
1888     // instruction. As given below, with the epilog reload, the earlier COPY
1889     // appeared to be dead during machine-cp.
1890     // ...
1891     // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1892     // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1893     // ...
1894     // Epilog block:
1895     // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1896     // ...
1897     // WWM spill restore to preserve the inactive lanes of v0.
1898     // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1899     // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1900     // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1901     // ...
1902     // SI_RETURN implicit $vgpr0
1903     // ...
1904     // To fix it, mark the same reg as a tied op for such restore instructions
1905     // so that it marks a usage for the preceding COPY.
1906     if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1907         MI->readsRegister(SubReg, this)) {
1908       MIB.addReg(SubReg, RegState::Implicit);
1909       MIB->tieOperands(0, MIB->getNumOperands() - 1);
1910     }
1911   }
1912 
1913   if (ScratchOffsetRegDelta != 0) {
1914     // Subtract the offset we added to the ScratchOffset register.
1915     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1916         .addReg(SOffset)
1917         .addImm(-ScratchOffsetRegDelta);
1918   }
1919 }
1920 
1921 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
1922                                              int Offset, bool IsLoad,
1923                                              bool IsKill) const {
1924   // Load/store VGPR
1925   MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1926   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1927 
1928   Register FrameReg =
1929       FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1930           ? getBaseRegister()
1931           : getFrameRegister(SB.MF);
1932 
1933   Align Alignment = FrameInfo.getObjectAlign(Index);
1934   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
1935   MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
1936       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1937       SB.EltSize, Alignment);
1938 
1939   if (IsLoad) {
1940     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1941                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1942     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1943                         FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1944   } else {
1945     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1946                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1947     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1948                         FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1949     // This only ever adds one VGPR spill
1950     SB.MFI.addToSpilledVGPRs(1);
1951   }
1952 }
1953 
1954 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
1955                                RegScavenger *RS, SlotIndexes *Indexes,
1956                                LiveIntervals *LIS, bool OnlyToVGPR,
1957                                bool SpillToPhysVGPRLane) const {
1958   assert(!MI->getOperand(0).isUndef() &&
1959          "undef spill should have been deleted earlier");
1960 
1961   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1962 
1963   ArrayRef<SpilledReg> VGPRSpills =
1964       SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1965                           : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
1966   bool SpillToVGPR = !VGPRSpills.empty();
1967   if (OnlyToVGPR && !SpillToVGPR)
1968     return false;
1969 
1970   assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1971                          SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1972 
1973   if (SpillToVGPR) {
1974 
1975     // Since stack slot coloring pass is trying to optimize SGPR spills,
1976     // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
1977     // spills of different sizes. This accounts for number of VGPR lanes alloted
1978     // equal to the largest SGPR being spilled in them.
1979     assert(SB.NumSubRegs <= VGPRSpills.size() &&
1980            "Num of SGPRs spilled should be less than or equal to num of "
1981            "the VGPR lanes.");
1982 
1983     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1984       Register SubReg =
1985           SB.NumSubRegs == 1
1986               ? SB.SuperReg
1987               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1988       SpilledReg Spill = VGPRSpills[i];
1989 
1990       bool IsFirstSubreg = i == 0;
1991       bool IsLastSubreg = i == SB.NumSubRegs - 1;
1992       bool UseKill = SB.IsKill && IsLastSubreg;
1993 
1994 
1995       // Mark the "old value of vgpr" input undef only if this is the first sgpr
1996       // spill to this specific vgpr in the first basic block.
1997       auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1998                          SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
1999                      .addReg(SubReg, getKillRegState(UseKill))
2000                      .addImm(Spill.Lane)
2001                      .addReg(Spill.VGPR);
2002       if (Indexes) {
2003         if (IsFirstSubreg)
2004           Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2005         else
2006           Indexes->insertMachineInstrInMaps(*MIB);
2007       }
2008 
2009       if (IsFirstSubreg && SB.NumSubRegs > 1) {
2010         // We may be spilling a super-register which is only partially defined,
2011         // and need to ensure later spills think the value is defined.
2012         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2013       }
2014 
2015       if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2016         MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2017 
2018       // FIXME: Since this spills to another register instead of an actual
2019       // frame index, we should delete the frame index when all references to
2020       // it are fixed.
2021     }
2022   } else {
2023     SB.prepare();
2024 
2025     // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2026     unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2027 
2028     // Per VGPR helper data
2029     auto PVD = SB.getPerVGPRData();
2030 
2031     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2032       unsigned TmpVGPRFlags = RegState::Undef;
2033 
2034       // Write sub registers into the VGPR
2035       for (unsigned i = Offset * PVD.PerVGPR,
2036                     e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2037            i < e; ++i) {
2038         Register SubReg =
2039             SB.NumSubRegs == 1
2040                 ? SB.SuperReg
2041                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2042 
2043         MachineInstrBuilder WriteLane =
2044             BuildMI(*SB.MBB, MI, SB.DL,
2045                     SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2046                 .addReg(SubReg, SubKillState)
2047                 .addImm(i % PVD.PerVGPR)
2048                 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2049         TmpVGPRFlags = 0;
2050 
2051         if (Indexes) {
2052           if (i == 0)
2053             Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2054           else
2055             Indexes->insertMachineInstrInMaps(*WriteLane);
2056         }
2057 
2058         // There could be undef components of a spilled super register.
2059         // TODO: Can we detect this and skip the spill?
2060         if (SB.NumSubRegs > 1) {
2061           // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2062           unsigned SuperKillState = 0;
2063           if (i + 1 == SB.NumSubRegs)
2064             SuperKillState |= getKillRegState(SB.IsKill);
2065           WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2066         }
2067       }
2068 
2069       // Write out VGPR
2070       SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2071     }
2072 
2073     SB.restore();
2074   }
2075 
2076   MI->eraseFromParent();
2077   SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
2078 
2079   if (LIS)
2080     LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
2081 
2082   return true;
2083 }
2084 
2085 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
2086                                  RegScavenger *RS, SlotIndexes *Indexes,
2087                                  LiveIntervals *LIS, bool OnlyToVGPR,
2088                                  bool SpillToPhysVGPRLane) const {
2089   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2090 
2091   ArrayRef<SpilledReg> VGPRSpills =
2092       SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2093                           : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
2094   bool SpillToVGPR = !VGPRSpills.empty();
2095   if (OnlyToVGPR && !SpillToVGPR)
2096     return false;
2097 
2098   if (SpillToVGPR) {
2099     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2100       Register SubReg =
2101           SB.NumSubRegs == 1
2102               ? SB.SuperReg
2103               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2104 
2105       SpilledReg Spill = VGPRSpills[i];
2106       auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2107                          SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2108                      .addReg(Spill.VGPR)
2109                      .addImm(Spill.Lane);
2110       if (SB.NumSubRegs > 1 && i == 0)
2111         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2112       if (Indexes) {
2113         if (i == e - 1)
2114           Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2115         else
2116           Indexes->insertMachineInstrInMaps(*MIB);
2117       }
2118     }
2119   } else {
2120     SB.prepare();
2121 
2122     // Per VGPR helper data
2123     auto PVD = SB.getPerVGPRData();
2124 
2125     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2126       // Load in VGPR data
2127       SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2128 
2129       // Unpack lanes
2130       for (unsigned i = Offset * PVD.PerVGPR,
2131                     e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2132            i < e; ++i) {
2133         Register SubReg =
2134             SB.NumSubRegs == 1
2135                 ? SB.SuperReg
2136                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2137 
2138         bool LastSubReg = (i + 1 == e);
2139         auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2140                            SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2141                        .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2142                        .addImm(i);
2143         if (SB.NumSubRegs > 1 && i == 0)
2144           MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2145         if (Indexes) {
2146           if (i == e - 1)
2147             Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2148           else
2149             Indexes->insertMachineInstrInMaps(*MIB);
2150         }
2151       }
2152     }
2153 
2154     SB.restore();
2155   }
2156 
2157   MI->eraseFromParent();
2158 
2159   if (LIS)
2160     LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
2161 
2162   return true;
2163 }
2164 
2165 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
2166                                         MachineBasicBlock &RestoreMBB,
2167                                         Register SGPR, RegScavenger *RS) const {
2168   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2169                       RS);
2170   SB.prepare();
2171   // Generate the spill of SGPR to SB.TmpVGPR.
2172   unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2173   auto PVD = SB.getPerVGPRData();
2174   for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2175     unsigned TmpVGPRFlags = RegState::Undef;
2176     // Write sub registers into the VGPR
2177     for (unsigned i = Offset * PVD.PerVGPR,
2178                   e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2179          i < e; ++i) {
2180       Register SubReg =
2181           SB.NumSubRegs == 1
2182               ? SB.SuperReg
2183               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2184 
2185       MachineInstrBuilder WriteLane =
2186           BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2187                   SB.TmpVGPR)
2188               .addReg(SubReg, SubKillState)
2189               .addImm(i % PVD.PerVGPR)
2190               .addReg(SB.TmpVGPR, TmpVGPRFlags);
2191       TmpVGPRFlags = 0;
2192       // There could be undef components of a spilled super register.
2193       // TODO: Can we detect this and skip the spill?
2194       if (SB.NumSubRegs > 1) {
2195         // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2196         unsigned SuperKillState = 0;
2197         if (i + 1 == SB.NumSubRegs)
2198           SuperKillState |= getKillRegState(SB.IsKill);
2199         WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2200       }
2201     }
2202     // Don't need to write VGPR out.
2203   }
2204 
2205   // Restore clobbered registers in the specified restore block.
2206   MI = RestoreMBB.end();
2207   SB.setMI(&RestoreMBB, MI);
2208   // Generate the restore of SGPR from SB.TmpVGPR.
2209   for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2210     // Don't need to load VGPR in.
2211     // Unpack lanes
2212     for (unsigned i = Offset * PVD.PerVGPR,
2213                   e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2214          i < e; ++i) {
2215       Register SubReg =
2216           SB.NumSubRegs == 1
2217               ? SB.SuperReg
2218               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2219       bool LastSubReg = (i + 1 == e);
2220       auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2221                          SubReg)
2222                      .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2223                      .addImm(i);
2224       if (SB.NumSubRegs > 1 && i == 0)
2225         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2226     }
2227   }
2228   SB.restore();
2229 
2230   SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
2231   return false;
2232 }
2233 
2234 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2235 /// a VGPR and the stack slot can be safely eliminated when all other users are
2236 /// handled.
2237 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
2238     MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
2239     SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2240   switch (MI->getOpcode()) {
2241   case AMDGPU::SI_SPILL_S1024_SAVE:
2242   case AMDGPU::SI_SPILL_S512_SAVE:
2243   case AMDGPU::SI_SPILL_S384_SAVE:
2244   case AMDGPU::SI_SPILL_S352_SAVE:
2245   case AMDGPU::SI_SPILL_S320_SAVE:
2246   case AMDGPU::SI_SPILL_S288_SAVE:
2247   case AMDGPU::SI_SPILL_S256_SAVE:
2248   case AMDGPU::SI_SPILL_S224_SAVE:
2249   case AMDGPU::SI_SPILL_S192_SAVE:
2250   case AMDGPU::SI_SPILL_S160_SAVE:
2251   case AMDGPU::SI_SPILL_S128_SAVE:
2252   case AMDGPU::SI_SPILL_S96_SAVE:
2253   case AMDGPU::SI_SPILL_S64_SAVE:
2254   case AMDGPU::SI_SPILL_S32_SAVE:
2255     return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2256   case AMDGPU::SI_SPILL_S1024_RESTORE:
2257   case AMDGPU::SI_SPILL_S512_RESTORE:
2258   case AMDGPU::SI_SPILL_S384_RESTORE:
2259   case AMDGPU::SI_SPILL_S352_RESTORE:
2260   case AMDGPU::SI_SPILL_S320_RESTORE:
2261   case AMDGPU::SI_SPILL_S288_RESTORE:
2262   case AMDGPU::SI_SPILL_S256_RESTORE:
2263   case AMDGPU::SI_SPILL_S224_RESTORE:
2264   case AMDGPU::SI_SPILL_S192_RESTORE:
2265   case AMDGPU::SI_SPILL_S160_RESTORE:
2266   case AMDGPU::SI_SPILL_S128_RESTORE:
2267   case AMDGPU::SI_SPILL_S96_RESTORE:
2268   case AMDGPU::SI_SPILL_S64_RESTORE:
2269   case AMDGPU::SI_SPILL_S32_RESTORE:
2270     return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2271   default:
2272     llvm_unreachable("not an SGPR spill instruction");
2273   }
2274 }
2275 
2276 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2277                                         int SPAdj, unsigned FIOperandNum,
2278                                         RegScavenger *RS) const {
2279   MachineFunction *MF = MI->getParent()->getParent();
2280   MachineBasicBlock *MBB = MI->getParent();
2281   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2282   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2283   const SIInstrInfo *TII = ST.getInstrInfo();
2284   const DebugLoc &DL = MI->getDebugLoc();
2285 
2286   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2287 
2288   assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
2289          "unreserved scratch RSRC register");
2290 
2291   MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2292   int Index = MI->getOperand(FIOperandNum).getIndex();
2293 
2294   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2295                           ? getBaseRegister()
2296                           : getFrameRegister(*MF);
2297 
2298   switch (MI->getOpcode()) {
2299     // SGPR register spill
2300     case AMDGPU::SI_SPILL_S1024_SAVE:
2301     case AMDGPU::SI_SPILL_S512_SAVE:
2302     case AMDGPU::SI_SPILL_S384_SAVE:
2303     case AMDGPU::SI_SPILL_S352_SAVE:
2304     case AMDGPU::SI_SPILL_S320_SAVE:
2305     case AMDGPU::SI_SPILL_S288_SAVE:
2306     case AMDGPU::SI_SPILL_S256_SAVE:
2307     case AMDGPU::SI_SPILL_S224_SAVE:
2308     case AMDGPU::SI_SPILL_S192_SAVE:
2309     case AMDGPU::SI_SPILL_S160_SAVE:
2310     case AMDGPU::SI_SPILL_S128_SAVE:
2311     case AMDGPU::SI_SPILL_S96_SAVE:
2312     case AMDGPU::SI_SPILL_S64_SAVE:
2313     case AMDGPU::SI_SPILL_S32_SAVE: {
2314       return spillSGPR(MI, Index, RS);
2315     }
2316 
2317     // SGPR register restore
2318     case AMDGPU::SI_SPILL_S1024_RESTORE:
2319     case AMDGPU::SI_SPILL_S512_RESTORE:
2320     case AMDGPU::SI_SPILL_S384_RESTORE:
2321     case AMDGPU::SI_SPILL_S352_RESTORE:
2322     case AMDGPU::SI_SPILL_S320_RESTORE:
2323     case AMDGPU::SI_SPILL_S288_RESTORE:
2324     case AMDGPU::SI_SPILL_S256_RESTORE:
2325     case AMDGPU::SI_SPILL_S224_RESTORE:
2326     case AMDGPU::SI_SPILL_S192_RESTORE:
2327     case AMDGPU::SI_SPILL_S160_RESTORE:
2328     case AMDGPU::SI_SPILL_S128_RESTORE:
2329     case AMDGPU::SI_SPILL_S96_RESTORE:
2330     case AMDGPU::SI_SPILL_S64_RESTORE:
2331     case AMDGPU::SI_SPILL_S32_RESTORE: {
2332       return restoreSGPR(MI, Index, RS);
2333     }
2334 
2335     // VGPR register spill
2336     case AMDGPU::SI_SPILL_V1024_SAVE:
2337     case AMDGPU::SI_SPILL_V512_SAVE:
2338     case AMDGPU::SI_SPILL_V384_SAVE:
2339     case AMDGPU::SI_SPILL_V352_SAVE:
2340     case AMDGPU::SI_SPILL_V320_SAVE:
2341     case AMDGPU::SI_SPILL_V288_SAVE:
2342     case AMDGPU::SI_SPILL_V256_SAVE:
2343     case AMDGPU::SI_SPILL_V224_SAVE:
2344     case AMDGPU::SI_SPILL_V192_SAVE:
2345     case AMDGPU::SI_SPILL_V160_SAVE:
2346     case AMDGPU::SI_SPILL_V128_SAVE:
2347     case AMDGPU::SI_SPILL_V96_SAVE:
2348     case AMDGPU::SI_SPILL_V64_SAVE:
2349     case AMDGPU::SI_SPILL_V32_SAVE:
2350     case AMDGPU::SI_SPILL_A1024_SAVE:
2351     case AMDGPU::SI_SPILL_A512_SAVE:
2352     case AMDGPU::SI_SPILL_A384_SAVE:
2353     case AMDGPU::SI_SPILL_A352_SAVE:
2354     case AMDGPU::SI_SPILL_A320_SAVE:
2355     case AMDGPU::SI_SPILL_A288_SAVE:
2356     case AMDGPU::SI_SPILL_A256_SAVE:
2357     case AMDGPU::SI_SPILL_A224_SAVE:
2358     case AMDGPU::SI_SPILL_A192_SAVE:
2359     case AMDGPU::SI_SPILL_A160_SAVE:
2360     case AMDGPU::SI_SPILL_A128_SAVE:
2361     case AMDGPU::SI_SPILL_A96_SAVE:
2362     case AMDGPU::SI_SPILL_A64_SAVE:
2363     case AMDGPU::SI_SPILL_A32_SAVE:
2364     case AMDGPU::SI_SPILL_AV1024_SAVE:
2365     case AMDGPU::SI_SPILL_AV512_SAVE:
2366     case AMDGPU::SI_SPILL_AV384_SAVE:
2367     case AMDGPU::SI_SPILL_AV352_SAVE:
2368     case AMDGPU::SI_SPILL_AV320_SAVE:
2369     case AMDGPU::SI_SPILL_AV288_SAVE:
2370     case AMDGPU::SI_SPILL_AV256_SAVE:
2371     case AMDGPU::SI_SPILL_AV224_SAVE:
2372     case AMDGPU::SI_SPILL_AV192_SAVE:
2373     case AMDGPU::SI_SPILL_AV160_SAVE:
2374     case AMDGPU::SI_SPILL_AV128_SAVE:
2375     case AMDGPU::SI_SPILL_AV96_SAVE:
2376     case AMDGPU::SI_SPILL_AV64_SAVE:
2377     case AMDGPU::SI_SPILL_AV32_SAVE:
2378     case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2379     case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2380       const MachineOperand *VData = TII->getNamedOperand(*MI,
2381                                                          AMDGPU::OpName::vdata);
2382       if (VData->isUndef()) {
2383         MI->eraseFromParent();
2384         return true;
2385       }
2386 
2387       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2388              MFI->getStackPtrOffsetReg());
2389 
2390       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2391                                             : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2392       auto *MBB = MI->getParent();
2393       bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2394       if (IsWWMRegSpill) {
2395         TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2396                                   RS->isRegUsed(AMDGPU::SCC));
2397       }
2398       buildSpillLoadStore(
2399           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2400           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2401           *MI->memoperands_begin(), RS);
2402       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
2403       if (IsWWMRegSpill)
2404         TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2405 
2406       MI->eraseFromParent();
2407       return true;
2408     }
2409     case AMDGPU::SI_SPILL_V32_RESTORE:
2410     case AMDGPU::SI_SPILL_V64_RESTORE:
2411     case AMDGPU::SI_SPILL_V96_RESTORE:
2412     case AMDGPU::SI_SPILL_V128_RESTORE:
2413     case AMDGPU::SI_SPILL_V160_RESTORE:
2414     case AMDGPU::SI_SPILL_V192_RESTORE:
2415     case AMDGPU::SI_SPILL_V224_RESTORE:
2416     case AMDGPU::SI_SPILL_V256_RESTORE:
2417     case AMDGPU::SI_SPILL_V288_RESTORE:
2418     case AMDGPU::SI_SPILL_V320_RESTORE:
2419     case AMDGPU::SI_SPILL_V352_RESTORE:
2420     case AMDGPU::SI_SPILL_V384_RESTORE:
2421     case AMDGPU::SI_SPILL_V512_RESTORE:
2422     case AMDGPU::SI_SPILL_V1024_RESTORE:
2423     case AMDGPU::SI_SPILL_A32_RESTORE:
2424     case AMDGPU::SI_SPILL_A64_RESTORE:
2425     case AMDGPU::SI_SPILL_A96_RESTORE:
2426     case AMDGPU::SI_SPILL_A128_RESTORE:
2427     case AMDGPU::SI_SPILL_A160_RESTORE:
2428     case AMDGPU::SI_SPILL_A192_RESTORE:
2429     case AMDGPU::SI_SPILL_A224_RESTORE:
2430     case AMDGPU::SI_SPILL_A256_RESTORE:
2431     case AMDGPU::SI_SPILL_A288_RESTORE:
2432     case AMDGPU::SI_SPILL_A320_RESTORE:
2433     case AMDGPU::SI_SPILL_A352_RESTORE:
2434     case AMDGPU::SI_SPILL_A384_RESTORE:
2435     case AMDGPU::SI_SPILL_A512_RESTORE:
2436     case AMDGPU::SI_SPILL_A1024_RESTORE:
2437     case AMDGPU::SI_SPILL_AV32_RESTORE:
2438     case AMDGPU::SI_SPILL_AV64_RESTORE:
2439     case AMDGPU::SI_SPILL_AV96_RESTORE:
2440     case AMDGPU::SI_SPILL_AV128_RESTORE:
2441     case AMDGPU::SI_SPILL_AV160_RESTORE:
2442     case AMDGPU::SI_SPILL_AV192_RESTORE:
2443     case AMDGPU::SI_SPILL_AV224_RESTORE:
2444     case AMDGPU::SI_SPILL_AV256_RESTORE:
2445     case AMDGPU::SI_SPILL_AV288_RESTORE:
2446     case AMDGPU::SI_SPILL_AV320_RESTORE:
2447     case AMDGPU::SI_SPILL_AV352_RESTORE:
2448     case AMDGPU::SI_SPILL_AV384_RESTORE:
2449     case AMDGPU::SI_SPILL_AV512_RESTORE:
2450     case AMDGPU::SI_SPILL_AV1024_RESTORE:
2451     case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2452     case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2453       const MachineOperand *VData = TII->getNamedOperand(*MI,
2454                                                          AMDGPU::OpName::vdata);
2455       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2456              MFI->getStackPtrOffsetReg());
2457 
2458       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2459                                             : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2460       auto *MBB = MI->getParent();
2461       bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2462       if (IsWWMRegSpill) {
2463         TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2464                                   RS->isRegUsed(AMDGPU::SCC));
2465       }
2466 
2467       buildSpillLoadStore(
2468           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2469           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2470           *MI->memoperands_begin(), RS);
2471 
2472       if (IsWWMRegSpill)
2473         TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2474 
2475       MI->eraseFromParent();
2476       return true;
2477     }
2478     case AMDGPU::V_ADD_U32_e32:
2479     case AMDGPU::V_ADD_U32_e64:
2480     case AMDGPU::V_ADD_CO_U32_e32:
2481     case AMDGPU::V_ADD_CO_U32_e64: {
2482       // TODO: Handle sub, and, or.
2483       unsigned NumDefs = MI->getNumExplicitDefs();
2484       unsigned Src0Idx = NumDefs;
2485 
2486       bool HasClamp = false;
2487       MachineOperand *VCCOp = nullptr;
2488 
2489       switch (MI->getOpcode()) {
2490       case AMDGPU::V_ADD_U32_e32:
2491         break;
2492       case AMDGPU::V_ADD_U32_e64:
2493         HasClamp = MI->getOperand(3).getImm();
2494         break;
2495       case AMDGPU::V_ADD_CO_U32_e32:
2496         VCCOp = &MI->getOperand(3);
2497         break;
2498       case AMDGPU::V_ADD_CO_U32_e64:
2499         VCCOp = &MI->getOperand(1);
2500         HasClamp = MI->getOperand(4).getImm();
2501         break;
2502       default:
2503         break;
2504       }
2505       bool DeadVCC = !VCCOp || VCCOp->isDead();
2506       MachineOperand &DstOp = MI->getOperand(0);
2507       Register DstReg = DstOp.getReg();
2508 
2509       unsigned OtherOpIdx =
2510           FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2511       MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2512 
2513       unsigned Src1Idx = Src0Idx + 1;
2514       Register MaterializedReg = FrameReg;
2515       Register ScavengedVGPR;
2516 
2517       int64_t Offset = FrameInfo.getObjectOffset(Index);
2518       // For the non-immediate case, we could fall through to the default
2519       // handling, but we do an in-place update of the result register here to
2520       // avoid scavenging another register.
2521       if (OtherOp->isImm()) {
2522         int64_t TotalOffset = OtherOp->getImm() + Offset;
2523 
2524         if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2525             !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2526           // If we can't support a VOP3 literal in the VALU instruction, we
2527           // can't specially fold into the add.
2528           // TODO: Handle VOP3->VOP2 shrink to support the fold.
2529           break;
2530         }
2531 
2532         OtherOp->setImm(TotalOffset);
2533         Offset = 0;
2534       }
2535 
2536       if (FrameReg && !ST.enableFlatScratch()) {
2537         // We should just do an in-place update of the result register. However,
2538         // the value there may also be used by the add, in which case we need a
2539         // temporary register.
2540         //
2541         // FIXME: The scavenger is not finding the result register in the
2542         // common case where the add does not read the register.
2543 
2544         ScavengedVGPR = RS->scavengeRegisterBackwards(
2545             AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2546 
2547         // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2548         // shift.
2549         BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2550             .addDef(ScavengedVGPR, RegState::Renamable)
2551             .addImm(ST.getWavefrontSizeLog2())
2552             .addReg(FrameReg);
2553         MaterializedReg = ScavengedVGPR;
2554       }
2555 
2556       if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2557         if (ST.enableFlatScratch() &&
2558             !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2559           // We didn't need the shift above, so we have an SGPR for the frame
2560           // register, but may have a VGPR only operand.
2561           //
2562           // TODO: On gfx10+, we can easily change the opcode to the e64 version
2563           // and use the higher constant bus restriction to avoid this copy.
2564 
2565           if (!ScavengedVGPR) {
2566             ScavengedVGPR = RS->scavengeRegisterBackwards(
2567                 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2568                 /*SPAdj=*/0);
2569           }
2570 
2571           assert(ScavengedVGPR != DstReg);
2572 
2573           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2574               .addReg(MaterializedReg,
2575                       MaterializedReg != FrameReg ? RegState::Kill : 0);
2576           MaterializedReg = ScavengedVGPR;
2577         }
2578 
2579         // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2580         // is not live, we could use a scalar add + vector add instead of 2
2581         // vector adds.
2582         auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2583                           .addDef(DstReg, RegState::Renamable);
2584         if (NumDefs == 2)
2585           AddI32.add(MI->getOperand(1));
2586 
2587         unsigned MaterializedRegFlags =
2588             MaterializedReg != FrameReg ? RegState::Kill : 0;
2589 
2590         if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2591           // If we know we have a VGPR already, it's more likely the other
2592           // operand is a legal vsrc0.
2593           AddI32
2594             .add(*OtherOp)
2595             .addReg(MaterializedReg, MaterializedRegFlags);
2596         } else {
2597           // Commute operands to avoid violating VOP2 restrictions. This will
2598           // typically happen when using scratch.
2599           AddI32
2600             .addReg(MaterializedReg, MaterializedRegFlags)
2601             .add(*OtherOp);
2602         }
2603 
2604         if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2605             MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2606           AddI32.addImm(0); // clamp
2607 
2608         if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2609           AddI32.setOperandDead(3); // Dead vcc
2610 
2611         MaterializedReg = DstReg;
2612 
2613         OtherOp->ChangeToRegister(MaterializedReg, false);
2614         OtherOp->setIsKill(true);
2615         FIOp->ChangeToImmediate(Offset);
2616         Offset = 0;
2617       } else if (Offset != 0) {
2618         assert(!MaterializedReg);
2619         FIOp->ChangeToImmediate(Offset);
2620         Offset = 0;
2621       } else {
2622         if (DeadVCC && !HasClamp) {
2623           assert(Offset == 0);
2624 
2625           // TODO: Losing kills and implicit operands. Just mutate to copy and
2626           // let lowerCopy deal with it?
2627           if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2628             // Folded to an identity copy.
2629             MI->eraseFromParent();
2630             return true;
2631           }
2632 
2633           // The immediate value should be in OtherOp
2634           MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2635           MI->removeOperand(FIOperandNum);
2636 
2637           unsigned NumOps = MI->getNumOperands();
2638           for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2639             MI->removeOperand(I);
2640 
2641           if (NumDefs == 2)
2642             MI->removeOperand(1);
2643 
2644           // The code below can't deal with a mov.
2645           return true;
2646         }
2647 
2648         // This folded to a constant, but we have to keep the add around for
2649         // pointless implicit defs or clamp modifier.
2650         FIOp->ChangeToImmediate(0);
2651       }
2652 
2653       // Try to improve legality by commuting.
2654       if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2655         std::swap(FIOp, OtherOp);
2656         std::swap(FIOperandNum, OtherOpIdx);
2657       }
2658 
2659       // We need at most one mov to satisfy the operand constraints. Prefer to
2660       // move the FI operand first, as it may be a literal in a VOP3
2661       // instruction.
2662       for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2663         if (!TII->isOperandLegal(*MI, SrcIdx)) {
2664           // If commuting didn't make the operands legal, we need to materialize
2665           // in a register.
2666           // TODO: Can use SGPR on gfx10+ in some cases.
2667           if (!ScavengedVGPR) {
2668             ScavengedVGPR = RS->scavengeRegisterBackwards(
2669                 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2670                 /*SPAdj=*/0);
2671           }
2672 
2673           assert(ScavengedVGPR != DstReg);
2674 
2675           MachineOperand &Src = MI->getOperand(SrcIdx);
2676           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2677               .add(Src);
2678 
2679           Src.ChangeToRegister(ScavengedVGPR, false);
2680           Src.setIsKill(true);
2681           break;
2682         }
2683       }
2684 
2685       // Fold out add of 0 case that can appear in kernels.
2686       if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2687         if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2688           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2689         }
2690 
2691         MI->eraseFromParent();
2692       }
2693 
2694       return true;
2695     }
2696     case AMDGPU::S_ADD_I32: {
2697       // TODO: Handle s_or_b32, s_and_b32.
2698       unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2699       MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2700 
2701       assert(FrameReg || MFI->isBottomOfStack());
2702 
2703       MachineOperand &DstOp = MI->getOperand(0);
2704       const DebugLoc &DL = MI->getDebugLoc();
2705       Register MaterializedReg = FrameReg;
2706 
2707       // Defend against live scc, which should never happen in practice.
2708       bool DeadSCC = MI->getOperand(3).isDead();
2709 
2710       Register TmpReg;
2711 
2712       // FIXME: Scavenger should figure out that the result register is
2713       // available. Also should do this for the v_add case.
2714       if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2715         TmpReg = DstOp.getReg();
2716 
2717       if (FrameReg && !ST.enableFlatScratch()) {
2718         // FIXME: In the common case where the add does not also read its result
2719         // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2720         // available.
2721         if (!TmpReg)
2722           TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2723                                                  MI, false, 0);
2724         BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2725             .addDef(TmpReg, RegState::Renamable)
2726             .addReg(FrameReg)
2727             .addImm(ST.getWavefrontSizeLog2())
2728             .setOperandDead(3); // Set SCC dead
2729         MaterializedReg = TmpReg;
2730       }
2731 
2732       int64_t Offset = FrameInfo.getObjectOffset(Index);
2733 
2734       // For the non-immediate case, we could fall through to the default
2735       // handling, but we do an in-place update of the result register here to
2736       // avoid scavenging another register.
2737       if (OtherOp.isImm()) {
2738         OtherOp.setImm(OtherOp.getImm() + Offset);
2739         Offset = 0;
2740 
2741         if (MaterializedReg)
2742           FIOp->ChangeToRegister(MaterializedReg, false);
2743         else
2744           FIOp->ChangeToImmediate(0);
2745       } else if (MaterializedReg) {
2746         // If we can't fold the other operand, do another increment.
2747         Register DstReg = DstOp.getReg();
2748 
2749         if (!TmpReg && MaterializedReg == FrameReg) {
2750           TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2751                                                  MI, /*RestoreAfter=*/false, 0,
2752                                                  /*AllowSpill=*/false);
2753           DstReg = TmpReg;
2754         }
2755 
2756         auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2757                           .addDef(DstReg, RegState::Renamable)
2758                           .addReg(MaterializedReg, RegState::Kill)
2759                           .add(OtherOp);
2760         if (DeadSCC)
2761           AddI32.setOperandDead(3);
2762 
2763         MaterializedReg = DstReg;
2764 
2765         OtherOp.ChangeToRegister(MaterializedReg, false);
2766         OtherOp.setIsKill(true);
2767         OtherOp.setIsRenamable(true);
2768         FIOp->ChangeToImmediate(Offset);
2769       } else {
2770         // If we don't have any other offset to apply, we can just directly
2771         // interpret the frame index as the offset.
2772         FIOp->ChangeToImmediate(Offset);
2773       }
2774 
2775       if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2776         assert(Offset == 0);
2777         MI->removeOperand(3);
2778         MI->removeOperand(OtherOpIdx);
2779         MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2780       } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2781         assert(Offset == 0);
2782         MI->removeOperand(3);
2783         MI->removeOperand(FIOperandNum);
2784         MI->setDesc(
2785             TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2786       }
2787 
2788       assert(!FIOp->isFI());
2789       return true;
2790     }
2791     default: {
2792       break;
2793     }
2794     }
2795 
2796     int64_t Offset = FrameInfo.getObjectOffset(Index);
2797     if (ST.enableFlatScratch()) {
2798       if (TII->isFLATScratch(*MI)) {
2799         assert(
2800             (int16_t)FIOperandNum ==
2801             AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2802 
2803         // The offset is always swizzled, just replace it
2804         if (FrameReg)
2805           FIOp->ChangeToRegister(FrameReg, false);
2806 
2807         MachineOperand *OffsetOp =
2808             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2809         int64_t NewOffset = Offset + OffsetOp->getImm();
2810         if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2811                                    SIInstrFlags::FlatScratch)) {
2812           OffsetOp->setImm(NewOffset);
2813           if (FrameReg)
2814             return false;
2815           Offset = 0;
2816         }
2817 
2818         if (!Offset) {
2819           unsigned Opc = MI->getOpcode();
2820           int NewOpc = -1;
2821           if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2822             NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
2823           } else if (ST.hasFlatScratchSTMode()) {
2824             // On GFX10 we have ST mode to use no registers for an address.
2825             // Otherwise we need to materialize 0 into an SGPR.
2826             NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
2827           }
2828 
2829           if (NewOpc != -1) {
2830             // removeOperand doesn't fixup tied operand indexes as it goes, so
2831             // it asserts. Untie vdst_in for now and retie them afterwards.
2832             int VDstIn =
2833                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2834             bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2835                             MI->getOperand(VDstIn).isTied();
2836             if (TiedVDst)
2837               MI->untieRegOperand(VDstIn);
2838 
2839             MI->removeOperand(
2840                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2841 
2842             if (TiedVDst) {
2843               int NewVDst =
2844                   AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2845               int NewVDstIn =
2846                   AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2847               assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2848               MI->tieOperands(NewVDst, NewVDstIn);
2849             }
2850             MI->setDesc(TII->get(NewOpc));
2851             return false;
2852           }
2853         }
2854       }
2855 
2856       if (!FrameReg) {
2857         FIOp->ChangeToImmediate(Offset);
2858         if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2859           return false;
2860       }
2861 
2862       // We need to use register here. Check if we can use an SGPR or need
2863       // a VGPR.
2864       FIOp->ChangeToRegister(AMDGPU::M0, false);
2865       bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2866 
2867       if (!Offset && FrameReg && UseSGPR) {
2868         FIOp->setReg(FrameReg);
2869         return false;
2870       }
2871 
2872       const TargetRegisterClass *RC =
2873           UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2874 
2875       Register TmpReg =
2876           RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2877       FIOp->setReg(TmpReg);
2878       FIOp->setIsKill();
2879 
2880       if ((!FrameReg || !Offset) && TmpReg) {
2881         unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2882         auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2883         if (FrameReg)
2884           MIB.addReg(FrameReg);
2885         else
2886           MIB.addImm(Offset);
2887 
2888         return false;
2889       }
2890 
2891       bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2892                          !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2893 
2894       Register TmpSReg =
2895           UseSGPR ? TmpReg
2896                   : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2897                                                   MI, false, 0, !UseSGPR);
2898 
2899       // TODO: for flat scratch another attempt can be made with a VGPR index
2900       //       if no SGPRs can be scavenged.
2901       if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2902         report_fatal_error("Cannot scavenge register in FI elimination!");
2903 
2904       if (!TmpSReg) {
2905         // Use frame register and restore it after.
2906         TmpSReg = FrameReg;
2907         FIOp->setReg(FrameReg);
2908         FIOp->setIsKill(false);
2909       }
2910 
2911       if (NeedSaveSCC) {
2912         assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2913         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2914             .addReg(FrameReg)
2915             .addImm(Offset);
2916         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2917             .addReg(TmpSReg)
2918             .addImm(0);
2919         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2920             .addImm(0)
2921             .addReg(TmpSReg);
2922       } else {
2923         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2924             .addReg(FrameReg)
2925             .addImm(Offset);
2926       }
2927 
2928       if (!UseSGPR)
2929         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2930             .addReg(TmpSReg, RegState::Kill);
2931 
2932       if (TmpSReg == FrameReg) {
2933         // Undo frame register modification.
2934         if (NeedSaveSCC &&
2935             !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2936           MachineBasicBlock::iterator I =
2937               BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2938                       TmpSReg)
2939                   .addReg(FrameReg)
2940                   .addImm(-Offset);
2941           I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2942                   .addReg(TmpSReg)
2943                   .addImm(0);
2944           BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2945                   TmpSReg)
2946               .addImm(0)
2947               .addReg(TmpSReg);
2948         } else {
2949           BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2950                   FrameReg)
2951               .addReg(FrameReg)
2952               .addImm(-Offset);
2953         }
2954       }
2955 
2956       return false;
2957     }
2958 
2959     bool IsMUBUF = TII->isMUBUF(*MI);
2960 
2961     if (!IsMUBUF && !MFI->isBottomOfStack()) {
2962       // Convert to a swizzled stack address by scaling by the wave size.
2963       // In an entry function/kernel the offset is already swizzled.
2964       bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
2965       bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2966                      !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2967       const TargetRegisterClass *RC = IsSALU && !LiveSCC
2968                                           ? &AMDGPU::SReg_32RegClass
2969                                           : &AMDGPU::VGPR_32RegClass;
2970       bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2971                     MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2972                     MI->getOpcode() == AMDGPU::S_MOV_B32;
2973       Register ResultReg =
2974           IsCopy ? MI->getOperand(0).getReg()
2975                  : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
2976 
2977       int64_t Offset = FrameInfo.getObjectOffset(Index);
2978       if (Offset == 0) {
2979         unsigned OpCode =
2980             IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
2981         Register TmpResultReg = ResultReg;
2982         if (IsSALU && LiveSCC) {
2983           TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2984                                                        MI, false, 0);
2985         }
2986 
2987         auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
2988         if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2989           // For V_LSHRREV, the operands are reversed (the shift count goes
2990           // first).
2991           Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2992         else
2993           Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2994         if (IsSALU && !LiveSCC)
2995           Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2996         if (IsSALU && LiveSCC) {
2997           Register NewDest =
2998               IsCopy ? ResultReg
2999                      : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
3000                                                      Shift, false, 0);
3001           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3002               .addReg(TmpResultReg);
3003           ResultReg = NewDest;
3004         }
3005       } else {
3006         MachineInstrBuilder MIB;
3007         if (!IsSALU) {
3008           if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3009               nullptr) {
3010             // Reuse ResultReg in intermediate step.
3011             Register ScaledReg = ResultReg;
3012 
3013             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3014                     ScaledReg)
3015                 .addImm(ST.getWavefrontSizeLog2())
3016                 .addReg(FrameReg);
3017 
3018             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3019 
3020             // TODO: Fold if use instruction is another add of a constant.
3021             if (IsVOP2 ||
3022                 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3023               // FIXME: This can fail
3024               MIB.addImm(Offset);
3025               MIB.addReg(ScaledReg, RegState::Kill);
3026               if (!IsVOP2)
3027                 MIB.addImm(0); // clamp bit
3028             } else {
3029               assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3030                      "Need to reuse carry out register");
3031 
3032               // Use scavenged unused carry out as offset register.
3033               Register ConstOffsetReg;
3034               if (!isWave32)
3035                 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3036               else
3037                 ConstOffsetReg = MIB.getReg(1);
3038 
3039               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3040                       ConstOffsetReg)
3041                   .addImm(Offset);
3042               MIB.addReg(ConstOffsetReg, RegState::Kill);
3043               MIB.addReg(ScaledReg, RegState::Kill);
3044               MIB.addImm(0); // clamp bit
3045             }
3046           }
3047         }
3048         if (!MIB || IsSALU) {
3049           // We have to produce a carry out, and there isn't a free SGPR pair
3050           // for it. We can keep the whole computation on the SALU to avoid
3051           // clobbering an additional register at the cost of an extra mov.
3052 
3053           // We may have 1 free scratch SGPR even though a carry out is
3054           // unavailable. Only one additional mov is needed.
3055           Register TmpScaledReg = IsCopy && IsSALU
3056                                       ? ResultReg
3057                                       : RS->scavengeRegisterBackwards(
3058                                             AMDGPU::SReg_32_XM0RegClass, MI,
3059                                             false, 0, /*AllowSpill=*/false);
3060           Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3061           Register TmpResultReg = ScaledReg;
3062 
3063           if (!LiveSCC) {
3064             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3065                 .addReg(FrameReg)
3066                 .addImm(ST.getWavefrontSizeLog2());
3067             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3068                 .addReg(TmpResultReg, RegState::Kill)
3069                 .addImm(Offset);
3070           } else {
3071             TmpResultReg = RS->scavengeRegisterBackwards(
3072                 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3073 
3074             MachineInstrBuilder Add;
3075             if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3076               BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3077                       TmpResultReg)
3078                   .addImm(ST.getWavefrontSizeLog2())
3079                   .addReg(FrameReg);
3080               if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3081                 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3082                     .addImm(Offset);
3083                 Add.addReg(ResultReg, RegState::Kill)
3084                     .addReg(TmpResultReg, RegState::Kill)
3085                     .addImm(0);
3086               } else
3087                 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3088             } else {
3089               assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3090                      "offset is unsafe for v_mad_u32_u24");
3091 
3092               // We start with a frame pointer with a wave space value, and
3093               // an offset in lane-space. We are materializing a lane space
3094               // value. We can either do a right shift of the frame pointer
3095               // to get to lane space, or a left shift of the offset to get
3096               // to wavespace. We can right shift after the computation to
3097               // get back to the desired per-lane value. We are using the
3098               // mad_u32_u24 primarily as an add with no carry out clobber.
3099               bool IsInlinableLiteral =
3100                   AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3101               if (!IsInlinableLiteral) {
3102                 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3103                         TmpResultReg)
3104                     .addImm(Offset);
3105               }
3106 
3107               Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3108                             TmpResultReg);
3109 
3110               if (!IsInlinableLiteral) {
3111                 Add.addReg(TmpResultReg, RegState::Kill);
3112               } else {
3113                 // We fold the offset into mad itself if its inlinable.
3114                 Add.addImm(Offset);
3115               }
3116               Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3117               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3118                       TmpResultReg)
3119                   .addImm(ST.getWavefrontSizeLog2())
3120                   .addReg(TmpResultReg);
3121             }
3122 
3123             Register NewDest = IsCopy ? ResultReg
3124                                       : RS->scavengeRegisterBackwards(
3125                                             AMDGPU::SReg_32RegClass, *Add,
3126                                             false, 0, /*AllowSpill=*/true);
3127             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3128                     NewDest)
3129                 .addReg(TmpResultReg);
3130             ResultReg = NewDest;
3131           }
3132           if (!IsSALU)
3133             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3134                 .addReg(TmpResultReg, RegState::Kill);
3135           else
3136             ResultReg = TmpResultReg;
3137           // If there were truly no free SGPRs, we need to undo everything.
3138           if (!TmpScaledReg.isValid()) {
3139             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3140                 .addReg(ScaledReg, RegState::Kill)
3141                 .addImm(-Offset);
3142             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3143                 .addReg(FrameReg)
3144                 .addImm(ST.getWavefrontSizeLog2());
3145           }
3146         }
3147       }
3148 
3149       // Don't introduce an extra copy if we're just materializing in a mov.
3150       if (IsCopy) {
3151         MI->eraseFromParent();
3152         return true;
3153       }
3154       FIOp->ChangeToRegister(ResultReg, false, false, true);
3155       return false;
3156     }
3157 
3158     if (IsMUBUF) {
3159       // Disable offen so we don't need a 0 vgpr base.
3160       assert(
3161           static_cast<int>(FIOperandNum) ==
3162           AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3163 
3164       auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3165       assert((SOffset.isImm() && SOffset.getImm() == 0));
3166 
3167       if (FrameReg != AMDGPU::NoRegister)
3168         SOffset.ChangeToRegister(FrameReg, false);
3169 
3170       int64_t Offset = FrameInfo.getObjectOffset(Index);
3171       int64_t OldImm =
3172           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3173       int64_t NewOffset = OldImm + Offset;
3174 
3175       if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3176           buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3177         MI->eraseFromParent();
3178         return true;
3179       }
3180     }
3181 
3182     // If the offset is simply too big, don't convert to a scratch wave offset
3183     // relative index.
3184 
3185     FIOp->ChangeToImmediate(Offset);
3186     if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3187       Register TmpReg =
3188           RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3189       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3190           .addImm(Offset);
3191       FIOp->ChangeToRegister(TmpReg, false, false, true);
3192     }
3193 
3194   return false;
3195 }
3196 
3197 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
3198   return AMDGPUInstPrinter::getRegisterName(Reg);
3199 }
3200 
3201 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
3202   return getRegBitWidth(RC.getID());
3203 }
3204 
3205 static const TargetRegisterClass *
3206 getAnyVGPRClassForBitWidth(unsigned BitWidth) {
3207   if (BitWidth == 64)
3208     return &AMDGPU::VReg_64RegClass;
3209   if (BitWidth == 96)
3210     return &AMDGPU::VReg_96RegClass;
3211   if (BitWidth == 128)
3212     return &AMDGPU::VReg_128RegClass;
3213   if (BitWidth == 160)
3214     return &AMDGPU::VReg_160RegClass;
3215   if (BitWidth == 192)
3216     return &AMDGPU::VReg_192RegClass;
3217   if (BitWidth == 224)
3218     return &AMDGPU::VReg_224RegClass;
3219   if (BitWidth == 256)
3220     return &AMDGPU::VReg_256RegClass;
3221   if (BitWidth == 288)
3222     return &AMDGPU::VReg_288RegClass;
3223   if (BitWidth == 320)
3224     return &AMDGPU::VReg_320RegClass;
3225   if (BitWidth == 352)
3226     return &AMDGPU::VReg_352RegClass;
3227   if (BitWidth == 384)
3228     return &AMDGPU::VReg_384RegClass;
3229   if (BitWidth == 512)
3230     return &AMDGPU::VReg_512RegClass;
3231   if (BitWidth == 1024)
3232     return &AMDGPU::VReg_1024RegClass;
3233 
3234   return nullptr;
3235 }
3236 
3237 static const TargetRegisterClass *
3238 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
3239   if (BitWidth == 64)
3240     return &AMDGPU::VReg_64_Align2RegClass;
3241   if (BitWidth == 96)
3242     return &AMDGPU::VReg_96_Align2RegClass;
3243   if (BitWidth == 128)
3244     return &AMDGPU::VReg_128_Align2RegClass;
3245   if (BitWidth == 160)
3246     return &AMDGPU::VReg_160_Align2RegClass;
3247   if (BitWidth == 192)
3248     return &AMDGPU::VReg_192_Align2RegClass;
3249   if (BitWidth == 224)
3250     return &AMDGPU::VReg_224_Align2RegClass;
3251   if (BitWidth == 256)
3252     return &AMDGPU::VReg_256_Align2RegClass;
3253   if (BitWidth == 288)
3254     return &AMDGPU::VReg_288_Align2RegClass;
3255   if (BitWidth == 320)
3256     return &AMDGPU::VReg_320_Align2RegClass;
3257   if (BitWidth == 352)
3258     return &AMDGPU::VReg_352_Align2RegClass;
3259   if (BitWidth == 384)
3260     return &AMDGPU::VReg_384_Align2RegClass;
3261   if (BitWidth == 512)
3262     return &AMDGPU::VReg_512_Align2RegClass;
3263   if (BitWidth == 1024)
3264     return &AMDGPU::VReg_1024_Align2RegClass;
3265 
3266   return nullptr;
3267 }
3268 
3269 const TargetRegisterClass *
3270 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
3271   if (BitWidth == 1)
3272     return &AMDGPU::VReg_1RegClass;
3273   if (BitWidth == 16)
3274     return &AMDGPU::VGPR_16RegClass;
3275   if (BitWidth == 32)
3276     return &AMDGPU::VGPR_32RegClass;
3277   return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3278                                 : getAnyVGPRClassForBitWidth(BitWidth);
3279 }
3280 
3281 static const TargetRegisterClass *
3282 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
3283   if (BitWidth == 64)
3284     return &AMDGPU::AReg_64RegClass;
3285   if (BitWidth == 96)
3286     return &AMDGPU::AReg_96RegClass;
3287   if (BitWidth == 128)
3288     return &AMDGPU::AReg_128RegClass;
3289   if (BitWidth == 160)
3290     return &AMDGPU::AReg_160RegClass;
3291   if (BitWidth == 192)
3292     return &AMDGPU::AReg_192RegClass;
3293   if (BitWidth == 224)
3294     return &AMDGPU::AReg_224RegClass;
3295   if (BitWidth == 256)
3296     return &AMDGPU::AReg_256RegClass;
3297   if (BitWidth == 288)
3298     return &AMDGPU::AReg_288RegClass;
3299   if (BitWidth == 320)
3300     return &AMDGPU::AReg_320RegClass;
3301   if (BitWidth == 352)
3302     return &AMDGPU::AReg_352RegClass;
3303   if (BitWidth == 384)
3304     return &AMDGPU::AReg_384RegClass;
3305   if (BitWidth == 512)
3306     return &AMDGPU::AReg_512RegClass;
3307   if (BitWidth == 1024)
3308     return &AMDGPU::AReg_1024RegClass;
3309 
3310   return nullptr;
3311 }
3312 
3313 static const TargetRegisterClass *
3314 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
3315   if (BitWidth == 64)
3316     return &AMDGPU::AReg_64_Align2RegClass;
3317   if (BitWidth == 96)
3318     return &AMDGPU::AReg_96_Align2RegClass;
3319   if (BitWidth == 128)
3320     return &AMDGPU::AReg_128_Align2RegClass;
3321   if (BitWidth == 160)
3322     return &AMDGPU::AReg_160_Align2RegClass;
3323   if (BitWidth == 192)
3324     return &AMDGPU::AReg_192_Align2RegClass;
3325   if (BitWidth == 224)
3326     return &AMDGPU::AReg_224_Align2RegClass;
3327   if (BitWidth == 256)
3328     return &AMDGPU::AReg_256_Align2RegClass;
3329   if (BitWidth == 288)
3330     return &AMDGPU::AReg_288_Align2RegClass;
3331   if (BitWidth == 320)
3332     return &AMDGPU::AReg_320_Align2RegClass;
3333   if (BitWidth == 352)
3334     return &AMDGPU::AReg_352_Align2RegClass;
3335   if (BitWidth == 384)
3336     return &AMDGPU::AReg_384_Align2RegClass;
3337   if (BitWidth == 512)
3338     return &AMDGPU::AReg_512_Align2RegClass;
3339   if (BitWidth == 1024)
3340     return &AMDGPU::AReg_1024_Align2RegClass;
3341 
3342   return nullptr;
3343 }
3344 
3345 const TargetRegisterClass *
3346 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
3347   if (BitWidth == 16)
3348     return &AMDGPU::AGPR_LO16RegClass;
3349   if (BitWidth == 32)
3350     return &AMDGPU::AGPR_32RegClass;
3351   return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3352                                 : getAnyAGPRClassForBitWidth(BitWidth);
3353 }
3354 
3355 static const TargetRegisterClass *
3356 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
3357   if (BitWidth == 64)
3358     return &AMDGPU::AV_64RegClass;
3359   if (BitWidth == 96)
3360     return &AMDGPU::AV_96RegClass;
3361   if (BitWidth == 128)
3362     return &AMDGPU::AV_128RegClass;
3363   if (BitWidth == 160)
3364     return &AMDGPU::AV_160RegClass;
3365   if (BitWidth == 192)
3366     return &AMDGPU::AV_192RegClass;
3367   if (BitWidth == 224)
3368     return &AMDGPU::AV_224RegClass;
3369   if (BitWidth == 256)
3370     return &AMDGPU::AV_256RegClass;
3371   if (BitWidth == 288)
3372     return &AMDGPU::AV_288RegClass;
3373   if (BitWidth == 320)
3374     return &AMDGPU::AV_320RegClass;
3375   if (BitWidth == 352)
3376     return &AMDGPU::AV_352RegClass;
3377   if (BitWidth == 384)
3378     return &AMDGPU::AV_384RegClass;
3379   if (BitWidth == 512)
3380     return &AMDGPU::AV_512RegClass;
3381   if (BitWidth == 1024)
3382     return &AMDGPU::AV_1024RegClass;
3383 
3384   return nullptr;
3385 }
3386 
3387 static const TargetRegisterClass *
3388 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
3389   if (BitWidth == 64)
3390     return &AMDGPU::AV_64_Align2RegClass;
3391   if (BitWidth == 96)
3392     return &AMDGPU::AV_96_Align2RegClass;
3393   if (BitWidth == 128)
3394     return &AMDGPU::AV_128_Align2RegClass;
3395   if (BitWidth == 160)
3396     return &AMDGPU::AV_160_Align2RegClass;
3397   if (BitWidth == 192)
3398     return &AMDGPU::AV_192_Align2RegClass;
3399   if (BitWidth == 224)
3400     return &AMDGPU::AV_224_Align2RegClass;
3401   if (BitWidth == 256)
3402     return &AMDGPU::AV_256_Align2RegClass;
3403   if (BitWidth == 288)
3404     return &AMDGPU::AV_288_Align2RegClass;
3405   if (BitWidth == 320)
3406     return &AMDGPU::AV_320_Align2RegClass;
3407   if (BitWidth == 352)
3408     return &AMDGPU::AV_352_Align2RegClass;
3409   if (BitWidth == 384)
3410     return &AMDGPU::AV_384_Align2RegClass;
3411   if (BitWidth == 512)
3412     return &AMDGPU::AV_512_Align2RegClass;
3413   if (BitWidth == 1024)
3414     return &AMDGPU::AV_1024_Align2RegClass;
3415 
3416   return nullptr;
3417 }
3418 
3419 const TargetRegisterClass *
3420 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
3421   if (BitWidth == 32)
3422     return &AMDGPU::AV_32RegClass;
3423   return ST.needsAlignedVGPRs()
3424              ? getAlignedVectorSuperClassForBitWidth(BitWidth)
3425              : getAnyVectorSuperClassForBitWidth(BitWidth);
3426 }
3427 
3428 const TargetRegisterClass *
3429 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
3430   if (BitWidth == 16)
3431     return &AMDGPU::SGPR_LO16RegClass;
3432   if (BitWidth == 32)
3433     return &AMDGPU::SReg_32RegClass;
3434   if (BitWidth == 64)
3435     return &AMDGPU::SReg_64RegClass;
3436   if (BitWidth == 96)
3437     return &AMDGPU::SGPR_96RegClass;
3438   if (BitWidth == 128)
3439     return &AMDGPU::SGPR_128RegClass;
3440   if (BitWidth == 160)
3441     return &AMDGPU::SGPR_160RegClass;
3442   if (BitWidth == 192)
3443     return &AMDGPU::SGPR_192RegClass;
3444   if (BitWidth == 224)
3445     return &AMDGPU::SGPR_224RegClass;
3446   if (BitWidth == 256)
3447     return &AMDGPU::SGPR_256RegClass;
3448   if (BitWidth == 288)
3449     return &AMDGPU::SGPR_288RegClass;
3450   if (BitWidth == 320)
3451     return &AMDGPU::SGPR_320RegClass;
3452   if (BitWidth == 352)
3453     return &AMDGPU::SGPR_352RegClass;
3454   if (BitWidth == 384)
3455     return &AMDGPU::SGPR_384RegClass;
3456   if (BitWidth == 512)
3457     return &AMDGPU::SGPR_512RegClass;
3458   if (BitWidth == 1024)
3459     return &AMDGPU::SGPR_1024RegClass;
3460 
3461   return nullptr;
3462 }
3463 
3464 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
3465                                Register Reg) const {
3466   const TargetRegisterClass *RC;
3467   if (Reg.isVirtual())
3468     RC = MRI.getRegClass(Reg);
3469   else
3470     RC = getPhysRegBaseClass(Reg);
3471   return RC ? isSGPRClass(RC) : false;
3472 }
3473 
3474 const TargetRegisterClass *
3475 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
3476   unsigned Size = getRegSizeInBits(*SRC);
3477   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
3478   assert(VRC && "Invalid register class size");
3479   return VRC;
3480 }
3481 
3482 const TargetRegisterClass *
3483 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
3484   unsigned Size = getRegSizeInBits(*SRC);
3485   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
3486   assert(ARC && "Invalid register class size");
3487   return ARC;
3488 }
3489 
3490 const TargetRegisterClass *
3491 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
3492   unsigned Size = getRegSizeInBits(*VRC);
3493   if (Size == 32)
3494     return &AMDGPU::SGPR_32RegClass;
3495   const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
3496   assert(SRC && "Invalid register class size");
3497   return SRC;
3498 }
3499 
3500 const TargetRegisterClass *
3501 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
3502                                          const TargetRegisterClass *SubRC,
3503                                          unsigned SubIdx) const {
3504   // Ensure this subregister index is aligned in the super register.
3505   const TargetRegisterClass *MatchRC =
3506       getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3507   return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3508 }
3509 
3510 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3511   if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
3512       OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
3513     return !ST.hasMFMAInlineLiteralBug();
3514 
3515   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3516          OpType <= AMDGPU::OPERAND_SRC_LAST;
3517 }
3518 
3519 bool SIRegisterInfo::shouldRewriteCopySrc(
3520   const TargetRegisterClass *DefRC,
3521   unsigned DefSubReg,
3522   const TargetRegisterClass *SrcRC,
3523   unsigned SrcSubReg) const {
3524   // We want to prefer the smallest register class possible, so we don't want to
3525   // stop and rewrite on anything that looks like a subregister
3526   // extract. Operations mostly don't care about the super register class, so we
3527   // only want to stop on the most basic of copies between the same register
3528   // class.
3529   //
3530   // e.g. if we have something like
3531   // %0 = ...
3532   // %1 = ...
3533   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
3534   // %3 = COPY %2, sub0
3535   //
3536   // We want to look through the COPY to find:
3537   //  => %3 = COPY %0
3538 
3539   // Plain copy.
3540   return getCommonSubClass(DefRC, SrcRC) != nullptr;
3541 }
3542 
3543 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3544   // TODO: 64-bit operands have extending behavior from 32-bit literal.
3545   return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3546          OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
3547 }
3548 
3549 /// Returns a lowest register that is not used at any point in the function.
3550 ///        If all registers are used, then this function will return
3551 ///         AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3552 ///         highest unused register.
3553 MCRegister SIRegisterInfo::findUnusedRegister(
3554     const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3555     const MachineFunction &MF, bool ReserveHighestRegister) const {
3556   if (ReserveHighestRegister) {
3557     for (MCRegister Reg : reverse(*RC))
3558       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3559         return Reg;
3560   } else {
3561     for (MCRegister Reg : *RC)
3562       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3563         return Reg;
3564   }
3565   return MCRegister();
3566 }
3567 
3568 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
3569                                   const RegisterBankInfo &RBI,
3570                                   Register Reg) const {
3571   auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3572   if (!RB)
3573     return false;
3574 
3575   return !RBI.isDivergentRegBank(RB);
3576 }
3577 
3578 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
3579                                                    unsigned EltSize) const {
3580   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3581   assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
3582 
3583   const unsigned RegDWORDs = RegBitWidth / 32;
3584   const unsigned EltDWORDs = EltSize / 4;
3585   assert(RegSplitParts.size() + 1 >= EltDWORDs);
3586 
3587   const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
3588   const unsigned NumParts = RegDWORDs / EltDWORDs;
3589 
3590   return ArrayRef(Parts.data(), NumParts);
3591 }
3592 
3593 const TargetRegisterClass*
3594 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
3595                                   Register Reg) const {
3596   return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3597 }
3598 
3599 const TargetRegisterClass *
3600 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI,
3601                                          const MachineOperand &MO) const {
3602   const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3603   return getSubRegisterClass(SrcRC, MO.getSubReg());
3604 }
3605 
3606 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
3607                             Register Reg) const {
3608   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3609   // Registers without classes are unaddressable, SGPR-like registers.
3610   return RC && isVGPRClass(RC);
3611 }
3612 
3613 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
3614                             Register Reg) const {
3615   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3616 
3617   // Registers without classes are unaddressable, SGPR-like registers.
3618   return RC && isAGPRClass(RC);
3619 }
3620 
3621 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
3622                                     const TargetRegisterClass *SrcRC,
3623                                     unsigned SubReg,
3624                                     const TargetRegisterClass *DstRC,
3625                                     unsigned DstSubReg,
3626                                     const TargetRegisterClass *NewRC,
3627                                     LiveIntervals &LIS) const {
3628   unsigned SrcSize = getRegSizeInBits(*SrcRC);
3629   unsigned DstSize = getRegSizeInBits(*DstRC);
3630   unsigned NewSize = getRegSizeInBits(*NewRC);
3631 
3632   // Do not increase size of registers beyond dword, we would need to allocate
3633   // adjacent registers and constraint regalloc more than needed.
3634 
3635   // Always allow dword coalescing.
3636   if (SrcSize <= 32 || DstSize <= 32)
3637     return true;
3638 
3639   return NewSize <= DstSize || NewSize <= SrcSize;
3640 }
3641 
3642 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
3643                                              MachineFunction &MF) const {
3644   unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3645   switch (RC->getID()) {
3646   default:
3647     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3648   case AMDGPU::VGPR_32RegClassID:
3649     return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
3650   case AMDGPU::SGPR_32RegClassID:
3651   case AMDGPU::SGPR_LO16RegClassID:
3652     return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3653   }
3654 }
3655 
3656 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
3657                                                 unsigned Idx) const {
3658   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3659       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3660     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3661                                const_cast<MachineFunction &>(MF));
3662 
3663   if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3664     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3665                                const_cast<MachineFunction &>(MF));
3666 
3667   llvm_unreachable("Unexpected register pressure set!");
3668 }
3669 
3670 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3671   static const int Empty[] = { -1 };
3672 
3673   if (RegPressureIgnoredUnits[RegUnit])
3674     return Empty;
3675 
3676   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3677 }
3678 
3679 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
3680   // Not a callee saved register.
3681   return AMDGPU::SGPR30_SGPR31;
3682 }
3683 
3684 const TargetRegisterClass *
3685 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
3686                                          const RegisterBank &RB) const {
3687   switch (RB.getID()) {
3688   case AMDGPU::VGPRRegBankID:
3689     return getVGPRClassForBitWidth(
3690         std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3691   case AMDGPU::VCCRegBankID:
3692     assert(Size == 1);
3693     return getWaveMaskRegClass();
3694   case AMDGPU::SGPRRegBankID:
3695     return getSGPRClassForBitWidth(std::max(32u, Size));
3696   case AMDGPU::AGPRRegBankID:
3697     return getAGPRClassForBitWidth(std::max(32u, Size));
3698   default:
3699     llvm_unreachable("unknown register bank");
3700   }
3701 }
3702 
3703 const TargetRegisterClass *
3704 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
3705                                          const MachineRegisterInfo &MRI) const {
3706   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3707   if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3708     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3709 
3710   if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3711     return getAllocatableClass(RC);
3712 
3713   return nullptr;
3714 }
3715 
3716 MCRegister SIRegisterInfo::getVCC() const {
3717   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3718 }
3719 
3720 MCRegister SIRegisterInfo::getExec() const {
3721   return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3722 }
3723 
3724 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
3725   // VGPR tuples have an alignment requirement on gfx90a variants.
3726   return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3727                                 : &AMDGPU::VReg_64RegClass;
3728 }
3729 
3730 const TargetRegisterClass *
3731 SIRegisterInfo::getRegClass(unsigned RCID) const {
3732   switch ((int)RCID) {
3733   case AMDGPU::SReg_1RegClassID:
3734     return getBoolRC();
3735   case AMDGPU::SReg_1_XEXECRegClassID:
3736     return getWaveMaskRegClass();
3737   case -1:
3738     return nullptr;
3739   default:
3740     return AMDGPUGenRegisterInfo::getRegClass(RCID);
3741   }
3742 }
3743 
3744 // Find reaching register definition
3745 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
3746                                               MachineInstr &Use,
3747                                               MachineRegisterInfo &MRI,
3748                                               LiveIntervals *LIS) const {
3749   auto &MDT = LIS->getDomTree();
3750   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3751   SlotIndex DefIdx;
3752 
3753   if (Reg.isVirtual()) {
3754     if (!LIS->hasInterval(Reg))
3755       return nullptr;
3756     LiveInterval &LI = LIS->getInterval(Reg);
3757     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3758                                   : MRI.getMaxLaneMaskForVReg(Reg);
3759     VNInfo *V = nullptr;
3760     if (LI.hasSubRanges()) {
3761       for (auto &S : LI.subranges()) {
3762         if ((S.LaneMask & SubLanes) == SubLanes) {
3763           V = S.getVNInfoAt(UseIdx);
3764           break;
3765         }
3766       }
3767     } else {
3768       V = LI.getVNInfoAt(UseIdx);
3769     }
3770     if (!V)
3771       return nullptr;
3772     DefIdx = V->def;
3773   } else {
3774     // Find last def.
3775     for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3776       LiveRange &LR = LIS->getRegUnit(Unit);
3777       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3778         if (!DefIdx.isValid() ||
3779             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3780                           LIS->getInstructionFromIndex(V->def)))
3781           DefIdx = V->def;
3782       } else {
3783         return nullptr;
3784       }
3785     }
3786   }
3787 
3788   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3789 
3790   if (!Def || !MDT.dominates(Def, &Use))
3791     return nullptr;
3792 
3793   assert(Def->modifiesRegister(Reg, this));
3794 
3795   return Def;
3796 }
3797 
3798 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
3799   assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3800 
3801   for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3802                                          AMDGPU::SReg_32RegClass,
3803                                          AMDGPU::AGPR_32RegClass } ) {
3804     if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3805       return Super;
3806   }
3807   if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3808                                             &AMDGPU::VGPR_32RegClass)) {
3809       return Super;
3810   }
3811 
3812   return AMDGPU::NoRegister;
3813 }
3814 
3815 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
3816   if (!ST.needsAlignedVGPRs())
3817     return true;
3818 
3819   if (isVGPRClass(&RC))
3820     return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3821   if (isAGPRClass(&RC))
3822     return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3823   if (isVectorSuperClass(&RC))
3824     return RC.hasSuperClassEq(
3825         getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3826 
3827   return true;
3828 }
3829 
3830 const TargetRegisterClass *
3831 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
3832   if (!RC || !ST.needsAlignedVGPRs())
3833     return RC;
3834 
3835   unsigned Size = getRegSizeInBits(*RC);
3836   if (Size <= 32)
3837     return RC;
3838 
3839   if (isVGPRClass(RC))
3840     return getAlignedVGPRClassForBitWidth(Size);
3841   if (isAGPRClass(RC))
3842     return getAlignedAGPRClassForBitWidth(Size);
3843   if (isVectorSuperClass(RC))
3844     return getAlignedVectorSuperClassForBitWidth(Size);
3845 
3846   return RC;
3847 }
3848 
3849 ArrayRef<MCPhysReg>
3850 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
3851   return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3852 }
3853 
3854 ArrayRef<MCPhysReg>
3855 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
3856   return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3857 }
3858 
3859 ArrayRef<MCPhysReg>
3860 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
3861   return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3862 }
3863 
3864 unsigned
3865 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
3866                                           unsigned SubReg) const {
3867   switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3868   case SIRCFlags::HasSGPR:
3869     return std::min(128u, getSubRegIdxSize(SubReg));
3870   case SIRCFlags::HasAGPR:
3871   case SIRCFlags::HasVGPR:
3872   case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR:
3873     return std::min(32u, getSubRegIdxSize(SubReg));
3874   default:
3875     break;
3876   }
3877   return 0;
3878 }
3879 
3880 unsigned
3881 SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
3882                                    const TargetRegisterClass &RC) const {
3883   for (MCPhysReg Reg : reverse(RC.getRegisters()))
3884     if (MRI.isPhysRegUsed(Reg))
3885       return getHWRegIndex(Reg) + 1;
3886   return 0;
3887 }
3888 
3889 SmallVector<StringLiteral>
3890 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
3891                                   const MachineFunction &MF) const {
3892   SmallVector<StringLiteral> RegFlags;
3893   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3894   if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
3895     RegFlags.push_back("WWM_REG");
3896   return RegFlags;
3897 }
3898