xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision 3a29dfe37c585355dc70c7c614f5bbf071cd7efb)
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   MachineIRBuilder &B;
101   const AMDGPURegisterBankInfo &RBI;
102   MachineRegisterInfo &MRI;
103   const RegisterBank *NewBank;
104   SmallVector<MachineInstr *, 4> NewInsts;
105 
106 public:
107   ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
109       : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110     assert(!B.isObservingChanges());
111     B.setChangeObserver(*this);
112   }
113 
114   ~ApplyRegBankMapping() override {
115     for (MachineInstr *MI : NewInsts)
116       applyBank(*MI);
117 
118     B.stopObservingChanges();
119   }
120 
121   /// Set any registers that don't have a set register class or bank to SALU.
122   void applyBank(MachineInstr &MI) {
123     const unsigned Opc = MI.getOpcode();
124     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125         Opc == AMDGPU::G_SEXT) {
126       // LegalizerHelper wants to use the basic legalization artifacts when
127       // widening etc. We don't handle selection with vcc in artifact sources,
128       // so we need to use a select instead to handle these properly.
129       Register DstReg = MI.getOperand(0).getReg();
130       Register SrcReg = MI.getOperand(1).getReg();
131       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132       if (SrcBank == &AMDGPU::VCCRegBank) {
133         const LLT S32 = LLT::scalar(32);
134         assert(MRI.getType(SrcReg) == LLT::scalar(1));
135         assert(MRI.getType(DstReg) == S32);
136         assert(NewBank == &AMDGPU::VGPRRegBank);
137 
138         // Replace the extension with a select, which really uses the boolean
139         // source.
140         B.setInsertPt(*MI.getParent(), MI);
141 
142         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143         auto False = B.buildConstant(S32, 0);
144         B.buildSelect(DstReg, SrcReg, True, False);
145         MRI.setRegBank(True.getReg(0), *NewBank);
146         MRI.setRegBank(False.getReg(0), *NewBank);
147         MI.eraseFromParent();
148       }
149 
150       assert(!MRI.getRegClassOrRegBank(DstReg));
151       MRI.setRegBank(DstReg, *NewBank);
152       return;
153     }
154 
155 #ifndef NDEBUG
156     if (Opc == AMDGPU::G_TRUNC) {
157       Register DstReg = MI.getOperand(0).getReg();
158       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159       assert(DstBank != &AMDGPU::VCCRegBank);
160     }
161 #endif
162 
163     for (MachineOperand &Op : MI.operands()) {
164       if (!Op.isReg())
165         continue;
166 
167       // We may see physical registers if building a real MI
168       Register Reg = Op.getReg();
169       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170         continue;
171 
172       const RegisterBank *RB = NewBank;
173       if (MRI.getType(Reg) == LLT::scalar(1)) {
174         assert(NewBank == &AMDGPU::VGPRRegBank &&
175                "s1 operands should only be used for vector bools");
176         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178                "not expecting legalization artifacts here");
179         RB = &AMDGPU::VCCRegBank;
180       }
181 
182       MRI.setRegBank(Reg, *RB);
183     }
184   }
185 
186   void erasingInstr(MachineInstr &MI) override {}
187 
188   void createdInstr(MachineInstr &MI) override {
189     // At this point, the instruction was just inserted and has no operands.
190     NewInsts.push_back(&MI);
191   }
192 
193   void changingInstr(MachineInstr &MI) override {}
194   void changedInstr(MachineInstr &MI) override {
195     // FIXME: In principle we should probably add the instruction to NewInsts,
196     // but the way the LegalizerHelper uses the observer, we will always see the
197     // registers we need to set the regbank on also referenced in a new
198     // instruction.
199   }
200 };
201 
202 } // anonymous namespace
203 
204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206       TII(Subtarget.getInstrInfo()) {
207 
208   // HACK: Until this is fully tablegen'd.
209   static llvm::once_flag InitializeRegisterBankFlag;
210 
211   static auto InitializeRegisterBankOnce = [this]() {
212     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215     (void)this;
216   };
217 
218   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220 
221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222   unsigned BankID = Bank.getID();
223   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225 
226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227   return RB != &AMDGPU::SGPRRegBank;
228 }
229 
230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231                                           const RegisterBank &Src,
232                                           TypeSize Size) const {
233   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236     return std::numeric_limits<unsigned>::max();
237   }
238 
239   // Bool values are tricky, because the meaning is based on context. The SCC
240   // and VCC banks are for the natural scalar and vector conditions produced by
241   // a compare.
242   //
243   // Legalization doesn't know about the necessary context, so an s1 use may
244   // have been a truncate from an arbitrary value, in which case a copy (lowered
245   // as a compare with 0) needs to be inserted.
246   if (Size == 1 &&
247       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248       (isVectorRegisterBank(Src) ||
249        Src.getID() == AMDGPU::SGPRRegBankID ||
250        Src.getID() == AMDGPU::VCCRegBankID))
251     return std::numeric_limits<unsigned>::max();
252 
253   // There is no direct copy between AGPRs.
254   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255       Src.getID() == AMDGPU::AGPRRegBankID)
256     return 4;
257 
258   return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260 
261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262   const ValueMapping &ValMapping,
263   const RegisterBank *CurBank) const {
264   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265   // VGPR.
266   // FIXME: Is there a better way to do this?
267   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268     return 10; // This is expensive.
269 
270   assert(ValMapping.NumBreakDowns == 2 &&
271          ValMapping.BreakDown[0].Length == 32 &&
272          ValMapping.BreakDown[0].StartIdx == 0 &&
273          ValMapping.BreakDown[1].Length == 32 &&
274          ValMapping.BreakDown[1].StartIdx == 32 &&
275          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276 
277   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279   // want.
280 
281   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282   // alignment restrictions, but this probably isn't important.
283   return 1;
284 }
285 
286 const RegisterBank &
287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288                                                LLT Ty) const {
289   if (&RC == &AMDGPU::SReg_1RegClass)
290     return AMDGPU::VCCRegBank;
291 
292   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293   // VCC-like use.
294   if (TRI->isSGPRClass(&RC)) {
295     // FIXME: This probably came from a copy from a physical register, which
296     // should be inferable from the copied to-type. We don't have many boolean
297     // physical register constraints so just assume a normal SGPR for now.
298     if (!Ty.isValid())
299       return AMDGPU::SGPRRegBank;
300 
301     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302   }
303 
304   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306 
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
309 AMDGPURegisterBankInfo::addMappingFromTable(
310     const MachineInstr &MI, const MachineRegisterInfo &MRI,
311     const std::array<unsigned, NumOps> RegSrcOpIdx,
312     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313 
314   InstructionMappings AltMappings;
315 
316   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317 
318   unsigned Sizes[NumOps];
319   for (unsigned I = 0; I < NumOps; ++I) {
320     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322   }
323 
324   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327   }
328 
329   // getInstrMapping's default mapping uses ID 1, so start at 2.
330   unsigned MappingID = 2;
331   for (const auto &Entry : Table) {
332     for (unsigned I = 0; I < NumOps; ++I) {
333       int OpIdx = RegSrcOpIdx[I];
334       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335     }
336 
337     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338                                                  getOperandsMapping(Operands),
339                                                  Operands.size()));
340   }
341 
342   return AltMappings;
343 }
344 
345 RegisterBankInfo::InstructionMappings
346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349   case Intrinsic::amdgcn_readlane: {
350     static const OpRegBankEntry<3> Table[2] = {
351       // Perfectly legal.
352       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353 
354       // Need a readfirstlane for the index.
355       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356     };
357 
358     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360   }
361   case Intrinsic::amdgcn_writelane: {
362     static const OpRegBankEntry<4> Table[4] = {
363       // Perfectly legal.
364       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365 
366       // Need readfirstlane of first op
367       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368 
369       // Need readfirstlane of second op
370       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371 
372       // Need readfirstlane of both ops
373       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374     };
375 
376     // rsrc, voffset, offset
377     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379   }
380   default:
381     return RegisterBankInfo::getInstrAlternativeMappings(MI);
382   }
383 }
384 
385 RegisterBankInfo::InstructionMappings
386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388 
389   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390   case Intrinsic::amdgcn_s_buffer_load: {
391     static const OpRegBankEntry<2> Table[4] = {
392       // Perfectly legal.
393       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394 
395       // Only need 1 register in loop
396       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397 
398       // Have to waterfall the resource.
399       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400 
401       // Have to waterfall the resource, and the offset.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403     };
404 
405     // rsrc, offset
406     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408   }
409   case Intrinsic::amdgcn_ds_ordered_add:
410   case Intrinsic::amdgcn_ds_ordered_swap: {
411     // VGPR = M0, VGPR
412     static const OpRegBankEntry<3> Table[2] = {
413       // Perfectly legal.
414       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
415 
416       // Need a readfirstlane for m0
417       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418     };
419 
420     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422   }
423   case Intrinsic::amdgcn_s_sendmsg:
424   case Intrinsic::amdgcn_s_sendmsghalt: {
425     // FIXME: Should have no register for immediate
426     static const OpRegBankEntry<1> Table[2] = {
427       // Perfectly legal.
428       { { AMDGPU::SGPRRegBankID }, 1 },
429 
430       // Need readlane
431       { { AMDGPU::VGPRRegBankID }, 3 }
432     };
433 
434     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436   }
437   default:
438     return RegisterBankInfo::getInstrAlternativeMappings(MI);
439   }
440 }
441 
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445   if (!MI.hasOneMemOperand())
446     return false;
447 
448   const MachineMemOperand *MMO = *MI.memoperands_begin();
449   const unsigned AS = MMO->getAddrSpace();
450   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452   const unsigned MemSize = 8 * MMO->getSize().getValue();
453 
454   // Require 4-byte alignment.
455   return (MMO->getAlign() >= Align(4) ||
456           (Subtarget.hasScalarSubwordLoads() &&
457            ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458             (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459          // Can't do a scalar atomic load.
460          !MMO->isAtomic() &&
461          // Don't use scalar loads for volatile accesses to non-constant address
462          // spaces.
463          (IsConst || !MMO->isVolatile()) &&
464          // Memory must be known constant, or not written before this load.
465          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
466          AMDGPUInstrInfo::isUniformMMO(MMO);
467 }
468 
469 RegisterBankInfo::InstructionMappings
470 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471     const MachineInstr &MI) const {
472 
473   const MachineFunction &MF = *MI.getParent()->getParent();
474   const MachineRegisterInfo &MRI = MF.getRegInfo();
475 
476 
477   InstructionMappings AltMappings;
478   switch (MI.getOpcode()) {
479   case TargetOpcode::G_CONSTANT:
480   case TargetOpcode::G_IMPLICIT_DEF: {
481     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
482     if (Size == 1) {
483       static const OpRegBankEntry<1> Table[3] = {
484         { { AMDGPU::VGPRRegBankID }, 1 },
485         { { AMDGPU::SGPRRegBankID }, 1 },
486         { { AMDGPU::VCCRegBankID }, 1 }
487       };
488 
489       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490     }
491 
492     [[fallthrough]];
493   }
494   case TargetOpcode::G_FCONSTANT:
495   case TargetOpcode::G_FRAME_INDEX:
496   case TargetOpcode::G_GLOBAL_VALUE: {
497     static const OpRegBankEntry<1> Table[2] = {
498       { { AMDGPU::VGPRRegBankID }, 1 },
499       { { AMDGPU::SGPRRegBankID }, 1 }
500     };
501 
502     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
503   }
504   case TargetOpcode::G_AND:
505   case TargetOpcode::G_OR:
506   case TargetOpcode::G_XOR: {
507     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
508 
509     if (Size == 1) {
510       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511       const InstructionMapping &SCCMapping = getInstructionMapping(
512         1, 1, getOperandsMapping(
513           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516         3); // Num Operands
517       AltMappings.push_back(&SCCMapping);
518 
519       const InstructionMapping &VCCMapping0 = getInstructionMapping(
520         2, 1, getOperandsMapping(
521           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524         3); // Num Operands
525       AltMappings.push_back(&VCCMapping0);
526       return AltMappings;
527     }
528 
529     if (Size != 64)
530       break;
531 
532     const InstructionMapping &SSMapping = getInstructionMapping(
533       1, 1, getOperandsMapping(
534         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537       3); // Num Operands
538     AltMappings.push_back(&SSMapping);
539 
540     const InstructionMapping &VVMapping = getInstructionMapping(
541       2, 2, getOperandsMapping(
542         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545       3); // Num Operands
546     AltMappings.push_back(&VVMapping);
547     break;
548   }
549   case TargetOpcode::G_LOAD:
550   case TargetOpcode::G_ZEXTLOAD:
551   case TargetOpcode::G_SEXTLOAD: {
552     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
553     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
554     unsigned PtrSize = PtrTy.getSizeInBits();
555     unsigned AS = PtrTy.getAddressSpace();
556 
557     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559         isScalarLoadLegal(MI)) {
560       const InstructionMapping &SSMapping = getInstructionMapping(
561           1, 1, getOperandsMapping(
562                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564           2); // Num Operands
565       AltMappings.push_back(&SSMapping);
566     }
567 
568     const InstructionMapping &VVMapping = getInstructionMapping(
569         2, 1,
570         getOperandsMapping(
571             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573         2); // Num Operands
574     AltMappings.push_back(&VVMapping);
575 
576     // It may be possible to have a vgpr = load sgpr mapping here, because
577     // the mubuf instructions support this kind of load, but probably for only
578     // gfx7 and older.  However, the addressing mode matching in the instruction
579     // selector should be able to do a better job of detecting and selecting
580     // these kinds of loads from the vgpr = load vgpr mapping.
581 
582     return AltMappings;
583 
584   }
585   case TargetOpcode::G_SELECT: {
586     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
587     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592       4); // Num Operands
593     AltMappings.push_back(&SSMapping);
594 
595     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600       4); // Num Operands
601     AltMappings.push_back(&VVMapping);
602 
603     return AltMappings;
604   }
605   case TargetOpcode::G_UADDE:
606   case TargetOpcode::G_USUBE:
607   case TargetOpcode::G_SADDE:
608   case TargetOpcode::G_SSUBE: {
609     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
610     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
611       getOperandsMapping(
612         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617       5); // Num Operands
618     AltMappings.push_back(&SSMapping);
619 
620     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626       5); // Num Operands
627     AltMappings.push_back(&VVMapping);
628     return AltMappings;
629   }
630   case AMDGPU::G_BRCOND: {
631     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632 
633     // TODO: Change type to 32 for scalar
634     const InstructionMapping &SMapping = getInstructionMapping(
635       1, 1, getOperandsMapping(
636         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637       2); // Num Operands
638     AltMappings.push_back(&SMapping);
639 
640     const InstructionMapping &VMapping = getInstructionMapping(
641       1, 1, getOperandsMapping(
642         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643       2); // Num Operands
644     AltMappings.push_back(&VMapping);
645     return AltMappings;
646   }
647   case AMDGPU::G_INTRINSIC:
648   case AMDGPU::G_INTRINSIC_CONVERGENT:
649     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653   default:
654     break;
655   }
656   return RegisterBankInfo::getInstrAlternativeMappings(MI);
657 }
658 
659 void AMDGPURegisterBankInfo::split64BitValueForMapping(
660   MachineIRBuilder &B,
661   SmallVector<Register, 2> &Regs,
662   LLT HalfTy,
663   Register Reg) const {
664   assert(HalfTy.getSizeInBits() == 32);
665   MachineRegisterInfo *MRI = B.getMRI();
666   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
667   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
668   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669   MRI->setRegBank(LoLHS, *Bank);
670   MRI->setRegBank(HiLHS, *Bank);
671 
672   Regs.push_back(LoLHS);
673   Regs.push_back(HiLHS);
674 
675   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676     .addDef(LoLHS)
677     .addDef(HiLHS)
678     .addUse(Reg);
679 }
680 
681 /// Replace the current type each register in \p Regs has with \p NewTy
682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683                           LLT NewTy) {
684   for (Register Reg : Regs) {
685     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686     MRI.setType(Reg, NewTy);
687   }
688 }
689 
690 static LLT getHalfSizedType(LLT Ty) {
691   if (Ty.isVector()) {
692     assert(Ty.getElementCount().isKnownMultipleOf(2));
693     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
694                                Ty.getElementType());
695   }
696 
697   assert(Ty.getScalarSizeInBits() % 2 == 0);
698   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
699 }
700 
701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702 // source value into a scalar register.
703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704                                                     MachineRegisterInfo &MRI,
705                                                     Register Src) const {
706   LLT Ty = MRI.getType(Src);
707   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708 
709   if (Bank == &AMDGPU::SGPRRegBank)
710     return Src;
711 
712   unsigned Bits = Ty.getSizeInBits();
713   assert(Bits % 32 == 0);
714 
715   if (Bank != &AMDGPU::VGPRRegBank) {
716     // We need to copy from AGPR to VGPR
717     Src = B.buildCopy(Ty, Src).getReg(0);
718     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719   }
720 
721   LLT S32 = LLT::scalar(32);
722   unsigned NumParts = Bits / 32;
723   SmallVector<Register, 8> SrcParts;
724   SmallVector<Register, 8> DstParts;
725 
726   if (Bits == 32) {
727     SrcParts.push_back(Src);
728   } else {
729     auto Unmerge = B.buildUnmerge(S32, Src);
730     for (unsigned i = 0; i < NumParts; ++i)
731       SrcParts.push_back(Unmerge.getReg(i));
732   }
733 
734   for (unsigned i = 0; i < NumParts; ++i) {
735     Register SrcPart = SrcParts[i];
736     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
737     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738 
739     const TargetRegisterClass *Constrained =
740         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741     (void)Constrained;
742     assert(Constrained && "Failed to constrain readfirstlane src reg");
743 
744     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745 
746     DstParts.push_back(DstPart);
747   }
748 
749   if (Bits == 32)
750     return DstParts[0];
751 
752   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
753   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754   return Dst;
755 }
756 
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759 /// execute the instruction for each unique combination of values in all lanes
760 /// in the wave. The block will be split such that rest of the instructions are
761 /// moved to a new block.
762 ///
763 /// Essentially performs this loop:
764 //
765 /// Save Execution Mask
766 /// For (Lane : Wavefront) {
767 ///   Enable Lane, Disable all other lanes
768 ///   SGPR = read SGPR value for current lane from VGPR
769 ///   VGPRResult[Lane] = use_op SGPR
770 /// }
771 /// Restore Execution Mask
772 ///
773 /// There is additional complexity to try for compare values to identify the
774 /// unique values used.
775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776     MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777     SmallSet<Register, 4> &SGPROperandRegs) const {
778   // Track use registers which have already been expanded with a readfirstlane
779   // sequence. This may have multiple uses if moving a sequence.
780   DenseMap<Register, Register> WaterfalledRegMap;
781 
782   MachineBasicBlock &MBB = B.getMBB();
783   MachineFunction *MF = &B.getMF();
784 
785   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786   const unsigned MovExecOpc =
787       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788   const unsigned MovExecTermOpc =
789       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790 
791   const unsigned XorTermOpc = Subtarget.isWave32() ?
792     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
794     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795   const unsigned ExecReg =  Subtarget.isWave32() ?
796     AMDGPU::EXEC_LO : AMDGPU::EXEC;
797 
798 #ifndef NDEBUG
799   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
800 #endif
801 
802   MachineRegisterInfo &MRI = *B.getMRI();
803   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
804   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
805 
806   // Don't bother using generic instructions/registers for the exec mask.
807   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
808     .addDef(InitSaveExecReg);
809 
810   Register PhiExec = MRI.createVirtualRegister(WaveRC);
811   Register NewExec = MRI.createVirtualRegister(WaveRC);
812 
813   // To insert the loop we need to split the block. Move everything before this
814   // point to a new block, and insert a new empty block before this instruction.
815   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819   MachineFunction::iterator MBBI(MBB);
820   ++MBBI;
821   MF->insert(MBBI, LoopBB);
822   MF->insert(MBBI, BodyBB);
823   MF->insert(MBBI, RestoreExecBB);
824   MF->insert(MBBI, RemainderBB);
825 
826   LoopBB->addSuccessor(BodyBB);
827   BodyBB->addSuccessor(RestoreExecBB);
828   BodyBB->addSuccessor(LoopBB);
829 
830   // Move the rest of the block into a new block.
831   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
832   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
833 
834   MBB.addSuccessor(LoopBB);
835   RestoreExecBB->addSuccessor(RemainderBB);
836 
837   B.setInsertPt(*LoopBB, LoopBB->end());
838 
839   B.buildInstr(TargetOpcode::PHI)
840       .addDef(PhiExec)
841       .addReg(InitSaveExecReg)
842       .addMBB(&MBB)
843       .addReg(NewExec)
844       .addMBB(BodyBB);
845 
846   const DebugLoc &DL = B.getDL();
847 
848   MachineInstr &FirstInst = *Range.begin();
849 
850   // Move the instruction into the loop body. Note we moved everything after
851   // Range.end() already into a new block, so Range.end() is no longer valid.
852   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
853 
854   // Figure out the iterator range after splicing the instructions.
855   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856   auto NewEnd = BodyBB->end();
857 
858   B.setMBB(*LoopBB);
859 
860   LLT S1 = LLT::scalar(1);
861   Register CondReg;
862 
863   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864 
865   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
866     for (MachineOperand &Op : MI.all_uses()) {
867       Register OldReg = Op.getReg();
868       if (!SGPROperandRegs.count(OldReg))
869         continue;
870 
871       // See if we already processed this register in another instruction in the
872       // sequence.
873       auto OldVal = WaterfalledRegMap.find(OldReg);
874       if (OldVal != WaterfalledRegMap.end()) {
875         Op.setReg(OldVal->second);
876         continue;
877       }
878 
879       Register OpReg = Op.getReg();
880       LLT OpTy = MRI.getType(OpReg);
881 
882       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883       if (OpBank != &AMDGPU::VGPRRegBank) {
884         // Insert copy from AGPR to VGPR before the loop.
885         B.setMBB(MBB);
886         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
887         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888         B.setMBB(*LoopBB);
889       }
890 
891       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
892 
893       // Build the comparison(s).
894       unsigned OpSize = OpTy.getSizeInBits();
895       bool Is64 = OpSize % 64 == 0;
896       unsigned PartSize = Is64 ? 64 : 32;
897       LLT PartTy = LLT::scalar(PartSize);
898       unsigned NumParts = OpSize / PartSize;
899       SmallVector<Register, 8> OpParts;
900       SmallVector<Register, 8> CurrentLaneParts;
901 
902       if (NumParts == 1) {
903         OpParts.push_back(OpReg);
904         CurrentLaneParts.push_back(CurrentLaneReg);
905       } else {
906         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
907         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
908         for (unsigned i = 0; i < NumParts; ++i) {
909           OpParts.push_back(UnmergeOp.getReg(i));
910           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
911           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913         }
914       }
915 
916       for (unsigned i = 0; i < NumParts; ++i) {
917         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
918                                   OpParts[i]).getReg(0);
919         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920 
921         if (!CondReg) {
922           CondReg = CmpReg;
923         } else {
924           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
925           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926         }
927       }
928 
929       Op.setReg(CurrentLaneReg);
930 
931       // Make sure we don't re-process this register again.
932       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
933     }
934   }
935 
936   // The ballot becomes a no-op during instruction selection.
937   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939                 .addReg(CondReg)
940                 .getReg(0);
941   MRI.setRegClass(CondReg, WaveRC);
942 
943   // Update EXEC, save the original EXEC value to VCC.
944   B.buildInstr(AndSaveExecOpc)
945     .addDef(NewExec)
946     .addReg(CondReg, RegState::Kill);
947 
948   MRI.setSimpleHint(NewExec, CondReg);
949 
950   B.setInsertPt(*BodyBB, BodyBB->end());
951 
952   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953   B.buildInstr(XorTermOpc)
954     .addDef(ExecReg)
955     .addReg(ExecReg)
956     .addReg(NewExec);
957 
958   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959   // s_cbranch_scc0?
960 
961   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963 
964   // Save the EXEC mask before the loop.
965   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966     .addReg(ExecReg);
967 
968   // Restore the EXEC mask after the loop.
969   B.setMBB(*RestoreExecBB);
970   B.buildInstr(MovExecTermOpc)
971     .addDef(ExecReg)
972     .addReg(SaveExecReg);
973 
974   // Set the insert point after the original instruction, so any new
975   // instructions will be in the remainder.
976   B.setInsertPt(*RemainderBB, RemainderBB->begin());
977 
978   return true;
979 }
980 
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
983 // SGPROperandRegs. Returns true if there are any operands to handle and a
984 // waterfall loop is necessary.
985 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988   for (unsigned Op : OpIndices) {
989     assert(MI.getOperand(Op).isUse());
990     Register Reg = MI.getOperand(Op).getReg();
991     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993       SGPROperandRegs.insert(Reg);
994   }
995 
996   // No operands need to be replaced, so no need to loop.
997   return !SGPROperandRegs.empty();
998 }
999 
1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001     MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003   // are the same register.
1004   SmallSet<Register, 4> SGPROperandRegs;
1005 
1006   if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007     return false;
1008 
1009   MachineBasicBlock::iterator I = MI.getIterator();
1010   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1011                                 SGPROperandRegs);
1012 }
1013 
1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016     MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017   Register Reg = MI.getOperand(OpIdx).getReg();
1018   MachineRegisterInfo &MRI = *B.getMRI();
1019   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020   if (Bank == &AMDGPU::SGPRRegBank)
1021     return;
1022 
1023   Reg = buildReadFirstLane(B, MRI, Reg);
1024   MI.getOperand(OpIdx).setReg(Reg);
1025 }
1026 
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028 /// rest will be in the remainder.
1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030   unsigned TotalSize = Ty.getSizeInBits();
1031   if (!Ty.isVector())
1032     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1033 
1034   LLT EltTy = Ty.getElementType();
1035   unsigned EltSize = EltTy.getSizeInBits();
1036   assert(FirstSize % EltSize == 0);
1037 
1038   unsigned FirstPartNumElts = FirstSize / EltSize;
1039   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040 
1041   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1042           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1043 }
1044 
1045 static LLT widen96To128(LLT Ty) {
1046   if (!Ty.isVector())
1047     return LLT::scalar(128);
1048 
1049   LLT EltTy = Ty.getElementType();
1050   assert(128 % EltTy.getSizeInBits() == 0);
1051   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1052 }
1053 
1054 bool AMDGPURegisterBankInfo::applyMappingLoad(
1055     MachineIRBuilder &B,
1056     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057     MachineInstr &MI) const {
1058   MachineRegisterInfo &MRI = *B.getMRI();
1059   Register DstReg = MI.getOperand(0).getReg();
1060   const LLT LoadTy = MRI.getType(DstReg);
1061   unsigned LoadSize = LoadTy.getSizeInBits();
1062   MachineMemOperand *MMO = *MI.memoperands_begin();
1063   const unsigned MaxNonSmrdLoadSize = 128;
1064 
1065   const RegisterBank *DstBank =
1066       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1067   if (DstBank == &AMDGPU::SGPRRegBank) {
1068     // There are some special cases that we need to look at for 32 bit and 96
1069     // bit SGPR loads otherwise we have nothing to do.
1070     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1071       return false;
1072 
1073     const unsigned MemSize = 8 * MMO->getSize().getValue();
1074     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076     // scalar loads should have a load size of 32 but memory access size of less
1077     // than 32.
1078     if (LoadSize == 32 &&
1079         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080       return false;
1081 
1082     if (LoadSize == 32 &&
1083         ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084          (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085         isScalarLoadLegal(MI) &&
1086         Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087       return false;
1088 
1089     Register PtrReg = MI.getOperand(1).getReg();
1090 
1091     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092 
1093     if (LoadSize == 32) {
1094       // This is an extending load from a sub-dword size. Widen the memory
1095       // access size to 4 bytes and clear the extra high bits appropriately
1096       const LLT S32 = LLT::scalar(32);
1097       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1100         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1101       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1104         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1105       } else
1106         // We do not need to touch the higher bits for regular loads.
1107         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1108     } else {
1109       // 96-bit loads are only available for vector loads. We need to split this
1110       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111       if (MMO->getAlign() < Align(16)) {
1112         LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113         LLT Part64, Part32;
1114         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1115         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1116             LegalizerHelper::Legalized)
1117           return false;
1118         return true;
1119       }
1120       LLT WiderTy = widen96To128(LoadTy);
1121       auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1122       if (WiderTy.isScalar()) {
1123         B.buildTrunc(MI.getOperand(0), WideLoad);
1124       } else {
1125         B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1126                                             WideLoad);
1127       }
1128     }
1129 
1130     MI.eraseFromParent();
1131     return true;
1132   }
1133 
1134   // 128-bit loads are supported for all instruction types.
1135   if (LoadSize <= MaxNonSmrdLoadSize)
1136     return false;
1137 
1138   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1139   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1140 
1141   if (SrcRegs.empty())
1142     SrcRegs.push_back(MI.getOperand(1).getReg());
1143 
1144   // RegBankSelect only emits scalar types, so we need to reset the pointer
1145   // operand to a pointer type.
1146   Register BasePtrReg = SrcRegs[0];
1147   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1148   MRI.setType(BasePtrReg, PtrTy);
1149 
1150   // The following are the loads not splitted enough during legalization
1151   // because it was not clear they are smem-load or vmem-load
1152   if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) ||
1153       MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1154     assert(LoadSize % MaxNonSmrdLoadSize == 0);
1155     unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1156     const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1157     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1158     LegalizerHelper Helper(B.getMF(), O, B);
1159     if (LoadTy.isVector()) {
1160       if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=
1161           LegalizerHelper::Legalized)
1162         return false;
1163     } else {
1164       if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1165         return false;
1166     }
1167   }
1168 
1169   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1170   return true;
1171 }
1172 
1173 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1174     MachineIRBuilder &B,
1175     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1176     MachineInstr &MI) const {
1177   MachineRegisterInfo &MRI = *B.getMRI();
1178   const MachineFunction &MF = B.getMF();
1179   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1180   const auto &TFI = *ST.getFrameLowering();
1181 
1182   // Guard in case the stack growth direction ever changes with scratch
1183   // instructions.
1184   assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1185          "Stack grows upwards for AMDGPU");
1186 
1187   Register Dst = MI.getOperand(0).getReg();
1188   Register AllocSize = MI.getOperand(1).getReg();
1189   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1190 
1191   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1192 
1193   if (SizeBank != &AMDGPU::SGPRRegBank) {
1194     auto WaveReduction =
1195         B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
1196             .addUse(AllocSize)
1197             .addImm(0);
1198     AllocSize = WaveReduction.getReg(0);
1199   }
1200 
1201   LLT PtrTy = MRI.getType(Dst);
1202   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1203 
1204   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1205   Register SPReg = Info->getStackPtrOffsetReg();
1206   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1207 
1208   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1209   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1210 
1211   auto OldSP = B.buildCopy(PtrTy, SPReg);
1212   if (Alignment > TFI.getStackAlign()) {
1213     auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1214     auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
1215                               B.buildConstant(LLT::scalar(32), StackAlignMask));
1216     B.buildMaskLowPtrBits(Dst, Tmp1,
1217                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1218   } else {
1219     B.buildCopy(Dst, OldSP);
1220   }
1221   auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
1222   B.buildCopy(SPReg, PtrAdd);
1223   MI.eraseFromParent();
1224   return true;
1225 }
1226 
1227 bool AMDGPURegisterBankInfo::applyMappingImage(
1228     MachineIRBuilder &B, MachineInstr &MI,
1229     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1230     int RsrcIdx) const {
1231   const int NumDefs = MI.getNumExplicitDefs();
1232 
1233   // The reported argument index is relative to the IR intrinsic call arguments,
1234   // so we need to shift by the number of defs and the intrinsic ID.
1235   RsrcIdx += NumDefs + 1;
1236 
1237   // Insert copies to VGPR arguments.
1238   applyDefaultMapping(OpdMapper);
1239 
1240   // Fixup any SGPR arguments.
1241   SmallVector<unsigned, 4> SGPRIndexes;
1242   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1243     if (!MI.getOperand(I).isReg())
1244       continue;
1245 
1246     // If this intrinsic has a sampler, it immediately follows rsrc.
1247     if (I == RsrcIdx || I == RsrcIdx + 1)
1248       SGPRIndexes.push_back(I);
1249   }
1250 
1251   executeInWaterfallLoop(B, MI, SGPRIndexes);
1252   return true;
1253 }
1254 
1255 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1256 // the three offsets (voffset, soffset and instoffset)
1257 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1258     MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1259     Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1260   const LLT S32 = LLT::scalar(32);
1261   MachineRegisterInfo *MRI = B.getMRI();
1262 
1263   if (std::optional<int64_t> Imm =
1264           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1265     uint32_t SOffset, ImmOffset;
1266     if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1267       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1268       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1269       InstOffsetVal = ImmOffset;
1270 
1271       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1272       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1273       return SOffset + ImmOffset;
1274     }
1275   }
1276 
1277   Register Base;
1278   unsigned Offset;
1279 
1280   std::tie(Base, Offset) =
1281       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1282 
1283   uint32_t SOffset, ImmOffset;
1284   if ((int)Offset > 0 &&
1285       TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1286     if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1287       VOffsetReg = Base;
1288       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1289       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1290       InstOffsetVal = ImmOffset;
1291       return 0; // XXX - Why is this 0?
1292     }
1293 
1294     // If we have SGPR base, we can use it for soffset.
1295     if (SOffset == 0) {
1296       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1297       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1298       SOffsetReg = Base;
1299       InstOffsetVal = ImmOffset;
1300       return 0; // XXX - Why is this 0?
1301     }
1302   }
1303 
1304   // Handle the variable sgpr + vgpr case.
1305   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1306   if (Add && (int)Offset >= 0) {
1307     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1308     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1309 
1310     const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1311     const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1312 
1313     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1314       VOffsetReg = Src0;
1315       SOffsetReg = Src1;
1316       return 0;
1317     }
1318 
1319     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1320       VOffsetReg = Src1;
1321       SOffsetReg = Src0;
1322       return 0;
1323     }
1324   }
1325 
1326   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1327   // have an SGPR offset and a VGPR resource.
1328   if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1329     VOffsetReg = CombinedOffset;
1330   } else {
1331     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1332     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1333   }
1334 
1335   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1336   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1337   return 0;
1338 }
1339 
1340 static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1341   switch (Opc) {
1342   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1343     return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1344   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1345     return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1346   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1347     return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1348   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1349     return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1350   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1351     return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1352   default:
1353     break;
1354   }
1355   llvm_unreachable("Unexpected s_buffer_load opcode");
1356 }
1357 
1358 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1359     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1360   MachineInstr &MI = OpdMapper.getMI();
1361   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1362 
1363   const LLT S32 = LLT::scalar(32);
1364   Register Dst = MI.getOperand(0).getReg();
1365   LLT Ty = MRI.getType(Dst);
1366 
1367   const RegisterBank *RSrcBank =
1368     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1369   const RegisterBank *OffsetBank =
1370     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1371   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1372       OffsetBank == &AMDGPU::SGPRRegBank)
1373     return true; // Legal mapping
1374 
1375   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1376   // here but don't have an MMO.
1377 
1378   unsigned LoadSize = Ty.getSizeInBits();
1379   int NumLoads = 1;
1380   if (LoadSize == 256 || LoadSize == 512) {
1381     NumLoads = LoadSize / 128;
1382     Ty = Ty.divide(NumLoads);
1383   }
1384 
1385   // Use the alignment to ensure that the required offsets will fit into the
1386   // immediate offsets.
1387   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1388 
1389   MachineFunction &MF = B.getMF();
1390 
1391   Register SOffset;
1392   Register VOffset;
1393   int64_t ImmOffset = 0;
1394 
1395   unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1396                                         SOffset, ImmOffset, Alignment);
1397 
1398   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1399   // can, but we need to track an MMO for that.
1400   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1401   const Align MemAlign(4); // FIXME: ABI type alignment?
1402   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1403     MachinePointerInfo(),
1404     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1405     MachineMemOperand::MOInvariant,
1406     MemSize, MemAlign);
1407   if (MMOOffset != 0)
1408     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1409 
1410   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1411   // assume that the buffer is unswizzled.
1412 
1413   Register RSrc = MI.getOperand(1).getReg();
1414   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1415   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1416 
1417   SmallVector<Register, 4> LoadParts(NumLoads);
1418 
1419   MachineBasicBlock::iterator MII = MI.getIterator();
1420   MachineInstrSpan Span(MII, &B.getMBB());
1421 
1422   for (int i = 0; i < NumLoads; ++i) {
1423     if (NumLoads == 1) {
1424       LoadParts[i] = Dst;
1425     } else {
1426       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1427       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1428     }
1429 
1430     MachineMemOperand *MMO = BaseMMO;
1431     if (i != 0)
1432       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1433 
1434     B.buildInstr(getSBufferLoadCorrespondingBufferLoadOpcode(MI.getOpcode()))
1435         .addDef(LoadParts[i])       // vdata
1436         .addUse(RSrc)               // rsrc
1437         .addUse(VIndex)             // vindex
1438         .addUse(VOffset)            // voffset
1439         .addUse(SOffset)            // soffset
1440         .addImm(ImmOffset + 16 * i) // offset(imm)
1441         .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1442         .addImm(0)                  // idxen(imm)
1443         .addMemOperand(MMO);
1444   }
1445 
1446   // TODO: If only the resource is a VGPR, it may be better to execute the
1447   // scalar load in the waterfall loop if the resource is expected to frequently
1448   // be dynamically uniform.
1449   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1450     // Remove the original instruction to avoid potentially confusing the
1451     // waterfall loop logic.
1452     B.setInstr(*Span.begin());
1453     MI.eraseFromParent();
1454 
1455     SmallSet<Register, 4> OpsToWaterfall;
1456 
1457     OpsToWaterfall.insert(RSrc);
1458     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1459                            OpsToWaterfall);
1460   }
1461 
1462   if (NumLoads != 1) {
1463     if (Ty.isVector())
1464       B.buildConcatVectors(Dst, LoadParts);
1465     else
1466       B.buildMergeLikeInstr(Dst, LoadParts);
1467   }
1468 
1469   // We removed the instruction earlier with a waterfall loop.
1470   if (RSrcBank == &AMDGPU::SGPRRegBank)
1471     MI.eraseFromParent();
1472 
1473   return true;
1474 }
1475 
1476 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1477                                              const OperandsMapper &OpdMapper,
1478                                              bool Signed) const {
1479   MachineInstr &MI = OpdMapper.getMI();
1480   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1481 
1482   // Insert basic copies
1483   applyDefaultMapping(OpdMapper);
1484 
1485   Register DstReg = MI.getOperand(0).getReg();
1486   LLT Ty = MRI.getType(DstReg);
1487 
1488   const LLT S32 = LLT::scalar(32);
1489 
1490   unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1491   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1492   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1493   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1494 
1495   const RegisterBank *DstBank =
1496     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1497   if (DstBank == &AMDGPU::VGPRRegBank) {
1498     if (Ty == S32)
1499       return true;
1500 
1501     // There is no 64-bit vgpr bitfield extract instructions so the operation
1502     // is expanded to a sequence of instructions that implement the operation.
1503     ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1504 
1505     const LLT S64 = LLT::scalar(64);
1506     // Shift the source operand so that extracted bits start at bit 0.
1507     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1508                               : B.buildLShr(S64, SrcReg, OffsetReg);
1509     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1510 
1511     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1512     // if the width is a constant.
1513     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1514       // Use the 32-bit bitfield extract instruction if the width is a constant.
1515       // Depending on the width size, use either the low or high 32-bits.
1516       auto Zero = B.buildConstant(S32, 0);
1517       auto WidthImm = ConstWidth->Value.getZExtValue();
1518       if (WidthImm <= 32) {
1519         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1520         // or clear the upper 32-bits.
1521         auto Extract =
1522             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1523                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1524         auto Extend =
1525             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1526         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1527       } else {
1528         // Use bitfield extract on upper 32-bit source, and combine with lower
1529         // 32-bit source.
1530         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1531         auto Extract =
1532             Signed
1533                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1534                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1535         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1536       }
1537       MI.eraseFromParent();
1538       return true;
1539     }
1540 
1541     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1542     // operations.
1543     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1544     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1545     if (Signed)
1546       B.buildAShr(S64, SignBit, ExtShift);
1547     else
1548       B.buildLShr(S64, SignBit, ExtShift);
1549     MI.eraseFromParent();
1550     return true;
1551   }
1552 
1553   // The scalar form packs the offset and width in a single operand.
1554 
1555   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1556 
1557   // Ensure the high bits are clear to insert the offset.
1558   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1559   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1560 
1561   // Zeros out the low bits, so don't bother clamping the input value.
1562   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1563 
1564   // Transformation function, pack the offset and width of a BFE into
1565   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1566   // source, bits [5:0] contain the offset and bits [22:16] the width.
1567   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1568 
1569   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1570   // register class constraints.
1571   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1572                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1573 
1574   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1575   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1576     llvm_unreachable("failed to constrain BFE");
1577 
1578   MI.eraseFromParent();
1579   return true;
1580 }
1581 
1582 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1583     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1584   MachineInstr &MI = OpdMapper.getMI();
1585   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1586 
1587   // Insert basic copies.
1588   applyDefaultMapping(OpdMapper);
1589 
1590   Register Dst0 = MI.getOperand(0).getReg();
1591   Register Dst1 = MI.getOperand(1).getReg();
1592   Register Src0 = MI.getOperand(2).getReg();
1593   Register Src1 = MI.getOperand(3).getReg();
1594   Register Src2 = MI.getOperand(4).getReg();
1595 
1596   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1597     return true;
1598 
1599   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1600   LLT S1 = LLT::scalar(1);
1601   LLT S32 = LLT::scalar(32);
1602 
1603   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1604   bool Accumulate = true;
1605 
1606   if (!DstOnValu) {
1607     if (mi_match(Src2, MRI, m_ZeroInt()))
1608       Accumulate = false;
1609   }
1610 
1611   // Keep the multiplication on the SALU.
1612   Register DstHi;
1613   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1614   bool MulHiInVgpr = false;
1615 
1616   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1617 
1618   if (Subtarget.hasSMulHi()) {
1619     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1620                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1621     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1622   } else {
1623     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1624     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1625 
1626     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1627     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1628 
1629     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1630                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1631     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1632 
1633     if (!DstOnValu) {
1634       DstHi = buildReadFirstLane(B, MRI, DstHi);
1635     } else {
1636       MulHiInVgpr = true;
1637     }
1638   }
1639 
1640   // Accumulate and produce the "carry-out" bit.
1641   //
1642   // The "carry-out" is defined as bit 64 of the result when computed as a
1643   // big integer. For unsigned multiply-add, this matches the usual definition
1644   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1645   // result, which is determined as:
1646   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1647   LLT CarryType = DstOnValu ? S1 : S32;
1648   const RegisterBank &CarryBank =
1649       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1650   const RegisterBank &DstBank =
1651       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1652   Register Carry;
1653   Register Zero;
1654 
1655   if (!IsUnsigned) {
1656     Zero = B.buildConstant(S32, 0).getReg(0);
1657     MRI.setRegBank(Zero,
1658                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1659 
1660     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1661                 .getReg(0);
1662     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1663                                       : AMDGPU::SGPRRegBank);
1664 
1665     if (DstOnValu && !MulHiInVgpr) {
1666       Carry = B.buildTrunc(S1, Carry).getReg(0);
1667       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1668     }
1669   }
1670 
1671   if (Accumulate) {
1672     if (DstOnValu) {
1673       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1674       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1675       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1676       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1677     }
1678 
1679     auto Unmerge = B.buildUnmerge(S32, Src2);
1680     Register Src2Lo = Unmerge.getReg(0);
1681     Register Src2Hi = Unmerge.getReg(1);
1682     MRI.setRegBank(Src2Lo, DstBank);
1683     MRI.setRegBank(Src2Hi, DstBank);
1684 
1685     if (!IsUnsigned) {
1686       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1687       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1688 
1689       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1690       MRI.setRegBank(Carry, CarryBank);
1691     }
1692 
1693     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1694     DstLo = AddLo.getReg(0);
1695     Register CarryLo = AddLo.getReg(1);
1696     MRI.setRegBank(DstLo, DstBank);
1697     MRI.setRegBank(CarryLo, CarryBank);
1698 
1699     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1700     DstHi = AddHi.getReg(0);
1701     MRI.setRegBank(DstHi, DstBank);
1702 
1703     Register CarryHi = AddHi.getReg(1);
1704     MRI.setRegBank(CarryHi, CarryBank);
1705 
1706     if (IsUnsigned) {
1707       Carry = CarryHi;
1708     } else {
1709       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1710       MRI.setRegBank(Carry, CarryBank);
1711     }
1712   } else {
1713     if (IsUnsigned) {
1714       Carry = B.buildConstant(CarryType, 0).getReg(0);
1715       MRI.setRegBank(Carry, CarryBank);
1716     }
1717   }
1718 
1719   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1720 
1721   if (DstOnValu) {
1722     B.buildCopy(Dst1, Carry);
1723   } else {
1724     B.buildTrunc(Dst1, Carry);
1725   }
1726 
1727   MI.eraseFromParent();
1728   return true;
1729 }
1730 
1731 // Return a suitable opcode for extending the operands of Opc when widening.
1732 static unsigned getExtendOp(unsigned Opc) {
1733   switch (Opc) {
1734   case TargetOpcode::G_ASHR:
1735   case TargetOpcode::G_SMIN:
1736   case TargetOpcode::G_SMAX:
1737     return TargetOpcode::G_SEXT;
1738   case TargetOpcode::G_LSHR:
1739   case TargetOpcode::G_UMIN:
1740   case TargetOpcode::G_UMAX:
1741     return TargetOpcode::G_ZEXT;
1742   default:
1743     return TargetOpcode::G_ANYEXT;
1744   }
1745 }
1746 
1747 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1748 // any illegal vector extend or unmerge operations.
1749 static std::pair<Register, Register>
1750 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1751   const LLT S32 = LLT::scalar(32);
1752   auto Bitcast = B.buildBitcast(S32, Src);
1753 
1754   if (ExtOpcode == TargetOpcode::G_SEXT) {
1755     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1756     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1757     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1758   }
1759 
1760   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1761   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1762     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1763     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1764   }
1765 
1766   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1767   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1768 }
1769 
1770 // For cases where only a single copy is inserted for matching register banks.
1771 // Replace the register in the instruction operand
1772 static bool substituteSimpleCopyRegs(
1773   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1774   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1775   if (!SrcReg.empty()) {
1776     assert(SrcReg.size() == 1);
1777     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1778     return true;
1779   }
1780 
1781   return false;
1782 }
1783 
1784 /// Handle register layout difference for f16 images for some subtargets.
1785 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1786                                                 MachineRegisterInfo &MRI,
1787                                                 Register Reg) const {
1788   if (!Subtarget.hasUnpackedD16VMem())
1789     return Reg;
1790 
1791   const LLT S16 = LLT::scalar(16);
1792   LLT StoreVT = MRI.getType(Reg);
1793   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1794     return Reg;
1795 
1796   auto Unmerge = B.buildUnmerge(S16, Reg);
1797 
1798 
1799   SmallVector<Register, 4> WideRegs;
1800   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1801     WideRegs.push_back(Unmerge.getReg(I));
1802 
1803   const LLT S32 = LLT::scalar(32);
1804   int NumElts = StoreVT.getNumElements();
1805 
1806   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1807       .getReg(0);
1808 }
1809 
1810 static std::pair<Register, unsigned>
1811 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1812   int64_t Const;
1813   if (mi_match(Reg, MRI, m_ICst(Const)))
1814     return std::pair(Register(), Const);
1815 
1816   Register Base;
1817   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1818     return std::pair(Base, Const);
1819 
1820   // TODO: Handle G_OR used for add case
1821   return std::pair(Reg, 0);
1822 }
1823 
1824 std::pair<Register, unsigned>
1825 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1826                                            Register OrigOffset) const {
1827   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1828   Register BaseReg;
1829   unsigned ImmOffset;
1830   const LLT S32 = LLT::scalar(32);
1831 
1832   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1833   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1834                                                            OrigOffset);
1835 
1836   unsigned C1 = 0;
1837   if (ImmOffset != 0) {
1838     // If the immediate value is too big for the immoffset field, put only bits
1839     // that would normally fit in the immoffset field. The remaining value that
1840     // is copied/added for the voffset field is a large power of 2, and it
1841     // stands more chance of being CSEd with the copy/add for another similar
1842     // load/store.
1843     // However, do not do that rounding down if that is a negative
1844     // number, as it appears to be illegal to have a negative offset in the
1845     // vgpr, even if adding the immediate offset makes it positive.
1846     unsigned Overflow = ImmOffset & ~MaxImm;
1847     ImmOffset -= Overflow;
1848     if ((int32_t)Overflow < 0) {
1849       Overflow += ImmOffset;
1850       ImmOffset = 0;
1851     }
1852 
1853     C1 = ImmOffset;
1854     if (Overflow != 0) {
1855       if (!BaseReg)
1856         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1857       else {
1858         auto OverflowVal = B.buildConstant(S32, Overflow);
1859         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1860       }
1861     }
1862   }
1863 
1864   if (!BaseReg)
1865     BaseReg = B.buildConstant(S32, 0).getReg(0);
1866 
1867   return {BaseReg, C1};
1868 }
1869 
1870 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1871                                         Register SrcReg) const {
1872   MachineRegisterInfo &MRI = *B.getMRI();
1873   LLT SrcTy = MRI.getType(SrcReg);
1874   if (SrcTy.getSizeInBits() == 32) {
1875     // Use a v_mov_b32 here to make the exec dependency explicit.
1876     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1877       .addDef(DstReg)
1878       .addUse(SrcReg);
1879     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1880            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1881   }
1882 
1883   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1885 
1886   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1887     .addDef(TmpReg0)
1888     .addUse(SrcReg, 0, AMDGPU::sub0);
1889   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1890     .addDef(TmpReg1)
1891     .addUse(SrcReg, 0, AMDGPU::sub1);
1892   B.buildInstr(AMDGPU::REG_SEQUENCE)
1893     .addDef(DstReg)
1894     .addUse(TmpReg0)
1895     .addImm(AMDGPU::sub0)
1896     .addUse(TmpReg1)
1897     .addImm(AMDGPU::sub1);
1898 
1899   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1900          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1901 }
1902 
1903 /// Utility function for pushing dynamic vector indexes with a constant offset
1904 /// into waterfall loops.
1905 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1906                                    MachineInstr &IdxUseInstr,
1907                                    unsigned OpIdx,
1908                                    unsigned ConstOffset) {
1909   MachineRegisterInfo &MRI = *B.getMRI();
1910   const LLT S32 = LLT::scalar(32);
1911   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1912   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1913 
1914   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1915 
1916   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1917   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1918   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1919   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1920 }
1921 
1922 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1923 /// original 32-bit source value (to be inserted in the low part of the combined
1924 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1925 /// value.
1926 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1927                                   Register Hi32Reg, Register Lo32Reg,
1928                                   unsigned ExtOpc,
1929                                   const RegisterBank &RegBank,
1930                                   bool IsBooleanSrc = false) {
1931   if (ExtOpc == AMDGPU::G_ZEXT) {
1932     B.buildConstant(Hi32Reg, 0);
1933   } else if (ExtOpc == AMDGPU::G_SEXT) {
1934     if (IsBooleanSrc) {
1935       // If we know the original source was an s1, the high half is the same as
1936       // the low.
1937       B.buildCopy(Hi32Reg, Lo32Reg);
1938     } else {
1939       // Replicate sign bit from 32-bit extended part.
1940       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1941       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1942       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1943     }
1944   } else {
1945     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1946     B.buildUndef(Hi32Reg);
1947   }
1948 }
1949 
1950 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1951     MachineIRBuilder &B, MachineInstr &MI,
1952     const OperandsMapper &OpdMapper) const {
1953   MachineRegisterInfo &MRI = *B.getMRI();
1954 
1955   Register VecReg = MI.getOperand(1).getReg();
1956   Register Idx = MI.getOperand(2).getReg();
1957 
1958   const RegisterBank &IdxBank =
1959     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1960 
1961   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1962 
1963   LLT VecTy = MRI.getType(VecReg);
1964   unsigned EltSize = VecTy.getScalarSizeInBits();
1965   unsigned NumElem = VecTy.getNumElements();
1966 
1967   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1968                                                   IsDivergentIdx, &Subtarget))
1969     return false;
1970 
1971   LLT S32 = LLT::scalar(32);
1972 
1973   const RegisterBank &DstBank =
1974     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1975   const RegisterBank &SrcBank =
1976     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1977 
1978   const RegisterBank &CCBank =
1979     (DstBank == AMDGPU::SGPRRegBank &&
1980      SrcBank == AMDGPU::SGPRRegBank &&
1981      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1982                                      : AMDGPU::VCCRegBank;
1983   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1984 
1985   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1986     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1987     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1988   }
1989 
1990   LLT EltTy = VecTy.getScalarType();
1991   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1992   unsigned NumLanes = DstRegs.size();
1993   if (!NumLanes)
1994     NumLanes = 1;
1995   else
1996     EltTy = MRI.getType(DstRegs[0]);
1997 
1998   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1999   SmallVector<Register, 2> Res(NumLanes);
2000   for (unsigned L = 0; L < NumLanes; ++L)
2001     Res[L] = UnmergeToEltTy.getReg(L);
2002 
2003   for (unsigned I = 1; I < NumElem; ++I) {
2004     auto IC = B.buildConstant(S32, I);
2005     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2006     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2007     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2008 
2009     for (unsigned L = 0; L < NumLanes; ++L) {
2010       auto S = B.buildSelect(EltTy, Cmp,
2011                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2012 
2013       for (unsigned N : { 0, 2, 3 })
2014         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2015 
2016       Res[L] = S->getOperand(0).getReg();
2017     }
2018   }
2019 
2020   for (unsigned L = 0; L < NumLanes; ++L) {
2021     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2022     B.buildCopy(DstReg, Res[L]);
2023     MRI.setRegBank(DstReg, DstBank);
2024   }
2025 
2026   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2027   MI.eraseFromParent();
2028 
2029   return true;
2030 }
2031 
2032 // Insert a cross regbank copy for a register if it already has a bank that
2033 // differs from the one we want to set.
2034 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2035                                    MachineIRBuilder &B, Register &Reg,
2036                                    const RegisterBank &Bank) {
2037   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2038   if (CurrBank && *CurrBank != Bank) {
2039     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2040     MRI.setRegBank(Copy, Bank);
2041     return Copy;
2042   }
2043 
2044   MRI.setRegBank(Reg, Bank);
2045   return Reg;
2046 }
2047 
2048 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2049     MachineIRBuilder &B, MachineInstr &MI,
2050     const OperandsMapper &OpdMapper) const {
2051 
2052   MachineRegisterInfo &MRI = *B.getMRI();
2053   Register VecReg = MI.getOperand(1).getReg();
2054   Register Idx = MI.getOperand(3).getReg();
2055 
2056   const RegisterBank &IdxBank =
2057     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2058 
2059   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2060 
2061   LLT VecTy = MRI.getType(VecReg);
2062   unsigned EltSize = VecTy.getScalarSizeInBits();
2063   unsigned NumElem = VecTy.getNumElements();
2064 
2065   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2066                                                   IsDivergentIdx, &Subtarget))
2067     return false;
2068 
2069   LLT S32 = LLT::scalar(32);
2070 
2071   const RegisterBank &DstBank =
2072     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2073   const RegisterBank &SrcBank =
2074     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2075   const RegisterBank &InsBank =
2076     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2077 
2078   const RegisterBank &CCBank =
2079     (DstBank == AMDGPU::SGPRRegBank &&
2080      SrcBank == AMDGPU::SGPRRegBank &&
2081      InsBank == AMDGPU::SGPRRegBank &&
2082      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2083                                      : AMDGPU::VCCRegBank;
2084   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2085 
2086   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2087     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2088     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2089   }
2090 
2091   LLT EltTy = VecTy.getScalarType();
2092   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2093   unsigned NumLanes = InsRegs.size();
2094   if (!NumLanes) {
2095     NumLanes = 1;
2096     InsRegs.push_back(MI.getOperand(2).getReg());
2097   } else {
2098     EltTy = MRI.getType(InsRegs[0]);
2099   }
2100 
2101   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2102   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2103 
2104   for (unsigned I = 0; I < NumElem; ++I) {
2105     auto IC = B.buildConstant(S32, I);
2106     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2107     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2108     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2109 
2110     for (unsigned L = 0; L < NumLanes; ++L) {
2111       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2112       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2113       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2114 
2115       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2116       MRI.setRegBank(Select, DstBank);
2117 
2118       Ops[I * NumLanes + L] = Select;
2119     }
2120   }
2121 
2122   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2123   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2124     B.buildBuildVector(MI.getOperand(0), Ops);
2125   } else {
2126     auto Vec = B.buildBuildVector(MergeTy, Ops);
2127     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2128     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2129   }
2130 
2131   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2132   MI.eraseFromParent();
2133 
2134   return true;
2135 }
2136 
2137 // Break s_mul_u64 into 32-bit vector operations.
2138 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2139     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2140   SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2141   SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2142   SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2143 
2144   // All inputs are SGPRs, nothing special to do.
2145   if (DefRegs.empty()) {
2146     assert(Src0Regs.empty() && Src1Regs.empty());
2147     applyDefaultMapping(OpdMapper);
2148     return;
2149   }
2150 
2151   assert(DefRegs.size() == 2);
2152   assert(Src0Regs.size() == Src1Regs.size() &&
2153          (Src0Regs.empty() || Src0Regs.size() == 2));
2154 
2155   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2156   MachineInstr &MI = OpdMapper.getMI();
2157   Register DstReg = MI.getOperand(0).getReg();
2158   LLT HalfTy = LLT::scalar(32);
2159 
2160   // Depending on where the source registers came from, the generic code may
2161   // have decided to split the inputs already or not. If not, we still need to
2162   // extract the values.
2163 
2164   if (Src0Regs.empty())
2165     split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2166   else
2167     setRegsToType(MRI, Src0Regs, HalfTy);
2168 
2169   if (Src1Regs.empty())
2170     split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2171   else
2172     setRegsToType(MRI, Src1Regs, HalfTy);
2173 
2174   setRegsToType(MRI, DefRegs, HalfTy);
2175 
2176   // The multiplication is done as follows:
2177   //
2178   //                            Op1H  Op1L
2179   //                          * Op0H  Op0L
2180   //                       --------------------
2181   //                       Op1H*Op0L  Op1L*Op0L
2182   //          + Op1H*Op0H  Op1L*Op0H
2183   // -----------------------------------------
2184   // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L
2185   //
2186   //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2187   //  value and that would overflow.
2188   //  The low 32-bit value is Op1L*Op0L.
2189   //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2190   //  Op1L*Op0L).
2191 
2192   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2193 
2194   Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2195   Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2196   Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2197   Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2198   B.buildAdd(DefRegs[1], Add, MulHiLo);
2199   B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2200 
2201   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2202   MI.eraseFromParent();
2203 }
2204 
2205 void AMDGPURegisterBankInfo::applyMappingImpl(
2206     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2207   MachineInstr &MI = OpdMapper.getMI();
2208   B.setInstrAndDebugLoc(MI);
2209   unsigned Opc = MI.getOpcode();
2210   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2211   switch (Opc) {
2212   case AMDGPU::G_CONSTANT:
2213   case AMDGPU::G_IMPLICIT_DEF: {
2214     Register DstReg = MI.getOperand(0).getReg();
2215     LLT DstTy = MRI.getType(DstReg);
2216     if (DstTy != LLT::scalar(1))
2217       break;
2218 
2219     const RegisterBank *DstBank =
2220         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2221     if (DstBank == &AMDGPU::VCCRegBank)
2222       break;
2223     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2224     if (DefRegs.empty())
2225       DefRegs.push_back(DstReg);
2226 
2227     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2228 
2229     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2230     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2231 
2232     MI.getOperand(0).setReg(NewDstReg);
2233     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2234       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2235       MI.getOperand(1).setCImm(
2236           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2237     }
2238 
2239     MRI.setRegBank(NewDstReg, *DstBank);
2240     B.buildTrunc(DefRegs[0], NewDstReg);
2241     return;
2242   }
2243   case AMDGPU::G_PHI: {
2244     Register DstReg = MI.getOperand(0).getReg();
2245     LLT DstTy = MRI.getType(DstReg);
2246     if (DstTy != LLT::scalar(1))
2247       break;
2248 
2249     const LLT S32 = LLT::scalar(32);
2250     const RegisterBank *DstBank =
2251       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2252     if (DstBank == &AMDGPU::VCCRegBank) {
2253       applyDefaultMapping(OpdMapper);
2254       // The standard handling only considers the result register bank for
2255       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2256       // produce an invalid copy. We can only copy with some kind of compare to
2257       // get a vector boolean result. Insert a register bank copy that will be
2258       // correctly lowered to a compare.
2259       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2260         Register SrcReg = MI.getOperand(I).getReg();
2261         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2262 
2263         if (SrcBank != &AMDGPU::VCCRegBank) {
2264           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2265           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2266 
2267           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2268           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2269           MI.getOperand(I).setReg(Copy.getReg(0));
2270         }
2271       }
2272 
2273       return;
2274     }
2275 
2276     // Phi handling is strange and only considers the bank of the destination.
2277     substituteSimpleCopyRegs(OpdMapper, 0);
2278 
2279     // Promote SGPR/VGPR booleans to s32
2280     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2281     B.setInsertPt(B.getMBB(), MI);
2282     LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2283 
2284     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2285       llvm_unreachable("widen scalar should have succeeded");
2286 
2287     return;
2288   }
2289   case AMDGPU::G_FCMP:
2290     if (!Subtarget.hasSALUFloatInsts())
2291       break;
2292     [[fallthrough]];
2293   case AMDGPU::G_ICMP:
2294   case AMDGPU::G_UADDO:
2295   case AMDGPU::G_USUBO:
2296   case AMDGPU::G_UADDE:
2297   case AMDGPU::G_SADDE:
2298   case AMDGPU::G_USUBE:
2299   case AMDGPU::G_SSUBE: {
2300     unsigned BoolDstOp =
2301         (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2302     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2303 
2304     const RegisterBank *DstBank =
2305       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2306     if (DstBank != &AMDGPU::SGPRRegBank)
2307       break;
2308 
2309     const bool HasCarryIn = MI.getNumOperands() == 5;
2310 
2311     // If this is a scalar compare, promote the result to s32, as the selection
2312     // will end up using a copy to a 32-bit vreg.
2313     const LLT S32 = LLT::scalar(32);
2314     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2315     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2316     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2317 
2318     if (HasCarryIn) {
2319       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2320       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2321       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2322       MI.getOperand(4).setReg(NewSrcReg);
2323     }
2324 
2325     MachineBasicBlock *MBB = MI.getParent();
2326     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2327 
2328     // If we had a constrained VCC result register, a copy was inserted to VCC
2329     // from SGPR.
2330     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2331     if (DefRegs.empty())
2332       DefRegs.push_back(DstReg);
2333     B.buildTrunc(DefRegs[0], NewDstReg);
2334     return;
2335   }
2336   case AMDGPU::G_SELECT: {
2337     Register DstReg = MI.getOperand(0).getReg();
2338     LLT DstTy = MRI.getType(DstReg);
2339 
2340     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2341     if (CondRegs.empty())
2342       CondRegs.push_back(MI.getOperand(1).getReg());
2343     else {
2344       assert(CondRegs.size() == 1);
2345     }
2346 
2347     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2348     if (CondBank == &AMDGPU::SGPRRegBank) {
2349       const LLT S32 = LLT::scalar(32);
2350       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2351       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2352 
2353       MI.getOperand(1).setReg(NewCondReg);
2354       B.buildZExt(NewCondReg, CondRegs[0]);
2355     }
2356 
2357     if (DstTy.getSizeInBits() != 64)
2358       break;
2359 
2360     LLT HalfTy = getHalfSizedType(DstTy);
2361 
2362     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2363     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2364     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2365 
2366     // All inputs are SGPRs, nothing special to do.
2367     if (DefRegs.empty()) {
2368       assert(Src1Regs.empty() && Src2Regs.empty());
2369       break;
2370     }
2371 
2372     if (Src1Regs.empty())
2373       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2374     else {
2375       setRegsToType(MRI, Src1Regs, HalfTy);
2376     }
2377 
2378     if (Src2Regs.empty())
2379       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2380     else
2381       setRegsToType(MRI, Src2Regs, HalfTy);
2382 
2383     setRegsToType(MRI, DefRegs, HalfTy);
2384 
2385     auto Flags = MI.getFlags();
2386     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0], Flags);
2387     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1], Flags);
2388 
2389     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2390     MI.eraseFromParent();
2391     return;
2392   }
2393   case AMDGPU::G_BRCOND: {
2394     Register CondReg = MI.getOperand(0).getReg();
2395     // FIXME: Should use legalizer helper, but should change bool ext type.
2396     const RegisterBank *CondBank =
2397       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2398 
2399     if (CondBank == &AMDGPU::SGPRRegBank) {
2400       const LLT S32 = LLT::scalar(32);
2401       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2402       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2403 
2404       MI.getOperand(0).setReg(NewCondReg);
2405       B.buildZExt(NewCondReg, CondReg);
2406       return;
2407     }
2408 
2409     break;
2410   }
2411   case AMDGPU::G_AND:
2412   case AMDGPU::G_OR:
2413   case AMDGPU::G_XOR: {
2414     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2415     // there is a VGPR input.
2416     Register DstReg = MI.getOperand(0).getReg();
2417     LLT DstTy = MRI.getType(DstReg);
2418 
2419     if (DstTy.getSizeInBits() == 1) {
2420       const RegisterBank *DstBank =
2421         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2422       if (DstBank == &AMDGPU::VCCRegBank)
2423         break;
2424 
2425       MachineFunction *MF = MI.getParent()->getParent();
2426       ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2427       LegalizerHelper Helper(*MF, ApplyBank, B);
2428 
2429       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2430           LegalizerHelper::Legalized)
2431         llvm_unreachable("widen scalar should have succeeded");
2432       return;
2433     }
2434 
2435     if (DstTy.getSizeInBits() != 64)
2436       break;
2437 
2438     LLT HalfTy = getHalfSizedType(DstTy);
2439     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2440     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2441     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2442 
2443     // All inputs are SGPRs, nothing special to do.
2444     if (DefRegs.empty()) {
2445       assert(Src0Regs.empty() && Src1Regs.empty());
2446       break;
2447     }
2448 
2449     assert(DefRegs.size() == 2);
2450     assert(Src0Regs.size() == Src1Regs.size() &&
2451            (Src0Regs.empty() || Src0Regs.size() == 2));
2452 
2453     // Depending on where the source registers came from, the generic code may
2454     // have decided to split the inputs already or not. If not, we still need to
2455     // extract the values.
2456 
2457     if (Src0Regs.empty())
2458       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2459     else
2460       setRegsToType(MRI, Src0Regs, HalfTy);
2461 
2462     if (Src1Regs.empty())
2463       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2464     else
2465       setRegsToType(MRI, Src1Regs, HalfTy);
2466 
2467     setRegsToType(MRI, DefRegs, HalfTy);
2468 
2469     auto Flags = MI.getFlags();
2470     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}, Flags);
2471     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}, Flags);
2472 
2473     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2474     MI.eraseFromParent();
2475     return;
2476   }
2477   case AMDGPU::G_ABS: {
2478     Register SrcReg = MI.getOperand(1).getReg();
2479     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2480 
2481     // There is no VALU abs instruction so we need to replace it with a sub and
2482     // max combination.
2483     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2484       MachineFunction *MF = MI.getParent()->getParent();
2485       ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2486       LegalizerHelper Helper(*MF, Apply, B);
2487 
2488       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2489         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2490       return;
2491     }
2492     [[fallthrough]];
2493   }
2494   case AMDGPU::G_ADD:
2495   case AMDGPU::G_SUB:
2496   case AMDGPU::G_MUL:
2497   case AMDGPU::G_SHL:
2498   case AMDGPU::G_LSHR:
2499   case AMDGPU::G_ASHR:
2500   case AMDGPU::G_SMIN:
2501   case AMDGPU::G_SMAX:
2502   case AMDGPU::G_UMIN:
2503   case AMDGPU::G_UMAX: {
2504     Register DstReg = MI.getOperand(0).getReg();
2505     LLT DstTy = MRI.getType(DstReg);
2506 
2507     // Special case for s_mul_u64. There is not a vector equivalent of
2508     // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2509     // multiplications.
2510     if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2511       applyMappingSMULU64(B, OpdMapper);
2512       return;
2513     }
2514 
2515     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2516     // Packed 16-bit operations need to be scalarized and promoted.
2517     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2518       break;
2519 
2520     const RegisterBank *DstBank =
2521         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2522     if (DstBank == &AMDGPU::VGPRRegBank)
2523       break;
2524 
2525     const LLT S32 = LLT::scalar(32);
2526     MachineBasicBlock *MBB = MI.getParent();
2527     MachineFunction *MF = MBB->getParent();
2528     ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2529 
2530     if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2531       Register WideSrcLo, WideSrcHi;
2532 
2533       std::tie(WideSrcLo, WideSrcHi) =
2534           unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2535       auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2536       auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2537       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2538       MI.eraseFromParent();
2539       return;
2540     }
2541 
2542     if (DstTy.isVector()) {
2543       Register WideSrc0Lo, WideSrc0Hi;
2544       Register WideSrc1Lo, WideSrc1Hi;
2545 
2546       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2547       std::tie(WideSrc0Lo, WideSrc0Hi)
2548         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2549       std::tie(WideSrc1Lo, WideSrc1Hi)
2550         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2551       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2552       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2553       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2554       MI.eraseFromParent();
2555     } else {
2556       LegalizerHelper Helper(*MF, ApplySALU, B);
2557 
2558       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2559         llvm_unreachable("widen scalar should have succeeded");
2560 
2561       // FIXME: s16 shift amounts should be legal.
2562       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2563           Opc == AMDGPU::G_ASHR) {
2564         B.setInsertPt(*MBB, MI.getIterator());
2565         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2566           llvm_unreachable("widen scalar should have succeeded");
2567       }
2568     }
2569 
2570     return;
2571   }
2572   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2573   case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2574     // This is a special case for s_mul_u64. We use
2575     // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2576     // where the 33 higher bits are sign-extended and
2577     // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2578     // where the 32 higher bits are zero-extended. In case scalar registers are
2579     // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2580     // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2581     // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2582 
2583     // Insert basic copies.
2584     applyDefaultMapping(OpdMapper);
2585 
2586     Register DstReg = MI.getOperand(0).getReg();
2587     Register SrcReg0 = MI.getOperand(1).getReg();
2588     Register SrcReg1 = MI.getOperand(2).getReg();
2589     const LLT S32 = LLT::scalar(32);
2590     const LLT S64 = LLT::scalar(64);
2591     assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2592                                          "that handles only 64-bit operands.");
2593     const RegisterBank *DstBank =
2594         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2595 
2596     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2597     // with s_mul_u64 operation.
2598     if (DstBank == &AMDGPU::SGPRRegBank) {
2599       MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2600       MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2601       MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2602       MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2603       return;
2604     }
2605 
2606     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2607     // with a vector mad.
2608     assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2609            "The destination operand should be in vector registers.");
2610 
2611     DebugLoc DL = MI.getDebugLoc();
2612 
2613     // Extract the lower subregister from the first operand.
2614     Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2615     MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2616     MRI.setType(Op0L, S32);
2617     B.buildTrunc(Op0L, SrcReg0);
2618 
2619     // Extract the lower subregister from the second operand.
2620     Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2621     MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2622     MRI.setType(Op1L, S32);
2623     B.buildTrunc(Op1L, SrcReg1);
2624 
2625     unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2626                           ? AMDGPU::G_AMDGPU_MAD_U64_U32
2627                           : AMDGPU::G_AMDGPU_MAD_I64_I32;
2628 
2629     MachineIRBuilder B(MI);
2630     Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2631     MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2632     Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2633     MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2634     B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2635     MI.eraseFromParent();
2636     return;
2637   }
2638   case AMDGPU::G_SEXT_INREG: {
2639     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2640     if (SrcRegs.empty())
2641       break; // Nothing to repair
2642 
2643     const LLT S32 = LLT::scalar(32);
2644     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2645 
2646     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2647     // we would need to further expand, and doesn't let us directly set the
2648     // result registers.
2649     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2650 
2651     int Amt = MI.getOperand(2).getImm();
2652     if (Amt <= 32) {
2653       // Downstream users have expectations for the high bit behavior, so freeze
2654       // incoming undefined bits.
2655       if (Amt == 32) {
2656         // The low bits are unchanged.
2657         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2658       } else {
2659         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2660         // Extend in the low bits and propagate the sign bit to the high half.
2661         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2662       }
2663 
2664       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2665     } else {
2666       // The low bits are unchanged, and extend in the high bits.
2667       // No freeze required
2668       B.buildCopy(DstRegs[0], SrcRegs[0]);
2669       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2670     }
2671 
2672     Register DstReg = MI.getOperand(0).getReg();
2673     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2674     MI.eraseFromParent();
2675     return;
2676   }
2677   case AMDGPU::G_CTPOP:
2678   case AMDGPU::G_BITREVERSE: {
2679     const RegisterBank *DstBank =
2680       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2681     if (DstBank == &AMDGPU::SGPRRegBank)
2682       break;
2683 
2684     Register SrcReg = MI.getOperand(1).getReg();
2685     const LLT S32 = LLT::scalar(32);
2686     LLT Ty = MRI.getType(SrcReg);
2687     if (Ty == S32)
2688       break;
2689 
2690     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2691 
2692     MachineFunction &MF = B.getMF();
2693     LegalizerHelper Helper(MF, ApplyVALU, B);
2694 
2695     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2696       llvm_unreachable("narrowScalar should have succeeded");
2697     return;
2698   }
2699   case AMDGPU::G_AMDGPU_FFBH_U32:
2700   case AMDGPU::G_AMDGPU_FFBL_B32:
2701   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2702   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2703     const RegisterBank *DstBank =
2704         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2705     if (DstBank == &AMDGPU::SGPRRegBank)
2706       break;
2707 
2708     Register SrcReg = MI.getOperand(1).getReg();
2709     const LLT S32 = LLT::scalar(32);
2710     LLT Ty = MRI.getType(SrcReg);
2711     if (Ty == S32)
2712       break;
2713 
2714     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2715     // which return -1 when the input is zero:
2716     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2717     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2718     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2719     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2720     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2721     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2722     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2723                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2724                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2725                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2726                                 : Opc;
2727     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2728     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2729     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2730     unsigned AddOpc =
2731         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2732             ? AMDGPU::G_ADD
2733             : AMDGPU::G_UADDSAT;
2734     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2735     Register DstReg = MI.getOperand(0).getReg();
2736     B.buildUMin(DstReg, X, Y);
2737     MI.eraseFromParent();
2738     return;
2739   }
2740   case AMDGPU::G_SEXT:
2741   case AMDGPU::G_ZEXT:
2742   case AMDGPU::G_ANYEXT: {
2743     Register SrcReg = MI.getOperand(1).getReg();
2744     LLT SrcTy = MRI.getType(SrcReg);
2745     const bool Signed = Opc == AMDGPU::G_SEXT;
2746 
2747     assert(OpdMapper.getVRegs(1).empty());
2748 
2749     const RegisterBank *SrcBank =
2750       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2751 
2752     Register DstReg = MI.getOperand(0).getReg();
2753     LLT DstTy = MRI.getType(DstReg);
2754     if (DstTy.isScalar() &&
2755         SrcBank != &AMDGPU::SGPRRegBank &&
2756         SrcBank != &AMDGPU::VCCRegBank &&
2757         // FIXME: Should handle any type that round to s64 when irregular
2758         // breakdowns supported.
2759         DstTy.getSizeInBits() == 64 &&
2760         SrcTy.getSizeInBits() <= 32) {
2761       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2762 
2763       // Extend to 32-bit, and then extend the low half.
2764       if (Signed) {
2765         // TODO: Should really be buildSExtOrCopy
2766         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2767       } else if (Opc == AMDGPU::G_ZEXT) {
2768         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2769       } else {
2770         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2771       }
2772 
2773       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2774       MRI.setRegBank(DstReg, *SrcBank);
2775       MI.eraseFromParent();
2776       return;
2777     }
2778 
2779     if (SrcTy != LLT::scalar(1))
2780       return;
2781 
2782     // It is not legal to have a legalization artifact with a VCC source. Rather
2783     // than introducing a copy, insert the select we would have to select the
2784     // copy to.
2785     if (SrcBank == &AMDGPU::VCCRegBank) {
2786       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2787 
2788       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2789 
2790       unsigned DstSize = DstTy.getSizeInBits();
2791       // 64-bit select is SGPR only
2792       const bool UseSel64 = DstSize > 32 &&
2793         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2794 
2795       // TODO: Should s16 select be legal?
2796       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2797       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2798       auto False = B.buildConstant(SelType, 0);
2799 
2800       MRI.setRegBank(True.getReg(0), *DstBank);
2801       MRI.setRegBank(False.getReg(0), *DstBank);
2802       MRI.setRegBank(DstReg, *DstBank);
2803 
2804       if (DstSize > 32) {
2805         B.buildSelect(DefRegs[0], SrcReg, True, False);
2806         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2807       } else if (DstSize < 32) {
2808         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2809         MRI.setRegBank(Sel.getReg(0), *DstBank);
2810         B.buildTrunc(DstReg, Sel);
2811       } else {
2812         B.buildSelect(DstReg, SrcReg, True, False);
2813       }
2814 
2815       MI.eraseFromParent();
2816       return;
2817     }
2818 
2819     break;
2820   }
2821   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2822     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2823 
2824     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2825 
2826     Register DstReg = MI.getOperand(0).getReg();
2827     Register SrcReg = MI.getOperand(1).getReg();
2828 
2829     const LLT S32 = LLT::scalar(32);
2830     LLT DstTy = MRI.getType(DstReg);
2831     LLT SrcTy = MRI.getType(SrcReg);
2832 
2833     if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2834       return;
2835 
2836     const ValueMapping &DstMapping
2837       = OpdMapper.getInstrMapping().getOperandMapping(0);
2838     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2839     const RegisterBank *SrcBank =
2840       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2841     const RegisterBank *IdxBank =
2842         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2843 
2844     Register BaseIdxReg;
2845     unsigned ConstOffset;
2846     std::tie(BaseIdxReg, ConstOffset) =
2847         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2848 
2849     // See if the index is an add of a constant which will be foldable by moving
2850     // the base register of the index later if this is going to be executed in a
2851     // waterfall loop. This is essentially to reassociate the add of a constant
2852     // with the readfirstlane.
2853     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2854                                    ConstOffset > 0 &&
2855                                    ConstOffset < SrcTy.getNumElements();
2856 
2857     // Move the base register. We'll re-insert the add later.
2858     if (ShouldMoveIndexIntoLoop)
2859       MI.getOperand(2).setReg(BaseIdxReg);
2860 
2861     // If this is a VGPR result only because the index was a VGPR result, the
2862     // actual indexing will be done on the SGPR source vector, which will
2863     // produce a scalar result. We need to copy to the VGPR result inside the
2864     // waterfall loop.
2865     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2866                                 SrcBank == &AMDGPU::SGPRRegBank;
2867     if (DstRegs.empty()) {
2868       applyDefaultMapping(OpdMapper);
2869 
2870       executeInWaterfallLoop(B, MI, {2});
2871 
2872       if (NeedCopyToVGPR) {
2873         // We don't want a phi for this temporary reg.
2874         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2875         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2876         MI.getOperand(0).setReg(TmpReg);
2877         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2878 
2879         // Use a v_mov_b32 here to make the exec dependency explicit.
2880         buildVCopy(B, DstReg, TmpReg);
2881       }
2882 
2883       // Re-insert the constant offset add inside the waterfall loop.
2884       if (ShouldMoveIndexIntoLoop)
2885         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2886 
2887       return;
2888     }
2889 
2890     assert(DstTy.getSizeInBits() == 64);
2891 
2892     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2893 
2894     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2895     auto One = B.buildConstant(S32, 1);
2896 
2897     MachineBasicBlock::iterator MII = MI.getIterator();
2898 
2899     // Split the vector index into 32-bit pieces. Prepare to move all of the
2900     // new instructions into a waterfall loop if necessary.
2901     //
2902     // Don't put the bitcast or constant in the loop.
2903     MachineInstrSpan Span(MII, &B.getMBB());
2904 
2905     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2906     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2907     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2908 
2909     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2910     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2911 
2912     MRI.setRegBank(DstReg, *DstBank);
2913     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2914     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2915     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2916     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2917 
2918     SmallSet<Register, 4> OpsToWaterfall;
2919     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2920       MI.eraseFromParent();
2921       return;
2922     }
2923 
2924     // Remove the original instruction to avoid potentially confusing the
2925     // waterfall loop logic.
2926     B.setInstr(*Span.begin());
2927     MI.eraseFromParent();
2928     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2929                            OpsToWaterfall);
2930 
2931     if (NeedCopyToVGPR) {
2932       MachineBasicBlock *LoopBB = Extract1->getParent();
2933       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2934       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2935       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2936       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2937 
2938       Extract0->getOperand(0).setReg(TmpReg0);
2939       Extract1->getOperand(0).setReg(TmpReg1);
2940 
2941       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2942 
2943       buildVCopy(B, DstRegs[0], TmpReg0);
2944       buildVCopy(B, DstRegs[1], TmpReg1);
2945     }
2946 
2947     if (ShouldMoveIndexIntoLoop)
2948       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2949 
2950     return;
2951   }
2952   case AMDGPU::G_INSERT_VECTOR_ELT: {
2953     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2954 
2955     Register DstReg = MI.getOperand(0).getReg();
2956     LLT VecTy = MRI.getType(DstReg);
2957 
2958     assert(OpdMapper.getVRegs(0).empty());
2959     assert(OpdMapper.getVRegs(3).empty());
2960 
2961     if (substituteSimpleCopyRegs(OpdMapper, 1))
2962       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2963 
2964     if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2965       return;
2966 
2967     const RegisterBank *IdxBank =
2968       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2969 
2970     Register SrcReg = MI.getOperand(1).getReg();
2971     Register InsReg = MI.getOperand(2).getReg();
2972     LLT InsTy = MRI.getType(InsReg);
2973     (void)InsTy;
2974 
2975     Register BaseIdxReg;
2976     unsigned ConstOffset;
2977     std::tie(BaseIdxReg, ConstOffset) =
2978         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2979 
2980     // See if the index is an add of a constant which will be foldable by moving
2981     // the base register of the index later if this is going to be executed in a
2982     // waterfall loop. This is essentially to reassociate the add of a constant
2983     // with the readfirstlane.
2984     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2985       ConstOffset > 0 &&
2986       ConstOffset < VecTy.getNumElements();
2987 
2988     // Move the base register. We'll re-insert the add later.
2989     if (ShouldMoveIndexIntoLoop)
2990       MI.getOperand(3).setReg(BaseIdxReg);
2991 
2992 
2993     if (InsRegs.empty()) {
2994       executeInWaterfallLoop(B, MI, {3});
2995 
2996       // Re-insert the constant offset add inside the waterfall loop.
2997       if (ShouldMoveIndexIntoLoop) {
2998         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2999       }
3000 
3001       return;
3002     }
3003 
3004     assert(InsTy.getSizeInBits() == 64);
3005 
3006     const LLT S32 = LLT::scalar(32);
3007     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
3008 
3009     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
3010     auto One = B.buildConstant(S32, 1);
3011 
3012     // Split the vector index into 32-bit pieces. Prepare to move all of the
3013     // new instructions into a waterfall loop if necessary.
3014     //
3015     // Don't put the bitcast or constant in the loop.
3016     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
3017 
3018     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
3019     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
3020     auto IdxHi = B.buildAdd(S32, IdxLo, One);
3021 
3022     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
3023     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
3024 
3025     const RegisterBank *DstBank =
3026       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
3027     const RegisterBank *SrcBank =
3028       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
3029     const RegisterBank *InsSrcBank =
3030       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
3031 
3032     MRI.setRegBank(InsReg, *InsSrcBank);
3033     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
3034     MRI.setRegBank(InsLo.getReg(0), *DstBank);
3035     MRI.setRegBank(InsHi.getReg(0), *DstBank);
3036     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3037     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3038     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3039 
3040 
3041     SmallSet<Register, 4> OpsToWaterfall;
3042     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3043       B.setInsertPt(B.getMBB(), MI);
3044       B.buildBitcast(DstReg, InsHi);
3045       MI.eraseFromParent();
3046       return;
3047     }
3048 
3049     B.setInstr(*Span.begin());
3050     MI.eraseFromParent();
3051 
3052     // Figure out the point after the waterfall loop before mangling the control
3053     // flow.
3054     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3055                            OpsToWaterfall);
3056 
3057     // The insertion point is now right after the original instruction.
3058     //
3059     // Keep the bitcast to the original vector type out of the loop. Doing this
3060     // saved an extra phi we don't need inside the loop.
3061     B.buildBitcast(DstReg, InsHi);
3062 
3063     // Re-insert the constant offset add inside the waterfall loop.
3064     if (ShouldMoveIndexIntoLoop)
3065       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3066 
3067     return;
3068   }
3069   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3070   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3071   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3072   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3073   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3074   case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3075   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3076   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3077   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3078   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3079   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3080   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3081   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3082   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3083   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3084   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3085   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3086   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3087   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3088   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3089   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3090   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3091     applyDefaultMapping(OpdMapper);
3092     executeInWaterfallLoop(B, MI, {1, 4});
3093     return;
3094   }
3095   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3096   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3097   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3098   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3099   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3100   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3101   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3102   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3103   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3104   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3105   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3106   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3107   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3108   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3109   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3110     applyDefaultMapping(OpdMapper);
3111     executeInWaterfallLoop(B, MI, {2, 5});
3112     return;
3113   }
3114   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3115     applyDefaultMapping(OpdMapper);
3116     executeInWaterfallLoop(B, MI, {3, 6});
3117     return;
3118   }
3119   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3120   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3121   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3122   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3123   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3124     applyMappingSBufferLoad(B, OpdMapper);
3125     return;
3126   }
3127   case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3128     constrainOpWithReadfirstlane(B, MI, 0);
3129     constrainOpWithReadfirstlane(B, MI, 2);
3130     return;
3131   case AMDGPU::G_INTRINSIC:
3132   case AMDGPU::G_INTRINSIC_CONVERGENT: {
3133     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3134     case Intrinsic::amdgcn_readlane: {
3135       substituteSimpleCopyRegs(OpdMapper, 2);
3136 
3137       assert(OpdMapper.getVRegs(0).empty());
3138       assert(OpdMapper.getVRegs(3).empty());
3139 
3140       // Make sure the index is an SGPR. It doesn't make sense to run this in a
3141       // waterfall loop, so assume it's a uniform value.
3142       constrainOpWithReadfirstlane(B, MI, 3); // Index
3143       return;
3144     }
3145     case Intrinsic::amdgcn_writelane: {
3146       assert(OpdMapper.getVRegs(0).empty());
3147       assert(OpdMapper.getVRegs(2).empty());
3148       assert(OpdMapper.getVRegs(3).empty());
3149 
3150       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3151       constrainOpWithReadfirstlane(B, MI, 2); // Source value
3152       constrainOpWithReadfirstlane(B, MI, 3); // Index
3153       return;
3154     }
3155     case Intrinsic::amdgcn_interp_p1:
3156     case Intrinsic::amdgcn_interp_p2:
3157     case Intrinsic::amdgcn_interp_mov:
3158     case Intrinsic::amdgcn_interp_p1_f16:
3159     case Intrinsic::amdgcn_interp_p2_f16:
3160     case Intrinsic::amdgcn_lds_param_load: {
3161       applyDefaultMapping(OpdMapper);
3162 
3163       // Readlane for m0 value, which is always the last operand.
3164       // FIXME: Should this be a waterfall loop instead?
3165       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3166       return;
3167     }
3168     case Intrinsic::amdgcn_interp_inreg_p10:
3169     case Intrinsic::amdgcn_interp_inreg_p2:
3170     case Intrinsic::amdgcn_interp_inreg_p10_f16:
3171     case Intrinsic::amdgcn_interp_inreg_p2_f16:
3172     case Intrinsic::amdgcn_interp_p10_rtz_f16:
3173     case Intrinsic::amdgcn_interp_p2_rtz_f16:
3174     case Intrinsic::amdgcn_permlane16_swap:
3175     case Intrinsic::amdgcn_permlane32_swap:
3176       applyDefaultMapping(OpdMapper);
3177       return;
3178     case Intrinsic::amdgcn_permlane16:
3179     case Intrinsic::amdgcn_permlanex16: {
3180       // Doing a waterfall loop over these wouldn't make any sense.
3181       substituteSimpleCopyRegs(OpdMapper, 2);
3182       substituteSimpleCopyRegs(OpdMapper, 3);
3183       constrainOpWithReadfirstlane(B, MI, 4);
3184       constrainOpWithReadfirstlane(B, MI, 5);
3185       return;
3186     }
3187     case Intrinsic::amdgcn_sbfe:
3188       applyMappingBFE(B, OpdMapper, true);
3189       return;
3190     case Intrinsic::amdgcn_ubfe:
3191       applyMappingBFE(B, OpdMapper, false);
3192       return;
3193     case Intrinsic::amdgcn_inverse_ballot:
3194     case Intrinsic::amdgcn_s_bitreplicate:
3195     case Intrinsic::amdgcn_s_quadmask:
3196     case Intrinsic::amdgcn_s_wqm:
3197       applyDefaultMapping(OpdMapper);
3198       constrainOpWithReadfirstlane(B, MI, 2); // Mask
3199       return;
3200     case Intrinsic::amdgcn_ballot:
3201       // Use default handling and insert copy to vcc source.
3202       break;
3203     }
3204     break;
3205   }
3206   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3207   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3208   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3209   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3210   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3211     const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3212         AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3213     assert(RSrcIntrin && RSrcIntrin->IsImage);
3214     // Non-images can have complications from operands that allow both SGPR
3215     // and VGPR. For now it's too complicated to figure out the final opcode
3216     // to derive the register bank from the MCInstrDesc.
3217     applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3218     return;
3219   }
3220   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3221     unsigned N = MI.getNumExplicitOperands() - 2;
3222     applyDefaultMapping(OpdMapper);
3223     executeInWaterfallLoop(B, MI, {N});
3224     return;
3225   }
3226   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3227   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3228     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3229     switch (IntrID) {
3230     case Intrinsic::amdgcn_ds_ordered_add:
3231     case Intrinsic::amdgcn_ds_ordered_swap: {
3232       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3233       assert(OpdMapper.getVRegs(0).empty());
3234       substituteSimpleCopyRegs(OpdMapper, 3);
3235       constrainOpWithReadfirstlane(B, MI, 2); // M0
3236       return;
3237     }
3238     case Intrinsic::amdgcn_ds_gws_init:
3239     case Intrinsic::amdgcn_ds_gws_barrier:
3240     case Intrinsic::amdgcn_ds_gws_sema_br: {
3241       // Only the first lane is executes, so readfirstlane is safe.
3242       substituteSimpleCopyRegs(OpdMapper, 1);
3243       constrainOpWithReadfirstlane(B, MI, 2); // M0
3244       return;
3245     }
3246     case Intrinsic::amdgcn_ds_gws_sema_v:
3247     case Intrinsic::amdgcn_ds_gws_sema_p:
3248     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3249       // Only the first lane is executes, so readfirstlane is safe.
3250       constrainOpWithReadfirstlane(B, MI, 1); // M0
3251       return;
3252     }
3253     case Intrinsic::amdgcn_ds_append:
3254     case Intrinsic::amdgcn_ds_consume: {
3255       constrainOpWithReadfirstlane(B, MI, 2); // M0
3256       return;
3257     }
3258     case Intrinsic::amdgcn_s_sendmsg:
3259     case Intrinsic::amdgcn_s_sendmsghalt: {
3260       // FIXME: Should this use a waterfall loop?
3261       constrainOpWithReadfirstlane(B, MI, 2); // M0
3262       return;
3263     }
3264     case Intrinsic::amdgcn_s_setreg: {
3265       constrainOpWithReadfirstlane(B, MI, 2);
3266       return;
3267     }
3268     case Intrinsic::amdgcn_s_ttracedata:
3269       constrainOpWithReadfirstlane(B, MI, 1); // M0
3270       return;
3271     case Intrinsic::amdgcn_raw_buffer_load_lds:
3272     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3273       applyDefaultMapping(OpdMapper);
3274       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3275       constrainOpWithReadfirstlane(B, MI, 2); // M0
3276       constrainOpWithReadfirstlane(B, MI, 5); // soffset
3277       return;
3278     }
3279     case Intrinsic::amdgcn_struct_buffer_load_lds:
3280     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3281       applyDefaultMapping(OpdMapper);
3282       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3283       constrainOpWithReadfirstlane(B, MI, 2); // M0
3284       constrainOpWithReadfirstlane(B, MI, 6); // soffset
3285       return;
3286     }
3287     case Intrinsic::amdgcn_global_load_lds: {
3288       applyDefaultMapping(OpdMapper);
3289       constrainOpWithReadfirstlane(B, MI, 2);
3290       return;
3291     }
3292     case Intrinsic::amdgcn_lds_direct_load: {
3293       applyDefaultMapping(OpdMapper);
3294       // Readlane for m0 value, which is always the last operand.
3295       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3296       return;
3297     }
3298     case Intrinsic::amdgcn_exp_row:
3299       applyDefaultMapping(OpdMapper);
3300       constrainOpWithReadfirstlane(B, MI, 8); // M0
3301       return;
3302     case Intrinsic::amdgcn_s_sleep_var:
3303       assert(OpdMapper.getVRegs(1).empty());
3304       constrainOpWithReadfirstlane(B, MI, 1);
3305       return;
3306     case Intrinsic::amdgcn_s_barrier_join:
3307       constrainOpWithReadfirstlane(B, MI, 1);
3308       return;
3309     case Intrinsic::amdgcn_s_barrier_init:
3310     case Intrinsic::amdgcn_s_barrier_signal_var:
3311       constrainOpWithReadfirstlane(B, MI, 1);
3312       constrainOpWithReadfirstlane(B, MI, 2);
3313       return;
3314     case Intrinsic::amdgcn_s_get_barrier_state:
3315     case Intrinsic::amdgcn_s_get_named_barrier_state: {
3316       constrainOpWithReadfirstlane(B, MI, 2);
3317       return;
3318     }
3319     case Intrinsic::amdgcn_s_prefetch_data: {
3320       Register PtrReg = MI.getOperand(1).getReg();
3321       unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3322       if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3323         constrainOpWithReadfirstlane(B, MI, 1);
3324         constrainOpWithReadfirstlane(B, MI, 2);
3325       } else
3326         MI.eraseFromParent();
3327       return;
3328     }
3329     default: {
3330       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3331               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3332         // Non-images can have complications from operands that allow both SGPR
3333         // and VGPR. For now it's too complicated to figure out the final opcode
3334         // to derive the register bank from the MCInstrDesc.
3335         if (RSrcIntrin->IsImage) {
3336           applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3337           return;
3338         }
3339       }
3340 
3341       break;
3342     }
3343     }
3344     break;
3345   }
3346   case AMDGPU::G_SI_CALL: {
3347     // Use a set to avoid extra readfirstlanes in the case where multiple
3348     // operands are the same register.
3349     SmallSet<Register, 4> SGPROperandRegs;
3350 
3351     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3352       break;
3353 
3354     // Move all copies to physical SGPRs that are used by the call instruction
3355     // into the loop block. Start searching for these copies until the
3356     // ADJCALLSTACKUP.
3357     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3358     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3359 
3360     // Move all non-copies before the copies, so that a complete range can be
3361     // moved into the waterfall loop.
3362     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3363     // Count of NonCopyInstrs found until the current LastCopy.
3364     unsigned NonCopyInstrsLen = 0;
3365     MachineBasicBlock::iterator Start(&MI);
3366     MachineBasicBlock::iterator LastCopy = Start;
3367     MachineBasicBlock *MBB = MI.getParent();
3368     const SIMachineFunctionInfo *Info =
3369         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3370     while (Start->getOpcode() != FrameSetupOpcode) {
3371       --Start;
3372       bool IsCopy = false;
3373       if (Start->getOpcode() == AMDGPU::COPY) {
3374         auto &Dst = Start->getOperand(0);
3375         if (Dst.isReg()) {
3376           Register Reg = Dst.getReg();
3377           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3378             IsCopy = true;
3379           } else {
3380             // Also move the copy from the scratch rsrc descriptor into the loop
3381             // to allow it to be optimized away.
3382             auto &Src = Start->getOperand(1);
3383             if (Src.isReg()) {
3384               Reg = Src.getReg();
3385               IsCopy = Info->getScratchRSrcReg() == Reg;
3386             }
3387           }
3388         }
3389       }
3390 
3391       if (IsCopy) {
3392         LastCopy = Start;
3393         NonCopyInstrsLen = NonCopyInstrs.size();
3394       } else {
3395         NonCopyInstrs.push_back(&*Start);
3396       }
3397     }
3398     NonCopyInstrs.resize(NonCopyInstrsLen);
3399 
3400     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3401       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3402     }
3403     Start = LastCopy;
3404 
3405     // Do the same for copies after the loop
3406     NonCopyInstrs.clear();
3407     NonCopyInstrsLen = 0;
3408     MachineBasicBlock::iterator End(&MI);
3409     LastCopy = End;
3410     while (End->getOpcode() != FrameDestroyOpcode) {
3411       ++End;
3412       bool IsCopy = false;
3413       if (End->getOpcode() == AMDGPU::COPY) {
3414         auto &Src = End->getOperand(1);
3415         if (Src.isReg()) {
3416           Register Reg = Src.getReg();
3417           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3418         }
3419       }
3420 
3421       if (IsCopy) {
3422         LastCopy = End;
3423         NonCopyInstrsLen = NonCopyInstrs.size();
3424       } else {
3425         NonCopyInstrs.push_back(&*End);
3426       }
3427     }
3428     NonCopyInstrs.resize(NonCopyInstrsLen);
3429 
3430     End = LastCopy;
3431     ++LastCopy;
3432     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3433       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3434     }
3435 
3436     ++End;
3437     B.setInsertPt(B.getMBB(), Start);
3438     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3439     break;
3440   }
3441   case AMDGPU::G_LOAD:
3442   case AMDGPU::G_ZEXTLOAD:
3443   case AMDGPU::G_SEXTLOAD: {
3444     if (applyMappingLoad(B, OpdMapper, MI))
3445       return;
3446     break;
3447   }
3448   case AMDGPU::G_DYN_STACKALLOC:
3449     applyMappingDynStackAlloc(B, OpdMapper, MI);
3450     return;
3451   case AMDGPU::G_STACKRESTORE: {
3452     applyDefaultMapping(OpdMapper);
3453     constrainOpWithReadfirstlane(B, MI, 0);
3454     return;
3455   }
3456   case AMDGPU::G_SBFX:
3457     applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3458     return;
3459   case AMDGPU::G_UBFX:
3460     applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3461     return;
3462   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3463   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3464     applyMappingMAD_64_32(B, OpdMapper);
3465     return;
3466   case AMDGPU::G_PREFETCH: {
3467     if (!Subtarget.hasPrefetch()) {
3468       MI.eraseFromParent();
3469       return;
3470     }
3471     Register PtrReg = MI.getOperand(0).getReg();
3472     unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3473     if (PtrBank == AMDGPU::VGPRRegBankID) {
3474       MI.eraseFromParent();
3475       return;
3476     }
3477     unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3478     if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3479         AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3480       MI.eraseFromParent();
3481       return;
3482     }
3483     applyDefaultMapping(OpdMapper);
3484     return;
3485   }
3486   default:
3487     break;
3488   }
3489 
3490   return applyDefaultMapping(OpdMapper);
3491 }
3492 
3493 // vgpr, sgpr -> vgpr
3494 // vgpr, agpr -> vgpr
3495 // agpr, agpr -> agpr
3496 // agpr, sgpr -> vgpr
3497 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3498   if (RB0 == AMDGPU::InvalidRegBankID)
3499     return RB1;
3500   if (RB1 == AMDGPU::InvalidRegBankID)
3501     return RB0;
3502 
3503   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3504     return AMDGPU::SGPRRegBankID;
3505 
3506   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3507     return AMDGPU::AGPRRegBankID;
3508 
3509   return AMDGPU::VGPRRegBankID;
3510 }
3511 
3512 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3513   if (RB0 == AMDGPU::InvalidRegBankID)
3514     return RB1;
3515   if (RB1 == AMDGPU::InvalidRegBankID)
3516     return RB0;
3517 
3518   // vcc, vcc -> vcc
3519   // vcc, sgpr -> vcc
3520   // vcc, vgpr -> vcc
3521   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3522     return AMDGPU::VCCRegBankID;
3523 
3524   // vcc, vgpr -> vgpr
3525   return regBankUnion(RB0, RB1);
3526 }
3527 
3528 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3529                                                 const MachineInstr &MI) const {
3530   unsigned RegBank = AMDGPU::InvalidRegBankID;
3531 
3532   for (const MachineOperand &MO : MI.operands()) {
3533     if (!MO.isReg())
3534       continue;
3535     Register Reg = MO.getReg();
3536     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3537       RegBank = regBankUnion(RegBank, Bank->getID());
3538       if (RegBank == AMDGPU::VGPRRegBankID)
3539         break;
3540     }
3541   }
3542 
3543   return RegBank;
3544 }
3545 
3546 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3547   const MachineFunction &MF = *MI.getParent()->getParent();
3548   const MachineRegisterInfo &MRI = MF.getRegInfo();
3549   for (const MachineOperand &MO : MI.operands()) {
3550     if (!MO.isReg())
3551       continue;
3552     Register Reg = MO.getReg();
3553     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3554       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3555         return false;
3556     }
3557   }
3558   return true;
3559 }
3560 
3561 const RegisterBankInfo::InstructionMapping &
3562 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3563   const MachineFunction &MF = *MI.getParent()->getParent();
3564   const MachineRegisterInfo &MRI = MF.getRegInfo();
3565   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3566 
3567   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3568     const MachineOperand &SrcOp = MI.getOperand(i);
3569     if (!SrcOp.isReg())
3570       continue;
3571 
3572     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3573     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3574   }
3575   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3576                                MI.getNumOperands());
3577 }
3578 
3579 const RegisterBankInfo::InstructionMapping &
3580 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3581   const MachineFunction &MF = *MI.getParent()->getParent();
3582   const MachineRegisterInfo &MRI = MF.getRegInfo();
3583   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3584 
3585   // Even though we technically could use SGPRs, this would require knowledge of
3586   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3587   //
3588   // TODO: Unary ops are trivially OK, so accept SGPRs?
3589   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3590     const MachineOperand &Src = MI.getOperand(i);
3591     if (!Src.isReg())
3592       continue;
3593 
3594     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3595     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3596     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3597   }
3598 
3599   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3600                                MI.getNumOperands());
3601 }
3602 
3603 const RegisterBankInfo::InstructionMapping &
3604 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3605   const MachineFunction &MF = *MI.getParent()->getParent();
3606   const MachineRegisterInfo &MRI = MF.getRegInfo();
3607   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3608 
3609   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3610     const MachineOperand &Op = MI.getOperand(I);
3611     if (!Op.isReg())
3612       continue;
3613 
3614     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3615     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3616   }
3617 
3618   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3619                                MI.getNumOperands());
3620 }
3621 
3622 const RegisterBankInfo::InstructionMapping &
3623 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3624                                         const MachineInstr &MI,
3625                                         int RsrcIdx) const {
3626   // The reported argument index is relative to the IR intrinsic call arguments,
3627   // so we need to shift by the number of defs and the intrinsic ID.
3628   RsrcIdx += MI.getNumExplicitDefs() + 1;
3629 
3630   const int NumOps = MI.getNumOperands();
3631   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3632 
3633   // TODO: Should packed/unpacked D16 difference be reported here as part of
3634   // the value mapping?
3635   for (int I = 0; I != NumOps; ++I) {
3636     if (!MI.getOperand(I).isReg())
3637       continue;
3638 
3639     Register OpReg = MI.getOperand(I).getReg();
3640     // We replace some dead address operands with $noreg
3641     if (!OpReg)
3642       continue;
3643 
3644     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3645 
3646     // FIXME: Probably need a new intrinsic register bank searchable table to
3647     // handle arbitrary intrinsics easily.
3648     //
3649     // If this has a sampler, it immediately follows rsrc.
3650     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3651 
3652     if (MustBeSGPR) {
3653       // If this must be an SGPR, so we must report whatever it is as legal.
3654       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3655       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3656     } else {
3657       // Some operands must be VGPR, and these are easy to copy to.
3658       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3659     }
3660   }
3661 
3662   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3663 }
3664 
3665 /// Return the mapping for a pointer argument.
3666 const RegisterBankInfo::ValueMapping *
3667 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3668                                               Register PtrReg) const {
3669   LLT PtrTy = MRI.getType(PtrReg);
3670   unsigned Size = PtrTy.getSizeInBits();
3671   if (Subtarget.useFlatForGlobal() ||
3672       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3673     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3674 
3675   // If we're using MUBUF instructions for global memory, an SGPR base register
3676   // is possible. Otherwise this needs to be a VGPR.
3677   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3678   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3679 }
3680 
3681 const RegisterBankInfo::InstructionMapping &
3682 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3683 
3684   const MachineFunction &MF = *MI.getParent()->getParent();
3685   const MachineRegisterInfo &MRI = MF.getRegInfo();
3686   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3687   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3688   Register PtrReg = MI.getOperand(1).getReg();
3689   LLT PtrTy = MRI.getType(PtrReg);
3690   unsigned AS = PtrTy.getAddressSpace();
3691   unsigned PtrSize = PtrTy.getSizeInBits();
3692 
3693   const ValueMapping *ValMapping;
3694   const ValueMapping *PtrMapping;
3695 
3696   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3697 
3698   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3699     if (isScalarLoadLegal(MI)) {
3700       // We have a uniform instruction so we want to use an SMRD load
3701       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3702       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3703     } else {
3704       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3705 
3706       // If we're using MUBUF instructions for global memory, an SGPR base
3707       // register is possible. Otherwise this needs to be a VGPR.
3708       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3709         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3710 
3711       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3712     }
3713   } else {
3714     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3715     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3716   }
3717 
3718   OpdsMapping[0] = ValMapping;
3719   OpdsMapping[1] = PtrMapping;
3720   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3721       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3722   return Mapping;
3723 
3724   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3725   // handle that during instruction selection?
3726 }
3727 
3728 unsigned
3729 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3730                                      const MachineRegisterInfo &MRI,
3731                                      unsigned Default) const {
3732   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3733   return Bank ? Bank->getID() : Default;
3734 }
3735 
3736 const RegisterBankInfo::ValueMapping *
3737 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3738                                          const MachineRegisterInfo &MRI,
3739                                          const TargetRegisterInfo &TRI) const {
3740   // Lie and claim anything is legal, even though this needs to be an SGPR
3741   // applyMapping will have to deal with it as a waterfall loop.
3742   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3743   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3744   return AMDGPU::getValueMapping(Bank, Size);
3745 }
3746 
3747 const RegisterBankInfo::ValueMapping *
3748 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3749                                          const MachineRegisterInfo &MRI,
3750                                          const TargetRegisterInfo &TRI) const {
3751   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3752   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3753 }
3754 
3755 const RegisterBankInfo::ValueMapping *
3756 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3757                                          const MachineRegisterInfo &MRI,
3758                                          const TargetRegisterInfo &TRI) const {
3759   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3760   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3761 }
3762 
3763 ///
3764 /// This function must return a legal mapping, because
3765 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3766 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3767 /// VGPR to SGPR generated is illegal.
3768 ///
3769 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3770 // legal. These will be dealt with in applyMappingImpl.
3771 //
3772 const RegisterBankInfo::InstructionMapping &
3773 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3774   const MachineFunction &MF = *MI.getParent()->getParent();
3775   const MachineRegisterInfo &MRI = MF.getRegInfo();
3776 
3777   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3778     Register DstReg = MI.getOperand(0).getReg();
3779     Register SrcReg = MI.getOperand(1).getReg();
3780 
3781     // The default logic bothers to analyze impossible alternative mappings. We
3782     // want the most straightforward mapping, so just directly handle this.
3783     const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
3784     const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
3785     assert(SrcBank && "src bank should have been assigned already");
3786 
3787     // For COPY between a physical reg and an s1, there is no type associated so
3788     // we need to take the virtual register's type as a hint on how to interpret
3789     // s1 values.
3790     if (!SrcReg.isVirtual() && !DstBank &&
3791         MRI.getType(DstReg) == LLT::scalar(1))
3792       DstBank = &AMDGPU::VCCRegBank;
3793     else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
3794       DstBank = &AMDGPU::VCCRegBank;
3795 
3796     if (!DstBank)
3797       DstBank = SrcBank;
3798 
3799     unsigned Size = getSizeInBits(DstReg, MRI, *TRI);
3800     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3801         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3802       return getInvalidInstructionMapping();
3803 
3804     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3805     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3806     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3807     OpdsMapping[0] = &ValMap;
3808     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3809       OpdsMapping[1] = &ValMap;
3810 
3811     return getInstructionMapping(
3812         1, /*Cost*/ 1,
3813         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3814   }
3815 
3816   if (MI.isRegSequence()) {
3817     // If any input is a VGPR, the result must be a VGPR. The default handling
3818     // assumes any copy between banks is legal.
3819     unsigned BankID = AMDGPU::SGPRRegBankID;
3820 
3821     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3822       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3823       // It doesn't make sense to use vcc or scc banks here, so just ignore
3824       // them.
3825       if (OpBank != AMDGPU::SGPRRegBankID) {
3826         BankID = AMDGPU::VGPRRegBankID;
3827         break;
3828       }
3829     }
3830     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3831 
3832     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3833     return getInstructionMapping(
3834         1, /*Cost*/ 1,
3835         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3836   }
3837 
3838   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3839   // properly.
3840   //
3841   // TODO: There are additional exec masking dependencies to analyze.
3842   if (auto *PHI = dyn_cast<GPhi>(&MI)) {
3843     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3844     Register DstReg = PHI->getReg(0);
3845 
3846     // Sometimes the result may have already been assigned a bank.
3847     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3848       ResultBank = DstBank->getID();
3849 
3850     for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3851       Register Reg = PHI->getIncomingValue(I);
3852       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3853 
3854       // FIXME: Assuming VGPR for any undetermined inputs.
3855       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3856         ResultBank = AMDGPU::VGPRRegBankID;
3857         break;
3858       }
3859 
3860       // FIXME: Need to promote SGPR case to s32
3861       unsigned OpBank = Bank->getID();
3862       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3863     }
3864 
3865     assert(ResultBank != AMDGPU::InvalidRegBankID);
3866 
3867     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3868 
3869     const ValueMapping &ValMap =
3870         getValueMapping(0, Size, getRegBank(ResultBank));
3871     return getInstructionMapping(
3872         1, /*Cost*/ 1,
3873         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3874   }
3875 
3876   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3877   if (Mapping.isValid())
3878     return Mapping;
3879 
3880   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3881 
3882   switch (MI.getOpcode()) {
3883   default:
3884     return getInvalidInstructionMapping();
3885 
3886   case AMDGPU::G_AND:
3887   case AMDGPU::G_OR:
3888   case AMDGPU::G_XOR:
3889   case AMDGPU::G_MUL: {
3890     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3891     if (Size == 1) {
3892       const RegisterBank *DstBank
3893         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3894 
3895       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3896       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3897       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3898       if (DstBank) {
3899         TargetBankID = DstBank->getID();
3900         if (DstBank == &AMDGPU::VCCRegBank) {
3901           TargetBankID = AMDGPU::VCCRegBankID;
3902           BankLHS = AMDGPU::VCCRegBankID;
3903           BankRHS = AMDGPU::VCCRegBankID;
3904         } else {
3905           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3906                                  AMDGPU::SGPRRegBankID);
3907           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3908                                  AMDGPU::SGPRRegBankID);
3909         }
3910       } else {
3911         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3912                                AMDGPU::VCCRegBankID);
3913         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3914                                AMDGPU::VCCRegBankID);
3915 
3916         // Both inputs should be true booleans to produce a boolean result.
3917         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3918           TargetBankID = AMDGPU::VGPRRegBankID;
3919         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3920           TargetBankID = AMDGPU::VCCRegBankID;
3921           BankLHS = AMDGPU::VCCRegBankID;
3922           BankRHS = AMDGPU::VCCRegBankID;
3923         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3924           TargetBankID = AMDGPU::SGPRRegBankID;
3925         }
3926       }
3927 
3928       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3929       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3930       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3931       break;
3932     }
3933 
3934     if (Size == 64) {
3935 
3936       if (isSALUMapping(MI)) {
3937         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3938         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3939       } else {
3940         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3941         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3942         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3943 
3944         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3945         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3946       }
3947 
3948       break;
3949     }
3950 
3951     [[fallthrough]];
3952   }
3953   case AMDGPU::G_PTR_ADD:
3954   case AMDGPU::G_PTRMASK:
3955   case AMDGPU::G_ADD:
3956   case AMDGPU::G_SUB:
3957   case AMDGPU::G_SHL:
3958   case AMDGPU::G_LSHR:
3959   case AMDGPU::G_ASHR:
3960   case AMDGPU::G_UADDO:
3961   case AMDGPU::G_USUBO:
3962   case AMDGPU::G_UADDE:
3963   case AMDGPU::G_SADDE:
3964   case AMDGPU::G_USUBE:
3965   case AMDGPU::G_SSUBE:
3966   case AMDGPU::G_SMIN:
3967   case AMDGPU::G_SMAX:
3968   case AMDGPU::G_UMIN:
3969   case AMDGPU::G_UMAX:
3970   case AMDGPU::G_ABS:
3971   case AMDGPU::G_SHUFFLE_VECTOR:
3972   case AMDGPU::G_SBFX:
3973   case AMDGPU::G_UBFX:
3974   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3975   case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3976     if (isSALUMapping(MI))
3977       return getDefaultMappingSOP(MI);
3978     return getDefaultMappingVOP(MI);
3979   case AMDGPU::G_FADD:
3980   case AMDGPU::G_FSUB:
3981   case AMDGPU::G_FMUL:
3982   case AMDGPU::G_FMA:
3983   case AMDGPU::G_FFLOOR:
3984   case AMDGPU::G_FCEIL:
3985   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3986   case AMDGPU::G_FMINNUM:
3987   case AMDGPU::G_FMAXNUM:
3988   case AMDGPU::G_FMINIMUM:
3989   case AMDGPU::G_FMAXIMUM:
3990   case AMDGPU::G_INTRINSIC_TRUNC:
3991   case AMDGPU::G_STRICT_FADD:
3992   case AMDGPU::G_STRICT_FSUB:
3993   case AMDGPU::G_STRICT_FMUL:
3994   case AMDGPU::G_STRICT_FMA: {
3995     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3996     unsigned Size = Ty.getSizeInBits();
3997     if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3998         (Size == 32 || Size == 16) && isSALUMapping(MI))
3999       return getDefaultMappingSOP(MI);
4000     return getDefaultMappingVOP(MI);
4001   }
4002   case AMDGPU::G_FPTOSI:
4003   case AMDGPU::G_FPTOUI:
4004   case AMDGPU::G_SITOFP:
4005   case AMDGPU::G_UITOFP: {
4006     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4007     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4008     if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
4009         isSALUMapping(MI))
4010       return getDefaultMappingSOP(MI);
4011     return getDefaultMappingVOP(MI);
4012   }
4013   case AMDGPU::G_FPTRUNC:
4014   case AMDGPU::G_FPEXT: {
4015     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4016     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4017     if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
4018         isSALUMapping(MI))
4019       return getDefaultMappingSOP(MI);
4020     return getDefaultMappingVOP(MI);
4021   }
4022   case AMDGPU::G_FSQRT:
4023   case AMDGPU::G_FEXP2:
4024   case AMDGPU::G_FLOG2: {
4025     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4026     if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4027         isSALUMapping(MI))
4028       return getDefaultMappingSOP(MI);
4029     return getDefaultMappingVOP(MI);
4030   }
4031   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4032   case AMDGPU::G_SSUBSAT:
4033   case AMDGPU::G_UADDSAT:
4034   case AMDGPU::G_USUBSAT:
4035   case AMDGPU::G_FMAD:
4036   case AMDGPU::G_FLDEXP:
4037   case AMDGPU::G_FMINNUM_IEEE:
4038   case AMDGPU::G_FMAXNUM_IEEE:
4039   case AMDGPU::G_FCANONICALIZE:
4040   case AMDGPU::G_STRICT_FLDEXP:
4041   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4042   case AMDGPU::G_FSHR: // TODO: Expand for scalar
4043   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4044   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4045   case AMDGPU::G_AMDGPU_RCP_IFLAG:
4046   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4047   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4048   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4049   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4050   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4051   case AMDGPU::G_AMDGPU_SMED3:
4052   case AMDGPU::G_AMDGPU_FMED3:
4053     return getDefaultMappingVOP(MI);
4054   case AMDGPU::G_UMULH:
4055   case AMDGPU::G_SMULH: {
4056     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4057       return getDefaultMappingSOP(MI);
4058     return getDefaultMappingVOP(MI);
4059   }
4060   case AMDGPU::G_AMDGPU_MAD_U64_U32:
4061   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4062     // Three possible mappings:
4063     //
4064     //  - Default SOP
4065     //  - Default VOP
4066     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4067     //
4068     // This allows instruction selection to keep the multiplication part of the
4069     // instruction on the SALU.
4070     bool AllSalu = true;
4071     bool MulSalu = true;
4072     for (unsigned i = 0; i < 5; ++i) {
4073       Register Reg = MI.getOperand(i).getReg();
4074       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4075         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4076           AllSalu = false;
4077           if (i == 2 || i == 3) {
4078             MulSalu = false;
4079             break;
4080           }
4081         }
4082       }
4083     }
4084 
4085     if (AllSalu)
4086       return getDefaultMappingSOP(MI);
4087 
4088     // If the multiply-add is full-rate in VALU, use that even if the
4089     // multiplication part is scalar. Accumulating separately on the VALU would
4090     // take two instructions.
4091     if (!MulSalu || Subtarget.hasFullRate64Ops())
4092       return getDefaultMappingVOP(MI);
4093 
4094     // Keep the multiplication on the SALU, then accumulate on the VALU.
4095     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4096     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4097     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4098     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4099     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4100     break;
4101   }
4102   case AMDGPU::G_IMPLICIT_DEF: {
4103     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4104     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4105     break;
4106   }
4107   case AMDGPU::G_FCONSTANT:
4108   case AMDGPU::G_CONSTANT:
4109   case AMDGPU::G_GLOBAL_VALUE:
4110   case AMDGPU::G_FRAME_INDEX:
4111   case AMDGPU::G_BLOCK_ADDR:
4112   case AMDGPU::G_READSTEADYCOUNTER:
4113   case AMDGPU::G_READCYCLECOUNTER: {
4114     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4115     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4116     break;
4117   }
4118   case AMDGPU::G_DYN_STACKALLOC: {
4119     // Result is always uniform, and a wave reduction is needed for the source.
4120     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4121     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4122     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4123     break;
4124   }
4125   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4126     // This case is weird because we expect a physical register in the source,
4127     // but need to set a bank anyway.
4128     //
4129     // TODO: We could select the result to SGPR or VGPR
4130     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4131     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4132     break;
4133   }
4134   case AMDGPU::G_INSERT: {
4135     unsigned BankID = getMappingType(MRI, MI);
4136     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4137     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4138     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4139     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4140     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4141     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4142     OpdsMapping[3] = nullptr;
4143     break;
4144   }
4145   case AMDGPU::G_EXTRACT: {
4146     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4147     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4148     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4149     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4150     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4151     OpdsMapping[2] = nullptr;
4152     break;
4153   }
4154   case AMDGPU::G_BUILD_VECTOR:
4155   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4156     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4157     if (DstTy == LLT::fixed_vector(2, 16)) {
4158       unsigned DstSize = DstTy.getSizeInBits();
4159       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4160       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4161       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4162       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4163 
4164       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4165       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4166       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4167       break;
4168     }
4169 
4170     [[fallthrough]];
4171   }
4172   case AMDGPU::G_MERGE_VALUES:
4173   case AMDGPU::G_CONCAT_VECTORS: {
4174     unsigned Bank = getMappingType(MRI, MI);
4175     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4176     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4177 
4178     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4179     // Op1 and Dst should use the same register bank.
4180     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4181       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4182     break;
4183   }
4184   case AMDGPU::G_BITREVERSE:
4185   case AMDGPU::G_BITCAST:
4186   case AMDGPU::G_INTTOPTR:
4187   case AMDGPU::G_PTRTOINT:
4188   case AMDGPU::G_FABS:
4189   case AMDGPU::G_FNEG: {
4190     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4191     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4192     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4193     break;
4194   }
4195   case AMDGPU::G_AMDGPU_FFBH_U32:
4196   case AMDGPU::G_AMDGPU_FFBL_B32:
4197   case AMDGPU::G_CTLZ_ZERO_UNDEF:
4198   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4199     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4200     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4201     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4202     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4203     break;
4204   }
4205   case AMDGPU::G_CTPOP: {
4206     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4207     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4208     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4209 
4210     // This should really be getValueMappingSGPR64Only, but allowing the generic
4211     // code to handle the register split just makes using LegalizerHelper more
4212     // difficult.
4213     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4214     break;
4215   }
4216   case AMDGPU::G_TRUNC: {
4217     Register Dst = MI.getOperand(0).getReg();
4218     Register Src = MI.getOperand(1).getReg();
4219     unsigned Bank = getRegBankID(Src, MRI);
4220     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4221     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4222     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4223     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4224     break;
4225   }
4226   case AMDGPU::G_ZEXT:
4227   case AMDGPU::G_SEXT:
4228   case AMDGPU::G_ANYEXT:
4229   case AMDGPU::G_SEXT_INREG: {
4230     Register Dst = MI.getOperand(0).getReg();
4231     Register Src = MI.getOperand(1).getReg();
4232     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4233     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4234 
4235     unsigned DstBank;
4236     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4237     assert(SrcBank);
4238     switch (SrcBank->getID()) {
4239     case AMDGPU::SGPRRegBankID:
4240       DstBank = AMDGPU::SGPRRegBankID;
4241       break;
4242     default:
4243       DstBank = AMDGPU::VGPRRegBankID;
4244       break;
4245     }
4246 
4247     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4248     // 32-bits, and then to 64.
4249     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4250     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4251                                                        SrcSize);
4252     break;
4253   }
4254   case AMDGPU::G_IS_FPCLASS: {
4255     Register SrcReg = MI.getOperand(1).getReg();
4256     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4257     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4258     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4259     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4260     break;
4261   }
4262   case AMDGPU::G_STORE: {
4263     assert(MI.getOperand(0).isReg());
4264     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4265 
4266     // FIXME: We need to specify a different reg bank once scalar stores are
4267     // supported.
4268     const ValueMapping *ValMapping =
4269         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4270     OpdsMapping[0] = ValMapping;
4271     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4272     break;
4273   }
4274   case AMDGPU::G_ICMP:
4275   case AMDGPU::G_FCMP: {
4276     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4277 
4278     // See if the result register has already been constrained to vcc, which may
4279     // happen due to control flow intrinsic lowering.
4280     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4281                                     AMDGPU::SGPRRegBankID);
4282     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4283     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4284 
4285     auto canUseSCCICMP = [&]() {
4286       auto Pred =
4287           static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4288       return Size == 32 ||
4289              (Size == 64 &&
4290               (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4291               Subtarget.hasScalarCompareEq64());
4292     };
4293     auto canUseSCCFCMP = [&]() {
4294       return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4295     };
4296 
4297     bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4298     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4299                      Op2Bank == AMDGPU::SGPRRegBankID &&
4300                      Op3Bank == AMDGPU::SGPRRegBankID &&
4301                      (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4302 
4303     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4304     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4305 
4306     // TODO: Use 32-bit for scalar output size.
4307     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4308     const unsigned ResultSize = 1;
4309 
4310     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4311     OpdsMapping[1] = nullptr; // Predicate Operand.
4312     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4313     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4314     break;
4315   }
4316   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4317     // VGPR index can be used for waterfall when indexing a SGPR vector.
4318     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4319     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4320     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4321     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4322     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4323     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4324 
4325     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4326     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4327 
4328     // The index can be either if the source vector is VGPR.
4329     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4330     break;
4331   }
4332   case AMDGPU::G_INSERT_VECTOR_ELT: {
4333     unsigned OutputBankID = isSALUMapping(MI) ?
4334       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4335 
4336     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4337     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4338     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4339     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4340     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4341 
4342     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4343     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4344 
4345     // This is a weird case, because we need to break down the mapping based on
4346     // the register bank of a different operand.
4347     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4348       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4349                                                       InsertSize);
4350     } else {
4351       assert(InsertSize == 32 || InsertSize == 64);
4352       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4353     }
4354 
4355     // The index can be either if the source vector is VGPR.
4356     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4357     break;
4358   }
4359   case AMDGPU::G_UNMERGE_VALUES: {
4360     unsigned Bank = getMappingType(MRI, MI);
4361 
4362     // Op1 and Dst should use the same register bank.
4363     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4364     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4365       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4366       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4367     }
4368     break;
4369   }
4370   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4371   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4372   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4373   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4374   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4375   case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4376   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4377   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4378   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4379   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4380   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4381   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4382   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4383   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4384   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4385   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4386   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4387   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4388   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4389   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4390   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4391   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4392     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4393 
4394     // rsrc
4395     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4396 
4397     // vindex
4398     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4399 
4400     // voffset
4401     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4402 
4403     // soffset
4404     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4405 
4406     // Any remaining operands are immediates and were correctly null
4407     // initialized.
4408     break;
4409   }
4410   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4411   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4412   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4413   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4414   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4415   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4416   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4417   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4418   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4419   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4420   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4421   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4422   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4423   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4424   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4425     // vdata_out
4426     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4427 
4428     // vdata_in
4429     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4430 
4431     // rsrc
4432     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4433 
4434     // vindex
4435     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4436 
4437     // voffset
4438     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4439 
4440     // soffset
4441     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4442 
4443     // Any remaining operands are immediates and were correctly null
4444     // initialized.
4445     break;
4446   }
4447   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4448     // vdata_out
4449     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4450 
4451     // vdata_in
4452     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4453 
4454     // cmp
4455     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4456 
4457     // rsrc
4458     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4459 
4460     // vindex
4461     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4462 
4463     // voffset
4464     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4465 
4466     // soffset
4467     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4468 
4469     // Any remaining operands are immediates and were correctly null
4470     // initialized.
4471     break;
4472   }
4473   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4474   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4475   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4476   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4477   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4478     // Lie and claim everything is legal, even though some need to be
4479     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4480     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4481     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4482 
4483     // We need to convert this to a MUBUF if either the resource of offset is
4484     // VGPR.
4485     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4486     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4487     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4488 
4489     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4490     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4491     break;
4492   }
4493   case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4494     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4495     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4496     break;
4497   case AMDGPU::G_INTRINSIC:
4498   case AMDGPU::G_INTRINSIC_CONVERGENT: {
4499     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4500     default:
4501       return getInvalidInstructionMapping();
4502     case Intrinsic::amdgcn_div_fmas:
4503     case Intrinsic::amdgcn_div_fixup:
4504     case Intrinsic::amdgcn_trig_preop:
4505     case Intrinsic::amdgcn_sin:
4506     case Intrinsic::amdgcn_cos:
4507     case Intrinsic::amdgcn_log_clamp:
4508     case Intrinsic::amdgcn_rcp_legacy:
4509     case Intrinsic::amdgcn_rsq_legacy:
4510     case Intrinsic::amdgcn_rsq_clamp:
4511     case Intrinsic::amdgcn_fmul_legacy:
4512     case Intrinsic::amdgcn_fma_legacy:
4513     case Intrinsic::amdgcn_frexp_mant:
4514     case Intrinsic::amdgcn_frexp_exp:
4515     case Intrinsic::amdgcn_fract:
4516     case Intrinsic::amdgcn_cvt_pknorm_i16:
4517     case Intrinsic::amdgcn_cvt_pknorm_u16:
4518     case Intrinsic::amdgcn_cvt_pk_i16:
4519     case Intrinsic::amdgcn_cvt_pk_u16:
4520     case Intrinsic::amdgcn_fmed3:
4521     case Intrinsic::amdgcn_cubeid:
4522     case Intrinsic::amdgcn_cubema:
4523     case Intrinsic::amdgcn_cubesc:
4524     case Intrinsic::amdgcn_cubetc:
4525     case Intrinsic::amdgcn_sffbh:
4526     case Intrinsic::amdgcn_fmad_ftz:
4527     case Intrinsic::amdgcn_mbcnt_lo:
4528     case Intrinsic::amdgcn_mbcnt_hi:
4529     case Intrinsic::amdgcn_mul_u24:
4530     case Intrinsic::amdgcn_mul_i24:
4531     case Intrinsic::amdgcn_mulhi_u24:
4532     case Intrinsic::amdgcn_mulhi_i24:
4533     case Intrinsic::amdgcn_lerp:
4534     case Intrinsic::amdgcn_sad_u8:
4535     case Intrinsic::amdgcn_msad_u8:
4536     case Intrinsic::amdgcn_sad_hi_u8:
4537     case Intrinsic::amdgcn_sad_u16:
4538     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4539     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4540     case Intrinsic::amdgcn_mqsad_u32_u8:
4541     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4542     case Intrinsic::amdgcn_alignbyte:
4543     case Intrinsic::amdgcn_perm:
4544     case Intrinsic::amdgcn_prng_b32:
4545     case Intrinsic::amdgcn_fdot2:
4546     case Intrinsic::amdgcn_sdot2:
4547     case Intrinsic::amdgcn_udot2:
4548     case Intrinsic::amdgcn_sdot4:
4549     case Intrinsic::amdgcn_udot4:
4550     case Intrinsic::amdgcn_sdot8:
4551     case Intrinsic::amdgcn_udot8:
4552     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4553     case Intrinsic::amdgcn_fdot2_f16_f16:
4554     case Intrinsic::amdgcn_fdot2_f32_bf16:
4555     case Intrinsic::amdgcn_fdot2c_f32_bf16:
4556     case Intrinsic::amdgcn_sudot4:
4557     case Intrinsic::amdgcn_sudot8:
4558     case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4559     case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4560     case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4561     case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4562     case Intrinsic::amdgcn_cvt_f32_fp8:
4563     case Intrinsic::amdgcn_cvt_f32_bf8:
4564     case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4565     case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4566     case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4567     case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4568     case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4569     case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4570     case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4571     case Intrinsic::amdgcn_cvt_sr_f16_f32:
4572     case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4573     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4574     case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4575     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4576     case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4577     case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4578     case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4579     case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4580     case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4581     case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4582     case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4583     case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4584     case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4585     case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4586     case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4587     case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4588     case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4589     case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4590     case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4591     case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4592     case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4593     case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4594     case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4595     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4596     case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4597     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4598     case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4599     case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4600     case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4601     case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4602     case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4603     case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4604     case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4605     case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4606     case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4607     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4608     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4609     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4610     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4611     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4612     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4613     case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4614     case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4615     case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4616     case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4617     case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4618     case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4619     case Intrinsic::amdgcn_ashr_pk_i8_i32:
4620     case Intrinsic::amdgcn_ashr_pk_u8_i32:
4621     case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4622     case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4623     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4624     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4625     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4626     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4627     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4628     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4629     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4630     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4631     case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4632     case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4633     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4634     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4635     case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4636     case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4637     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4638     case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4639     case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4640     case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4641     case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4642     case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4643     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4644     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4645     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4646     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4647       return getDefaultMappingVOP(MI);
4648     case Intrinsic::amdgcn_log:
4649     case Intrinsic::amdgcn_exp2:
4650     case Intrinsic::amdgcn_rcp:
4651     case Intrinsic::amdgcn_rsq:
4652     case Intrinsic::amdgcn_sqrt: {
4653       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4654       if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4655           isSALUMapping(MI))
4656         return getDefaultMappingSOP(MI);
4657       return getDefaultMappingVOP(MI);
4658     }
4659     case Intrinsic::amdgcn_sbfe:
4660     case Intrinsic::amdgcn_ubfe:
4661       if (isSALUMapping(MI))
4662         return getDefaultMappingSOP(MI);
4663       return getDefaultMappingVOP(MI);
4664     case Intrinsic::amdgcn_ds_swizzle:
4665     case Intrinsic::amdgcn_ds_permute:
4666     case Intrinsic::amdgcn_ds_bpermute:
4667     case Intrinsic::amdgcn_update_dpp:
4668     case Intrinsic::amdgcn_mov_dpp8:
4669     case Intrinsic::amdgcn_mov_dpp:
4670     case Intrinsic::amdgcn_strict_wwm:
4671     case Intrinsic::amdgcn_wwm:
4672     case Intrinsic::amdgcn_strict_wqm:
4673     case Intrinsic::amdgcn_wqm:
4674     case Intrinsic::amdgcn_softwqm:
4675     case Intrinsic::amdgcn_set_inactive:
4676     case Intrinsic::amdgcn_set_inactive_chain_arg:
4677     case Intrinsic::amdgcn_permlane64:
4678     case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4679       return getDefaultMappingAllVGPR(MI);
4680     case Intrinsic::amdgcn_cvt_pkrtz:
4681       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4682         return getDefaultMappingSOP(MI);
4683       return getDefaultMappingVOP(MI);
4684     case Intrinsic::amdgcn_kernarg_segment_ptr:
4685     case Intrinsic::amdgcn_s_getpc:
4686     case Intrinsic::amdgcn_groupstaticsize:
4687     case Intrinsic::amdgcn_reloc_constant:
4688     case Intrinsic::returnaddress: {
4689       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4690       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4691       break;
4692     }
4693     case Intrinsic::amdgcn_wqm_vote: {
4694       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4695       OpdsMapping[0] = OpdsMapping[2]
4696         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4697       break;
4698     }
4699     case Intrinsic::amdgcn_ps_live: {
4700       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4701       break;
4702     }
4703     case Intrinsic::amdgcn_div_scale: {
4704       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4705       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4706       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4707       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4708 
4709       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4710       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4711       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4712       break;
4713     }
4714     case Intrinsic::amdgcn_class: {
4715       Register Src0Reg = MI.getOperand(2).getReg();
4716       Register Src1Reg = MI.getOperand(3).getReg();
4717       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4718       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4719       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4720       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4721       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4722       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4723       break;
4724     }
4725     case Intrinsic::amdgcn_icmp:
4726     case Intrinsic::amdgcn_fcmp: {
4727       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4728       // This is not VCCRegBank because this is not used in boolean contexts.
4729       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4730       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4731       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4732       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4733       break;
4734     }
4735     case Intrinsic::amdgcn_readlane: {
4736       // This must be an SGPR, but accept a VGPR.
4737       Register IdxReg = MI.getOperand(3).getReg();
4738       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4739       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4740       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4741       [[fallthrough]];
4742     }
4743     case Intrinsic::amdgcn_readfirstlane: {
4744       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4745       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4746       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4747       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4748       break;
4749     }
4750     case Intrinsic::amdgcn_writelane: {
4751       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4752       Register SrcReg = MI.getOperand(2).getReg();
4753       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4754       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4755       Register IdxReg = MI.getOperand(3).getReg();
4756       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4757       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4758       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4759 
4760       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4761       // to legalize.
4762       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4763       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4764       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4765       break;
4766     }
4767     case Intrinsic::amdgcn_if_break: {
4768       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4769       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4770       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4771       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4772       break;
4773     }
4774     case Intrinsic::amdgcn_permlane16:
4775     case Intrinsic::amdgcn_permlanex16: {
4776       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4777       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4778       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4779       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4780       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4781       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4782       break;
4783     }
4784     case Intrinsic::amdgcn_permlane16_var:
4785     case Intrinsic::amdgcn_permlanex16_var: {
4786       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4787       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4788       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4789       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4790       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4791       break;
4792     }
4793     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4794     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4795     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4796     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4797     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4798     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4799     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4800     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4801     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4802     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4803     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4804     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4805     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4806     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4807     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4808     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4809     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4810     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4811     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4812     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4813     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4814     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4815     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4816     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4817     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4818     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4819     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4820     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4821     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4822     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4823     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4824     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4825     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4826     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4827     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4828     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4829     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4830     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4831     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
4832     case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
4833     case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
4834     case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
4835     case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
4836     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
4837       // Default for MAI intrinsics.
4838       // srcC can also be an immediate which can be folded later.
4839       // FIXME: Should we eventually add an alternative mapping with AGPR src
4840       // for srcA/srcB?
4841       //
4842       // vdst, srcA, srcB, srcC
4843       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4844       OpdsMapping[0] =
4845           Info->mayNeedAGPRs()
4846               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4847               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4848       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4849       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4850       OpdsMapping[4] =
4851           Info->mayNeedAGPRs()
4852               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4853               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4854       break;
4855     }
4856     case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
4857     case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
4858       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4859       OpdsMapping[0] =
4860           Info->mayNeedAGPRs()
4861               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4862               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4863 
4864       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4865       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4866       OpdsMapping[4] =
4867           Info->mayNeedAGPRs()
4868               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4869               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4870 
4871       OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4872       OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
4873       break;
4874     }
4875     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4876     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4877     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4878     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4879     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4880     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4881     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4882     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4883     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4884     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4885     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4886     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4887     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4888     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
4889     case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
4890     case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
4891     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
4892     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4893     case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4894     case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4895     case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4896     case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4897     case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4898     case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4899     case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4900     case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4901     case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4902     case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
4903       // vdst, srcA, srcB, srcC, idx
4904       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4905       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4906       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4907       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4908       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4909       break;
4910     }
4911     case Intrinsic::amdgcn_interp_p1:
4912     case Intrinsic::amdgcn_interp_p2:
4913     case Intrinsic::amdgcn_interp_mov:
4914     case Intrinsic::amdgcn_interp_p1_f16:
4915     case Intrinsic::amdgcn_interp_p2_f16:
4916     case Intrinsic::amdgcn_lds_param_load: {
4917       const int M0Idx = MI.getNumOperands() - 1;
4918       Register M0Reg = MI.getOperand(M0Idx).getReg();
4919       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4920       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4921 
4922       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4923       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4924         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4925 
4926       // Must be SGPR, but we must take whatever the original bank is and fix it
4927       // later.
4928       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4929       break;
4930     }
4931     case Intrinsic::amdgcn_interp_inreg_p10:
4932     case Intrinsic::amdgcn_interp_inreg_p2:
4933     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4934     case Intrinsic::amdgcn_interp_inreg_p2_f16:
4935     case Intrinsic::amdgcn_interp_p10_rtz_f16:
4936     case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4937       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4938       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4939       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4940       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4941       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4942       break;
4943     }
4944     case Intrinsic::amdgcn_permlane16_swap:
4945     case Intrinsic::amdgcn_permlane32_swap: {
4946       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4947       OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =
4948           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4949       break;
4950     }
4951     case Intrinsic::amdgcn_ballot: {
4952       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4953       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4954       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4955       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4956       break;
4957     }
4958     case Intrinsic::amdgcn_inverse_ballot: {
4959       // This must be an SGPR, but accept a VGPR.
4960       Register MaskReg = MI.getOperand(2).getReg();
4961       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4962       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4963       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4964       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4965       break;
4966     }
4967     case Intrinsic::amdgcn_bitop3: {
4968       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4969       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4970       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4971       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4972       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4973       break;
4974     }
4975     case Intrinsic::amdgcn_s_quadmask:
4976     case Intrinsic::amdgcn_s_wqm: {
4977       Register MaskReg = MI.getOperand(2).getReg();
4978       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4979       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4980       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4981       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4982       break;
4983     }
4984     case Intrinsic::amdgcn_wave_reduce_umin:
4985     case Intrinsic::amdgcn_wave_reduce_umax: {
4986       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4987       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4988       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4989       auto regBankID =
4990           isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4991       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4992       break;
4993     }
4994     case Intrinsic::amdgcn_s_bitreplicate:
4995       Register MaskReg = MI.getOperand(2).getReg();
4996       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4997       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4998       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
4999     }
5000     break;
5001   }
5002   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5003   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5004   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5005   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5006   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5007     auto IntrID = AMDGPU::getIntrinsicID(MI);
5008     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
5009     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5010     // Non-images can have complications from operands that allow both SGPR
5011     // and VGPR. For now it's too complicated to figure out the final opcode
5012     // to derive the register bank from the MCInstrDesc.
5013     assert(RSrcIntrin->IsImage);
5014     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
5015   }
5016   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
5017     unsigned N = MI.getNumExplicitOperands() - 2;
5018     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
5019     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
5020     if (N == 3) {
5021       // Sequential form: all operands combined into VGPR256/VGPR512
5022       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
5023       if (Size > 256)
5024         Size = 512;
5025       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5026     } else {
5027       // NSA form
5028       for (unsigned I = 2; I < N; ++I) {
5029         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
5030         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5031       }
5032     }
5033     break;
5034   }
5035   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5036   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5037     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
5038     switch (IntrID) {
5039     case Intrinsic::amdgcn_s_getreg:
5040     case Intrinsic::amdgcn_s_memtime:
5041     case Intrinsic::amdgcn_s_memrealtime:
5042     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5043     case Intrinsic::amdgcn_s_sendmsg_rtn: {
5044       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5045       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5046       break;
5047     }
5048     case Intrinsic::amdgcn_global_atomic_csub:
5049     case Intrinsic::amdgcn_global_atomic_fmin_num:
5050     case Intrinsic::amdgcn_global_atomic_fmax_num:
5051     case Intrinsic::amdgcn_flat_atomic_fmin_num:
5052     case Intrinsic::amdgcn_flat_atomic_fmax_num:
5053     case Intrinsic::amdgcn_atomic_cond_sub_u32:
5054     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5055     case Intrinsic::amdgcn_global_load_tr_b64:
5056     case Intrinsic::amdgcn_global_load_tr_b128:
5057     case Intrinsic::amdgcn_ds_read_tr4_b64:
5058     case Intrinsic::amdgcn_ds_read_tr6_b96:
5059     case Intrinsic::amdgcn_ds_read_tr8_b64:
5060     case Intrinsic::amdgcn_ds_read_tr16_b64:
5061       return getDefaultMappingAllVGPR(MI);
5062     case Intrinsic::amdgcn_ds_ordered_add:
5063     case Intrinsic::amdgcn_ds_ordered_swap: {
5064       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5065       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5066       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5067                                  AMDGPU::SGPRRegBankID);
5068       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
5069       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5070       break;
5071     }
5072     case Intrinsic::amdgcn_ds_append:
5073     case Intrinsic::amdgcn_ds_consume: {
5074       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5075       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5076       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5077       break;
5078     }
5079     case Intrinsic::amdgcn_exp_compr:
5080       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5081       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5082       break;
5083     case Intrinsic::amdgcn_exp:
5084       // FIXME: Could we support packed types here?
5085       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5086       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5087       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5088       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5089       break;
5090     case Intrinsic::amdgcn_exp_row:
5091       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5092       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5093       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5094       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5095       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
5096       break;
5097     case Intrinsic::amdgcn_s_sendmsg:
5098     case Intrinsic::amdgcn_s_sendmsghalt: {
5099       // This must be an SGPR, but accept a VGPR.
5100       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5101                                    AMDGPU::SGPRRegBankID);
5102       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5103       break;
5104     }
5105     case Intrinsic::amdgcn_s_setreg: {
5106       // This must be an SGPR, but accept a VGPR.
5107       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5108                                    AMDGPU::SGPRRegBankID);
5109       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5110       break;
5111     }
5112     case Intrinsic::amdgcn_s_ttracedata: {
5113       // This must be an SGPR, but accept a VGPR.
5114       unsigned Bank =
5115           getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
5116       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5117       break;
5118     }
5119     case Intrinsic::amdgcn_end_cf: {
5120       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5121       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5122       break;
5123     }
5124     case Intrinsic::amdgcn_else: {
5125       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5126       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5127       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
5128       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
5129       break;
5130     }
5131     case Intrinsic::amdgcn_init_whole_wave:
5132     case Intrinsic::amdgcn_live_mask: {
5133       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5134       break;
5135     }
5136     case Intrinsic::amdgcn_wqm_demote:
5137     case Intrinsic::amdgcn_kill: {
5138       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5139       break;
5140     }
5141     case Intrinsic::amdgcn_raw_buffer_load:
5142     case Intrinsic::amdgcn_raw_ptr_buffer_load:
5143     case Intrinsic::amdgcn_raw_atomic_buffer_load:
5144     case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5145     case Intrinsic::amdgcn_raw_tbuffer_load:
5146     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5147       // FIXME: Should make intrinsic ID the last operand of the instruction,
5148       // then this would be the same as store
5149       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5150       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5151       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5152       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5153       break;
5154     }
5155     case Intrinsic::amdgcn_raw_buffer_load_lds:
5156     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
5157       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5158       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5159       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5160       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5161       break;
5162     }
5163     case Intrinsic::amdgcn_raw_buffer_store:
5164     case Intrinsic::amdgcn_raw_ptr_buffer_store:
5165     case Intrinsic::amdgcn_raw_buffer_store_format:
5166     case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5167     case Intrinsic::amdgcn_raw_tbuffer_store:
5168     case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5169       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5170       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5171       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5172       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5173       break;
5174     }
5175     case Intrinsic::amdgcn_struct_buffer_load:
5176     case Intrinsic::amdgcn_struct_ptr_buffer_load:
5177     case Intrinsic::amdgcn_struct_tbuffer_load:
5178     case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5179     case Intrinsic::amdgcn_struct_atomic_buffer_load:
5180     case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5181       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5182       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5183       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5184       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5185       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5186       break;
5187     }
5188     case Intrinsic::amdgcn_struct_buffer_load_lds:
5189     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5190       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5191       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5192       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5193       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5194       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
5195       break;
5196     }
5197     case Intrinsic::amdgcn_struct_buffer_store:
5198     case Intrinsic::amdgcn_struct_ptr_buffer_store:
5199     case Intrinsic::amdgcn_struct_tbuffer_store:
5200     case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5201       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5202       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5203       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5204       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5205       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5206       break;
5207     }
5208     case Intrinsic::amdgcn_init_exec_from_input: {
5209       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5210       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5211       break;
5212     }
5213     case Intrinsic::amdgcn_ds_gws_init:
5214     case Intrinsic::amdgcn_ds_gws_barrier:
5215     case Intrinsic::amdgcn_ds_gws_sema_br: {
5216       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5217 
5218       // This must be an SGPR, but accept a VGPR.
5219       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5220                                    AMDGPU::SGPRRegBankID);
5221       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5222       break;
5223     }
5224     case Intrinsic::amdgcn_ds_gws_sema_v:
5225     case Intrinsic::amdgcn_ds_gws_sema_p:
5226     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5227       // This must be an SGPR, but accept a VGPR.
5228       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5229                                    AMDGPU::SGPRRegBankID);
5230       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5231       break;
5232     }
5233     case Intrinsic::amdgcn_global_load_lds: {
5234       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5235       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5236       break;
5237     }
5238     case Intrinsic::amdgcn_lds_direct_load: {
5239       const int M0Idx = MI.getNumOperands() - 1;
5240       Register M0Reg = MI.getOperand(M0Idx).getReg();
5241       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5242       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5243 
5244       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5245       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5246         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5247 
5248       // Must be SGPR, but we must take whatever the original bank is and fix it
5249       // later.
5250       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
5251       break;
5252     }
5253     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5254     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5255       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5256       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5257       break;
5258     case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5259       OpdsMapping[0] =
5260           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
5261       OpdsMapping[1] =
5262           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
5263       OpdsMapping[3] =
5264           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
5265       OpdsMapping[4] =
5266           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
5267       OpdsMapping[5] =
5268           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
5269       break;
5270     }
5271     case Intrinsic::amdgcn_s_sleep_var:
5272       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5273       break;
5274     case Intrinsic::amdgcn_s_barrier_join:
5275       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5276       break;
5277     case Intrinsic::amdgcn_s_barrier_init:
5278     case Intrinsic::amdgcn_s_barrier_signal_var:
5279       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5280       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5281       break;
5282     case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5283       const unsigned ResultSize = 1;
5284       OpdsMapping[0] =
5285           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5286       break;
5287     }
5288     case Intrinsic::amdgcn_s_get_barrier_state:
5289     case Intrinsic::amdgcn_s_get_named_barrier_state: {
5290       OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5291       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5292       break;
5293     }
5294     case Intrinsic::amdgcn_pops_exiting_wave_id:
5295       return getDefaultMappingSOP(MI);
5296     case Intrinsic::amdgcn_s_prefetch_data: {
5297       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5298       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5299       break;
5300     }
5301     default:
5302       return getInvalidInstructionMapping();
5303     }
5304     break;
5305   }
5306   case AMDGPU::G_SELECT: {
5307     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5308     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5309                                     AMDGPU::SGPRRegBankID);
5310     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5311                                     AMDGPU::SGPRRegBankID);
5312     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5313                     Op3Bank == AMDGPU::SGPRRegBankID;
5314 
5315     unsigned CondBankDefault = SGPRSrcs ?
5316       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5317     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5318                                      CondBankDefault);
5319     if (CondBank == AMDGPU::SGPRRegBankID)
5320       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5321     else if (CondBank == AMDGPU::VGPRRegBankID)
5322       CondBank = AMDGPU::VCCRegBankID;
5323 
5324     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5325       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5326 
5327     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5328 
5329     // TODO: Should report 32-bit for scalar condition type.
5330     if (Size == 64) {
5331       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5332       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5333       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5334       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5335     } else {
5336       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
5337       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5338       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
5339       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
5340     }
5341 
5342     break;
5343   }
5344 
5345   case AMDGPU::G_SI_CALL: {
5346     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5347     // Lie and claim everything is legal, even though some need to be
5348     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5349     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5350 
5351     // Allow anything for implicit arguments
5352     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5353       if (MI.getOperand(I).isReg()) {
5354         Register Reg = MI.getOperand(I).getReg();
5355         auto OpBank = getRegBankID(Reg, MRI);
5356         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5357         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5358       }
5359     }
5360     break;
5361   }
5362   case AMDGPU::G_LOAD:
5363   case AMDGPU::G_ZEXTLOAD:
5364   case AMDGPU::G_SEXTLOAD:
5365     return getInstrMappingForLoad(MI);
5366 
5367   case AMDGPU::G_ATOMICRMW_XCHG:
5368   case AMDGPU::G_ATOMICRMW_ADD:
5369   case AMDGPU::G_ATOMICRMW_SUB:
5370   case AMDGPU::G_ATOMICRMW_AND:
5371   case AMDGPU::G_ATOMICRMW_OR:
5372   case AMDGPU::G_ATOMICRMW_XOR:
5373   case AMDGPU::G_ATOMICRMW_MAX:
5374   case AMDGPU::G_ATOMICRMW_MIN:
5375   case AMDGPU::G_ATOMICRMW_UMAX:
5376   case AMDGPU::G_ATOMICRMW_UMIN:
5377   case AMDGPU::G_ATOMICRMW_FADD:
5378   case AMDGPU::G_ATOMICRMW_FMIN:
5379   case AMDGPU::G_ATOMICRMW_FMAX:
5380   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5381   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5382   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5383     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5384     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5385     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5386     break;
5387   }
5388   case AMDGPU::G_ATOMIC_CMPXCHG: {
5389     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5390     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5391     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5392     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5393     break;
5394   }
5395   case AMDGPU::G_BRCOND: {
5396     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5397                                  AMDGPU::SGPRRegBankID);
5398     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5399     if (Bank != AMDGPU::SGPRRegBankID)
5400       Bank = AMDGPU::VCCRegBankID;
5401 
5402     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5403     break;
5404   }
5405   case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5406     return getDefaultMappingVOP(MI);
5407   case AMDGPU::G_PREFETCH:
5408     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5409     break;
5410   }
5411 
5412   return getInstructionMapping(/*ID*/1, /*Cost*/1,
5413                                getOperandsMapping(OpdsMapping),
5414                                MI.getNumOperands());
5415 }
5416