xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (revision 0ee037b861f94604907d95d0ff0ff87805b52428)
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31 
32 #define DEBUG_TYPE "amdgpu-isel"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 #define GET_GLOBALISEL_IMPL
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenGlobalISel.inc"
40 #undef GET_GLOBALISEL_IMPL
41 #undef AMDGPUSubtarget
42 
43 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45     const AMDGPUTargetMachine &TM)
46     : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47       STI(STI),
48 #define GET_GLOBALISEL_PREDICATES_INIT
49 #include "AMDGPUGenGlobalISel.inc"
50 #undef GET_GLOBALISEL_PREDICATES_INIT
51 #define GET_GLOBALISEL_TEMPORARIES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_TEMPORARIES_INIT
54 {
55 }
56 
57 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58 
59 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
60                                         CodeGenCoverage *CoverageInfo,
61                                         ProfileSummaryInfo *PSI,
62                                         BlockFrequencyInfo *BFI) {
63   MRI = &MF.getRegInfo();
64   Subtarget = &MF.getSubtarget<GCNSubtarget>();
65   Subtarget->checkSubtargetFeatures(MF.getFunction());
66   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
67 }
68 
69 // Return the wave level SGPR base address if this is a wave address.
70 static Register getWaveAddress(const MachineInstr *Def) {
71   return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72              ? Def->getOperand(1).getReg()
73              : Register();
74 }
75 
76 bool AMDGPUInstructionSelector::isVCC(Register Reg,
77                                       const MachineRegisterInfo &MRI) const {
78   // The verifier is oblivious to s1 being a valid value for wavesize registers.
79   if (Reg.isPhysical())
80     return false;
81 
82   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83   const TargetRegisterClass *RC =
84       dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85   if (RC) {
86     const LLT Ty = MRI.getType(Reg);
87     if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88       return false;
89     // G_TRUNC s1 result is never vcc.
90     return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91            RC->hasSuperClassEq(TRI.getBoolRC());
92   }
93 
94   const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95   return RB->getID() == AMDGPU::VCCRegBankID;
96 }
97 
98 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99                                                         unsigned NewOpc) const {
100   MI.setDesc(TII.get(NewOpc));
101   MI.removeOperand(1); // Remove intrinsic ID.
102   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103 
104   MachineOperand &Dst = MI.getOperand(0);
105   MachineOperand &Src = MI.getOperand(1);
106 
107   // TODO: This should be legalized to s32 if needed
108   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109     return false;
110 
111   const TargetRegisterClass *DstRC
112     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113   const TargetRegisterClass *SrcRC
114     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115   if (!DstRC || DstRC != SrcRC)
116     return false;
117 
118   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120 }
121 
122 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123   const DebugLoc &DL = I.getDebugLoc();
124   MachineBasicBlock *BB = I.getParent();
125   I.setDesc(TII.get(TargetOpcode::COPY));
126 
127   const MachineOperand &Src = I.getOperand(1);
128   MachineOperand &Dst = I.getOperand(0);
129   Register DstReg = Dst.getReg();
130   Register SrcReg = Src.getReg();
131 
132   if (isVCC(DstReg, *MRI)) {
133     if (SrcReg == AMDGPU::SCC) {
134       const TargetRegisterClass *RC
135         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136       if (!RC)
137         return true;
138       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139     }
140 
141     if (!isVCC(SrcReg, *MRI)) {
142       // TODO: Should probably leave the copy and let copyPhysReg expand it.
143       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144         return false;
145 
146       const TargetRegisterClass *SrcRC
147         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148 
149       std::optional<ValueAndVReg> ConstVal =
150           getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151       if (ConstVal) {
152         unsigned MovOpc =
153             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154         BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155             .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156       } else {
157         Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158 
159         // We can't trust the high bits at this point, so clear them.
160 
161         // TODO: Skip masking high bits if def is known boolean.
162 
163         if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164           assert(Subtarget->useRealTrue16Insts());
165           const int64_t NoMods = 0;
166           BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167               .addImm(NoMods)
168               .addImm(1)
169               .addImm(NoMods)
170               .addReg(SrcReg)
171               .addImm(NoMods);
172           BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173               .addImm(NoMods)
174               .addImm(0)
175               .addImm(NoMods)
176               .addReg(MaskedReg)
177               .addImm(NoMods);
178         } else {
179           bool IsSGPR = TRI.isSGPRClass(SrcRC);
180           unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181           auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
182                          .addImm(1)
183                          .addReg(SrcReg);
184           if (IsSGPR)
185             And.setOperandDead(3); // Dead scc
186 
187           BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188               .addImm(0)
189               .addReg(MaskedReg);
190         }
191       }
192 
193       if (!MRI->getRegClassOrNull(SrcReg))
194         MRI->setRegClass(SrcReg, SrcRC);
195       I.eraseFromParent();
196       return true;
197     }
198 
199     const TargetRegisterClass *RC =
200       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
201     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202       return false;
203 
204     return true;
205   }
206 
207   for (const MachineOperand &MO : I.operands()) {
208     if (MO.getReg().isPhysical())
209       continue;
210 
211     const TargetRegisterClass *RC =
212             TRI.getConstrainedRegClassForOperand(MO, *MRI);
213     if (!RC)
214       continue;
215     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216   }
217   return true;
218 }
219 
220 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
221   const DebugLoc &DL = I.getDebugLoc();
222   MachineBasicBlock *BB = I.getParent();
223 
224   unsigned CmpOpc =
225       STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226   MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
227                           .addReg(I.getOperand(1).getReg())
228                           .addImm(0);
229   if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230     return false;
231 
232   Register DstReg = I.getOperand(0).getReg();
233   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234 
235   I.eraseFromParent();
236   return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
237 }
238 
239 bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
240   const DebugLoc &DL = I.getDebugLoc();
241   MachineBasicBlock *BB = I.getParent();
242 
243   Register DstReg = I.getOperand(0).getReg();
244   Register SrcReg = I.getOperand(1).getReg();
245   std::optional<ValueAndVReg> Arg =
246       getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247 
248   if (Arg) {
249     const int64_t Value = Arg->Value.getZExtValue();
250     if (Value == 0) {
251       unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
253     } else {
254       assert(Value == 1);
255       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256     }
257     I.eraseFromParent();
258     return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259   }
260 
261   // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263 
264   unsigned SelectOpcode =
265       STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266   MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
267                              .addReg(TRI.getExec())
268                              .addImm(0);
269 
270   I.eraseFromParent();
271   return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272 }
273 
274 bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
275   Register DstReg = I.getOperand(0).getReg();
276   Register SrcReg = I.getOperand(1).getReg();
277 
278   const DebugLoc &DL = I.getDebugLoc();
279   MachineBasicBlock *BB = I.getParent();
280 
281   auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282                  .addReg(SrcReg);
283 
284   I.eraseFromParent();
285   return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286 }
287 
288 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
289   const Register DefReg = I.getOperand(0).getReg();
290   const LLT DefTy = MRI->getType(DefReg);
291 
292   // S1 G_PHIs should not be selected in instruction-select, instead:
293   // - divergent S1 G_PHI should go through lane mask merging algorithm
294   //   and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
295   // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
296   if (DefTy == LLT::scalar(1))
297     return false;
298 
299   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
300 
301   const RegClassOrRegBank &RegClassOrBank =
302     MRI->getRegClassOrRegBank(DefReg);
303 
304   const TargetRegisterClass *DefRC =
305       dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
306   if (!DefRC) {
307     if (!DefTy.isValid()) {
308       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
309       return false;
310     }
311 
312     const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
313     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
314     if (!DefRC) {
315       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
316       return false;
317     }
318   }
319 
320   // If inputs have register bank, assign corresponding reg class.
321   // Note: registers don't need to have the same reg bank.
322   for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
323     const Register SrcReg = I.getOperand(i).getReg();
324 
325     const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
326     if (RB) {
327       const LLT SrcTy = MRI->getType(SrcReg);
328       const TargetRegisterClass *SrcRC =
329           TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331         return false;
332     }
333   }
334 
335   I.setDesc(TII.get(TargetOpcode::PHI));
336   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
337 }
338 
339 MachineOperand
340 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
341                                            const TargetRegisterClass &SubRC,
342                                            unsigned SubIdx) const {
343 
344   MachineInstr *MI = MO.getParent();
345   MachineBasicBlock *BB = MO.getParent()->getParent();
346   Register DstReg = MRI->createVirtualRegister(&SubRC);
347 
348   if (MO.isReg()) {
349     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
350     Register Reg = MO.getReg();
351     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
352             .addReg(Reg, 0, ComposedSubIdx);
353 
354     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
355                                      MO.isKill(), MO.isDead(), MO.isUndef(),
356                                      MO.isEarlyClobber(), 0, MO.isDebug(),
357                                      MO.isInternalRead());
358   }
359 
360   assert(MO.isImm());
361 
362   APInt Imm(64, MO.getImm());
363 
364   switch (SubIdx) {
365   default:
366     llvm_unreachable("do not know to split immediate with this sub index.");
367   case AMDGPU::sub0:
368     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
369   case AMDGPU::sub1:
370     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
371   }
372 }
373 
374 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
375   switch (Opc) {
376   case AMDGPU::G_AND:
377     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
378   case AMDGPU::G_OR:
379     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
380   case AMDGPU::G_XOR:
381     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
382   default:
383     llvm_unreachable("not a bit op");
384   }
385 }
386 
387 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
388   Register DstReg = I.getOperand(0).getReg();
389   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
390 
391   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
392   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
393       DstRB->getID() != AMDGPU::VCCRegBankID)
394     return false;
395 
396   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
397                             STI.isWave64());
398   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
399 
400   // Dead implicit-def of scc
401   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
402                                          true, // isImp
403                                          false, // isKill
404                                          true)); // isDead
405   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
406 }
407 
408 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
409   MachineBasicBlock *BB = I.getParent();
410   MachineFunction *MF = BB->getParent();
411   Register DstReg = I.getOperand(0).getReg();
412   const DebugLoc &DL = I.getDebugLoc();
413   LLT Ty = MRI->getType(DstReg);
414   if (Ty.isVector())
415     return false;
416 
417   unsigned Size = Ty.getSizeInBits();
418   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
419   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
420   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
421 
422   if (Size == 32) {
423     if (IsSALU) {
424       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
425       MachineInstr *Add =
426         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
427         .add(I.getOperand(1))
428         .add(I.getOperand(2))
429         .setOperandDead(3); // Dead scc
430       I.eraseFromParent();
431       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
432     }
433 
434     if (STI.hasAddNoCarry()) {
435       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
436       I.setDesc(TII.get(Opc));
437       I.addOperand(*MF, MachineOperand::CreateImm(0));
438       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
439       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
440     }
441 
442     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
443 
444     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
445     MachineInstr *Add
446       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
447       .addDef(UnusedCarry, RegState::Dead)
448       .add(I.getOperand(1))
449       .add(I.getOperand(2))
450       .addImm(0);
451     I.eraseFromParent();
452     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
453   }
454 
455   assert(!Sub && "illegal sub should not reach here");
456 
457   const TargetRegisterClass &RC
458     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
459   const TargetRegisterClass &HalfRC
460     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
461 
462   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
463   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
464   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
465   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
466 
467   Register DstLo = MRI->createVirtualRegister(&HalfRC);
468   Register DstHi = MRI->createVirtualRegister(&HalfRC);
469 
470   if (IsSALU) {
471     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
472       .add(Lo1)
473       .add(Lo2);
474     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
475       .add(Hi1)
476       .add(Hi2)
477       .setOperandDead(3); // Dead scc
478   } else {
479     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
480     Register CarryReg = MRI->createVirtualRegister(CarryRC);
481     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
482       .addDef(CarryReg)
483       .add(Lo1)
484       .add(Lo2)
485       .addImm(0);
486     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
487       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
488       .add(Hi1)
489       .add(Hi2)
490       .addReg(CarryReg, RegState::Kill)
491       .addImm(0);
492 
493     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
494       return false;
495   }
496 
497   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
498     .addReg(DstLo)
499     .addImm(AMDGPU::sub0)
500     .addReg(DstHi)
501     .addImm(AMDGPU::sub1);
502 
503 
504   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
505     return false;
506 
507   I.eraseFromParent();
508   return true;
509 }
510 
511 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
512   MachineInstr &I) const {
513   MachineBasicBlock *BB = I.getParent();
514   MachineFunction *MF = BB->getParent();
515   const DebugLoc &DL = I.getDebugLoc();
516   Register Dst0Reg = I.getOperand(0).getReg();
517   Register Dst1Reg = I.getOperand(1).getReg();
518   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
519                      I.getOpcode() == AMDGPU::G_UADDE;
520   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
521                           I.getOpcode() == AMDGPU::G_USUBE;
522 
523   if (isVCC(Dst1Reg, *MRI)) {
524     unsigned NoCarryOpc =
525         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
526     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
527     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
528     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
529     I.addOperand(*MF, MachineOperand::CreateImm(0));
530     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
531   }
532 
533   Register Src0Reg = I.getOperand(2).getReg();
534   Register Src1Reg = I.getOperand(3).getReg();
535 
536   if (HasCarryIn) {
537     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
538       .addReg(I.getOperand(4).getReg());
539   }
540 
541   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
542   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
543 
544   auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
545     .add(I.getOperand(2))
546     .add(I.getOperand(3));
547 
548   if (MRI->use_nodbg_empty(Dst1Reg)) {
549     CarryInst.setOperandDead(3); // Dead scc
550   } else {
551     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
552       .addReg(AMDGPU::SCC);
553     if (!MRI->getRegClassOrNull(Dst1Reg))
554       MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
555   }
556 
557   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
558       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
560     return false;
561 
562   if (HasCarryIn &&
563       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
564                                     AMDGPU::SReg_32RegClass, *MRI))
565     return false;
566 
567   I.eraseFromParent();
568   return true;
569 }
570 
571 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
572     MachineInstr &I) const {
573   MachineBasicBlock *BB = I.getParent();
574   MachineFunction *MF = BB->getParent();
575   const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
576 
577   unsigned Opc;
578   if (Subtarget->hasMADIntraFwdBug())
579     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
580                      : AMDGPU::V_MAD_I64_I32_gfx11_e64;
581   else
582     Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
583   I.setDesc(TII.get(Opc));
584   I.addOperand(*MF, MachineOperand::CreateImm(0));
585   I.addImplicitDefUseOperands(*MF);
586   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
587 }
588 
589 // TODO: We should probably legalize these to only using 32-bit results.
590 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
591   MachineBasicBlock *BB = I.getParent();
592   Register DstReg = I.getOperand(0).getReg();
593   Register SrcReg = I.getOperand(1).getReg();
594   LLT DstTy = MRI->getType(DstReg);
595   LLT SrcTy = MRI->getType(SrcReg);
596   const unsigned SrcSize = SrcTy.getSizeInBits();
597   unsigned DstSize = DstTy.getSizeInBits();
598 
599   // TODO: Should handle any multiple of 32 offset.
600   unsigned Offset = I.getOperand(2).getImm();
601   if (Offset % 32 != 0 || DstSize > 128)
602     return false;
603 
604   // 16-bit operations really use 32-bit registers.
605   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
606   if (DstSize == 16)
607     DstSize = 32;
608 
609   const TargetRegisterClass *DstRC =
610     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
611   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
612     return false;
613 
614   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
615   const TargetRegisterClass *SrcRC =
616       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
617   if (!SrcRC)
618     return false;
619   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
620                                                          DstSize / 32);
621   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
622   if (!SrcRC)
623     return false;
624 
625   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
626                                     *SrcRC, I.getOperand(1));
627   const DebugLoc &DL = I.getDebugLoc();
628   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
629     .addReg(SrcReg, 0, SubReg);
630 
631   I.eraseFromParent();
632   return true;
633 }
634 
635 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
636   MachineBasicBlock *BB = MI.getParent();
637   Register DstReg = MI.getOperand(0).getReg();
638   LLT DstTy = MRI->getType(DstReg);
639   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
640 
641   const unsigned SrcSize = SrcTy.getSizeInBits();
642   if (SrcSize < 32)
643     return selectImpl(MI, *CoverageInfo);
644 
645   const DebugLoc &DL = MI.getDebugLoc();
646   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
647   const unsigned DstSize = DstTy.getSizeInBits();
648   const TargetRegisterClass *DstRC =
649       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
650   if (!DstRC)
651     return false;
652 
653   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
654   MachineInstrBuilder MIB =
655     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
656   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
657     MachineOperand &Src = MI.getOperand(I + 1);
658     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
659     MIB.addImm(SubRegs[I]);
660 
661     const TargetRegisterClass *SrcRC
662       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
663     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
664       return false;
665   }
666 
667   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
668     return false;
669 
670   MI.eraseFromParent();
671   return true;
672 }
673 
674 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
675   MachineBasicBlock *BB = MI.getParent();
676   const int NumDst = MI.getNumOperands() - 1;
677 
678   MachineOperand &Src = MI.getOperand(NumDst);
679 
680   Register SrcReg = Src.getReg();
681   Register DstReg0 = MI.getOperand(0).getReg();
682   LLT DstTy = MRI->getType(DstReg0);
683   LLT SrcTy = MRI->getType(SrcReg);
684 
685   const unsigned DstSize = DstTy.getSizeInBits();
686   const unsigned SrcSize = SrcTy.getSizeInBits();
687   const DebugLoc &DL = MI.getDebugLoc();
688   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
689 
690   const TargetRegisterClass *SrcRC =
691       TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
692   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
693     return false;
694 
695   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
696   // source, and this relies on the fact that the same subregister indices are
697   // used for both.
698   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
699   for (int I = 0, E = NumDst; I != E; ++I) {
700     MachineOperand &Dst = MI.getOperand(I);
701     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
702       .addReg(SrcReg, 0, SubRegs[I]);
703 
704     // Make sure the subregister index is valid for the source register.
705     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
706     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
707       return false;
708 
709     const TargetRegisterClass *DstRC =
710       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
711     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
712       return false;
713   }
714 
715   MI.eraseFromParent();
716   return true;
717 }
718 
719 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
720   assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
721          MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
722 
723   Register Src0 = MI.getOperand(1).getReg();
724   Register Src1 = MI.getOperand(2).getReg();
725   LLT SrcTy = MRI->getType(Src0);
726   const unsigned SrcSize = SrcTy.getSizeInBits();
727 
728   // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
729   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
730     return selectG_MERGE_VALUES(MI);
731   }
732 
733   // Selection logic below is for V2S16 only.
734   // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
735   Register Dst = MI.getOperand(0).getReg();
736   if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
737       (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
738        SrcTy != LLT::scalar(32)))
739     return selectImpl(MI, *CoverageInfo);
740 
741   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
742   if (DstBank->getID() == AMDGPU::AGPRRegBankID)
743     return false;
744 
745   assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
746          DstBank->getID() == AMDGPU::VGPRRegBankID);
747   const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
748 
749   const DebugLoc &DL = MI.getDebugLoc();
750   MachineBasicBlock *BB = MI.getParent();
751 
752   // First, before trying TableGen patterns, check if both sources are
753   // constants. In those cases, we can trivially compute the final constant
754   // and emit a simple move.
755   auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
756   if (ConstSrc1) {
757     auto ConstSrc0 =
758         getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
759     if (ConstSrc0) {
760       const int64_t K0 = ConstSrc0->Value.getSExtValue();
761       const int64_t K1 = ConstSrc1->Value.getSExtValue();
762       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
763       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
764       uint32_t Imm = Lo16 | (Hi16 << 16);
765 
766       // VALU
767       if (IsVector) {
768         BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
769         MI.eraseFromParent();
770         return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
771       }
772 
773       // SALU
774       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
775       MI.eraseFromParent();
776       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
777     }
778   }
779 
780   // Now try TableGen patterns.
781   if (selectImpl(MI, *CoverageInfo))
782     return true;
783 
784   // TODO: This should probably be a combine somewhere
785   // (build_vector $src0, undef)  -> copy $src0
786   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
787   if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
788     MI.setDesc(TII.get(AMDGPU::COPY));
789     MI.removeOperand(2);
790     const auto &RC =
791         IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
792     return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
793            RBI.constrainGenericRegister(Src0, RC, *MRI);
794   }
795 
796   // TODO: Can be improved?
797   if (IsVector) {
798     Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
799     auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
800                    .addImm(0xFFFF)
801                    .addReg(Src0);
802     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
803       return false;
804 
805     MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
806               .addReg(Src1)
807               .addImm(16)
808               .addReg(TmpReg);
809     if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
810       return false;
811 
812     MI.eraseFromParent();
813     return true;
814   }
815 
816   Register ShiftSrc0;
817   Register ShiftSrc1;
818 
819   // With multiple uses of the shift, this will duplicate the shift and
820   // increase register pressure.
821   //
822   // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
823   //  => (S_PACK_HH_B32_B16 $src0, $src1)
824   // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
825   //  => (S_PACK_HL_B32_B16 $src0, $src1)
826   // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
827   //  => (S_PACK_LH_B32_B16 $src0, $src1)
828   // (build_vector $src0, $src1)
829   //  => (S_PACK_LL_B32_B16 $src0, $src1)
830 
831   bool Shift0 = mi_match(
832       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
833 
834   bool Shift1 = mi_match(
835       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
836 
837   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
838   if (Shift0 && Shift1) {
839     Opc = AMDGPU::S_PACK_HH_B32_B16;
840     MI.getOperand(1).setReg(ShiftSrc0);
841     MI.getOperand(2).setReg(ShiftSrc1);
842   } else if (Shift1) {
843     Opc = AMDGPU::S_PACK_LH_B32_B16;
844     MI.getOperand(2).setReg(ShiftSrc1);
845   } else if (Shift0) {
846     auto ConstSrc1 =
847         getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
848     if (ConstSrc1 && ConstSrc1->Value == 0) {
849       // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
850       auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
851                      .addReg(ShiftSrc0)
852                      .addImm(16)
853                      .setOperandDead(3); // Dead scc
854 
855       MI.eraseFromParent();
856       return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
857     }
858     if (STI.hasSPackHL()) {
859       Opc = AMDGPU::S_PACK_HL_B32_B16;
860       MI.getOperand(1).setReg(ShiftSrc0);
861     }
862   }
863 
864   MI.setDesc(TII.get(Opc));
865   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
866 }
867 
868 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
869   const MachineOperand &MO = I.getOperand(0);
870 
871   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
872   // regbank check here is to know why getConstrainedRegClassForOperand failed.
873   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
874   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
875       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
876     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
877     return true;
878   }
879 
880   return false;
881 }
882 
883 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
884   MachineBasicBlock *BB = I.getParent();
885 
886   Register DstReg = I.getOperand(0).getReg();
887   Register Src0Reg = I.getOperand(1).getReg();
888   Register Src1Reg = I.getOperand(2).getReg();
889   LLT Src1Ty = MRI->getType(Src1Reg);
890 
891   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
892   unsigned InsSize = Src1Ty.getSizeInBits();
893 
894   int64_t Offset = I.getOperand(3).getImm();
895 
896   // FIXME: These cases should have been illegal and unnecessary to check here.
897   if (Offset % 32 != 0 || InsSize % 32 != 0)
898     return false;
899 
900   // Currently not handled by getSubRegFromChannel.
901   if (InsSize > 128)
902     return false;
903 
904   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
905   if (SubReg == AMDGPU::NoSubRegister)
906     return false;
907 
908   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
909   const TargetRegisterClass *DstRC =
910       TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
911   if (!DstRC)
912     return false;
913 
914   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
915   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
916   const TargetRegisterClass *Src0RC =
917       TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
918   const TargetRegisterClass *Src1RC =
919       TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
920 
921   // Deal with weird cases where the class only partially supports the subreg
922   // index.
923   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
924   if (!Src0RC || !Src1RC)
925     return false;
926 
927   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
928       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
929       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
930     return false;
931 
932   const DebugLoc &DL = I.getDebugLoc();
933   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
934     .addReg(Src0Reg)
935     .addReg(Src1Reg)
936     .addImm(SubReg);
937 
938   I.eraseFromParent();
939   return true;
940 }
941 
942 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
943   Register DstReg = MI.getOperand(0).getReg();
944   Register SrcReg = MI.getOperand(1).getReg();
945   Register OffsetReg = MI.getOperand(2).getReg();
946   Register WidthReg = MI.getOperand(3).getReg();
947 
948   assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
949          "scalar BFX instructions are expanded in regbankselect");
950   assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
951          "64-bit vector BFX instructions are expanded in regbankselect");
952 
953   const DebugLoc &DL = MI.getDebugLoc();
954   MachineBasicBlock *MBB = MI.getParent();
955 
956   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
957   unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
958   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
959                  .addReg(SrcReg)
960                  .addReg(OffsetReg)
961                  .addReg(WidthReg);
962   MI.eraseFromParent();
963   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
964 }
965 
966 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
967   if (STI.getLDSBankCount() != 16)
968     return selectImpl(MI, *CoverageInfo);
969 
970   Register Dst = MI.getOperand(0).getReg();
971   Register Src0 = MI.getOperand(2).getReg();
972   Register M0Val = MI.getOperand(6).getReg();
973   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
974       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
975       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
976     return false;
977 
978   // This requires 2 instructions. It is possible to write a pattern to support
979   // this, but the generated isel emitter doesn't correctly deal with multiple
980   // output instructions using the same physical register input. The copy to m0
981   // is incorrectly placed before the second instruction.
982   //
983   // TODO: Match source modifiers.
984 
985   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
986   const DebugLoc &DL = MI.getDebugLoc();
987   MachineBasicBlock *MBB = MI.getParent();
988 
989   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
990     .addReg(M0Val);
991   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
992     .addImm(2)
993     .addImm(MI.getOperand(4).getImm())  // $attr
994     .addImm(MI.getOperand(3).getImm()); // $attrchan
995 
996   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
997     .addImm(0)                          // $src0_modifiers
998     .addReg(Src0)                       // $src0
999     .addImm(MI.getOperand(4).getImm())  // $attr
1000     .addImm(MI.getOperand(3).getImm())  // $attrchan
1001     .addImm(0)                          // $src2_modifiers
1002     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
1003     .addImm(MI.getOperand(5).getImm())  // $high
1004     .addImm(0)                          // $clamp
1005     .addImm(0);                         // $omod
1006 
1007   MI.eraseFromParent();
1008   return true;
1009 }
1010 
1011 // Writelane is special in that it can use SGPR and M0 (which would normally
1012 // count as using the constant bus twice - but in this case it is allowed since
1013 // the lane selector doesn't count as a use of the constant bus). However, it is
1014 // still required to abide by the 1 SGPR rule. Fix this up if we might have
1015 // multiple SGPRs.
1016 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1017   // With a constant bus limit of at least 2, there's no issue.
1018   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1019     return selectImpl(MI, *CoverageInfo);
1020 
1021   MachineBasicBlock *MBB = MI.getParent();
1022   const DebugLoc &DL = MI.getDebugLoc();
1023   Register VDst = MI.getOperand(0).getReg();
1024   Register Val = MI.getOperand(2).getReg();
1025   Register LaneSelect = MI.getOperand(3).getReg();
1026   Register VDstIn = MI.getOperand(4).getReg();
1027 
1028   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1029 
1030   std::optional<ValueAndVReg> ConstSelect =
1031       getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1032   if (ConstSelect) {
1033     // The selector has to be an inline immediate, so we can use whatever for
1034     // the other operands.
1035     MIB.addReg(Val);
1036     MIB.addImm(ConstSelect->Value.getSExtValue() &
1037                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1038   } else {
1039     std::optional<ValueAndVReg> ConstVal =
1040         getIConstantVRegValWithLookThrough(Val, *MRI);
1041 
1042     // If the value written is an inline immediate, we can get away without a
1043     // copy to m0.
1044     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1045                                                  STI.hasInv2PiInlineImm())) {
1046       MIB.addImm(ConstVal->Value.getSExtValue());
1047       MIB.addReg(LaneSelect);
1048     } else {
1049       MIB.addReg(Val);
1050 
1051       // If the lane selector was originally in a VGPR and copied with
1052       // readfirstlane, there's a hazard to read the same SGPR from the
1053       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1054       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1055 
1056       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1057         .addReg(LaneSelect);
1058       MIB.addReg(AMDGPU::M0);
1059     }
1060   }
1061 
1062   MIB.addReg(VDstIn);
1063 
1064   MI.eraseFromParent();
1065   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1066 }
1067 
1068 // We need to handle this here because tablegen doesn't support matching
1069 // instructions with multiple outputs.
1070 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1071   Register Dst0 = MI.getOperand(0).getReg();
1072   Register Dst1 = MI.getOperand(1).getReg();
1073 
1074   LLT Ty = MRI->getType(Dst0);
1075   unsigned Opc;
1076   if (Ty == LLT::scalar(32))
1077     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1078   else if (Ty == LLT::scalar(64))
1079     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1080   else
1081     return false;
1082 
1083   // TODO: Match source modifiers.
1084 
1085   const DebugLoc &DL = MI.getDebugLoc();
1086   MachineBasicBlock *MBB = MI.getParent();
1087 
1088   Register Numer = MI.getOperand(3).getReg();
1089   Register Denom = MI.getOperand(4).getReg();
1090   unsigned ChooseDenom = MI.getOperand(5).getImm();
1091 
1092   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1093 
1094   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1095     .addDef(Dst1)
1096     .addImm(0)     // $src0_modifiers
1097     .addUse(Src0)  // $src0
1098     .addImm(0)     // $src1_modifiers
1099     .addUse(Denom) // $src1
1100     .addImm(0)     // $src2_modifiers
1101     .addUse(Numer) // $src2
1102     .addImm(0)     // $clamp
1103     .addImm(0);    // $omod
1104 
1105   MI.eraseFromParent();
1106   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107 }
1108 
1109 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1110   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1111   switch (IntrinsicID) {
1112   case Intrinsic::amdgcn_if_break: {
1113     MachineBasicBlock *BB = I.getParent();
1114 
1115     // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1116     // SelectionDAG uses for wave32 vs wave64.
1117     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1118       .add(I.getOperand(0))
1119       .add(I.getOperand(2))
1120       .add(I.getOperand(3));
1121 
1122     Register DstReg = I.getOperand(0).getReg();
1123     Register Src0Reg = I.getOperand(2).getReg();
1124     Register Src1Reg = I.getOperand(3).getReg();
1125 
1126     I.eraseFromParent();
1127 
1128     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1129       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1130 
1131     return true;
1132   }
1133   case Intrinsic::amdgcn_interp_p1_f16:
1134     return selectInterpP1F16(I);
1135   case Intrinsic::amdgcn_wqm:
1136     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1137   case Intrinsic::amdgcn_softwqm:
1138     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1139   case Intrinsic::amdgcn_strict_wwm:
1140   case Intrinsic::amdgcn_wwm:
1141     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1142   case Intrinsic::amdgcn_strict_wqm:
1143     return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1144   case Intrinsic::amdgcn_writelane:
1145     return selectWritelane(I);
1146   case Intrinsic::amdgcn_div_scale:
1147     return selectDivScale(I);
1148   case Intrinsic::amdgcn_icmp:
1149   case Intrinsic::amdgcn_fcmp:
1150     if (selectImpl(I, *CoverageInfo))
1151       return true;
1152     return selectIntrinsicCmp(I);
1153   case Intrinsic::amdgcn_ballot:
1154     return selectBallot(I);
1155   case Intrinsic::amdgcn_reloc_constant:
1156     return selectRelocConstant(I);
1157   case Intrinsic::amdgcn_groupstaticsize:
1158     return selectGroupStaticSize(I);
1159   case Intrinsic::returnaddress:
1160     return selectReturnAddress(I);
1161   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1162   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1163   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1164   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1165   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1166   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1167   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1168   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1169   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1170   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1171   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1172   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1173   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1174   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1175   case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1176   case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1177   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1178   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1179   case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1180   case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1181   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1182   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1183   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1184   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1185   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1186   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1187   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1188   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1189     return selectSMFMACIntrin(I);
1190   case Intrinsic::amdgcn_permlane16_swap:
1191   case Intrinsic::amdgcn_permlane32_swap:
1192     return selectPermlaneSwapIntrin(I, IntrinsicID);
1193   default:
1194     return selectImpl(I, *CoverageInfo);
1195   }
1196 }
1197 
1198 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1199                           const GCNSubtarget &ST) {
1200   if (Size != 16 && Size != 32 && Size != 64)
1201     return -1;
1202 
1203   if (Size == 16 && !ST.has16BitInsts())
1204     return -1;
1205 
1206   const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1207                           unsigned FakeS16Opc, unsigned S32Opc,
1208                           unsigned S64Opc) {
1209     if (Size == 16)
1210       // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
1211       return ST.hasTrue16BitInsts()
1212                  ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
1213                  : S16Opc;
1214     if (Size == 32)
1215       return S32Opc;
1216     return S64Opc;
1217   };
1218 
1219   switch (P) {
1220   default:
1221     llvm_unreachable("Unknown condition code!");
1222   case CmpInst::ICMP_NE:
1223     return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224                   AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225                   AMDGPU::V_CMP_NE_U64_e64);
1226   case CmpInst::ICMP_EQ:
1227     return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228                   AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229                   AMDGPU::V_CMP_EQ_U64_e64);
1230   case CmpInst::ICMP_SGT:
1231     return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232                   AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233                   AMDGPU::V_CMP_GT_I64_e64);
1234   case CmpInst::ICMP_SGE:
1235     return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236                   AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237                   AMDGPU::V_CMP_GE_I64_e64);
1238   case CmpInst::ICMP_SLT:
1239     return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240                   AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241                   AMDGPU::V_CMP_LT_I64_e64);
1242   case CmpInst::ICMP_SLE:
1243     return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244                   AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245                   AMDGPU::V_CMP_LE_I64_e64);
1246   case CmpInst::ICMP_UGT:
1247     return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248                   AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249                   AMDGPU::V_CMP_GT_U64_e64);
1250   case CmpInst::ICMP_UGE:
1251     return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252                   AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253                   AMDGPU::V_CMP_GE_U64_e64);
1254   case CmpInst::ICMP_ULT:
1255     return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256                   AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257                   AMDGPU::V_CMP_LT_U64_e64);
1258   case CmpInst::ICMP_ULE:
1259     return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260                   AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261                   AMDGPU::V_CMP_LE_U64_e64);
1262 
1263   case CmpInst::FCMP_OEQ:
1264     return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265                   AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266                   AMDGPU::V_CMP_EQ_F64_e64);
1267   case CmpInst::FCMP_OGT:
1268     return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269                   AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270                   AMDGPU::V_CMP_GT_F64_e64);
1271   case CmpInst::FCMP_OGE:
1272     return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273                   AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274                   AMDGPU::V_CMP_GE_F64_e64);
1275   case CmpInst::FCMP_OLT:
1276     return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277                   AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278                   AMDGPU::V_CMP_LT_F64_e64);
1279   case CmpInst::FCMP_OLE:
1280     return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281                   AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282                   AMDGPU::V_CMP_LE_F64_e64);
1283   case CmpInst::FCMP_ONE:
1284     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285                   AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286                   AMDGPU::V_CMP_NEQ_F64_e64);
1287   case CmpInst::FCMP_ORD:
1288     return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289                   AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290                   AMDGPU::V_CMP_O_F64_e64);
1291   case CmpInst::FCMP_UNO:
1292     return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293                   AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294                   AMDGPU::V_CMP_U_F64_e64);
1295   case CmpInst::FCMP_UEQ:
1296     return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297                   AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298                   AMDGPU::V_CMP_NLG_F64_e64);
1299   case CmpInst::FCMP_UGT:
1300     return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301                   AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302                   AMDGPU::V_CMP_NLE_F64_e64);
1303   case CmpInst::FCMP_UGE:
1304     return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305                   AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306                   AMDGPU::V_CMP_NLT_F64_e64);
1307   case CmpInst::FCMP_ULT:
1308     return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309                   AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310                   AMDGPU::V_CMP_NGE_F64_e64);
1311   case CmpInst::FCMP_ULE:
1312     return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313                   AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314                   AMDGPU::V_CMP_NGT_F64_e64);
1315   case CmpInst::FCMP_UNE:
1316     return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317                   AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318                   AMDGPU::V_CMP_NEQ_F64_e64);
1319   case CmpInst::FCMP_TRUE:
1320     return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321                   AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322                   AMDGPU::V_CMP_TRU_F64_e64);
1323   case CmpInst::FCMP_FALSE:
1324     return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325                   AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326                   AMDGPU::V_CMP_F_F64_e64);
1327   }
1328 }
1329 
1330 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1331                                               unsigned Size) const {
1332   if (Size == 64) {
1333     if (!STI.hasScalarCompareEq64())
1334       return -1;
1335 
1336     switch (P) {
1337     case CmpInst::ICMP_NE:
1338       return AMDGPU::S_CMP_LG_U64;
1339     case CmpInst::ICMP_EQ:
1340       return AMDGPU::S_CMP_EQ_U64;
1341     default:
1342       return -1;
1343     }
1344   }
1345 
1346   if (Size == 32) {
1347     switch (P) {
1348     case CmpInst::ICMP_NE:
1349       return AMDGPU::S_CMP_LG_U32;
1350     case CmpInst::ICMP_EQ:
1351       return AMDGPU::S_CMP_EQ_U32;
1352     case CmpInst::ICMP_SGT:
1353       return AMDGPU::S_CMP_GT_I32;
1354     case CmpInst::ICMP_SGE:
1355       return AMDGPU::S_CMP_GE_I32;
1356     case CmpInst::ICMP_SLT:
1357       return AMDGPU::S_CMP_LT_I32;
1358     case CmpInst::ICMP_SLE:
1359       return AMDGPU::S_CMP_LE_I32;
1360     case CmpInst::ICMP_UGT:
1361       return AMDGPU::S_CMP_GT_U32;
1362     case CmpInst::ICMP_UGE:
1363       return AMDGPU::S_CMP_GE_U32;
1364     case CmpInst::ICMP_ULT:
1365       return AMDGPU::S_CMP_LT_U32;
1366     case CmpInst::ICMP_ULE:
1367       return AMDGPU::S_CMP_LE_U32;
1368     case CmpInst::FCMP_OEQ:
1369       return AMDGPU::S_CMP_EQ_F32;
1370     case CmpInst::FCMP_OGT:
1371       return AMDGPU::S_CMP_GT_F32;
1372     case CmpInst::FCMP_OGE:
1373       return AMDGPU::S_CMP_GE_F32;
1374     case CmpInst::FCMP_OLT:
1375       return AMDGPU::S_CMP_LT_F32;
1376     case CmpInst::FCMP_OLE:
1377       return AMDGPU::S_CMP_LE_F32;
1378     case CmpInst::FCMP_ONE:
1379       return AMDGPU::S_CMP_LG_F32;
1380     case CmpInst::FCMP_ORD:
1381       return AMDGPU::S_CMP_O_F32;
1382     case CmpInst::FCMP_UNO:
1383       return AMDGPU::S_CMP_U_F32;
1384     case CmpInst::FCMP_UEQ:
1385       return AMDGPU::S_CMP_NLG_F32;
1386     case CmpInst::FCMP_UGT:
1387       return AMDGPU::S_CMP_NLE_F32;
1388     case CmpInst::FCMP_UGE:
1389       return AMDGPU::S_CMP_NLT_F32;
1390     case CmpInst::FCMP_ULT:
1391       return AMDGPU::S_CMP_NGE_F32;
1392     case CmpInst::FCMP_ULE:
1393       return AMDGPU::S_CMP_NGT_F32;
1394     case CmpInst::FCMP_UNE:
1395       return AMDGPU::S_CMP_NEQ_F32;
1396     default:
1397       llvm_unreachable("Unknown condition code!");
1398     }
1399   }
1400 
1401   if (Size == 16) {
1402     if (!STI.hasSALUFloatInsts())
1403       return -1;
1404 
1405     switch (P) {
1406     case CmpInst::FCMP_OEQ:
1407       return AMDGPU::S_CMP_EQ_F16;
1408     case CmpInst::FCMP_OGT:
1409       return AMDGPU::S_CMP_GT_F16;
1410     case CmpInst::FCMP_OGE:
1411       return AMDGPU::S_CMP_GE_F16;
1412     case CmpInst::FCMP_OLT:
1413       return AMDGPU::S_CMP_LT_F16;
1414     case CmpInst::FCMP_OLE:
1415       return AMDGPU::S_CMP_LE_F16;
1416     case CmpInst::FCMP_ONE:
1417       return AMDGPU::S_CMP_LG_F16;
1418     case CmpInst::FCMP_ORD:
1419       return AMDGPU::S_CMP_O_F16;
1420     case CmpInst::FCMP_UNO:
1421       return AMDGPU::S_CMP_U_F16;
1422     case CmpInst::FCMP_UEQ:
1423       return AMDGPU::S_CMP_NLG_F16;
1424     case CmpInst::FCMP_UGT:
1425       return AMDGPU::S_CMP_NLE_F16;
1426     case CmpInst::FCMP_UGE:
1427       return AMDGPU::S_CMP_NLT_F16;
1428     case CmpInst::FCMP_ULT:
1429       return AMDGPU::S_CMP_NGE_F16;
1430     case CmpInst::FCMP_ULE:
1431       return AMDGPU::S_CMP_NGT_F16;
1432     case CmpInst::FCMP_UNE:
1433       return AMDGPU::S_CMP_NEQ_F16;
1434     default:
1435       llvm_unreachable("Unknown condition code!");
1436     }
1437   }
1438 
1439   return -1;
1440 }
1441 
1442 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1443 
1444   MachineBasicBlock *BB = I.getParent();
1445   const DebugLoc &DL = I.getDebugLoc();
1446 
1447   Register SrcReg = I.getOperand(2).getReg();
1448   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1449 
1450   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1451 
1452   Register CCReg = I.getOperand(0).getReg();
1453   if (!isVCC(CCReg, *MRI)) {
1454     int Opcode = getS_CMPOpcode(Pred, Size);
1455     if (Opcode == -1)
1456       return false;
1457     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1458             .add(I.getOperand(2))
1459             .add(I.getOperand(3));
1460     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1461       .addReg(AMDGPU::SCC);
1462     bool Ret =
1463         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1464         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1465     I.eraseFromParent();
1466     return Ret;
1467   }
1468 
1469   if (I.getOpcode() == AMDGPU::G_FCMP)
1470     return false;
1471 
1472   int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1473   if (Opcode == -1)
1474     return false;
1475 
1476   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1477             I.getOperand(0).getReg())
1478             .add(I.getOperand(2))
1479             .add(I.getOperand(3));
1480   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1481                                *TRI.getBoolRC(), *MRI);
1482   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1483   I.eraseFromParent();
1484   return Ret;
1485 }
1486 
1487 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1488   Register Dst = I.getOperand(0).getReg();
1489   if (isVCC(Dst, *MRI))
1490     return false;
1491 
1492   LLT DstTy = MRI->getType(Dst);
1493   if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1494     return false;
1495 
1496   MachineBasicBlock *BB = I.getParent();
1497   const DebugLoc &DL = I.getDebugLoc();
1498   Register SrcReg = I.getOperand(2).getReg();
1499   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1500 
1501   // i1 inputs are not supported in GlobalISel.
1502   if (Size == 1)
1503     return false;
1504 
1505   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1506   if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1507     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1508     I.eraseFromParent();
1509     return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1510   }
1511 
1512   const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1513   if (Opcode == -1)
1514     return false;
1515 
1516   MachineInstrBuilder SelectedMI;
1517   MachineOperand &LHS = I.getOperand(2);
1518   MachineOperand &RHS = I.getOperand(3);
1519   auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1520   auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1521   Register Src0Reg =
1522       copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1523   Register Src1Reg =
1524       copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1525   SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1526   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1527     SelectedMI.addImm(Src0Mods);
1528   SelectedMI.addReg(Src0Reg);
1529   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1530     SelectedMI.addImm(Src1Mods);
1531   SelectedMI.addReg(Src1Reg);
1532   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1533     SelectedMI.addImm(0); // clamp
1534   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1535     SelectedMI.addImm(0); // op_sel
1536 
1537   RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1538   if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1539     return false;
1540 
1541   I.eraseFromParent();
1542   return true;
1543 }
1544 
1545 // Ballot has to zero bits in input lane-mask that are zero in current exec,
1546 // Done as AND with exec. For inputs that are results of instruction that
1547 // implicitly use same exec, for example compares in same basic block or SCC to
1548 // VCC copy, use copy.
1549 static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1550                                     MachineBasicBlock *MBB) {
1551   MachineInstr *MI = MRI.getVRegDef(Reg);
1552   if (MI->getParent() != MBB)
1553     return false;
1554 
1555   // Lane mask generated by SCC to VCC copy.
1556   if (MI->getOpcode() == AMDGPU::COPY) {
1557     auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1558     auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1559     if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1560         SrcRB->getID() == AMDGPU::SGPRRegBankID)
1561       return true;
1562   }
1563 
1564   // Lane mask generated using compare with same exec.
1565   if (isa<GAnyCmp>(MI))
1566     return true;
1567 
1568   Register LHS, RHS;
1569   // Look through AND.
1570   if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1571     return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1572            isLaneMaskFromSameBlock(RHS, MRI, MBB);
1573 
1574   return false;
1575 }
1576 
1577 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1578   MachineBasicBlock *BB = I.getParent();
1579   const DebugLoc &DL = I.getDebugLoc();
1580   Register DstReg = I.getOperand(0).getReg();
1581   Register SrcReg = I.getOperand(2).getReg();
1582   const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1583   const unsigned WaveSize = STI.getWavefrontSize();
1584 
1585   // In the common case, the return type matches the wave size.
1586   // However we also support emitting i64 ballots in wave32 mode.
1587   if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1588     return false;
1589 
1590   std::optional<ValueAndVReg> Arg =
1591       getIConstantVRegValWithLookThrough(SrcReg, *MRI);
1592 
1593   Register Dst = DstReg;
1594   // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1595   if (BallotSize != WaveSize) {
1596     Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1597   }
1598 
1599   if (Arg) {
1600     const int64_t Value = Arg->Value.getZExtValue();
1601     if (Value == 0) {
1602       // Dst = S_MOV 0
1603       unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1604       BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1605     } else {
1606       // Dst = COPY EXEC
1607       assert(Value == 1);
1608       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1609     }
1610     if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1611       return false;
1612   } else {
1613     if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1614       // Dst = COPY SrcReg
1615       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1616       if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1617         return false;
1618     } else {
1619       // Dst = S_AND SrcReg, EXEC
1620       unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1621       auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1622                      .addReg(SrcReg)
1623                      .addReg(TRI.getExec())
1624                      .setOperandDead(3); // Dead scc
1625       if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1626         return false;
1627     }
1628   }
1629 
1630   // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1631   if (BallotSize != WaveSize) {
1632     Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1633     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1634     BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1635         .addReg(Dst)
1636         .addImm(AMDGPU::sub0)
1637         .addReg(HiReg)
1638         .addImm(AMDGPU::sub1);
1639   }
1640 
1641   I.eraseFromParent();
1642   return true;
1643 }
1644 
1645 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1646   Register DstReg = I.getOperand(0).getReg();
1647   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1648   const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1649   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1650     return false;
1651 
1652   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1653 
1654   Module *M = MF->getFunction().getParent();
1655   const MDNode *Metadata = I.getOperand(2).getMetadata();
1656   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1657   auto *RelocSymbol = cast<GlobalVariable>(
1658       M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1659 
1660   MachineBasicBlock *BB = I.getParent();
1661   BuildMI(*BB, &I, I.getDebugLoc(),
1662           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1663     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1664 
1665   I.eraseFromParent();
1666   return true;
1667 }
1668 
1669 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1670   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1671 
1672   Register DstReg = I.getOperand(0).getReg();
1673   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1674   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1675     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1676 
1677   MachineBasicBlock *MBB = I.getParent();
1678   const DebugLoc &DL = I.getDebugLoc();
1679 
1680   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1681 
1682   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1683     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1684     MIB.addImm(MFI->getLDSSize());
1685   } else {
1686     Module *M = MF->getFunction().getParent();
1687     const GlobalValue *GV =
1688         Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1689     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1690   }
1691 
1692   I.eraseFromParent();
1693   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1694 }
1695 
1696 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1697   MachineBasicBlock *MBB = I.getParent();
1698   MachineFunction &MF = *MBB->getParent();
1699   const DebugLoc &DL = I.getDebugLoc();
1700 
1701   MachineOperand &Dst = I.getOperand(0);
1702   Register DstReg = Dst.getReg();
1703   unsigned Depth = I.getOperand(2).getImm();
1704 
1705   const TargetRegisterClass *RC
1706     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1707   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1708       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1709     return false;
1710 
1711   // Check for kernel and shader functions
1712   if (Depth != 0 ||
1713       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1714     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1715       .addImm(0);
1716     I.eraseFromParent();
1717     return true;
1718   }
1719 
1720   MachineFrameInfo &MFI = MF.getFrameInfo();
1721   // There is a call to @llvm.returnaddress in this function
1722   MFI.setReturnAddressIsTaken(true);
1723 
1724   // Get the return address reg and mark it as an implicit live-in
1725   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1726   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1727                                              AMDGPU::SReg_64RegClass, DL);
1728   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1729     .addReg(LiveIn);
1730   I.eraseFromParent();
1731   return true;
1732 }
1733 
1734 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1735   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1736   // SelectionDAG uses for wave32 vs wave64.
1737   MachineBasicBlock *BB = MI.getParent();
1738   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1739       .add(MI.getOperand(1));
1740 
1741   Register Reg = MI.getOperand(1).getReg();
1742   MI.eraseFromParent();
1743 
1744   if (!MRI->getRegClassOrNull(Reg))
1745     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1746   return true;
1747 }
1748 
1749 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1750   MachineInstr &MI, Intrinsic::ID IntrID) const {
1751   MachineBasicBlock *MBB = MI.getParent();
1752   MachineFunction *MF = MBB->getParent();
1753   const DebugLoc &DL = MI.getDebugLoc();
1754 
1755   unsigned IndexOperand = MI.getOperand(7).getImm();
1756   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1757   bool WaveDone = MI.getOperand(9).getImm() != 0;
1758 
1759   if (WaveDone && !WaveRelease)
1760     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1761 
1762   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1763   IndexOperand &= ~0x3f;
1764   unsigned CountDw = 0;
1765 
1766   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1767     CountDw = (IndexOperand >> 24) & 0xf;
1768     IndexOperand &= ~(0xf << 24);
1769 
1770     if (CountDw < 1 || CountDw > 4) {
1771       report_fatal_error(
1772         "ds_ordered_count: dword count must be between 1 and 4");
1773     }
1774   }
1775 
1776   if (IndexOperand)
1777     report_fatal_error("ds_ordered_count: bad index operand");
1778 
1779   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1780   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1781 
1782   unsigned Offset0 = OrderedCountIndex << 2;
1783   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1784 
1785   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1786     Offset1 |= (CountDw - 1) << 6;
1787 
1788   if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1789     Offset1 |= ShaderType << 2;
1790 
1791   unsigned Offset = Offset0 | (Offset1 << 8);
1792 
1793   Register M0Val = MI.getOperand(2).getReg();
1794   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1795     .addReg(M0Val);
1796 
1797   Register DstReg = MI.getOperand(0).getReg();
1798   Register ValReg = MI.getOperand(3).getReg();
1799   MachineInstrBuilder DS =
1800     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1801       .addReg(ValReg)
1802       .addImm(Offset)
1803       .cloneMemRefs(MI);
1804 
1805   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1806     return false;
1807 
1808   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1809   MI.eraseFromParent();
1810   return Ret;
1811 }
1812 
1813 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1814   switch (IntrID) {
1815   case Intrinsic::amdgcn_ds_gws_init:
1816     return AMDGPU::DS_GWS_INIT;
1817   case Intrinsic::amdgcn_ds_gws_barrier:
1818     return AMDGPU::DS_GWS_BARRIER;
1819   case Intrinsic::amdgcn_ds_gws_sema_v:
1820     return AMDGPU::DS_GWS_SEMA_V;
1821   case Intrinsic::amdgcn_ds_gws_sema_br:
1822     return AMDGPU::DS_GWS_SEMA_BR;
1823   case Intrinsic::amdgcn_ds_gws_sema_p:
1824     return AMDGPU::DS_GWS_SEMA_P;
1825   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1826     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1827   default:
1828     llvm_unreachable("not a gws intrinsic");
1829   }
1830 }
1831 
1832 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1833                                                      Intrinsic::ID IID) const {
1834   if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1835                         !STI.hasGWSSemaReleaseAll()))
1836     return false;
1837 
1838   // intrinsic ID, vsrc, offset
1839   const bool HasVSrc = MI.getNumOperands() == 3;
1840   assert(HasVSrc || MI.getNumOperands() == 2);
1841 
1842   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1843   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1844   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1845     return false;
1846 
1847   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1848   unsigned ImmOffset;
1849 
1850   MachineBasicBlock *MBB = MI.getParent();
1851   const DebugLoc &DL = MI.getDebugLoc();
1852 
1853   MachineInstr *Readfirstlane = nullptr;
1854 
1855   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1856   // incoming offset, in case there's an add of a constant. We'll have to put it
1857   // back later.
1858   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1859     Readfirstlane = OffsetDef;
1860     BaseOffset = OffsetDef->getOperand(1).getReg();
1861     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1862   }
1863 
1864   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1865     // If we have a constant offset, try to use the 0 in m0 as the base.
1866     // TODO: Look into changing the default m0 initialization value. If the
1867     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1868     // the immediate offset.
1869 
1870     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1871     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1872       .addImm(0);
1873   } else {
1874     std::tie(BaseOffset, ImmOffset) =
1875         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1876 
1877     if (Readfirstlane) {
1878       // We have the constant offset now, so put the readfirstlane back on the
1879       // variable component.
1880       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1881         return false;
1882 
1883       Readfirstlane->getOperand(1).setReg(BaseOffset);
1884       BaseOffset = Readfirstlane->getOperand(0).getReg();
1885     } else {
1886       if (!RBI.constrainGenericRegister(BaseOffset,
1887                                         AMDGPU::SReg_32RegClass, *MRI))
1888         return false;
1889     }
1890 
1891     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1892     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1893       .addReg(BaseOffset)
1894       .addImm(16)
1895       .setOperandDead(3); // Dead scc
1896 
1897     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1898       .addReg(M0Base);
1899   }
1900 
1901   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1902   // offset field) % 64. Some versions of the programming guide omit the m0
1903   // part, or claim it's from offset 0.
1904   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1905 
1906   if (HasVSrc) {
1907     Register VSrc = MI.getOperand(1).getReg();
1908     MIB.addReg(VSrc);
1909 
1910     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1911       return false;
1912   }
1913 
1914   MIB.addImm(ImmOffset)
1915      .cloneMemRefs(MI);
1916 
1917   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1918 
1919   MI.eraseFromParent();
1920   return true;
1921 }
1922 
1923 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1924                                                       bool IsAppend) const {
1925   Register PtrBase = MI.getOperand(2).getReg();
1926   LLT PtrTy = MRI->getType(PtrBase);
1927   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1928 
1929   unsigned Offset;
1930   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1931 
1932   // TODO: Should this try to look through readfirstlane like GWS?
1933   if (!isDSOffsetLegal(PtrBase, Offset)) {
1934     PtrBase = MI.getOperand(2).getReg();
1935     Offset = 0;
1936   }
1937 
1938   MachineBasicBlock *MBB = MI.getParent();
1939   const DebugLoc &DL = MI.getDebugLoc();
1940   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1941 
1942   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943     .addReg(PtrBase);
1944   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1945     return false;
1946 
1947   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1948     .addImm(Offset)
1949     .addImm(IsGDS ? -1 : 0)
1950     .cloneMemRefs(MI);
1951   MI.eraseFromParent();
1952   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1953 }
1954 
1955 bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1956   MachineFunction *MF = MI.getParent()->getParent();
1957   SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1958 
1959   MFInfo->setInitWholeWave();
1960   return selectImpl(MI, *CoverageInfo);
1961 }
1962 
1963 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1964   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1965   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1966     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1967     if (WGSize <= STI.getWavefrontSize()) {
1968       // If the workgroup fits in a wave, remove s_barrier_signal and lower
1969       // s_barrier/s_barrier_wait to wave_barrier.
1970       if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1971           IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1972         MachineBasicBlock *MBB = MI.getParent();
1973         const DebugLoc &DL = MI.getDebugLoc();
1974         BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1975       }
1976       MI.eraseFromParent();
1977       return true;
1978     }
1979   }
1980 
1981   if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1982     // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1983     MachineBasicBlock *MBB = MI.getParent();
1984     const DebugLoc &DL = MI.getDebugLoc();
1985     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1986         .addImm(AMDGPU::Barrier::WORKGROUP);
1987     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1988         .addImm(AMDGPU::Barrier::WORKGROUP);
1989     MI.eraseFromParent();
1990     return true;
1991   }
1992 
1993   return selectImpl(MI, *CoverageInfo);
1994 }
1995 
1996 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1997                          bool &IsTexFail) {
1998   if (TexFailCtrl)
1999     IsTexFail = true;
2000 
2001   TFE = (TexFailCtrl & 0x1) ? true : false;
2002   TexFailCtrl &= ~(uint64_t)0x1;
2003   LWE = (TexFailCtrl & 0x2) ? true : false;
2004   TexFailCtrl &= ~(uint64_t)0x2;
2005 
2006   return TexFailCtrl == 0;
2007 }
2008 
2009 bool AMDGPUInstructionSelector::selectImageIntrinsic(
2010   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2011   MachineBasicBlock *MBB = MI.getParent();
2012   const DebugLoc &DL = MI.getDebugLoc();
2013 
2014   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2015     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
2016 
2017   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2018   unsigned IntrOpcode = Intr->BaseOpcode;
2019   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2020   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2021   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2022 
2023   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2024 
2025   Register VDataIn, VDataOut;
2026   LLT VDataTy;
2027   int NumVDataDwords = -1;
2028   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2029                MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2030 
2031   bool Unorm;
2032   if (!BaseOpcode->Sampler)
2033     Unorm = true;
2034   else
2035     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2036 
2037   bool TFE;
2038   bool LWE;
2039   bool IsTexFail = false;
2040   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2041                     TFE, LWE, IsTexFail))
2042     return false;
2043 
2044   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2045   const bool IsA16 = (Flags & 1) != 0;
2046   const bool IsG16 = (Flags & 2) != 0;
2047 
2048   // A16 implies 16 bit gradients if subtarget doesn't support G16
2049   if (IsA16 && !STI.hasG16() && !IsG16)
2050     return false;
2051 
2052   unsigned DMask = 0;
2053   unsigned DMaskLanes = 0;
2054 
2055   if (BaseOpcode->Atomic) {
2056     VDataOut = MI.getOperand(0).getReg();
2057     VDataIn = MI.getOperand(2).getReg();
2058     LLT Ty = MRI->getType(VDataIn);
2059 
2060     // Be careful to allow atomic swap on 16-bit element vectors.
2061     const bool Is64Bit = BaseOpcode->AtomicX2 ?
2062       Ty.getSizeInBits() == 128 :
2063       Ty.getSizeInBits() == 64;
2064 
2065     if (BaseOpcode->AtomicX2) {
2066       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2067 
2068       DMask = Is64Bit ? 0xf : 0x3;
2069       NumVDataDwords = Is64Bit ? 4 : 2;
2070     } else {
2071       DMask = Is64Bit ? 0x3 : 0x1;
2072       NumVDataDwords = Is64Bit ? 2 : 1;
2073     }
2074   } else {
2075     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2076     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2077 
2078     if (BaseOpcode->Store) {
2079       VDataIn = MI.getOperand(1).getReg();
2080       VDataTy = MRI->getType(VDataIn);
2081       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2082     } else if (BaseOpcode->NoReturn) {
2083       NumVDataDwords = 0;
2084     } else {
2085       VDataOut = MI.getOperand(0).getReg();
2086       VDataTy = MRI->getType(VDataOut);
2087       NumVDataDwords = DMaskLanes;
2088 
2089       if (IsD16 && !STI.hasUnpackedD16VMem())
2090         NumVDataDwords = (DMaskLanes + 1) / 2;
2091     }
2092   }
2093 
2094   // Set G16 opcode
2095   if (Subtarget->hasG16() && IsG16) {
2096     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2097         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
2098     assert(G16MappingInfo);
2099     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2100   }
2101 
2102   // TODO: Check this in verifier.
2103   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2104 
2105   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2106   if (BaseOpcode->Atomic)
2107     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2108   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2109                AMDGPU::CPol::VOLATILE))
2110     return false;
2111 
2112   int NumVAddrRegs = 0;
2113   int NumVAddrDwords = 0;
2114   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2115     // Skip the $noregs and 0s inserted during legalization.
2116     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2117     if (!AddrOp.isReg())
2118       continue; // XXX - Break?
2119 
2120     Register Addr = AddrOp.getReg();
2121     if (!Addr)
2122       break;
2123 
2124     ++NumVAddrRegs;
2125     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2126   }
2127 
2128   // The legalizer preprocessed the intrinsic arguments. If we aren't using
2129   // NSA, these should have been packed into a single value in the first
2130   // address register
2131   const bool UseNSA =
2132       NumVAddrRegs != 1 &&
2133       (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2134                                    : NumVAddrDwords == NumVAddrRegs);
2135   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2136     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2137     return false;
2138   }
2139 
2140   if (IsTexFail)
2141     ++NumVDataDwords;
2142 
2143   int Opcode = -1;
2144   if (IsGFX12Plus) {
2145     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2146                                    NumVDataDwords, NumVAddrDwords);
2147   } else if (IsGFX11Plus) {
2148     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2149                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
2150                                           : AMDGPU::MIMGEncGfx11Default,
2151                                    NumVDataDwords, NumVAddrDwords);
2152   } else if (IsGFX10Plus) {
2153     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2154                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
2155                                           : AMDGPU::MIMGEncGfx10Default,
2156                                    NumVDataDwords, NumVAddrDwords);
2157   } else {
2158     if (Subtarget->hasGFX90AInsts()) {
2159       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2160                                      NumVDataDwords, NumVAddrDwords);
2161       if (Opcode == -1) {
2162         LLVM_DEBUG(
2163             dbgs()
2164             << "requested image instruction is not supported on this GPU\n");
2165         return false;
2166       }
2167     }
2168     if (Opcode == -1 &&
2169         STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2170       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2171                                      NumVDataDwords, NumVAddrDwords);
2172     if (Opcode == -1)
2173       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2174                                      NumVDataDwords, NumVAddrDwords);
2175   }
2176   if (Opcode == -1)
2177     return false;
2178 
2179   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2180     .cloneMemRefs(MI);
2181 
2182   if (VDataOut) {
2183     if (BaseOpcode->AtomicX2) {
2184       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2185 
2186       Register TmpReg = MRI->createVirtualRegister(
2187         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2188       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2189 
2190       MIB.addDef(TmpReg);
2191       if (!MRI->use_empty(VDataOut)) {
2192         BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2193             .addReg(TmpReg, RegState::Kill, SubReg);
2194       }
2195 
2196     } else {
2197       MIB.addDef(VDataOut); // vdata output
2198     }
2199   }
2200 
2201   if (VDataIn)
2202     MIB.addReg(VDataIn); // vdata input
2203 
2204   for (int I = 0; I != NumVAddrRegs; ++I) {
2205     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2206     if (SrcOp.isReg()) {
2207       assert(SrcOp.getReg() != 0);
2208       MIB.addReg(SrcOp.getReg());
2209     }
2210   }
2211 
2212   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2213   if (BaseOpcode->Sampler)
2214     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2215 
2216   MIB.addImm(DMask); // dmask
2217 
2218   if (IsGFX10Plus)
2219     MIB.addImm(DimInfo->Encoding);
2220   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2221     MIB.addImm(Unorm);
2222 
2223   MIB.addImm(CPol);
2224   MIB.addImm(IsA16 &&  // a16 or r128
2225              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2226   if (IsGFX10Plus)
2227     MIB.addImm(IsA16 ? -1 : 0);
2228 
2229   if (!Subtarget->hasGFX90AInsts()) {
2230     MIB.addImm(TFE); // tfe
2231   } else if (TFE) {
2232     LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2233     return false;
2234   }
2235 
2236   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2237     MIB.addImm(LWE); // lwe
2238   if (!IsGFX10Plus)
2239     MIB.addImm(DimInfo->DA ? -1 : 0);
2240   if (BaseOpcode->HasD16)
2241     MIB.addImm(IsD16 ? -1 : 0);
2242 
2243   MI.eraseFromParent();
2244   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2245   TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2246   return true;
2247 }
2248 
2249 // We need to handle this here because tablegen doesn't support matching
2250 // instructions with multiple outputs.
2251 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2252     MachineInstr &MI) const {
2253   Register Dst0 = MI.getOperand(0).getReg();
2254   Register Dst1 = MI.getOperand(1).getReg();
2255 
2256   const DebugLoc &DL = MI.getDebugLoc();
2257   MachineBasicBlock *MBB = MI.getParent();
2258 
2259   Register Addr = MI.getOperand(3).getReg();
2260   Register Data0 = MI.getOperand(4).getReg();
2261   Register Data1 = MI.getOperand(5).getReg();
2262   unsigned Offset = MI.getOperand(6).getImm();
2263 
2264   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2265                  .addDef(Dst1)
2266                  .addUse(Addr)
2267                  .addUse(Data0)
2268                  .addUse(Data1)
2269                  .addImm(Offset)
2270                  .cloneMemRefs(MI);
2271 
2272   MI.eraseFromParent();
2273   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2274 }
2275 
2276 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2277     MachineInstr &I) const {
2278   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2279   switch (IntrinsicID) {
2280   case Intrinsic::amdgcn_end_cf:
2281     return selectEndCfIntrinsic(I);
2282   case Intrinsic::amdgcn_ds_ordered_add:
2283   case Intrinsic::amdgcn_ds_ordered_swap:
2284     return selectDSOrderedIntrinsic(I, IntrinsicID);
2285   case Intrinsic::amdgcn_ds_gws_init:
2286   case Intrinsic::amdgcn_ds_gws_barrier:
2287   case Intrinsic::amdgcn_ds_gws_sema_v:
2288   case Intrinsic::amdgcn_ds_gws_sema_br:
2289   case Intrinsic::amdgcn_ds_gws_sema_p:
2290   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2291     return selectDSGWSIntrinsic(I, IntrinsicID);
2292   case Intrinsic::amdgcn_ds_append:
2293     return selectDSAppendConsume(I, true);
2294   case Intrinsic::amdgcn_ds_consume:
2295     return selectDSAppendConsume(I, false);
2296   case Intrinsic::amdgcn_init_whole_wave:
2297     return selectInitWholeWave(I);
2298   case Intrinsic::amdgcn_s_barrier:
2299   case Intrinsic::amdgcn_s_barrier_signal:
2300   case Intrinsic::amdgcn_s_barrier_wait:
2301     return selectSBarrier(I);
2302   case Intrinsic::amdgcn_raw_buffer_load_lds:
2303   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2304   case Intrinsic::amdgcn_struct_buffer_load_lds:
2305   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2306     return selectBufferLoadLds(I);
2307   case Intrinsic::amdgcn_global_load_lds:
2308     return selectGlobalLoadLds(I);
2309   case Intrinsic::amdgcn_exp_compr:
2310     if (!STI.hasCompressedExport()) {
2311       Function &F = I.getMF()->getFunction();
2312       DiagnosticInfoUnsupported NoFpRet(
2313           F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2314       F.getContext().diagnose(NoFpRet);
2315       return false;
2316     }
2317     break;
2318   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2319     return selectDSBvhStackIntrinsic(I);
2320   case Intrinsic::amdgcn_s_barrier_init:
2321   case Intrinsic::amdgcn_s_barrier_signal_var:
2322     return selectNamedBarrierInit(I, IntrinsicID);
2323   case Intrinsic::amdgcn_s_barrier_join:
2324   case Intrinsic::amdgcn_s_get_named_barrier_state:
2325     return selectNamedBarrierInst(I, IntrinsicID);
2326   case Intrinsic::amdgcn_s_get_barrier_state:
2327     return selectSGetBarrierState(I, IntrinsicID);
2328   case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2329     return selectSBarrierSignalIsfirst(I, IntrinsicID);
2330   }
2331   return selectImpl(I, *CoverageInfo);
2332 }
2333 
2334 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2335   if (selectImpl(I, *CoverageInfo))
2336     return true;
2337 
2338   MachineBasicBlock *BB = I.getParent();
2339   const DebugLoc &DL = I.getDebugLoc();
2340 
2341   Register DstReg = I.getOperand(0).getReg();
2342   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2343   assert(Size <= 32 || Size == 64);
2344   const MachineOperand &CCOp = I.getOperand(1);
2345   Register CCReg = CCOp.getReg();
2346   if (!isVCC(CCReg, *MRI)) {
2347     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2348                                          AMDGPU::S_CSELECT_B32;
2349     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2350             .addReg(CCReg);
2351 
2352     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2353     // bank, because it does not cover the register class that we used to represent
2354     // for it.  So we need to manually set the register class here.
2355     if (!MRI->getRegClassOrNull(CCReg))
2356         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2357     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2358             .add(I.getOperand(2))
2359             .add(I.getOperand(3));
2360 
2361     bool Ret = false;
2362     Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2363     Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2364     I.eraseFromParent();
2365     return Ret;
2366   }
2367 
2368   // Wide VGPR select should have been split in RegBankSelect.
2369   if (Size > 32)
2370     return false;
2371 
2372   MachineInstr *Select =
2373       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2374               .addImm(0)
2375               .add(I.getOperand(3))
2376               .addImm(0)
2377               .add(I.getOperand(2))
2378               .add(I.getOperand(1));
2379 
2380   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2381   I.eraseFromParent();
2382   return Ret;
2383 }
2384 
2385 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2386   Register DstReg = I.getOperand(0).getReg();
2387   Register SrcReg = I.getOperand(1).getReg();
2388   const LLT DstTy = MRI->getType(DstReg);
2389   const LLT SrcTy = MRI->getType(SrcReg);
2390   const LLT S1 = LLT::scalar(1);
2391 
2392   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2393   const RegisterBank *DstRB;
2394   if (DstTy == S1) {
2395     // This is a special case. We don't treat s1 for legalization artifacts as
2396     // vcc booleans.
2397     DstRB = SrcRB;
2398   } else {
2399     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2400     if (SrcRB != DstRB)
2401       return false;
2402   }
2403 
2404   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2405 
2406   unsigned DstSize = DstTy.getSizeInBits();
2407   unsigned SrcSize = SrcTy.getSizeInBits();
2408 
2409   const TargetRegisterClass *SrcRC =
2410       TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2411   const TargetRegisterClass *DstRC =
2412       TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2413   if (!SrcRC || !DstRC)
2414     return false;
2415 
2416   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2417       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2418     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2419     return false;
2420   }
2421 
2422   if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2423     assert(STI.useRealTrue16Insts());
2424     const DebugLoc &DL = I.getDebugLoc();
2425     MachineBasicBlock *MBB = I.getParent();
2426     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2427         .addReg(SrcReg, 0, AMDGPU::lo16);
2428     I.eraseFromParent();
2429     return true;
2430   }
2431 
2432   if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2433     MachineBasicBlock *MBB = I.getParent();
2434     const DebugLoc &DL = I.getDebugLoc();
2435 
2436     Register LoReg = MRI->createVirtualRegister(DstRC);
2437     Register HiReg = MRI->createVirtualRegister(DstRC);
2438     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2439       .addReg(SrcReg, 0, AMDGPU::sub0);
2440     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2441       .addReg(SrcReg, 0, AMDGPU::sub1);
2442 
2443     if (IsVALU && STI.hasSDWA()) {
2444       // Write the low 16-bits of the high element into the high 16-bits of the
2445       // low element.
2446       MachineInstr *MovSDWA =
2447         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2448         .addImm(0)                             // $src0_modifiers
2449         .addReg(HiReg)                         // $src0
2450         .addImm(0)                             // $clamp
2451         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2452         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2453         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2454         .addReg(LoReg, RegState::Implicit);
2455       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2456     } else {
2457       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2458       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2459       Register ImmReg = MRI->createVirtualRegister(DstRC);
2460       if (IsVALU) {
2461         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2462           .addImm(16)
2463           .addReg(HiReg);
2464       } else {
2465         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2466           .addReg(HiReg)
2467           .addImm(16)
2468           .setOperandDead(3); // Dead scc
2469       }
2470 
2471       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2472       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2473       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2474 
2475       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2476         .addImm(0xffff);
2477       auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2478         .addReg(LoReg)
2479         .addReg(ImmReg);
2480       auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2481         .addReg(TmpReg0)
2482         .addReg(TmpReg1);
2483 
2484       if (!IsVALU) {
2485         And.setOperandDead(3); // Dead scc
2486         Or.setOperandDead(3); // Dead scc
2487       }
2488     }
2489 
2490     I.eraseFromParent();
2491     return true;
2492   }
2493 
2494   if (!DstTy.isScalar())
2495     return false;
2496 
2497   if (SrcSize > 32) {
2498     unsigned SubRegIdx =
2499         DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2500     if (SubRegIdx == AMDGPU::NoSubRegister)
2501       return false;
2502 
2503     // Deal with weird cases where the class only partially supports the subreg
2504     // index.
2505     const TargetRegisterClass *SrcWithSubRC
2506       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2507     if (!SrcWithSubRC)
2508       return false;
2509 
2510     if (SrcWithSubRC != SrcRC) {
2511       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2512         return false;
2513     }
2514 
2515     I.getOperand(1).setSubReg(SubRegIdx);
2516   }
2517 
2518   I.setDesc(TII.get(TargetOpcode::COPY));
2519   return true;
2520 }
2521 
2522 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
2523 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2524   Mask = maskTrailingOnes<unsigned>(Size);
2525   int SignedMask = static_cast<int>(Mask);
2526   return SignedMask >= -16 && SignedMask <= 64;
2527 }
2528 
2529 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2530 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2531   Register Reg, const MachineRegisterInfo &MRI,
2532   const TargetRegisterInfo &TRI) const {
2533   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2534   if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2535     return RB;
2536 
2537   // Ignore the type, since we don't use vcc in artifacts.
2538   if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2539     return &RBI.getRegBankFromRegClass(*RC, LLT());
2540   return nullptr;
2541 }
2542 
2543 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2544   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2545   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2546   const DebugLoc &DL = I.getDebugLoc();
2547   MachineBasicBlock &MBB = *I.getParent();
2548   const Register DstReg = I.getOperand(0).getReg();
2549   const Register SrcReg = I.getOperand(1).getReg();
2550 
2551   const LLT DstTy = MRI->getType(DstReg);
2552   const LLT SrcTy = MRI->getType(SrcReg);
2553   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2554     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2555   const unsigned DstSize = DstTy.getSizeInBits();
2556   if (!DstTy.isScalar())
2557     return false;
2558 
2559   // Artifact casts should never use vcc.
2560   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2561 
2562   // FIXME: This should probably be illegal and split earlier.
2563   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2564     if (DstSize <= 32)
2565       return selectCOPY(I);
2566 
2567     const TargetRegisterClass *SrcRC =
2568         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2569     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2570     const TargetRegisterClass *DstRC =
2571         TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2572 
2573     Register UndefReg = MRI->createVirtualRegister(SrcRC);
2574     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2575     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2576       .addReg(SrcReg)
2577       .addImm(AMDGPU::sub0)
2578       .addReg(UndefReg)
2579       .addImm(AMDGPU::sub1);
2580     I.eraseFromParent();
2581 
2582     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2583            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2584   }
2585 
2586   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2587     // 64-bit should have been split up in RegBankSelect
2588 
2589     // Try to use an and with a mask if it will save code size.
2590     unsigned Mask;
2591     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2592       MachineInstr *ExtI =
2593       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2594         .addImm(Mask)
2595         .addReg(SrcReg);
2596       I.eraseFromParent();
2597       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2598     }
2599 
2600     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2601     MachineInstr *ExtI =
2602       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2603       .addReg(SrcReg)
2604       .addImm(0) // Offset
2605       .addImm(SrcSize); // Width
2606     I.eraseFromParent();
2607     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2608   }
2609 
2610   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2611     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2612       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2613     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2614       return false;
2615 
2616     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2617       const unsigned SextOpc = SrcSize == 8 ?
2618         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2619       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2620         .addReg(SrcReg);
2621       I.eraseFromParent();
2622       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2623     }
2624 
2625     // Using a single 32-bit SALU to calculate the high half is smaller than
2626     // S_BFE with a literal constant operand.
2627     if (DstSize > 32 && SrcSize == 32) {
2628       Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2629       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2630       if (Signed) {
2631         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2632           .addReg(SrcReg, 0, SubReg)
2633           .addImm(31)
2634           .setOperandDead(3); // Dead scc
2635       } else {
2636         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2637           .addImm(0);
2638       }
2639       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2640         .addReg(SrcReg, 0, SubReg)
2641         .addImm(AMDGPU::sub0)
2642         .addReg(HiReg)
2643         .addImm(AMDGPU::sub1);
2644       I.eraseFromParent();
2645       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2646                                           *MRI);
2647     }
2648 
2649     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2650     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2651 
2652     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2653     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2654       // We need a 64-bit register source, but the high bits don't matter.
2655       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2656       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2657       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2658 
2659       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2660       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2661         .addReg(SrcReg, 0, SubReg)
2662         .addImm(AMDGPU::sub0)
2663         .addReg(UndefReg)
2664         .addImm(AMDGPU::sub1);
2665 
2666       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2667         .addReg(ExtReg)
2668         .addImm(SrcSize << 16);
2669 
2670       I.eraseFromParent();
2671       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2672     }
2673 
2674     unsigned Mask;
2675     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2676       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2677         .addReg(SrcReg)
2678         .addImm(Mask)
2679         .setOperandDead(3); // Dead scc
2680     } else {
2681       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2682         .addReg(SrcReg)
2683         .addImm(SrcSize << 16);
2684     }
2685 
2686     I.eraseFromParent();
2687     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2688   }
2689 
2690   return false;
2691 }
2692 
2693 static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2694   return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2695 }
2696 
2697 static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2698   Register BitcastSrc;
2699   if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2700     Reg = BitcastSrc;
2701   return Reg;
2702 }
2703 
2704 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2705                            Register &Out) {
2706   Register Trunc;
2707   if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2708     return false;
2709 
2710   Register LShlSrc;
2711   Register Cst;
2712   if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2713     Cst = stripCopy(Cst, MRI);
2714     if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2715       Out = stripBitCast(LShlSrc, MRI);
2716       return true;
2717     }
2718   }
2719 
2720   MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2721   if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2722     return false;
2723 
2724   assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2725          LLT::fixed_vector(2, 16));
2726 
2727   ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2728   assert(Mask.size() == 2);
2729 
2730   if (Mask[0] == 1 && Mask[1] <= 1) {
2731     Out = Shuffle->getOperand(0).getReg();
2732     return true;
2733   }
2734 
2735   return false;
2736 }
2737 
2738 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2739   if (!Subtarget->hasSALUFloatInsts())
2740     return false;
2741 
2742   Register Dst = I.getOperand(0).getReg();
2743   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2744   if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2745     return false;
2746 
2747   Register Src = I.getOperand(1).getReg();
2748 
2749   if (MRI->getType(Dst) == LLT::scalar(32) &&
2750       MRI->getType(Src) == LLT::scalar(16)) {
2751     if (isExtractHiElt(*MRI, Src, Src)) {
2752       MachineBasicBlock *BB = I.getParent();
2753       BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2754           .addUse(Src);
2755       I.eraseFromParent();
2756       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2757     }
2758   }
2759 
2760   return false;
2761 }
2762 
2763 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2764   // Only manually handle the f64 SGPR case.
2765   //
2766   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2767   // the bit ops theoretically have a second result due to the implicit def of
2768   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2769   // that is easy by disabling the check. The result works, but uses a
2770   // nonsensical sreg32orlds_and_sreg_1 regclass.
2771   //
2772   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2773   // the variadic REG_SEQUENCE operands.
2774 
2775   Register Dst = MI.getOperand(0).getReg();
2776   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2777   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2778       MRI->getType(Dst) != LLT::scalar(64))
2779     return false;
2780 
2781   Register Src = MI.getOperand(1).getReg();
2782   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2783   if (Fabs)
2784     Src = Fabs->getOperand(1).getReg();
2785 
2786   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2787       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2788     return false;
2789 
2790   MachineBasicBlock *BB = MI.getParent();
2791   const DebugLoc &DL = MI.getDebugLoc();
2792   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2793   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2794   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2795   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2796 
2797   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2798     .addReg(Src, 0, AMDGPU::sub0);
2799   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2800     .addReg(Src, 0, AMDGPU::sub1);
2801   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2802     .addImm(0x80000000);
2803 
2804   // Set or toggle sign bit.
2805   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2806   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2807     .addReg(HiReg)
2808     .addReg(ConstReg)
2809     .setOperandDead(3); // Dead scc
2810   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2811     .addReg(LoReg)
2812     .addImm(AMDGPU::sub0)
2813     .addReg(OpReg)
2814     .addImm(AMDGPU::sub1);
2815   MI.eraseFromParent();
2816   return true;
2817 }
2818 
2819 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2820 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2821   Register Dst = MI.getOperand(0).getReg();
2822   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2823   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2824       MRI->getType(Dst) != LLT::scalar(64))
2825     return false;
2826 
2827   Register Src = MI.getOperand(1).getReg();
2828   MachineBasicBlock *BB = MI.getParent();
2829   const DebugLoc &DL = MI.getDebugLoc();
2830   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2831   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2832   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2833   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2834 
2835   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2836       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2837     return false;
2838 
2839   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2840     .addReg(Src, 0, AMDGPU::sub0);
2841   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2842     .addReg(Src, 0, AMDGPU::sub1);
2843   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2844     .addImm(0x7fffffff);
2845 
2846   // Clear sign bit.
2847   // TODO: Should this used S_BITSET0_*?
2848   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2849     .addReg(HiReg)
2850     .addReg(ConstReg)
2851     .setOperandDead(3); // Dead scc
2852   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2853     .addReg(LoReg)
2854     .addImm(AMDGPU::sub0)
2855     .addReg(OpReg)
2856     .addImm(AMDGPU::sub1);
2857 
2858   MI.eraseFromParent();
2859   return true;
2860 }
2861 
2862 static bool isConstant(const MachineInstr &MI) {
2863   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2864 }
2865 
2866 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2867     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2868 
2869   unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2870   const MachineInstr *PtrMI =
2871       MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2872 
2873   assert(PtrMI);
2874 
2875   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2876     return;
2877 
2878   GEPInfo GEPInfo;
2879 
2880   for (unsigned i = 1; i != 3; ++i) {
2881     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2882     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2883     assert(OpDef);
2884     if (i == 2 && isConstant(*OpDef)) {
2885       // TODO: Could handle constant base + variable offset, but a combine
2886       // probably should have commuted it.
2887       assert(GEPInfo.Imm == 0);
2888       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2889       continue;
2890     }
2891     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2892     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2893       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2894     else
2895       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2896   }
2897 
2898   AddrInfo.push_back(GEPInfo);
2899   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2900 }
2901 
2902 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2903   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2904 }
2905 
2906 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2907   if (!MI.hasOneMemOperand())
2908     return false;
2909 
2910   const MachineMemOperand *MMO = *MI.memoperands_begin();
2911   const Value *Ptr = MMO->getValue();
2912 
2913   // UndefValue means this is a load of a kernel input.  These are uniform.
2914   // Sometimes LDS instructions have constant pointers.
2915   // If Ptr is null, then that means this mem operand contains a
2916   // PseudoSourceValue like GOT.
2917   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2918       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2919     return true;
2920 
2921   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2922     return true;
2923 
2924   if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2925     return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2926            AMDGPU::SGPRRegBankID;
2927 
2928   const Instruction *I = dyn_cast<Instruction>(Ptr);
2929   return I && I->getMetadata("amdgpu.uniform");
2930 }
2931 
2932 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2933   for (const GEPInfo &GEPInfo : AddrInfo) {
2934     if (!GEPInfo.VgprParts.empty())
2935       return true;
2936   }
2937   return false;
2938 }
2939 
2940 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2941   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2942   unsigned AS = PtrTy.getAddressSpace();
2943   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2944       STI.ldsRequiresM0Init()) {
2945     MachineBasicBlock *BB = I.getParent();
2946 
2947     // If DS instructions require M0 initialization, insert it before selecting.
2948     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2949       .addImm(-1);
2950   }
2951 }
2952 
2953 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2954   MachineInstr &I) const {
2955   initM0(I);
2956   return selectImpl(I, *CoverageInfo);
2957 }
2958 
2959 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2960   if (Reg.isPhysical())
2961     return false;
2962 
2963   MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2964   const unsigned Opcode = MI.getOpcode();
2965 
2966   if (Opcode == AMDGPU::COPY)
2967     return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2968 
2969   if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2970       Opcode == AMDGPU::G_XOR)
2971     return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2972            isVCmpResult(MI.getOperand(2).getReg(), MRI);
2973 
2974   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2975     return GI->is(Intrinsic::amdgcn_class);
2976 
2977   return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2978 }
2979 
2980 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2981   MachineBasicBlock *BB = I.getParent();
2982   MachineOperand &CondOp = I.getOperand(0);
2983   Register CondReg = CondOp.getReg();
2984   const DebugLoc &DL = I.getDebugLoc();
2985 
2986   unsigned BrOpcode;
2987   Register CondPhysReg;
2988   const TargetRegisterClass *ConstrainRC;
2989 
2990   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2991   // whether the branch is uniform when selecting the instruction. In
2992   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2993   // RegBankSelect knows what it's doing if the branch condition is scc, even
2994   // though it currently does not.
2995   if (!isVCC(CondReg, *MRI)) {
2996     if (MRI->getType(CondReg) != LLT::scalar(32))
2997       return false;
2998 
2999     CondPhysReg = AMDGPU::SCC;
3000     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3001     ConstrainRC = &AMDGPU::SReg_32RegClass;
3002   } else {
3003     // FIXME: Should scc->vcc copies and with exec?
3004 
3005     // Unless the value of CondReg is a result of a V_CMP* instruction then we
3006     // need to insert an and with exec.
3007     if (!isVCmpResult(CondReg, *MRI)) {
3008       const bool Is64 = STI.isWave64();
3009       const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3010       const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3011 
3012       Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3013       BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3014           .addReg(CondReg)
3015           .addReg(Exec)
3016           .setOperandDead(3); // Dead scc
3017       CondReg = TmpReg;
3018     }
3019 
3020     CondPhysReg = TRI.getVCC();
3021     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3022     ConstrainRC = TRI.getBoolRC();
3023   }
3024 
3025   if (!MRI->getRegClassOrNull(CondReg))
3026     MRI->setRegClass(CondReg, ConstrainRC);
3027 
3028   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3029     .addReg(CondReg);
3030   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3031     .addMBB(I.getOperand(1).getMBB());
3032 
3033   I.eraseFromParent();
3034   return true;
3035 }
3036 
3037 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3038   MachineInstr &I) const {
3039   Register DstReg = I.getOperand(0).getReg();
3040   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3041   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3042   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3043   if (IsVGPR)
3044     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3045 
3046   return RBI.constrainGenericRegister(
3047     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3048 }
3049 
3050 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3051   Register DstReg = I.getOperand(0).getReg();
3052   Register SrcReg = I.getOperand(1).getReg();
3053   Register MaskReg = I.getOperand(2).getReg();
3054   LLT Ty = MRI->getType(DstReg);
3055   LLT MaskTy = MRI->getType(MaskReg);
3056   MachineBasicBlock *BB = I.getParent();
3057   const DebugLoc &DL = I.getDebugLoc();
3058 
3059   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3060   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3061   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3062   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3063   if (DstRB != SrcRB) // Should only happen for hand written MIR.
3064     return false;
3065 
3066   // Try to avoid emitting a bit operation when we only need to touch half of
3067   // the 64-bit pointer.
3068   APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
3069   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3070   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3071 
3072   const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3073   const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3074 
3075   if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3076       !CanCopyLow32 && !CanCopyHi32) {
3077     auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3078       .addReg(SrcReg)
3079       .addReg(MaskReg)
3080       .setOperandDead(3); // Dead scc
3081     I.eraseFromParent();
3082     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3083   }
3084 
3085   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3086   const TargetRegisterClass &RegRC
3087     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3088 
3089   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3090   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3091   const TargetRegisterClass *MaskRC =
3092       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3093 
3094   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3095       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3096       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3097     return false;
3098 
3099   if (Ty.getSizeInBits() == 32) {
3100     assert(MaskTy.getSizeInBits() == 32 &&
3101            "ptrmask should have been narrowed during legalize");
3102 
3103     auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3104       .addReg(SrcReg)
3105       .addReg(MaskReg);
3106 
3107     if (!IsVGPR)
3108       NewOp.setOperandDead(3); // Dead scc
3109     I.eraseFromParent();
3110     return true;
3111   }
3112 
3113   Register HiReg = MRI->createVirtualRegister(&RegRC);
3114   Register LoReg = MRI->createVirtualRegister(&RegRC);
3115 
3116   // Extract the subregisters from the source pointer.
3117   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3118     .addReg(SrcReg, 0, AMDGPU::sub0);
3119   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3120     .addReg(SrcReg, 0, AMDGPU::sub1);
3121 
3122   Register MaskedLo, MaskedHi;
3123 
3124   if (CanCopyLow32) {
3125     // If all the bits in the low half are 1, we only need a copy for it.
3126     MaskedLo = LoReg;
3127   } else {
3128     // Extract the mask subregister and apply the and.
3129     Register MaskLo = MRI->createVirtualRegister(&RegRC);
3130     MaskedLo = MRI->createVirtualRegister(&RegRC);
3131 
3132     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3133       .addReg(MaskReg, 0, AMDGPU::sub0);
3134     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3135       .addReg(LoReg)
3136       .addReg(MaskLo);
3137   }
3138 
3139   if (CanCopyHi32) {
3140     // If all the bits in the high half are 1, we only need a copy for it.
3141     MaskedHi = HiReg;
3142   } else {
3143     Register MaskHi = MRI->createVirtualRegister(&RegRC);
3144     MaskedHi = MRI->createVirtualRegister(&RegRC);
3145 
3146     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3147       .addReg(MaskReg, 0, AMDGPU::sub1);
3148     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3149       .addReg(HiReg)
3150       .addReg(MaskHi);
3151   }
3152 
3153   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3154     .addReg(MaskedLo)
3155     .addImm(AMDGPU::sub0)
3156     .addReg(MaskedHi)
3157     .addImm(AMDGPU::sub1);
3158   I.eraseFromParent();
3159   return true;
3160 }
3161 
3162 /// Return the register to use for the index value, and the subregister to use
3163 /// for the indirectly accessed register.
3164 static std::pair<Register, unsigned>
3165 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3166                         const TargetRegisterClass *SuperRC, Register IdxReg,
3167                         unsigned EltSize, GISelKnownBits &KnownBits) {
3168   Register IdxBaseReg;
3169   int Offset;
3170 
3171   std::tie(IdxBaseReg, Offset) =
3172       AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
3173   if (IdxBaseReg == AMDGPU::NoRegister) {
3174     // This will happen if the index is a known constant. This should ordinarily
3175     // be legalized out, but handle it as a register just in case.
3176     assert(Offset == 0);
3177     IdxBaseReg = IdxReg;
3178   }
3179 
3180   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3181 
3182   // Skip out of bounds offsets, or else we would end up using an undefined
3183   // register.
3184   if (static_cast<unsigned>(Offset) >= SubRegs.size())
3185     return std::pair(IdxReg, SubRegs[0]);
3186   return std::pair(IdxBaseReg, SubRegs[Offset]);
3187 }
3188 
3189 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3190   MachineInstr &MI) const {
3191   Register DstReg = MI.getOperand(0).getReg();
3192   Register SrcReg = MI.getOperand(1).getReg();
3193   Register IdxReg = MI.getOperand(2).getReg();
3194 
3195   LLT DstTy = MRI->getType(DstReg);
3196   LLT SrcTy = MRI->getType(SrcReg);
3197 
3198   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3199   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3200   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3201 
3202   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3203   // into a waterfall loop.
3204   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3205     return false;
3206 
3207   const TargetRegisterClass *SrcRC =
3208       TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3209   const TargetRegisterClass *DstRC =
3210       TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3211   if (!SrcRC || !DstRC)
3212     return false;
3213   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3214       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3215       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3216     return false;
3217 
3218   MachineBasicBlock *BB = MI.getParent();
3219   const DebugLoc &DL = MI.getDebugLoc();
3220   const bool Is64 = DstTy.getSizeInBits() == 64;
3221 
3222   unsigned SubReg;
3223   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3224       *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3225 
3226   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3227     if (DstTy.getSizeInBits() != 32 && !Is64)
3228       return false;
3229 
3230     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3231       .addReg(IdxReg);
3232 
3233     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3234     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3235       .addReg(SrcReg, 0, SubReg)
3236       .addReg(SrcReg, RegState::Implicit);
3237     MI.eraseFromParent();
3238     return true;
3239   }
3240 
3241   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3242     return false;
3243 
3244   if (!STI.useVGPRIndexMode()) {
3245     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3246       .addReg(IdxReg);
3247     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3248       .addReg(SrcReg, 0, SubReg)
3249       .addReg(SrcReg, RegState::Implicit);
3250     MI.eraseFromParent();
3251     return true;
3252   }
3253 
3254   const MCInstrDesc &GPRIDXDesc =
3255       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3256   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3257       .addReg(SrcReg)
3258       .addReg(IdxReg)
3259       .addImm(SubReg);
3260 
3261   MI.eraseFromParent();
3262   return true;
3263 }
3264 
3265 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3266 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3267   MachineInstr &MI) const {
3268   Register DstReg = MI.getOperand(0).getReg();
3269   Register VecReg = MI.getOperand(1).getReg();
3270   Register ValReg = MI.getOperand(2).getReg();
3271   Register IdxReg = MI.getOperand(3).getReg();
3272 
3273   LLT VecTy = MRI->getType(DstReg);
3274   LLT ValTy = MRI->getType(ValReg);
3275   unsigned VecSize = VecTy.getSizeInBits();
3276   unsigned ValSize = ValTy.getSizeInBits();
3277 
3278   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3279   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3280   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3281 
3282   assert(VecTy.getElementType() == ValTy);
3283 
3284   // The index must be scalar. If it wasn't RegBankSelect should have moved this
3285   // into a waterfall loop.
3286   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3287     return false;
3288 
3289   const TargetRegisterClass *VecRC =
3290       TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3291   const TargetRegisterClass *ValRC =
3292       TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3293 
3294   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3295       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3296       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3297       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3298     return false;
3299 
3300   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3301     return false;
3302 
3303   unsigned SubReg;
3304   std::tie(IdxReg, SubReg) =
3305       computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3306 
3307   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3308                          STI.useVGPRIndexMode();
3309 
3310   MachineBasicBlock *BB = MI.getParent();
3311   const DebugLoc &DL = MI.getDebugLoc();
3312 
3313   if (!IndexMode) {
3314     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3315       .addReg(IdxReg);
3316 
3317     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3318         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3319     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3320         .addReg(VecReg)
3321         .addReg(ValReg)
3322         .addImm(SubReg);
3323     MI.eraseFromParent();
3324     return true;
3325   }
3326 
3327   const MCInstrDesc &GPRIDXDesc =
3328       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3329   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3330       .addReg(VecReg)
3331       .addReg(ValReg)
3332       .addReg(IdxReg)
3333       .addImm(SubReg);
3334 
3335   MI.eraseFromParent();
3336   return true;
3337 }
3338 
3339 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3340   assert(!AMDGPU::isGFX12Plus(STI));
3341   unsigned Opc;
3342   unsigned Size = MI.getOperand(3).getImm();
3343 
3344   // The struct intrinsic variants add one additional operand over raw.
3345   const bool HasVIndex = MI.getNumOperands() == 9;
3346   Register VIndex;
3347   int OpOffset = 0;
3348   if (HasVIndex) {
3349     VIndex = MI.getOperand(4).getReg();
3350     OpOffset = 1;
3351   }
3352 
3353   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3354   std::optional<ValueAndVReg> MaybeVOffset =
3355       getIConstantVRegValWithLookThrough(VOffset, *MRI);
3356   const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3357 
3358   switch (Size) {
3359   default:
3360     return false;
3361   case 1:
3362     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3363                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3364                     : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3365                                  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3366     break;
3367   case 2:
3368     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3369                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3370                     : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3371                                  : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3372     break;
3373   case 4:
3374     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3375                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3376                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3377                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3378     break;
3379   case 12:
3380     if (!Subtarget->hasLDSLoadB96_B128())
3381       return false;
3382 
3383     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3384                                  : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3385                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3386                                  : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3387     break;
3388   case 16:
3389     if (!Subtarget->hasLDSLoadB96_B128())
3390       return false;
3391 
3392     Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3393                                  : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3394                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3395                                  : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3396     break;
3397   }
3398 
3399   MachineBasicBlock *MBB = MI.getParent();
3400   const DebugLoc &DL = MI.getDebugLoc();
3401   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3402     .add(MI.getOperand(2));
3403 
3404   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3405 
3406   if (HasVIndex && HasVOffset) {
3407     Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3408     BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3409       .addReg(VIndex)
3410       .addImm(AMDGPU::sub0)
3411       .addReg(VOffset)
3412       .addImm(AMDGPU::sub1);
3413 
3414     MIB.addReg(IdxReg);
3415   } else if (HasVIndex) {
3416     MIB.addReg(VIndex);
3417   } else if (HasVOffset) {
3418     MIB.addReg(VOffset);
3419   }
3420 
3421   MIB.add(MI.getOperand(1));            // rsrc
3422   MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3423   MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3424   bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3425   unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3426   MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3427                                 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3428   MIB.addImm(
3429       Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3430           ? 1
3431           : 0); // swz
3432 
3433   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3434   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3435   LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3436   MachinePointerInfo StorePtrI = LoadPtrI;
3437   StorePtrI.V = nullptr;
3438   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3439 
3440   auto F = LoadMMO->getFlags() &
3441            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3442   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3443                                      Size, LoadMMO->getBaseAlign());
3444 
3445   MachineMemOperand *StoreMMO =
3446       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3447                                sizeof(int32_t), LoadMMO->getBaseAlign());
3448 
3449   MIB.setMemRefs({LoadMMO, StoreMMO});
3450 
3451   MI.eraseFromParent();
3452   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3453 }
3454 
3455 /// Match a zero extend from a 32-bit value to 64-bits.
3456 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3457   Register ZExtSrc;
3458   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3459     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3460 
3461   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3462   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3463   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3464     return Register();
3465 
3466   assert(Def->getNumOperands() == 3 &&
3467          MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3468   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3469     return Def->getOperand(1).getReg();
3470   }
3471 
3472   return Register();
3473 }
3474 
3475 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3476   unsigned Opc;
3477   unsigned Size = MI.getOperand(3).getImm();
3478 
3479   switch (Size) {
3480   default:
3481     return false;
3482   case 1:
3483     Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3484     break;
3485   case 2:
3486     Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3487     break;
3488   case 4:
3489     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3490     break;
3491   case 12:
3492     if (!Subtarget->hasLDSLoadB96_B128())
3493       return false;
3494     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3495     break;
3496   case 16:
3497     if (!Subtarget->hasLDSLoadB96_B128())
3498       return false;
3499     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3500     break;
3501   }
3502 
3503   MachineBasicBlock *MBB = MI.getParent();
3504   const DebugLoc &DL = MI.getDebugLoc();
3505   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3506     .add(MI.getOperand(2));
3507 
3508   Register Addr = MI.getOperand(1).getReg();
3509   Register VOffset;
3510   // Try to split SAddr and VOffset. Global and LDS pointers share the same
3511   // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3512   if (!isSGPR(Addr)) {
3513     auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3514     if (isSGPR(AddrDef->Reg)) {
3515       Addr = AddrDef->Reg;
3516     } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3517       Register SAddr =
3518           getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3519       if (isSGPR(SAddr)) {
3520         Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3521         if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3522           Addr = SAddr;
3523           VOffset = Off;
3524         }
3525       }
3526     }
3527   }
3528 
3529   if (isSGPR(Addr)) {
3530     Opc = AMDGPU::getGlobalSaddrOp(Opc);
3531     if (!VOffset) {
3532       VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3533       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3534         .addImm(0);
3535     }
3536   }
3537 
3538   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3539     .addReg(Addr);
3540 
3541   if (isSGPR(Addr))
3542     MIB.addReg(VOffset);
3543 
3544   MIB.add(MI.getOperand(4))  // offset
3545      .add(MI.getOperand(5)); // cpol
3546 
3547   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3548   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3549   LoadPtrI.Offset = MI.getOperand(4).getImm();
3550   MachinePointerInfo StorePtrI = LoadPtrI;
3551   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3552   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3553   auto F = LoadMMO->getFlags() &
3554            ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3555   LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3556                                      Size, LoadMMO->getBaseAlign());
3557   MachineMemOperand *StoreMMO =
3558       MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3559                                sizeof(int32_t), Align(4));
3560 
3561   MIB.setMemRefs({LoadMMO, StoreMMO});
3562 
3563   MI.eraseFromParent();
3564   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3565 }
3566 
3567 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3568   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3569   MI.removeOperand(1);
3570   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3571   return true;
3572 }
3573 
3574 // FIXME: This should be removed and let the patterns select. We just need the
3575 // AGPR/VGPR combination versions.
3576 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3577   unsigned Opc;
3578   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3579   case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3580     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3581     break;
3582   case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3583     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3584     break;
3585   case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3586     Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3587     break;
3588   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3589     Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3590     break;
3591   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3592     Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3593     break;
3594   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3595     Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3596     break;
3597   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3598     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3599     break;
3600   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3601     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3602     break;
3603   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3604     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3605     break;
3606   case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3607     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3608     break;
3609   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3610     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3611     break;
3612   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3613     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3614     break;
3615   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3616     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3617     break;
3618   case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3619     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3620     break;
3621   case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3622     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3623     break;
3624   case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3625     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3626     break;
3627   case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3628     Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3629     break;
3630   case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3631     Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3632     break;
3633   case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3634     Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3635     break;
3636   case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3637     Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3638     break;
3639   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3640     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3641     break;
3642   case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3643     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3644     break;
3645   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3646     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3647     break;
3648   case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3649     Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3650     break;
3651   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3652     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3653     break;
3654   case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3655     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3656     break;
3657   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3658     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3659     break;
3660   case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3661     Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3662     break;
3663   default:
3664     llvm_unreachable("unhandled smfmac intrinsic");
3665   }
3666 
3667   auto VDst_In = MI.getOperand(4);
3668 
3669   MI.setDesc(TII.get(Opc));
3670   MI.removeOperand(4); // VDst_In
3671   MI.removeOperand(1); // Intrinsic ID
3672   MI.addOperand(VDst_In); // Readd VDst_In to the end
3673   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3674   return true;
3675 }
3676 
3677 bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3678     MachineInstr &MI, Intrinsic::ID IntrID) const {
3679   if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3680       !Subtarget->hasPermlane16Swap())
3681     return false;
3682   if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3683       !Subtarget->hasPermlane32Swap())
3684     return false;
3685 
3686   unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3687                         ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3688                         : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3689 
3690   MI.removeOperand(2);
3691   MI.setDesc(TII.get(Opcode));
3692   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3693 
3694   MachineOperand &FI = MI.getOperand(4);
3695   FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
3696 
3697   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3698 }
3699 
3700 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3701   Register DstReg = MI.getOperand(0).getReg();
3702   Register SrcReg = MI.getOperand(1).getReg();
3703   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3704   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3705   MachineBasicBlock *MBB = MI.getParent();
3706   const DebugLoc &DL = MI.getDebugLoc();
3707 
3708   if (IsVALU) {
3709     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3710       .addImm(Subtarget->getWavefrontSizeLog2())
3711       .addReg(SrcReg);
3712   } else {
3713     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3714       .addReg(SrcReg)
3715       .addImm(Subtarget->getWavefrontSizeLog2())
3716       .setOperandDead(3); // Dead scc
3717   }
3718 
3719   const TargetRegisterClass &RC =
3720       IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3721   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3722     return false;
3723 
3724   MI.eraseFromParent();
3725   return true;
3726 }
3727 
3728 // Match BITOP3 operation and return a number of matched instructions plus
3729 // truth table.
3730 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3731                                               SmallVectorImpl<Register> &Src,
3732                                               const MachineRegisterInfo &MRI) {
3733   unsigned NumOpcodes = 0;
3734   uint8_t LHSBits, RHSBits;
3735 
3736   auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3737     // Define truth table given Src0, Src1, Src2 bits permutations:
3738     //                          0     0     0
3739     //                          0     0     1
3740     //                          0     1     0
3741     //                          0     1     1
3742     //                          1     0     0
3743     //                          1     0     1
3744     //                          1     1     0
3745     //                          1     1     1
3746     const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3747 
3748     if (mi_match(Op, MRI, m_AllOnesInt())) {
3749       Bits = 0xff;
3750       return true;
3751     }
3752     if (mi_match(Op, MRI, m_ZeroInt())) {
3753       Bits = 0;
3754       return true;
3755     }
3756 
3757     for (unsigned I = 0; I < Src.size(); ++I) {
3758       // Try to find existing reused operand
3759       if (Src[I] == Op) {
3760         Bits = SrcBits[I];
3761         return true;
3762       }
3763       // Try to replace parent operator
3764       if (Src[I] == R) {
3765         Bits = SrcBits[I];
3766         Src[I] = Op;
3767         return true;
3768       }
3769     }
3770 
3771     if (Src.size() == 3) {
3772       // No room left for operands. Try one last time, there can be a 'not' of
3773       // one of our source operands. In this case we can compute the bits
3774       // without growing Src vector.
3775       Register LHS;
3776       if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3777         LHS = getSrcRegIgnoringCopies(LHS, MRI);
3778         for (unsigned I = 0; I < Src.size(); ++I) {
3779           if (Src[I] == LHS) {
3780             Bits = ~SrcBits[I];
3781             return true;
3782           }
3783         }
3784       }
3785 
3786       return false;
3787     }
3788 
3789     Bits = SrcBits[Src.size()];
3790     Src.push_back(Op);
3791     return true;
3792   };
3793 
3794   MachineInstr *MI = MRI.getVRegDef(R);
3795   switch (MI->getOpcode()) {
3796   case TargetOpcode::G_AND:
3797   case TargetOpcode::G_OR:
3798   case TargetOpcode::G_XOR: {
3799     Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3800     Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3801 
3802     SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3803     if (!getOperandBits(LHS, LHSBits) ||
3804         !getOperandBits(RHS, RHSBits)) {
3805       Src = Backup;
3806       return std::make_pair(0, 0);
3807     }
3808 
3809     // Recursion is naturally limited by the size of the operand vector.
3810     auto Op = BitOp3_Op(LHS, Src, MRI);
3811     if (Op.first) {
3812       NumOpcodes += Op.first;
3813       LHSBits = Op.second;
3814     }
3815 
3816     Op = BitOp3_Op(RHS, Src, MRI);
3817     if (Op.first) {
3818       NumOpcodes += Op.first;
3819       RHSBits = Op.second;
3820     }
3821     break;
3822   }
3823   default:
3824     return std::make_pair(0, 0);
3825   }
3826 
3827   uint8_t TTbl;
3828   switch (MI->getOpcode()) {
3829   case TargetOpcode::G_AND:
3830     TTbl = LHSBits & RHSBits;
3831     break;
3832   case TargetOpcode::G_OR:
3833     TTbl = LHSBits | RHSBits;
3834     break;
3835   case TargetOpcode::G_XOR:
3836     TTbl = LHSBits ^ RHSBits;
3837     break;
3838   default:
3839     break;
3840   }
3841 
3842   return std::make_pair(NumOpcodes + 1, TTbl);
3843 }
3844 
3845 bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3846   if (!Subtarget->hasBitOp3Insts())
3847     return false;
3848 
3849   Register DstReg = MI.getOperand(0).getReg();
3850   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3851   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3852   if (!IsVALU)
3853     return false;
3854 
3855   SmallVector<Register, 3> Src;
3856   uint8_t TTbl;
3857   unsigned NumOpcodes;
3858 
3859   std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3860 
3861   // Src.empty() case can happen if all operands are all zero or all ones.
3862   // Normally it shall be optimized out before reaching this.
3863   if (NumOpcodes < 2 || Src.empty())
3864     return false;
3865 
3866   const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3867   if (NumOpcodes == 2 && IsB32) {
3868     // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3869     // asm more readable. This cannot be modeled with AddedComplexity because
3870     // selector does not know how many operations did we match.
3871     if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3872         mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3873         mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3874       return false;
3875   } else if (NumOpcodes < 4) {
3876     // For a uniform case threshold should be higher to account for moves
3877     // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3878     // in SGPRs and a readtfirstlane after.
3879     return false;
3880   }
3881 
3882   unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3883   unsigned CBL = STI.getConstantBusLimit(Opc);
3884   MachineBasicBlock *MBB = MI.getParent();
3885   const DebugLoc &DL = MI.getDebugLoc();
3886 
3887   for (unsigned I = 0; I < Src.size(); ++I) {
3888     const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3889     if (RB->getID() != AMDGPU::SGPRRegBankID)
3890       continue;
3891     if (CBL > 0) {
3892       --CBL;
3893       continue;
3894     }
3895     Register NewReg =  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3896     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3897         .addReg(Src[I]);
3898     Src[I] = NewReg;
3899   }
3900 
3901   // Last operand can be ignored, turning a ternary operation into a binary.
3902   // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3903   // 'c' with 'a' here without changing the answer. In some pathological
3904   // cases it should be possible to get an operation with a single operand
3905   // too if optimizer would not catch it.
3906   while (Src.size() < 3)
3907     Src.push_back(Src[0]);
3908 
3909   auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3910   if (!IsB32)
3911     MIB.addImm(0); // src_mod0
3912   MIB.addReg(Src[0]);
3913   if (!IsB32)
3914     MIB.addImm(0); // src_mod1
3915   MIB.addReg(Src[1]);
3916   if (!IsB32)
3917     MIB.addImm(0); // src_mod2
3918   MIB.addReg(Src[2])
3919      .addImm(TTbl);
3920   if (!IsB32)
3921     MIB.addImm(0); // op_sel
3922 
3923   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3924   MI.eraseFromParent();
3925 
3926   return true;
3927 }
3928 
3929 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3930   Register SrcReg = MI.getOperand(0).getReg();
3931   if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3932     return false;
3933 
3934   MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3935   Register SP =
3936       Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3937   Register WaveAddr = getWaveAddress(DefMI);
3938   MachineBasicBlock *MBB = MI.getParent();
3939   const DebugLoc &DL = MI.getDebugLoc();
3940 
3941   if (!WaveAddr) {
3942     WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3943     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3944       .addReg(SrcReg)
3945       .addImm(Subtarget->getWavefrontSizeLog2())
3946       .setOperandDead(3); // Dead scc
3947   }
3948 
3949   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3950     .addReg(WaveAddr);
3951 
3952   MI.eraseFromParent();
3953   return true;
3954 }
3955 
3956 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3957 
3958   if (!I.isPreISelOpcode()) {
3959     if (I.isCopy())
3960       return selectCOPY(I);
3961     return true;
3962   }
3963 
3964   switch (I.getOpcode()) {
3965   case TargetOpcode::G_AND:
3966   case TargetOpcode::G_OR:
3967   case TargetOpcode::G_XOR:
3968     if (selectBITOP3(I))
3969       return true;
3970     if (selectImpl(I, *CoverageInfo))
3971       return true;
3972     return selectG_AND_OR_XOR(I);
3973   case TargetOpcode::G_ADD:
3974   case TargetOpcode::G_SUB:
3975   case TargetOpcode::G_PTR_ADD:
3976     if (selectImpl(I, *CoverageInfo))
3977       return true;
3978     return selectG_ADD_SUB(I);
3979   case TargetOpcode::G_UADDO:
3980   case TargetOpcode::G_USUBO:
3981   case TargetOpcode::G_UADDE:
3982   case TargetOpcode::G_USUBE:
3983     return selectG_UADDO_USUBO_UADDE_USUBE(I);
3984   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3985   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3986     return selectG_AMDGPU_MAD_64_32(I);
3987   case TargetOpcode::G_INTTOPTR:
3988   case TargetOpcode::G_BITCAST:
3989   case TargetOpcode::G_PTRTOINT:
3990   case TargetOpcode::G_FREEZE:
3991     return selectCOPY(I);
3992   case TargetOpcode::G_FNEG:
3993     if (selectImpl(I, *CoverageInfo))
3994       return true;
3995     return selectG_FNEG(I);
3996   case TargetOpcode::G_FABS:
3997     if (selectImpl(I, *CoverageInfo))
3998       return true;
3999     return selectG_FABS(I);
4000   case TargetOpcode::G_EXTRACT:
4001     return selectG_EXTRACT(I);
4002   case TargetOpcode::G_MERGE_VALUES:
4003   case TargetOpcode::G_CONCAT_VECTORS:
4004     return selectG_MERGE_VALUES(I);
4005   case TargetOpcode::G_UNMERGE_VALUES:
4006     return selectG_UNMERGE_VALUES(I);
4007   case TargetOpcode::G_BUILD_VECTOR:
4008   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4009     return selectG_BUILD_VECTOR(I);
4010   case TargetOpcode::G_IMPLICIT_DEF:
4011     return selectG_IMPLICIT_DEF(I);
4012   case TargetOpcode::G_INSERT:
4013     return selectG_INSERT(I);
4014   case TargetOpcode::G_INTRINSIC:
4015   case TargetOpcode::G_INTRINSIC_CONVERGENT:
4016     return selectG_INTRINSIC(I);
4017   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4018   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4019     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4020   case TargetOpcode::G_ICMP:
4021   case TargetOpcode::G_FCMP:
4022     if (selectG_ICMP_or_FCMP(I))
4023       return true;
4024     return selectImpl(I, *CoverageInfo);
4025   case TargetOpcode::G_LOAD:
4026   case TargetOpcode::G_ZEXTLOAD:
4027   case TargetOpcode::G_SEXTLOAD:
4028   case TargetOpcode::G_STORE:
4029   case TargetOpcode::G_ATOMIC_CMPXCHG:
4030   case TargetOpcode::G_ATOMICRMW_XCHG:
4031   case TargetOpcode::G_ATOMICRMW_ADD:
4032   case TargetOpcode::G_ATOMICRMW_SUB:
4033   case TargetOpcode::G_ATOMICRMW_AND:
4034   case TargetOpcode::G_ATOMICRMW_OR:
4035   case TargetOpcode::G_ATOMICRMW_XOR:
4036   case TargetOpcode::G_ATOMICRMW_MIN:
4037   case TargetOpcode::G_ATOMICRMW_MAX:
4038   case TargetOpcode::G_ATOMICRMW_UMIN:
4039   case TargetOpcode::G_ATOMICRMW_UMAX:
4040   case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4041   case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4042   case TargetOpcode::G_ATOMICRMW_FADD:
4043   case TargetOpcode::G_ATOMICRMW_FMIN:
4044   case TargetOpcode::G_ATOMICRMW_FMAX:
4045     return selectG_LOAD_STORE_ATOMICRMW(I);
4046   case TargetOpcode::G_SELECT:
4047     return selectG_SELECT(I);
4048   case TargetOpcode::G_TRUNC:
4049     return selectG_TRUNC(I);
4050   case TargetOpcode::G_SEXT:
4051   case TargetOpcode::G_ZEXT:
4052   case TargetOpcode::G_ANYEXT:
4053   case TargetOpcode::G_SEXT_INREG:
4054     // This is a workaround. For extension from type i1, `selectImpl()` uses
4055     // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4056     // i1 can only be hold in a SGPR class.
4057     if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4058         selectImpl(I, *CoverageInfo))
4059       return true;
4060     return selectG_SZA_EXT(I);
4061   case TargetOpcode::G_FPEXT:
4062     if (selectG_FPEXT(I))
4063       return true;
4064     return selectImpl(I, *CoverageInfo);
4065   case TargetOpcode::G_BRCOND:
4066     return selectG_BRCOND(I);
4067   case TargetOpcode::G_GLOBAL_VALUE:
4068     return selectG_GLOBAL_VALUE(I);
4069   case TargetOpcode::G_PTRMASK:
4070     return selectG_PTRMASK(I);
4071   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4072     return selectG_EXTRACT_VECTOR_ELT(I);
4073   case TargetOpcode::G_INSERT_VECTOR_ELT:
4074     return selectG_INSERT_VECTOR_ELT(I);
4075   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4076   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4077   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4078   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4079   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4080     const AMDGPU::ImageDimIntrinsicInfo *Intr =
4081         AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
4082     assert(Intr && "not an image intrinsic with image pseudo");
4083     return selectImageIntrinsic(I, Intr);
4084   }
4085   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4086     return selectBVHIntrinsic(I);
4087   case AMDGPU::G_SBFX:
4088   case AMDGPU::G_UBFX:
4089     return selectG_SBFX_UBFX(I);
4090   case AMDGPU::G_SI_CALL:
4091     I.setDesc(TII.get(AMDGPU::SI_CALL));
4092     return true;
4093   case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4094     return selectWaveAddress(I);
4095   case AMDGPU::G_STACKRESTORE:
4096     return selectStackRestore(I);
4097   case AMDGPU::G_PHI:
4098     return selectPHI(I);
4099   case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4100     return selectCOPY_SCC_VCC(I);
4101   case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4102     return selectCOPY_VCC_SCC(I);
4103   case AMDGPU::G_AMDGPU_READANYLANE:
4104     return selectReadAnyLane(I);
4105   case TargetOpcode::G_CONSTANT:
4106   case TargetOpcode::G_FCONSTANT:
4107   default:
4108     return selectImpl(I, *CoverageInfo);
4109   }
4110   return false;
4111 }
4112 
4113 InstructionSelector::ComplexRendererFns
4114 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4115   return {{
4116       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4117   }};
4118 
4119 }
4120 
4121 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4122     Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4123   unsigned Mods = 0;
4124   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4125 
4126   if (MI->getOpcode() == AMDGPU::G_FNEG) {
4127     Src = MI->getOperand(1).getReg();
4128     Mods |= SISrcMods::NEG;
4129     MI = getDefIgnoringCopies(Src, *MRI);
4130   } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4131     // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4132     // denormal mode, but we're implicitly canonicalizing in a source operand.
4133     const ConstantFP *LHS =
4134         getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4135     if (LHS && LHS->isZero()) {
4136       Mods |= SISrcMods::NEG;
4137       Src = MI->getOperand(2).getReg();
4138     }
4139   }
4140 
4141   if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4142     Src = MI->getOperand(1).getReg();
4143     Mods |= SISrcMods::ABS;
4144   }
4145 
4146   if (OpSel)
4147     Mods |= SISrcMods::OP_SEL_0;
4148 
4149   return std::pair(Src, Mods);
4150 }
4151 
4152 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4153     Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4154     bool ForceVGPR) const {
4155   if ((Mods != 0 || ForceVGPR) &&
4156       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4157 
4158     // If we looked through copies to find source modifiers on an SGPR operand,
4159     // we now have an SGPR register source. To avoid potentially violating the
4160     // constant bus restriction, we need to insert a copy to a VGPR.
4161     Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4162     BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4163             TII.get(AMDGPU::COPY), VGPRSrc)
4164         .addReg(Src);
4165     Src = VGPRSrc;
4166   }
4167 
4168   return Src;
4169 }
4170 
4171 ///
4172 /// This will select either an SGPR or VGPR operand and will save us from
4173 /// having to write an extra tablegen pattern.
4174 InstructionSelector::ComplexRendererFns
4175 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4176   return {{
4177       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4178   }};
4179 }
4180 
4181 InstructionSelector::ComplexRendererFns
4182 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4183   Register Src;
4184   unsigned Mods;
4185   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4186 
4187   return {{
4188       [=](MachineInstrBuilder &MIB) {
4189         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4190       },
4191       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4192       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
4193       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
4194   }};
4195 }
4196 
4197 InstructionSelector::ComplexRendererFns
4198 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4199   Register Src;
4200   unsigned Mods;
4201   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4202                                            /*IsCanonicalizing=*/true,
4203                                            /*AllowAbs=*/false);
4204 
4205   return {{
4206       [=](MachineInstrBuilder &MIB) {
4207         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4208       },
4209       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4210       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
4211       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
4212   }};
4213 }
4214 
4215 InstructionSelector::ComplexRendererFns
4216 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4217   return {{
4218       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4219       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4220       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
4221   }};
4222 }
4223 
4224 InstructionSelector::ComplexRendererFns
4225 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4226   Register Src;
4227   unsigned Mods;
4228   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4229 
4230   return {{
4231       [=](MachineInstrBuilder &MIB) {
4232         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4233       },
4234       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4235   }};
4236 }
4237 
4238 InstructionSelector::ComplexRendererFns
4239 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4240     MachineOperand &Root) const {
4241   Register Src;
4242   unsigned Mods;
4243   std::tie(Src, Mods) =
4244       selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4245 
4246   return {{
4247       [=](MachineInstrBuilder &MIB) {
4248         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4249       },
4250       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4251   }};
4252 }
4253 
4254 InstructionSelector::ComplexRendererFns
4255 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4256   Register Src;
4257   unsigned Mods;
4258   std::tie(Src, Mods) =
4259       selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4260                          /*AllowAbs=*/false);
4261 
4262   return {{
4263       [=](MachineInstrBuilder &MIB) {
4264         MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4265       },
4266       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4267   }};
4268 }
4269 
4270 InstructionSelector::ComplexRendererFns
4271 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4272   Register Reg = Root.getReg();
4273   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4274   if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4275     return {};
4276   return {{
4277       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4278   }};
4279 }
4280 
4281 std::pair<Register, unsigned>
4282 AMDGPUInstructionSelector::selectVOP3PModsImpl(
4283   Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4284   unsigned Mods = 0;
4285   MachineInstr *MI = MRI.getVRegDef(Src);
4286 
4287   if (MI->getOpcode() == AMDGPU::G_FNEG &&
4288       // It's possible to see an f32 fneg here, but unlikely.
4289       // TODO: Treat f32 fneg as only high bit.
4290       MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
4291     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
4292     Src = MI->getOperand(1).getReg();
4293     MI = MRI.getVRegDef(Src);
4294   }
4295 
4296   // TODO: Handle G_FSUB 0 as fneg
4297 
4298   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4299   (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4300 
4301   // Packed instructions do not have abs modifiers.
4302   Mods |= SISrcMods::OP_SEL_1;
4303 
4304   return std::pair(Src, Mods);
4305 }
4306 
4307 InstructionSelector::ComplexRendererFns
4308 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4309   MachineRegisterInfo &MRI
4310     = Root.getParent()->getParent()->getParent()->getRegInfo();
4311 
4312   Register Src;
4313   unsigned Mods;
4314   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4315 
4316   return {{
4317       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4318       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
4319   }};
4320 }
4321 
4322 InstructionSelector::ComplexRendererFns
4323 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4324   MachineRegisterInfo &MRI
4325     = Root.getParent()->getParent()->getParent()->getRegInfo();
4326 
4327   Register Src;
4328   unsigned Mods;
4329   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
4330 
4331   return {{
4332       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4333       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
4334   }};
4335 }
4336 
4337 InstructionSelector::ComplexRendererFns
4338 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4339   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4340   // Value is in Imm operand as i1 sign extended to int64_t.
4341   // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4342   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4343          "expected i1 value");
4344   unsigned Mods = SISrcMods::OP_SEL_1;
4345   if (Root.getImm() == -1)
4346     Mods ^= SISrcMods::NEG;
4347   return {{
4348       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4349   }};
4350 }
4351 
4352 InstructionSelector::ComplexRendererFns
4353 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4354     MachineOperand &Root) const {
4355   assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4356          "expected i1 value");
4357   unsigned Mods = SISrcMods::OP_SEL_1;
4358   if (Root.getImm() != 0)
4359     Mods |= SISrcMods::OP_SEL_0;
4360 
4361   return {{
4362       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4363   }};
4364 }
4365 
4366 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
4367                                  MachineInstr *InsertPt,
4368                                  MachineRegisterInfo &MRI) {
4369   const TargetRegisterClass *DstRegClass;
4370   switch (Elts.size()) {
4371   case 8:
4372     DstRegClass = &AMDGPU::VReg_256RegClass;
4373     break;
4374   case 4:
4375     DstRegClass = &AMDGPU::VReg_128RegClass;
4376     break;
4377   case 2:
4378     DstRegClass = &AMDGPU::VReg_64RegClass;
4379     break;
4380   default:
4381     llvm_unreachable("unhandled Reg sequence size");
4382   }
4383 
4384   MachineIRBuilder B(*InsertPt);
4385   auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4386                  .addDef(MRI.createVirtualRegister(DstRegClass));
4387   for (unsigned i = 0; i < Elts.size(); ++i) {
4388     MIB.addReg(Elts[i]);
4389     MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
4390   }
4391   return MIB->getOperand(0).getReg();
4392 }
4393 
4394 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4395                                  SmallVectorImpl<Register> &Elts, Register &Src,
4396                                  MachineInstr *InsertPt,
4397                                  MachineRegisterInfo &MRI) {
4398   if (ModOpcode == TargetOpcode::G_FNEG) {
4399     Mods |= SISrcMods::NEG;
4400     // Check if all elements also have abs modifier
4401     SmallVector<Register, 8> NegAbsElts;
4402     for (auto El : Elts) {
4403       Register FabsSrc;
4404       if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4405         break;
4406       NegAbsElts.push_back(FabsSrc);
4407     }
4408     if (Elts.size() != NegAbsElts.size()) {
4409       // Neg
4410       Src = buildRegSequence(Elts, InsertPt, MRI);
4411     } else {
4412       // Neg and Abs
4413       Mods |= SISrcMods::NEG_HI;
4414       Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4415     }
4416   } else {
4417     assert(ModOpcode == TargetOpcode::G_FABS);
4418     // Abs
4419     Mods |= SISrcMods::NEG_HI;
4420     Src = buildRegSequence(Elts, InsertPt, MRI);
4421   }
4422 }
4423 
4424 InstructionSelector::ComplexRendererFns
4425 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4426   Register Src = Root.getReg();
4427   unsigned Mods = SISrcMods::OP_SEL_1;
4428   SmallVector<Register, 8> EltsF32;
4429 
4430   if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4431     assert(BV->getNumSources() > 0);
4432     // Based on first element decide which mod we match, neg or abs
4433     MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4434     unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4435                              ? AMDGPU::G_FNEG
4436                              : AMDGPU::G_FABS;
4437     for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4438       ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4439       if (ElF32->getOpcode() != ModOpcode)
4440         break;
4441       EltsF32.push_back(ElF32->getOperand(1).getReg());
4442     }
4443 
4444     // All elements had ModOpcode modifier
4445     if (BV->getNumSources() == EltsF32.size()) {
4446       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4447                            *MRI);
4448     }
4449   }
4450 
4451   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4452            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4453 }
4454 
4455 InstructionSelector::ComplexRendererFns
4456 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4457   Register Src = Root.getReg();
4458   unsigned Mods = SISrcMods::OP_SEL_1;
4459   SmallVector<Register, 8> EltsV2F16;
4460 
4461   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4462     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4463       Register FNegSrc;
4464       if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4465         break;
4466       EltsV2F16.push_back(FNegSrc);
4467     }
4468 
4469     // All elements had ModOpcode modifier
4470     if (CV->getNumSources() == EltsV2F16.size()) {
4471       Mods |= SISrcMods::NEG;
4472       Mods |= SISrcMods::NEG_HI;
4473       Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4474     }
4475   }
4476 
4477   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4478            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4479 }
4480 
4481 InstructionSelector::ComplexRendererFns
4482 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4483   Register Src = Root.getReg();
4484   unsigned Mods = SISrcMods::OP_SEL_1;
4485   SmallVector<Register, 8> EltsV2F16;
4486 
4487   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4488     assert(CV->getNumSources() > 0);
4489     MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4490     // Based on first element decide which mod we match, neg or abs
4491     unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4492                              ? AMDGPU::G_FNEG
4493                              : AMDGPU::G_FABS;
4494 
4495     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4496       ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4497       if (ElV2F16->getOpcode() != ModOpcode)
4498         break;
4499       EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4500     }
4501 
4502     // All elements had ModOpcode modifier
4503     if (CV->getNumSources() == EltsV2F16.size()) {
4504       MachineIRBuilder B(*Root.getParent());
4505       selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4506                            *MRI);
4507     }
4508   }
4509 
4510   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4511            [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4512 }
4513 
4514 InstructionSelector::ComplexRendererFns
4515 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4516   std::optional<FPValueAndVReg> FPValReg;
4517   if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4518     if (TII.isInlineConstant(FPValReg->Value)) {
4519       return {{[=](MachineInstrBuilder &MIB) {
4520         MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4521       }}};
4522     }
4523     // Non-inlineable splat floats should not fall-through for integer immediate
4524     // checks.
4525     return {};
4526   }
4527 
4528   APInt ICst;
4529   if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4530     if (TII.isInlineConstant(ICst)) {
4531       return {
4532           {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4533     }
4534   }
4535 
4536   return {};
4537 }
4538 
4539 InstructionSelector::ComplexRendererFns
4540 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4541   Register Src =
4542       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4543   unsigned Key = 0;
4544 
4545   Register ShiftSrc;
4546   std::optional<ValueAndVReg> ShiftAmt;
4547   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4548       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4549       ShiftAmt->Value.getZExtValue() % 8 == 0) {
4550     Key = ShiftAmt->Value.getZExtValue() / 8;
4551     Src = ShiftSrc;
4552   }
4553 
4554   return {{
4555       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4556       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4557   }};
4558 }
4559 
4560 InstructionSelector::ComplexRendererFns
4561 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4562 
4563   Register Src =
4564       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4565   unsigned Key = 0;
4566 
4567   Register ShiftSrc;
4568   std::optional<ValueAndVReg> ShiftAmt;
4569   if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4570       MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4571       ShiftAmt->Value.getZExtValue() == 16) {
4572     Src = ShiftSrc;
4573     Key = 1;
4574   }
4575 
4576   return {{
4577       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4578       [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4579   }};
4580 }
4581 
4582 InstructionSelector::ComplexRendererFns
4583 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4584   Register Src;
4585   unsigned Mods;
4586   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4587 
4588   // FIXME: Handle op_sel
4589   return {{
4590       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4591       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4592   }};
4593 }
4594 
4595 InstructionSelector::ComplexRendererFns
4596 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4597   Register Src;
4598   unsigned Mods;
4599   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4600                                            /*IsCanonicalizing=*/true,
4601                                            /*AllowAbs=*/false,
4602                                            /*OpSel=*/false);
4603 
4604   return {{
4605       [=](MachineInstrBuilder &MIB) {
4606         MIB.addReg(
4607             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4608       },
4609       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4610   }};
4611 }
4612 
4613 InstructionSelector::ComplexRendererFns
4614 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4615   Register Src;
4616   unsigned Mods;
4617   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4618                                            /*IsCanonicalizing=*/true,
4619                                            /*AllowAbs=*/false,
4620                                            /*OpSel=*/true);
4621 
4622   return {{
4623       [=](MachineInstrBuilder &MIB) {
4624         MIB.addReg(
4625             copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4626       },
4627       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4628   }};
4629 }
4630 
4631 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4632                                                  Register &Base,
4633                                                  Register *SOffset,
4634                                                  int64_t *Offset) const {
4635   MachineInstr *MI = Root.getParent();
4636   MachineBasicBlock *MBB = MI->getParent();
4637 
4638   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4639   // then we can select all ptr + 32-bit offsets.
4640   SmallVector<GEPInfo, 4> AddrInfo;
4641   getAddrModeInfo(*MI, *MRI, AddrInfo);
4642 
4643   if (AddrInfo.empty())
4644     return false;
4645 
4646   const GEPInfo &GEPI = AddrInfo[0];
4647   std::optional<int64_t> EncodedImm;
4648 
4649   if (SOffset && Offset) {
4650     EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4651                                               /*HasSOffset=*/true);
4652     if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4653         AddrInfo.size() > 1) {
4654       const GEPInfo &GEPI2 = AddrInfo[1];
4655       if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4656         if (Register OffsetReg =
4657                 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4658           Base = GEPI2.SgprParts[0];
4659           *SOffset = OffsetReg;
4660           *Offset = *EncodedImm;
4661           if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4662             return true;
4663 
4664           // For unbuffered smem loads, it is illegal for the Immediate Offset
4665           // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4666           // is negative. Handle the case where the Immediate Offset + SOffset
4667           // is negative.
4668           auto SKnown = KB->getKnownBits(*SOffset);
4669           if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4670             return false;
4671 
4672           return true;
4673         }
4674       }
4675     }
4676     return false;
4677   }
4678 
4679   EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4680                                             /*HasSOffset=*/false);
4681   if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4682     Base = GEPI.SgprParts[0];
4683     *Offset = *EncodedImm;
4684     return true;
4685   }
4686 
4687   // SGPR offset is unsigned.
4688   if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4689       GEPI.Imm != 0) {
4690     // If we make it this far we have a load with an 32-bit immediate offset.
4691     // It is OK to select this using a sgpr offset, because we have already
4692     // failed trying to select this load into one of the _IMM variants since
4693     // the _IMM Patterns are considered before the _SGPR patterns.
4694     Base = GEPI.SgprParts[0];
4695     *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4696     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4697         .addImm(GEPI.Imm);
4698     return true;
4699   }
4700 
4701   if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4702     if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4703       Base = GEPI.SgprParts[0];
4704       *SOffset = OffsetReg;
4705       return true;
4706     }
4707   }
4708 
4709   return false;
4710 }
4711 
4712 InstructionSelector::ComplexRendererFns
4713 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4714   Register Base;
4715   int64_t Offset;
4716   if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4717     return std::nullopt;
4718 
4719   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4720            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4721 }
4722 
4723 InstructionSelector::ComplexRendererFns
4724 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4725   SmallVector<GEPInfo, 4> AddrInfo;
4726   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4727 
4728   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4729     return std::nullopt;
4730 
4731   const GEPInfo &GEPInfo = AddrInfo[0];
4732   Register PtrReg = GEPInfo.SgprParts[0];
4733   std::optional<int64_t> EncodedImm =
4734       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4735   if (!EncodedImm)
4736     return std::nullopt;
4737 
4738   return {{
4739     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4740     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4741   }};
4742 }
4743 
4744 InstructionSelector::ComplexRendererFns
4745 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4746   Register Base, SOffset;
4747   if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4748     return std::nullopt;
4749 
4750   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4751            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4752 }
4753 
4754 InstructionSelector::ComplexRendererFns
4755 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4756   Register Base, SOffset;
4757   int64_t Offset;
4758   if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4759     return std::nullopt;
4760 
4761   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4762            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4763            [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4764 }
4765 
4766 std::pair<Register, int>
4767 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4768                                                 uint64_t FlatVariant) const {
4769   MachineInstr *MI = Root.getParent();
4770 
4771   auto Default = std::pair(Root.getReg(), 0);
4772 
4773   if (!STI.hasFlatInstOffsets())
4774     return Default;
4775 
4776   Register PtrBase;
4777   int64_t ConstOffset;
4778   std::tie(PtrBase, ConstOffset) =
4779       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4780 
4781   if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4782                            !isFlatScratchBaseLegal(Root.getReg())))
4783     return Default;
4784 
4785   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4786   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4787     return Default;
4788 
4789   return std::pair(PtrBase, ConstOffset);
4790 }
4791 
4792 InstructionSelector::ComplexRendererFns
4793 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4794   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4795 
4796   return {{
4797       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4798       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4799     }};
4800 }
4801 
4802 InstructionSelector::ComplexRendererFns
4803 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4804   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4805 
4806   return {{
4807       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4808       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4809   }};
4810 }
4811 
4812 InstructionSelector::ComplexRendererFns
4813 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4814   auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4815 
4816   return {{
4817       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4818       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4819     }};
4820 }
4821 
4822 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4823 InstructionSelector::ComplexRendererFns
4824 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4825   Register Addr = Root.getReg();
4826   Register PtrBase;
4827   int64_t ConstOffset;
4828   int64_t ImmOffset = 0;
4829 
4830   // Match the immediate offset first, which canonically is moved as low as
4831   // possible.
4832   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4833 
4834   if (ConstOffset != 0) {
4835     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4836                               SIInstrFlags::FlatGlobal)) {
4837       Addr = PtrBase;
4838       ImmOffset = ConstOffset;
4839     } else {
4840       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4841       if (isSGPR(PtrBaseDef->Reg)) {
4842         if (ConstOffset > 0) {
4843           // Offset is too large.
4844           //
4845           // saddr + large_offset -> saddr +
4846           //                         (voffset = large_offset & ~MaxOffset) +
4847           //                         (large_offset & MaxOffset);
4848           int64_t SplitImmOffset, RemainderOffset;
4849           std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4850               ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
4851 
4852           if (isUInt<32>(RemainderOffset)) {
4853             MachineInstr *MI = Root.getParent();
4854             MachineBasicBlock *MBB = MI->getParent();
4855             Register HighBits =
4856                 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4857 
4858             BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4859                     HighBits)
4860                 .addImm(RemainderOffset);
4861 
4862             return {{
4863                 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4864                 [=](MachineInstrBuilder &MIB) {
4865                   MIB.addReg(HighBits);
4866                 }, // voffset
4867                 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4868             }};
4869           }
4870         }
4871 
4872         // We are adding a 64 bit SGPR and a constant. If constant bus limit
4873         // is 1 we would need to perform 1 or 2 extra moves for each half of
4874         // the constant and it is better to do a scalar add and then issue a
4875         // single VALU instruction to materialize zero. Otherwise it is less
4876         // instructions to perform VALU adds with immediates or inline literals.
4877         unsigned NumLiterals =
4878             !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
4879             !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
4880         if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4881           return std::nullopt;
4882       }
4883     }
4884   }
4885 
4886   // Match the variable offset.
4887   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4888   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4889     // Look through the SGPR->VGPR copy.
4890     Register SAddr =
4891         getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4892 
4893     if (isSGPR(SAddr)) {
4894       Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4895 
4896       // It's possible voffset is an SGPR here, but the copy to VGPR will be
4897       // inserted later.
4898       if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4899         return {{[=](MachineInstrBuilder &MIB) { // saddr
4900                    MIB.addReg(SAddr);
4901                  },
4902                  [=](MachineInstrBuilder &MIB) { // voffset
4903                    MIB.addReg(VOffset);
4904                  },
4905                  [=](MachineInstrBuilder &MIB) { // offset
4906                    MIB.addImm(ImmOffset);
4907                  }}};
4908       }
4909     }
4910   }
4911 
4912   // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4913   // drop this.
4914   if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4915       AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4916     return std::nullopt;
4917 
4918   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4919   // moves required to copy a 64-bit SGPR to VGPR.
4920   MachineInstr *MI = Root.getParent();
4921   MachineBasicBlock *MBB = MI->getParent();
4922   Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4923 
4924   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4925       .addImm(0);
4926 
4927   return {{
4928       [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4929       [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset
4930       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }     // offset
4931   }};
4932 }
4933 
4934 InstructionSelector::ComplexRendererFns
4935 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4936   Register Addr = Root.getReg();
4937   Register PtrBase;
4938   int64_t ConstOffset;
4939   int64_t ImmOffset = 0;
4940 
4941   // Match the immediate offset first, which canonically is moved as low as
4942   // possible.
4943   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4944 
4945   if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4946       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
4947                             SIInstrFlags::FlatScratch)) {
4948     Addr = PtrBase;
4949     ImmOffset = ConstOffset;
4950   }
4951 
4952   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4953   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4954     int FI = AddrDef->MI->getOperand(1).getIndex();
4955     return {{
4956         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4957         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4958     }};
4959   }
4960 
4961   Register SAddr = AddrDef->Reg;
4962 
4963   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4964     Register LHS = AddrDef->MI->getOperand(1).getReg();
4965     Register RHS = AddrDef->MI->getOperand(2).getReg();
4966     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4967     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4968 
4969     if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4970         isSGPR(RHSDef->Reg)) {
4971       int FI = LHSDef->MI->getOperand(1).getIndex();
4972       MachineInstr &I = *Root.getParent();
4973       MachineBasicBlock *BB = I.getParent();
4974       const DebugLoc &DL = I.getDebugLoc();
4975       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4976 
4977       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4978           .addFrameIndex(FI)
4979           .addReg(RHSDef->Reg)
4980           .setOperandDead(3); // Dead scc
4981     }
4982   }
4983 
4984   if (!isSGPR(SAddr))
4985     return std::nullopt;
4986 
4987   return {{
4988       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4989       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4990   }};
4991 }
4992 
4993 // Check whether the flat scratch SVS swizzle bug affects this access.
4994 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4995     Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4996   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4997     return false;
4998 
4999   // The bug affects the swizzling of SVS accesses if there is any carry out
5000   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5001   // voffset to (soffset + inst_offset).
5002   auto VKnown = KB->getKnownBits(VAddr);
5003   auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
5004                                KnownBits::makeConstant(APInt(32, ImmOffset)));
5005   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5006   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5007   return (VMax & 3) + (SMax & 3) >= 4;
5008 }
5009 
5010 InstructionSelector::ComplexRendererFns
5011 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5012   Register Addr = Root.getReg();
5013   Register PtrBase;
5014   int64_t ConstOffset;
5015   int64_t ImmOffset = 0;
5016 
5017   // Match the immediate offset first, which canonically is moved as low as
5018   // possible.
5019   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5020 
5021   Register OrigAddr = Addr;
5022   if (ConstOffset != 0 &&
5023       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
5024     Addr = PtrBase;
5025     ImmOffset = ConstOffset;
5026   }
5027 
5028   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5029   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5030     return std::nullopt;
5031 
5032   Register RHS = AddrDef->MI->getOperand(2).getReg();
5033   if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5034     return std::nullopt;
5035 
5036   Register LHS = AddrDef->MI->getOperand(1).getReg();
5037   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5038 
5039   if (OrigAddr != Addr) {
5040     if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5041       return std::nullopt;
5042   } else {
5043     if (!isFlatScratchBaseLegalSV(OrigAddr))
5044       return std::nullopt;
5045   }
5046 
5047   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5048     return std::nullopt;
5049 
5050   if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5051     int FI = LHSDef->MI->getOperand(1).getIndex();
5052     return {{
5053         [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5054         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5055         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5056     }};
5057   }
5058 
5059   if (!isSGPR(LHS))
5060     return std::nullopt;
5061 
5062   return {{
5063       [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5064       [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5065       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5066   }};
5067 }
5068 
5069 InstructionSelector::ComplexRendererFns
5070 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5071   MachineInstr *MI = Root.getParent();
5072   MachineBasicBlock *MBB = MI->getParent();
5073   MachineFunction *MF = MBB->getParent();
5074   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5075 
5076   int64_t Offset = 0;
5077   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5078       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5079     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5080 
5081     // TODO: Should this be inside the render function? The iterator seems to
5082     // move.
5083     const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5084     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5085             HighBits)
5086         .addImm(Offset & ~MaxOffset);
5087 
5088     return {{[=](MachineInstrBuilder &MIB) { // rsrc
5089                MIB.addReg(Info->getScratchRSrcReg());
5090              },
5091              [=](MachineInstrBuilder &MIB) { // vaddr
5092                MIB.addReg(HighBits);
5093              },
5094              [=](MachineInstrBuilder &MIB) { // soffset
5095                // Use constant zero for soffset and rely on eliminateFrameIndex
5096                // to choose the appropriate frame register if need be.
5097                MIB.addImm(0);
5098              },
5099              [=](MachineInstrBuilder &MIB) { // offset
5100                MIB.addImm(Offset & MaxOffset);
5101              }}};
5102   }
5103 
5104   assert(Offset == 0 || Offset == -1);
5105 
5106   // Try to fold a frame index directly into the MUBUF vaddr field, and any
5107   // offsets.
5108   std::optional<int> FI;
5109   Register VAddr = Root.getReg();
5110 
5111   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5112   Register PtrBase;
5113   int64_t ConstOffset;
5114   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5115   if (ConstOffset != 0) {
5116     if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5117         (!STI.privateMemoryResourceIsRangeChecked() ||
5118          KB->signBitIsZero(PtrBase))) {
5119       const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5120       if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5121         FI = PtrBaseDef->getOperand(1).getIndex();
5122       else
5123         VAddr = PtrBase;
5124       Offset = ConstOffset;
5125     }
5126   } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5127     FI = RootDef->getOperand(1).getIndex();
5128   }
5129 
5130   return {{[=](MachineInstrBuilder &MIB) { // rsrc
5131              MIB.addReg(Info->getScratchRSrcReg());
5132            },
5133            [=](MachineInstrBuilder &MIB) { // vaddr
5134              if (FI)
5135                MIB.addFrameIndex(*FI);
5136              else
5137                MIB.addReg(VAddr);
5138            },
5139            [=](MachineInstrBuilder &MIB) { // soffset
5140              // Use constant zero for soffset and rely on eliminateFrameIndex
5141              // to choose the appropriate frame register if need be.
5142              MIB.addImm(0);
5143            },
5144            [=](MachineInstrBuilder &MIB) { // offset
5145              MIB.addImm(Offset);
5146            }}};
5147 }
5148 
5149 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5150                                                 int64_t Offset) const {
5151   if (!isUInt<16>(Offset))
5152     return false;
5153 
5154   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5155     return true;
5156 
5157   // On Southern Islands instruction with a negative base value and an offset
5158   // don't seem to work.
5159   return KB->signBitIsZero(Base);
5160 }
5161 
5162 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5163                                                  int64_t Offset1,
5164                                                  unsigned Size) const {
5165   if (Offset0 % Size != 0 || Offset1 % Size != 0)
5166     return false;
5167   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5168     return false;
5169 
5170   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5171     return true;
5172 
5173   // On Southern Islands instruction with a negative base value and an offset
5174   // don't seem to work.
5175   return KB->signBitIsZero(Base);
5176 }
5177 
5178 // Return whether the operation has NoUnsignedWrap property.
5179 static bool isNoUnsignedWrap(MachineInstr *Addr) {
5180   return Addr->getOpcode() == TargetOpcode::G_OR ||
5181          (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5182           Addr->getFlag(MachineInstr::NoUWrap));
5183 }
5184 
5185 // Check that the base address of flat scratch load/store in the form of `base +
5186 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5187 // requirement). We always treat the first operand as the base address here.
5188 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5189   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5190 
5191   if (isNoUnsignedWrap(AddrMI))
5192     return true;
5193 
5194   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5195   // values.
5196   if (STI.hasSignedScratchOffsets())
5197     return true;
5198 
5199   Register LHS = AddrMI->getOperand(1).getReg();
5200   Register RHS = AddrMI->getOperand(2).getReg();
5201 
5202   if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5203     std::optional<ValueAndVReg> RhsValReg =
5204         getIConstantVRegValWithLookThrough(RHS, *MRI);
5205     // If the immediate offset is negative and within certain range, the base
5206     // address cannot also be negative. If the base is also negative, the sum
5207     // would be either negative or much larger than the valid range of scratch
5208     // memory a thread can access.
5209     if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5210         RhsValReg->Value.getSExtValue() > -0x40000000)
5211       return true;
5212   }
5213 
5214   return KB->signBitIsZero(LHS);
5215 }
5216 
5217 // Check address value in SGPR/VGPR are legal for flat scratch in the form
5218 // of: SGPR + VGPR.
5219 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5220   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5221 
5222   if (isNoUnsignedWrap(AddrMI))
5223     return true;
5224 
5225   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5226   // values.
5227   if (STI.hasSignedScratchOffsets())
5228     return true;
5229 
5230   Register LHS = AddrMI->getOperand(1).getReg();
5231   Register RHS = AddrMI->getOperand(2).getReg();
5232   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5233 }
5234 
5235 // Check address value in SGPR/VGPR are legal for flat scratch in the form
5236 // of: SGPR + VGPR + Imm.
5237 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5238     Register Addr) const {
5239   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5240   // values.
5241   if (STI.hasSignedScratchOffsets())
5242     return true;
5243 
5244   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5245   Register Base = AddrMI->getOperand(1).getReg();
5246   std::optional<DefinitionAndSourceRegister> BaseDef =
5247       getDefSrcRegIgnoringCopies(Base, *MRI);
5248   std::optional<ValueAndVReg> RHSOffset =
5249       getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
5250   assert(RHSOffset);
5251 
5252   // If the immediate offset is negative and within certain range, the base
5253   // address cannot also be negative. If the base is also negative, the sum
5254   // would be either negative or much larger than the valid range of scratch
5255   // memory a thread can access.
5256   if (isNoUnsignedWrap(BaseDef->MI) &&
5257       (isNoUnsignedWrap(AddrMI) ||
5258        (RHSOffset->Value.getSExtValue() < 0 &&
5259         RHSOffset->Value.getSExtValue() > -0x40000000)))
5260     return true;
5261 
5262   Register LHS = BaseDef->MI->getOperand(1).getReg();
5263   Register RHS = BaseDef->MI->getOperand(2).getReg();
5264   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5265 }
5266 
5267 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5268                                                     unsigned ShAmtBits) const {
5269   assert(MI.getOpcode() == TargetOpcode::G_AND);
5270 
5271   std::optional<APInt> RHS =
5272       getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5273   if (!RHS)
5274     return false;
5275 
5276   if (RHS->countr_one() >= ShAmtBits)
5277     return true;
5278 
5279   const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
5280   return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5281 }
5282 
5283 InstructionSelector::ComplexRendererFns
5284 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5285     MachineOperand &Root) const {
5286   Register Reg = Root.getReg();
5287   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5288 
5289   std::optional<DefinitionAndSourceRegister> Def =
5290     getDefSrcRegIgnoringCopies(Reg, *MRI);
5291   assert(Def && "this shouldn't be an optional result");
5292   Reg = Def->Reg;
5293 
5294   if (Register WaveBase = getWaveAddress(Def->MI)) {
5295     return {{
5296         [=](MachineInstrBuilder &MIB) { // rsrc
5297           MIB.addReg(Info->getScratchRSrcReg());
5298         },
5299         [=](MachineInstrBuilder &MIB) { // soffset
5300           MIB.addReg(WaveBase);
5301         },
5302         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5303     }};
5304   }
5305 
5306   int64_t Offset = 0;
5307 
5308   // FIXME: Copy check is a hack
5309   Register BasePtr;
5310   if (mi_match(Reg, *MRI,
5311                m_GPtrAdd(m_Reg(BasePtr),
5312                          m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
5313     if (!TII.isLegalMUBUFImmOffset(Offset))
5314       return {};
5315     MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5316     Register WaveBase = getWaveAddress(BasePtrDef);
5317     if (!WaveBase)
5318       return {};
5319 
5320     return {{
5321         [=](MachineInstrBuilder &MIB) { // rsrc
5322           MIB.addReg(Info->getScratchRSrcReg());
5323         },
5324         [=](MachineInstrBuilder &MIB) { // soffset
5325           MIB.addReg(WaveBase);
5326         },
5327         [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5328     }};
5329   }
5330 
5331   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5332       !TII.isLegalMUBUFImmOffset(Offset))
5333     return {};
5334 
5335   return {{
5336       [=](MachineInstrBuilder &MIB) { // rsrc
5337         MIB.addReg(Info->getScratchRSrcReg());
5338       },
5339       [=](MachineInstrBuilder &MIB) { // soffset
5340         MIB.addImm(0);
5341       },
5342       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5343   }};
5344 }
5345 
5346 std::pair<Register, unsigned>
5347 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5348   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5349   int64_t ConstAddr = 0;
5350 
5351   Register PtrBase;
5352   int64_t Offset;
5353   std::tie(PtrBase, Offset) =
5354     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5355 
5356   if (Offset) {
5357     if (isDSOffsetLegal(PtrBase, Offset)) {
5358       // (add n0, c0)
5359       return std::pair(PtrBase, Offset);
5360     }
5361   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5362     // TODO
5363 
5364 
5365   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5366     // TODO
5367 
5368   }
5369 
5370   return std::pair(Root.getReg(), 0);
5371 }
5372 
5373 InstructionSelector::ComplexRendererFns
5374 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5375   Register Reg;
5376   unsigned Offset;
5377   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5378   return {{
5379       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5380       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5381     }};
5382 }
5383 
5384 InstructionSelector::ComplexRendererFns
5385 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5386   return selectDSReadWrite2(Root, 4);
5387 }
5388 
5389 InstructionSelector::ComplexRendererFns
5390 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5391   return selectDSReadWrite2(Root, 8);
5392 }
5393 
5394 InstructionSelector::ComplexRendererFns
5395 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5396                                               unsigned Size) const {
5397   Register Reg;
5398   unsigned Offset;
5399   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5400   return {{
5401       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5402       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5403       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5404     }};
5405 }
5406 
5407 std::pair<Register, unsigned>
5408 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5409                                                   unsigned Size) const {
5410   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5411   int64_t ConstAddr = 0;
5412 
5413   Register PtrBase;
5414   int64_t Offset;
5415   std::tie(PtrBase, Offset) =
5416     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5417 
5418   if (Offset) {
5419     int64_t OffsetValue0 = Offset;
5420     int64_t OffsetValue1 = Offset + Size;
5421     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5422       // (add n0, c0)
5423       return std::pair(PtrBase, OffsetValue0 / Size);
5424     }
5425   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5426     // TODO
5427 
5428   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5429     // TODO
5430 
5431   }
5432 
5433   return std::pair(Root.getReg(), 0);
5434 }
5435 
5436 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5437 /// the base value with the constant offset. There may be intervening copies
5438 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
5439 /// not match the pattern.
5440 std::pair<Register, int64_t>
5441 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5442   Register Root, const MachineRegisterInfo &MRI) const {
5443   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5444   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5445     return {Root, 0};
5446 
5447   MachineOperand &RHS = RootI->getOperand(2);
5448   std::optional<ValueAndVReg> MaybeOffset =
5449       getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5450   if (!MaybeOffset)
5451     return {Root, 0};
5452   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5453 }
5454 
5455 static void addZeroImm(MachineInstrBuilder &MIB) {
5456   MIB.addImm(0);
5457 }
5458 
5459 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5460 /// BasePtr is not valid, a null base pointer will be used.
5461 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5462                           uint32_t FormatLo, uint32_t FormatHi,
5463                           Register BasePtr) {
5464   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5466   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5467   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5468 
5469   B.buildInstr(AMDGPU::S_MOV_B32)
5470     .addDef(RSrc2)
5471     .addImm(FormatLo);
5472   B.buildInstr(AMDGPU::S_MOV_B32)
5473     .addDef(RSrc3)
5474     .addImm(FormatHi);
5475 
5476   // Build the half of the subregister with the constants before building the
5477   // full 128-bit register. If we are building multiple resource descriptors,
5478   // this will allow CSEing of the 2-component register.
5479   B.buildInstr(AMDGPU::REG_SEQUENCE)
5480     .addDef(RSrcHi)
5481     .addReg(RSrc2)
5482     .addImm(AMDGPU::sub0)
5483     .addReg(RSrc3)
5484     .addImm(AMDGPU::sub1);
5485 
5486   Register RSrcLo = BasePtr;
5487   if (!BasePtr) {
5488     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5489     B.buildInstr(AMDGPU::S_MOV_B64)
5490       .addDef(RSrcLo)
5491       .addImm(0);
5492   }
5493 
5494   B.buildInstr(AMDGPU::REG_SEQUENCE)
5495     .addDef(RSrc)
5496     .addReg(RSrcLo)
5497     .addImm(AMDGPU::sub0_sub1)
5498     .addReg(RSrcHi)
5499     .addImm(AMDGPU::sub2_sub3);
5500 
5501   return RSrc;
5502 }
5503 
5504 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5505                                 const SIInstrInfo &TII, Register BasePtr) {
5506   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5507 
5508   // FIXME: Why are half the "default" bits ignored based on the addressing
5509   // mode?
5510   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5511 }
5512 
5513 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5514                                const SIInstrInfo &TII, Register BasePtr) {
5515   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5516 
5517   // FIXME: Why are half the "default" bits ignored based on the addressing
5518   // mode?
5519   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5520 }
5521 
5522 AMDGPUInstructionSelector::MUBUFAddressData
5523 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5524   MUBUFAddressData Data;
5525   Data.N0 = Src;
5526 
5527   Register PtrBase;
5528   int64_t Offset;
5529 
5530   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5531   if (isUInt<32>(Offset)) {
5532     Data.N0 = PtrBase;
5533     Data.Offset = Offset;
5534   }
5535 
5536   if (MachineInstr *InputAdd
5537       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5538     Data.N2 = InputAdd->getOperand(1).getReg();
5539     Data.N3 = InputAdd->getOperand(2).getReg();
5540 
5541     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5542     // FIXME: Don't know this was defined by operand 0
5543     //
5544     // TODO: Remove this when we have copy folding optimizations after
5545     // RegBankSelect.
5546     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5547     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5548   }
5549 
5550   return Data;
5551 }
5552 
5553 /// Return if the addr64 mubuf mode should be used for the given address.
5554 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5555   // (ptr_add N2, N3) -> addr64, or
5556   // (ptr_add (ptr_add N2, N3), C1) -> addr64
5557   if (Addr.N2)
5558     return true;
5559 
5560   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5561   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5562 }
5563 
5564 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
5565 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5566 /// component.
5567 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5568   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5569   if (TII.isLegalMUBUFImmOffset(ImmOffset))
5570     return;
5571 
5572   // Illegal offset, store it in soffset.
5573   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5574   B.buildInstr(AMDGPU::S_MOV_B32)
5575     .addDef(SOffset)
5576     .addImm(ImmOffset);
5577   ImmOffset = 0;
5578 }
5579 
5580 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5581   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5582   Register &SOffset, int64_t &Offset) const {
5583   // FIXME: Predicates should stop this from reaching here.
5584   // addr64 bit was removed for volcanic islands.
5585   if (!STI.hasAddr64() || STI.useFlatForGlobal())
5586     return false;
5587 
5588   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5589   if (!shouldUseAddr64(AddrData))
5590     return false;
5591 
5592   Register N0 = AddrData.N0;
5593   Register N2 = AddrData.N2;
5594   Register N3 = AddrData.N3;
5595   Offset = AddrData.Offset;
5596 
5597   // Base pointer for the SRD.
5598   Register SRDPtr;
5599 
5600   if (N2) {
5601     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5602       assert(N3);
5603       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5604         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5605         // addr64, and construct the default resource from a 0 address.
5606         VAddr = N0;
5607       } else {
5608         SRDPtr = N3;
5609         VAddr = N2;
5610       }
5611     } else {
5612       // N2 is not divergent.
5613       SRDPtr = N2;
5614       VAddr = N3;
5615     }
5616   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5617     // Use the default null pointer in the resource
5618     VAddr = N0;
5619   } else {
5620     // N0 -> offset, or
5621     // (N0 + C1) -> offset
5622     SRDPtr = N0;
5623   }
5624 
5625   MachineIRBuilder B(*Root.getParent());
5626   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5627   splitIllegalMUBUFOffset(B, SOffset, Offset);
5628   return true;
5629 }
5630 
5631 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5632   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5633   int64_t &Offset) const {
5634 
5635   // FIXME: Pattern should not reach here.
5636   if (STI.useFlatForGlobal())
5637     return false;
5638 
5639   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5640   if (shouldUseAddr64(AddrData))
5641     return false;
5642 
5643   // N0 -> offset, or
5644   // (N0 + C1) -> offset
5645   Register SRDPtr = AddrData.N0;
5646   Offset = AddrData.Offset;
5647 
5648   // TODO: Look through extensions for 32-bit soffset.
5649   MachineIRBuilder B(*Root.getParent());
5650 
5651   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5652   splitIllegalMUBUFOffset(B, SOffset, Offset);
5653   return true;
5654 }
5655 
5656 InstructionSelector::ComplexRendererFns
5657 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5658   Register VAddr;
5659   Register RSrcReg;
5660   Register SOffset;
5661   int64_t Offset = 0;
5662 
5663   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5664     return {};
5665 
5666   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5667   // pattern.
5668   return {{
5669       [=](MachineInstrBuilder &MIB) {  // rsrc
5670         MIB.addReg(RSrcReg);
5671       },
5672       [=](MachineInstrBuilder &MIB) { // vaddr
5673         MIB.addReg(VAddr);
5674       },
5675       [=](MachineInstrBuilder &MIB) { // soffset
5676         if (SOffset)
5677           MIB.addReg(SOffset);
5678         else if (STI.hasRestrictedSOffset())
5679           MIB.addReg(AMDGPU::SGPR_NULL);
5680         else
5681           MIB.addImm(0);
5682       },
5683       [=](MachineInstrBuilder &MIB) { // offset
5684         MIB.addImm(Offset);
5685       },
5686       addZeroImm, //  cpol
5687       addZeroImm, //  tfe
5688       addZeroImm  //  swz
5689     }};
5690 }
5691 
5692 InstructionSelector::ComplexRendererFns
5693 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5694   Register RSrcReg;
5695   Register SOffset;
5696   int64_t Offset = 0;
5697 
5698   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5699     return {};
5700 
5701   return {{
5702       [=](MachineInstrBuilder &MIB) {  // rsrc
5703         MIB.addReg(RSrcReg);
5704       },
5705       [=](MachineInstrBuilder &MIB) { // soffset
5706         if (SOffset)
5707           MIB.addReg(SOffset);
5708         else if (STI.hasRestrictedSOffset())
5709           MIB.addReg(AMDGPU::SGPR_NULL);
5710         else
5711           MIB.addImm(0);
5712       },
5713       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5714       addZeroImm, //  cpol
5715       addZeroImm, //  tfe
5716       addZeroImm, //  swz
5717     }};
5718 }
5719 
5720 InstructionSelector::ComplexRendererFns
5721 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5722 
5723   Register SOffset = Root.getReg();
5724 
5725   if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5726     SOffset = AMDGPU::SGPR_NULL;
5727 
5728   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5729 }
5730 
5731 /// Get an immediate that must be 32-bits, and treated as zero extended.
5732 static std::optional<uint64_t>
5733 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5734   // getIConstantVRegVal sexts any values, so see if that matters.
5735   std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5736   if (!OffsetVal || !isInt<32>(*OffsetVal))
5737     return std::nullopt;
5738   return Lo_32(*OffsetVal);
5739 }
5740 
5741 InstructionSelector::ComplexRendererFns
5742 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5743   std::optional<uint64_t> OffsetVal =
5744       Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
5745   if (!OffsetVal)
5746     return {};
5747 
5748   std::optional<int64_t> EncodedImm =
5749       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5750   if (!EncodedImm)
5751     return {};
5752 
5753   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
5754 }
5755 
5756 InstructionSelector::ComplexRendererFns
5757 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5758   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5759 
5760   std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5761   if (!OffsetVal)
5762     return {};
5763 
5764   std::optional<int64_t> EncodedImm =
5765       AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
5766   if (!EncodedImm)
5767     return {};
5768 
5769   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
5770 }
5771 
5772 InstructionSelector::ComplexRendererFns
5773 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5774   // Match the (soffset + offset) pair as a 32-bit register base and
5775   // an immediate offset.
5776   Register SOffset;
5777   unsigned Offset;
5778   std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5779       *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5780   if (!SOffset)
5781     return std::nullopt;
5782 
5783   std::optional<int64_t> EncodedOffset =
5784       AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5785   if (!EncodedOffset)
5786     return std::nullopt;
5787 
5788   assert(MRI->getType(SOffset) == LLT::scalar(32));
5789   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5790            [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5791 }
5792 
5793 std::pair<Register, unsigned>
5794 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5795                                                      bool &Matched) const {
5796   Matched = false;
5797 
5798   Register Src;
5799   unsigned Mods;
5800   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5801 
5802   if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5803     assert(MRI->getType(Src) == LLT::scalar(16));
5804 
5805     // Only change Src if src modifier could be gained. In such cases new Src
5806     // could be sgpr but this does not violate constant bus restriction for
5807     // instruction that is being selected.
5808     Src = stripBitCast(Src, *MRI);
5809 
5810     const auto CheckAbsNeg = [&]() {
5811       // Be careful about folding modifiers if we already have an abs. fneg is
5812       // applied last, so we don't want to apply an earlier fneg.
5813       if ((Mods & SISrcMods::ABS) == 0) {
5814         unsigned ModsTmp;
5815         std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5816 
5817         if ((ModsTmp & SISrcMods::NEG) != 0)
5818           Mods ^= SISrcMods::NEG;
5819 
5820         if ((ModsTmp & SISrcMods::ABS) != 0)
5821           Mods |= SISrcMods::ABS;
5822       }
5823     };
5824 
5825     CheckAbsNeg();
5826 
5827     // op_sel/op_sel_hi decide the source type and source.
5828     // If the source's op_sel_hi is set, it indicates to do a conversion from
5829     // fp16. If the sources's op_sel is set, it picks the high half of the
5830     // source register.
5831 
5832     Mods |= SISrcMods::OP_SEL_1;
5833 
5834     if (isExtractHiElt(*MRI, Src, Src)) {
5835       Mods |= SISrcMods::OP_SEL_0;
5836       CheckAbsNeg();
5837     }
5838 
5839     Matched = true;
5840   }
5841 
5842   return {Src, Mods};
5843 }
5844 
5845 InstructionSelector::ComplexRendererFns
5846 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5847     MachineOperand &Root) const {
5848   Register Src;
5849   unsigned Mods;
5850   bool Matched;
5851   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5852   if (!Matched)
5853     return {};
5854 
5855   return {{
5856       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5857       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5858   }};
5859 }
5860 
5861 InstructionSelector::ComplexRendererFns
5862 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5863   Register Src;
5864   unsigned Mods;
5865   bool Matched;
5866   std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5867 
5868   return {{
5869       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5870       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5871   }};
5872 }
5873 
5874 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5875     MachineInstr &I, Intrinsic::ID IntrID) const {
5876   MachineBasicBlock *MBB = I.getParent();
5877   const DebugLoc &DL = I.getDebugLoc();
5878   Register CCReg = I.getOperand(0).getReg();
5879 
5880   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5881       .addImm(I.getOperand(2).getImm());
5882 
5883   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5884 
5885   I.eraseFromParent();
5886   return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5887                                       *MRI);
5888 }
5889 
5890 bool AMDGPUInstructionSelector::selectSGetBarrierState(
5891     MachineInstr &I, Intrinsic::ID IntrID) const {
5892   MachineBasicBlock *MBB = I.getParent();
5893   const DebugLoc &DL = I.getDebugLoc();
5894   MachineOperand BarOp = I.getOperand(2);
5895   std::optional<int64_t> BarValImm =
5896       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5897 
5898   if (!BarValImm) {
5899     auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5900                        .addReg(BarOp.getReg());
5901     constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5902   }
5903   MachineInstrBuilder MIB;
5904   unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5905                            : AMDGPU::S_GET_BARRIER_STATE_M0;
5906   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5907 
5908   auto DstReg = I.getOperand(0).getReg();
5909   const TargetRegisterClass *DstRC =
5910       TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5911   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5912     return false;
5913   MIB.addDef(DstReg);
5914   if (BarValImm) {
5915     MIB.addImm(*BarValImm);
5916   }
5917   I.eraseFromParent();
5918   return true;
5919 }
5920 
5921 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5922   if (HasInlineConst) {
5923     switch (IntrID) {
5924     default:
5925       llvm_unreachable("not a named barrier op");
5926     case Intrinsic::amdgcn_s_barrier_join:
5927       return AMDGPU::S_BARRIER_JOIN_IMM;
5928     case Intrinsic::amdgcn_s_get_named_barrier_state:
5929       return AMDGPU::S_GET_BARRIER_STATE_IMM;
5930     };
5931   } else {
5932     switch (IntrID) {
5933     default:
5934       llvm_unreachable("not a named barrier op");
5935     case Intrinsic::amdgcn_s_barrier_join:
5936       return AMDGPU::S_BARRIER_JOIN_M0;
5937     case Intrinsic::amdgcn_s_get_named_barrier_state:
5938       return AMDGPU::S_GET_BARRIER_STATE_M0;
5939     };
5940   }
5941 }
5942 
5943 bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5944     MachineInstr &I, Intrinsic::ID IntrID) const {
5945   MachineBasicBlock *MBB = I.getParent();
5946   const DebugLoc &DL = I.getDebugLoc();
5947   MachineOperand BarOp = I.getOperand(1);
5948   MachineOperand CntOp = I.getOperand(2);
5949 
5950   // BarID = (BarOp >> 4) & 0x3F
5951   Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5952   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5953       .add(BarOp)
5954       .addImm(4u)
5955       .setOperandDead(3); // Dead scc
5956 
5957   Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5958   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5959       .addReg(TmpReg0)
5960       .addImm(0x3F)
5961       .setOperandDead(3); // Dead scc
5962 
5963   // MO = ((CntOp & 0x3F) << shAmt) | BarID
5964   Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5965   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5966       .add(CntOp)
5967       .addImm(0x3F)
5968       .setOperandDead(3); // Dead scc
5969 
5970   Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5971   constexpr unsigned ShAmt = 16;
5972   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5973       .addReg(TmpReg2)
5974       .addImm(ShAmt)
5975       .setOperandDead(3); // Dead scc
5976 
5977   Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5978   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5979       .addReg(TmpReg1)
5980       .addReg(TmpReg3)
5981       .setOperandDead(3); // Dead scc;
5982 
5983   auto CopyMIB =
5984       BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5985   constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5986 
5987   unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5988                      ? AMDGPU::S_BARRIER_INIT_M0
5989                      : AMDGPU::S_BARRIER_SIGNAL_M0;
5990   MachineInstrBuilder MIB;
5991   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5992 
5993   I.eraseFromParent();
5994   return true;
5995 }
5996 
5997 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5998     MachineInstr &I, Intrinsic::ID IntrID) const {
5999   MachineBasicBlock *MBB = I.getParent();
6000   const DebugLoc &DL = I.getDebugLoc();
6001   MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6002                              ? I.getOperand(2)
6003                              : I.getOperand(1);
6004   std::optional<int64_t> BarValImm =
6005       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6006 
6007   if (!BarValImm) {
6008     // BarID = (BarOp >> 4) & 0x3F
6009     Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6010     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6011         .addReg(BarOp.getReg())
6012         .addImm(4u)
6013         .setOperandDead(3); // Dead scc;
6014 
6015     Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6016     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6017         .addReg(TmpReg0)
6018         .addImm(0x3F)
6019         .setOperandDead(3); // Dead scc;
6020 
6021     auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6022                        .addReg(TmpReg1);
6023     constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6024   }
6025 
6026   MachineInstrBuilder MIB;
6027   unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6028   MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6029 
6030   if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6031     auto DstReg = I.getOperand(0).getReg();
6032     const TargetRegisterClass *DstRC =
6033         TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6034     if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6035       return false;
6036     MIB.addDef(DstReg);
6037   }
6038 
6039   if (BarValImm) {
6040     auto BarId = ((*BarValImm) >> 4) & 0x3F;
6041     MIB.addImm(BarId);
6042   }
6043 
6044   I.eraseFromParent();
6045   return true;
6046 }
6047 
6048 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6049                                                  const MachineInstr &MI,
6050                                                  int OpIdx) const {
6051   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6052          "Expected G_CONSTANT");
6053   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6054 }
6055 
6056 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6057                                                 const MachineInstr &MI,
6058                                                 int OpIdx) const {
6059   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6060          "Expected G_CONSTANT");
6061   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6062 }
6063 
6064 void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6065                                                    const MachineInstr &MI,
6066                                                    int OpIdx) const {
6067   const MachineOperand &Op = MI.getOperand(1);
6068   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6069   MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6070 }
6071 
6072 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6073                                                 const MachineInstr &MI,
6074                                                 int OpIdx) const {
6075   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6076          "Expected G_CONSTANT");
6077   MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6078 }
6079 
6080 /// This only really exists to satisfy DAG type checking machinery, so is a
6081 /// no-op here.
6082 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6083                                                 const MachineInstr &MI,
6084                                                 int OpIdx) const {
6085   const MachineOperand &Op = MI.getOperand(OpIdx);
6086   int64_t Imm;
6087   if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6088     MIB.addImm(Imm);
6089   else
6090     MIB.addImm(Op.getImm());
6091 }
6092 
6093 void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6094                                                    const MachineInstr &MI,
6095                                                    int OpIdx) const {
6096   MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6097 }
6098 
6099 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6100                                                 const MachineInstr &MI,
6101                                                 int OpIdx) const {
6102   assert(OpIdx >= 0 && "expected to match an immediate operand");
6103   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6104 }
6105 
6106 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6107     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6108   assert(OpIdx >= 0 && "expected to match an immediate operand");
6109   MIB.addImm(
6110       (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6111 }
6112 
6113 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6114     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6115   assert(OpIdx >= 0 && "expected to match an immediate operand");
6116   MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6117                  ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
6118                  : (int64_t)SISrcMods::DST_OP_SEL);
6119 }
6120 
6121 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6122     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6123   assert(OpIdx >= 0 && "expected to match an immediate operand");
6124   MIB.addImm(
6125       (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6126 }
6127 
6128 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6129     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6130   assert(OpIdx >= 0 && "expected to match an immediate operand");
6131   MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6132                  ? (int64_t)(SISrcMods::OP_SEL_0)
6133                  : 0);
6134 }
6135 
6136 void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6137     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6138   assert(OpIdx >= 0 && "expected to match an immediate operand");
6139   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6140                                            : 0);
6141 }
6142 
6143 void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6144     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6145   assert(OpIdx >= 0 && "expected to match an immediate operand");
6146   MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6147                                            : 0);
6148 }
6149 
6150 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6151     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6152   assert(OpIdx >= 0 && "expected to match an immediate operand");
6153   MIB.addImm(
6154       (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6155 }
6156 
6157 void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6158     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6159   assert(OpIdx >= 0 && "expected to match an immediate operand");
6160   MIB.addImm(
6161       (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL  : 0);
6162 }
6163 
6164 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6165                                                   const MachineInstr &MI,
6166                                                   int OpIdx) const {
6167   assert(OpIdx >= 0 && "expected to match an immediate operand");
6168   MIB.addImm(MI.getOperand(OpIdx).getImm() &
6169              (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6170                                        : AMDGPU::CPol::ALL_pregfx12));
6171 }
6172 
6173 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6174                                                  const MachineInstr &MI,
6175                                                  int OpIdx) const {
6176   assert(OpIdx >= 0 && "expected to match an immediate operand");
6177   const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6178                        (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
6179                                                  : AMDGPU::CPol::SWZ_pregfx12);
6180   MIB.addImm(Swizzle);
6181 }
6182 
6183 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6184     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6185   assert(OpIdx >= 0 && "expected to match an immediate operand");
6186   const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6187                         (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6188                                                   : AMDGPU::CPol::ALL_pregfx12);
6189   MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6190 }
6191 
6192 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6193                                                  const MachineInstr &MI,
6194                                                  int OpIdx) const {
6195   MIB.addFrameIndex(MI.getOperand(1).getIndex());
6196 }
6197 
6198 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6199                                                        const MachineInstr &MI,
6200                                                        int OpIdx) const {
6201   const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6202   int ExpVal = APF.getExactLog2Abs();
6203   assert(ExpVal != INT_MIN);
6204   MIB.addImm(ExpVal);
6205 }
6206 
6207 void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6208                                                 const MachineInstr &MI,
6209                                                 int OpIdx) const {
6210   // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
6211   // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6212   // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
6213   // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
6214   MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6215 }
6216 
6217 /// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6218 void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6219     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6220   unsigned Val = MI.getOperand(OpIdx).getImm();
6221   unsigned New = 0;
6222   if (Val & 0x1)
6223     New |= SISrcMods::OP_SEL_0;
6224   if (Val & 0x2)
6225     New |= SISrcMods::OP_SEL_1;
6226   MIB.addImm(New);
6227 }
6228 
6229 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6230   return TII.isInlineConstant(Imm);
6231 }
6232 
6233 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6234   return TII.isInlineConstant(Imm);
6235 }
6236