xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (revision e28e93550a74752714db6fffe50233aa96e536a5)
1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "MCTargetDesc/R600MCTargetDesc.h"
21 #include "R600RegisterInfo.h"
22 #include "SIISelLowering.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "llvm/Analysis/UniformityAnalysis.h"
25 #include "llvm/CodeGen/FunctionLoweringInfo.h"
26 #include "llvm/CodeGen/SelectionDAG.h"
27 #include "llvm/CodeGen/SelectionDAGISel.h"
28 #include "llvm/CodeGen/SelectionDAGNodes.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/Support/ErrorHandling.h"
31 
32 #ifdef EXPENSIVE_CHECKS
33 #include "llvm/Analysis/LoopInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #endif
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 
41 //===----------------------------------------------------------------------===//
42 // Instruction Selector Implementation
43 //===----------------------------------------------------------------------===//
44 
45 namespace {
46 static SDValue stripBitcast(SDValue Val) {
47   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48 }
49 
50 // Figure out if this is really an extract of the high 16-bits of a dword.
51 static bool isExtractHiElt(SDValue In, SDValue &Out) {
52   In = stripBitcast(In);
53 
54   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56       if (!Idx->isOne())
57         return false;
58       Out = In.getOperand(0);
59       return true;
60     }
61   }
62 
63   if (In.getOpcode() != ISD::TRUNCATE)
64     return false;
65 
66   SDValue Srl = In.getOperand(0);
67   if (Srl.getOpcode() == ISD::SRL) {
68     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69       if (ShiftAmt->getZExtValue() == 16) {
70         Out = stripBitcast(Srl.getOperand(0));
71         return true;
72       }
73     }
74   }
75 
76   return false;
77 }
78 
79 // Look through operations that obscure just looking at the low 16-bits of the
80 // same register.
81 static SDValue stripExtractLoElt(SDValue In) {
82   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83     SDValue Idx = In.getOperand(1);
84     if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85       return In.getOperand(0);
86   }
87 
88   if (In.getOpcode() == ISD::TRUNCATE) {
89     SDValue Src = In.getOperand(0);
90     if (Src.getValueType().getSizeInBits() == 32)
91       return stripBitcast(Src);
92   }
93 
94   return In;
95 }
96 
97 } // end anonymous namespace
98 
99 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
100                       "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101                       false)
102 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
103 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
104 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
105 #ifdef EXPENSIVE_CHECKS
106 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
107 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
108 #endif
109 INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
110                     "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111                     false)
112 
113 /// This pass converts a legalized DAG into a AMDGPU-specific
114 // DAG, ready for instruction scheduling.
115 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
116                                         CodeGenOptLevel OptLevel) {
117   return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118 }
119 
120 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
121                                        CodeGenOptLevel OptLevel)
122     : SelectionDAGISel(TM, OptLevel) {}
123 
124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125   Subtarget = &MF.getSubtarget<GCNSubtarget>();
126   Subtarget->checkSubtargetFeatures(MF.getFunction());
127   Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
128   return SelectionDAGISel::runOnMachineFunction(MF);
129 }
130 
131 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132   // XXX - only need to list legal operations.
133   switch (Opc) {
134   case ISD::FADD:
135   case ISD::FSUB:
136   case ISD::FMUL:
137   case ISD::FDIV:
138   case ISD::FREM:
139   case ISD::FCANONICALIZE:
140   case ISD::UINT_TO_FP:
141   case ISD::SINT_TO_FP:
142   case ISD::FABS:
143     // Fabs is lowered to a bit operation, but it's an and which will clear the
144     // high bits anyway.
145   case ISD::FSQRT:
146   case ISD::FSIN:
147   case ISD::FCOS:
148   case ISD::FPOWI:
149   case ISD::FPOW:
150   case ISD::FLOG:
151   case ISD::FLOG2:
152   case ISD::FLOG10:
153   case ISD::FEXP:
154   case ISD::FEXP2:
155   case ISD::FCEIL:
156   case ISD::FTRUNC:
157   case ISD::FRINT:
158   case ISD::FNEARBYINT:
159   case ISD::FROUNDEVEN:
160   case ISD::FROUND:
161   case ISD::FFLOOR:
162   case ISD::FMINNUM:
163   case ISD::FMAXNUM:
164   case ISD::FLDEXP:
165   case AMDGPUISD::FRACT:
166   case AMDGPUISD::CLAMP:
167   case AMDGPUISD::COS_HW:
168   case AMDGPUISD::SIN_HW:
169   case AMDGPUISD::FMIN3:
170   case AMDGPUISD::FMAX3:
171   case AMDGPUISD::FMED3:
172   case AMDGPUISD::FMAD_FTZ:
173   case AMDGPUISD::RCP:
174   case AMDGPUISD::RSQ:
175   case AMDGPUISD::RCP_IFLAG:
176     // On gfx10, all 16-bit instructions preserve the high bits.
177     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178   case ISD::FP_ROUND:
179     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180     // high bits on gfx9.
181     // TODO: If we had the source node we could see if the source was fma/mad
182     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
183   case ISD::FMA:
184   case ISD::FMAD:
185   case AMDGPUISD::DIV_FIXUP:
186     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187   default:
188     // fcopysign, select and others may be lowered to 32-bit bit operations
189     // which don't zero the high bits.
190     return false;
191   }
192 }
193 
194 bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
195 #ifdef EXPENSIVE_CHECKS
196   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198   for (auto &L : LI->getLoopsInPreorder()) {
199     assert(L->isLCSSAForm(DT));
200   }
201 #endif
202   return SelectionDAGISelLegacy::runOnMachineFunction(MF);
203 }
204 
205 void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
206   AU.addRequired<AMDGPUArgumentUsageInfo>();
207   AU.addRequired<UniformityInfoWrapperPass>();
208 #ifdef EXPENSIVE_CHECKS
209   AU.addRequired<DominatorTreeWrapperPass>();
210   AU.addRequired<LoopInfoWrapperPass>();
211 #endif
212   SelectionDAGISelLegacy::getAnalysisUsage(AU);
213 }
214 
215 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
216   assert(Subtarget->d16PreservesUnusedBits());
217   MVT VT = N->getValueType(0).getSimpleVT();
218   if (VT != MVT::v2i16 && VT != MVT::v2f16)
219     return false;
220 
221   SDValue Lo = N->getOperand(0);
222   SDValue Hi = N->getOperand(1);
223 
224   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225 
226   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229 
230   // Need to check for possible indirect dependencies on the other half of the
231   // vector to avoid introducing a cycle.
232   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234 
235     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
236     SDValue Ops[] = {
237       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238     };
239 
240     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241     if (LdHi->getMemoryVT() == MVT::i8) {
242       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
243         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
244     } else {
245       assert(LdHi->getMemoryVT() == MVT::i16);
246     }
247 
248     SDValue NewLoadHi =
249       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250                                   Ops, LdHi->getMemoryVT(),
251                                   LdHi->getMemOperand());
252 
253     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255     return true;
256   }
257 
258   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262   if (LdLo && Lo.hasOneUse()) {
263     SDValue TiedIn = getHi16Elt(Hi);
264     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265       return false;
266 
267     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269     if (LdLo->getMemoryVT() == MVT::i8) {
270       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
271         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
272     } else {
273       assert(LdLo->getMemoryVT() == MVT::i16);
274     }
275 
276     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277 
278     SDValue Ops[] = {
279       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280     };
281 
282     SDValue NewLoadLo =
283       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284                                   Ops, LdLo->getMemoryVT(),
285                                   LdLo->getMemOperand());
286 
287     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289     return true;
290   }
291 
292   return false;
293 }
294 
295 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
296   if (!Subtarget->d16PreservesUnusedBits())
297     return;
298 
299   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
300 
301   bool MadeChange = false;
302   while (Position != CurDAG->allnodes_begin()) {
303     SDNode *N = &*--Position;
304     if (N->use_empty())
305       continue;
306 
307     switch (N->getOpcode()) {
308     case ISD::BUILD_VECTOR:
309       // TODO: Match load d16 from shl (extload:i16), 16
310       MadeChange |= matchLoadD16FromBuildVector(N);
311       break;
312     default:
313       break;
314     }
315   }
316 
317   if (MadeChange) {
318     CurDAG->RemoveDeadNodes();
319     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320                CurDAG->dump(););
321   }
322 }
323 
324 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325   if (N->isUndef())
326     return true;
327 
328   const SIInstrInfo *TII = Subtarget->getInstrInfo();
329   if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330     return TII->isInlineConstant(C->getAPIntValue());
331 
332   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333     return TII->isInlineConstant(C->getValueAPF());
334 
335   return false;
336 }
337 
338 /// Determine the register class for \p OpNo
339 /// \returns The register class of the virtual register that will be used for
340 /// the given operand number \OpNo or NULL if the register class cannot be
341 /// determined.
342 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343                                                           unsigned OpNo) const {
344   if (!N->isMachineOpcode()) {
345     if (N->getOpcode() == ISD::CopyToReg) {
346       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347       if (Reg.isVirtual()) {
348         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
349         return MRI.getRegClass(Reg);
350       }
351 
352       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353       return TRI->getPhysRegBaseClass(Reg);
354     }
355 
356     return nullptr;
357   }
358 
359   switch (N->getMachineOpcode()) {
360   default: {
361     const MCInstrDesc &Desc =
362         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363     unsigned OpIdx = Desc.getNumDefs() + OpNo;
364     if (OpIdx >= Desc.getNumOperands())
365       return nullptr;
366     int RegClass = Desc.operands()[OpIdx].RegClass;
367     if (RegClass == -1)
368       return nullptr;
369 
370     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371   }
372   case AMDGPU::REG_SEQUENCE: {
373     unsigned RCID = N->getConstantOperandVal(0);
374     const TargetRegisterClass *SuperRC =
375         Subtarget->getRegisterInfo()->getRegClass(RCID);
376 
377     SDValue SubRegOp = N->getOperand(OpNo + 1);
378     unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380                                                               SubRegIdx);
381   }
382   }
383 }
384 
385 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386                                          SDValue Glue) const {
387   SmallVector <SDValue, 8> Ops;
388   Ops.push_back(NewChain); // Replace the chain.
389   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390     Ops.push_back(N->getOperand(i));
391 
392   Ops.push_back(Glue);
393   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394 }
395 
396 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
397   const SITargetLowering& Lowering =
398     *static_cast<const SITargetLowering*>(getTargetLowering());
399 
400   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401 
402   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403   return glueCopyToOp(N, M0, M0.getValue(1));
404 }
405 
406 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409     if (Subtarget->ldsRequiresM0Init())
410       return glueCopyToM0(
411           N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
413     MachineFunction &MF = CurDAG->getMachineFunction();
414     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415     return
416         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417   }
418   return N;
419 }
420 
421 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422                                                   EVT VT) const {
423   SDNode *Lo = CurDAG->getMachineNode(
424       AMDGPU::S_MOV_B32, DL, MVT::i32,
425       CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
426   SDNode *Hi = CurDAG->getMachineNode(
427       AMDGPU::S_MOV_B32, DL, MVT::i32,
428       CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429   const SDValue Ops[] = {
430       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433 
434   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435 }
436 
437 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438   EVT VT = N->getValueType(0);
439   unsigned NumVectorElts = VT.getVectorNumElements();
440   EVT EltVT = VT.getVectorElementType();
441   SDLoc DL(N);
442   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443 
444   if (NumVectorElts == 1) {
445     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446                          RegClass);
447     return;
448   }
449 
450   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
451                                   "supported yet");
452   // 32 = Max Num Vector Elements
453   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
454   // 1 = Vector Register Class
455   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
456 
457   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
458                Triple::amdgcn;
459   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
460   bool IsRegSeq = true;
461   unsigned NOps = N->getNumOperands();
462   for (unsigned i = 0; i < NOps; i++) {
463     // XXX: Why is this here?
464     if (isa<RegisterSDNode>(N->getOperand(i))) {
465       IsRegSeq = false;
466       break;
467     }
468     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
469                          : R600RegisterInfo::getSubRegFromChannel(i);
470     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
471     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
472   }
473   if (NOps != NumVectorElts) {
474     // Fill in the missing undef elements if this was a scalar_to_vector.
475     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
476     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
477                                                    DL, EltVT);
478     for (unsigned i = NOps; i < NumVectorElts; ++i) {
479       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
480                            : R600RegisterInfo::getSubRegFromChannel(i);
481       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
482       RegSeqArgs[1 + (2 * i) + 1] =
483           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
484     }
485   }
486 
487   if (!IsRegSeq)
488     SelectCode(N);
489   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
490 }
491 
492 void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
493   EVT VT = N->getValueType(0);
494   EVT EltVT = VT.getVectorElementType();
495 
496   // TODO: Handle 16-bit element vectors with even aligned masks.
497   if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
498       VT.getVectorNumElements() != 2) {
499     SelectCode(N);
500     return;
501   }
502 
503   auto *SVN = cast<ShuffleVectorSDNode>(N);
504 
505   SDValue Src0 = SVN->getOperand(0);
506   SDValue Src1 = SVN->getOperand(1);
507   ArrayRef<int> Mask = SVN->getMask();
508   SDLoc DL(N);
509 
510   assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
511          Mask[0] < 4 && Mask[1] < 4);
512 
513   SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
514   SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
515   unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
516   unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
517 
518   if (Mask[0] < 0) {
519     Src0SubReg = Src1SubReg;
520     MachineSDNode *ImpDef =
521         CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
522     VSrc0 = SDValue(ImpDef, 0);
523   }
524 
525   if (Mask[1] < 0) {
526     Src1SubReg = Src0SubReg;
527     MachineSDNode *ImpDef =
528         CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
529     VSrc1 = SDValue(ImpDef, 0);
530   }
531 
532   // SGPR case needs to lower to copies.
533   //
534   // Also use subregister extract when we can directly blend the registers with
535   // a simple subregister copy.
536   //
537   // TODO: Maybe we should fold this out earlier
538   if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
539       Src1SubReg == AMDGPU::sub0) {
540     // The low element of the result always comes from src0.
541     // The high element of the result always comes from src1.
542     // op_sel selects the high half of src0.
543     // op_sel_hi selects the high half of src1.
544 
545     unsigned Src0OpSel =
546         Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
547     unsigned Src1OpSel =
548         Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
549 
550     // Enable op_sel_hi to avoid printing it. This should have no effect on the
551     // result.
552     Src0OpSel |= SISrcMods::OP_SEL_1;
553     Src1OpSel |= SISrcMods::OP_SEL_1;
554 
555     SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
556     SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
557     SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
558 
559     CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
560                          {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
561                           ZeroMods,   // clamp
562                           ZeroMods,   // op_sel
563                           ZeroMods,   // op_sel_hi
564                           ZeroMods,   // neg_lo
565                           ZeroMods}); // neg_hi
566     return;
567   }
568 
569   SDValue ResultElt0 =
570       CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
571   SDValue ResultElt1 =
572       CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
573 
574   const SDValue Ops[] = {
575       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
576       ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
577       ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
578   CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
579 }
580 
581 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
582   unsigned int Opc = N->getOpcode();
583   if (N->isMachineOpcode()) {
584     N->setNodeId(-1);
585     return;   // Already selected.
586   }
587 
588   // isa<MemSDNode> almost works but is slightly too permissive for some DS
589   // intrinsics.
590   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
591     N = glueCopyToM0LDSInit(N);
592     SelectCode(N);
593     return;
594   }
595 
596   switch (Opc) {
597   default:
598     break;
599   // We are selecting i64 ADD here instead of custom lower it during
600   // DAG legalization, so we can fold some i64 ADDs used for address
601   // calculation into the LOAD and STORE instructions.
602   case ISD::ADDC:
603   case ISD::ADDE:
604   case ISD::SUBC:
605   case ISD::SUBE: {
606     if (N->getValueType(0) != MVT::i64)
607       break;
608 
609     SelectADD_SUB_I64(N);
610     return;
611   }
612   case ISD::UADDO_CARRY:
613   case ISD::USUBO_CARRY:
614     if (N->getValueType(0) != MVT::i32)
615       break;
616 
617     SelectAddcSubb(N);
618     return;
619   case ISD::UADDO:
620   case ISD::USUBO: {
621     SelectUADDO_USUBO(N);
622     return;
623   }
624   case AMDGPUISD::FMUL_W_CHAIN: {
625     SelectFMUL_W_CHAIN(N);
626     return;
627   }
628   case AMDGPUISD::FMA_W_CHAIN: {
629     SelectFMA_W_CHAIN(N);
630     return;
631   }
632 
633   case ISD::SCALAR_TO_VECTOR:
634   case ISD::BUILD_VECTOR: {
635     EVT VT = N->getValueType(0);
636     unsigned NumVectorElts = VT.getVectorNumElements();
637     if (VT.getScalarSizeInBits() == 16) {
638       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
639         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
640           ReplaceNode(N, Packed);
641           return;
642         }
643       }
644 
645       break;
646     }
647 
648     assert(VT.getVectorElementType().bitsEq(MVT::i32));
649     unsigned RegClassID =
650         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
651     SelectBuildVector(N, RegClassID);
652     return;
653   }
654   case ISD::VECTOR_SHUFFLE:
655     SelectVectorShuffle(N);
656     return;
657   case ISD::BUILD_PAIR: {
658     SDValue RC, SubReg0, SubReg1;
659     SDLoc DL(N);
660     if (N->getValueType(0) == MVT::i128) {
661       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
662       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
663       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
664     } else if (N->getValueType(0) == MVT::i64) {
665       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
666       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
667       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
668     } else {
669       llvm_unreachable("Unhandled value type for BUILD_PAIR");
670     }
671     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
672                             N->getOperand(1), SubReg1 };
673     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
674                                           N->getValueType(0), Ops));
675     return;
676   }
677 
678   case ISD::Constant:
679   case ISD::ConstantFP: {
680     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
681       break;
682 
683     uint64_t Imm;
684     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
685       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
686       if (AMDGPU::isValid32BitLiteral(Imm, true))
687         break;
688     } else {
689       ConstantSDNode *C = cast<ConstantSDNode>(N);
690       Imm = C->getZExtValue();
691       if (AMDGPU::isValid32BitLiteral(Imm, false))
692         break;
693     }
694 
695     SDLoc DL(N);
696     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
697     return;
698   }
699   case AMDGPUISD::BFE_I32:
700   case AMDGPUISD::BFE_U32: {
701     // There is a scalar version available, but unlike the vector version which
702     // has a separate operand for the offset and width, the scalar version packs
703     // the width and offset into a single operand. Try to move to the scalar
704     // version if the offsets are constant, so that we can try to keep extended
705     // loads of kernel arguments in SGPRs.
706 
707     // TODO: Technically we could try to pattern match scalar bitshifts of
708     // dynamic values, but it's probably not useful.
709     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
710     if (!Offset)
711       break;
712 
713     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
714     if (!Width)
715       break;
716 
717     bool Signed = Opc == AMDGPUISD::BFE_I32;
718 
719     uint32_t OffsetVal = Offset->getZExtValue();
720     uint32_t WidthVal = Width->getZExtValue();
721 
722     ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
723                             WidthVal));
724     return;
725   }
726   case AMDGPUISD::DIV_SCALE: {
727     SelectDIV_SCALE(N);
728     return;
729   }
730   case AMDGPUISD::MAD_I64_I32:
731   case AMDGPUISD::MAD_U64_U32: {
732     SelectMAD_64_32(N);
733     return;
734   }
735   case ISD::SMUL_LOHI:
736   case ISD::UMUL_LOHI:
737     return SelectMUL_LOHI(N);
738   case ISD::CopyToReg: {
739     const SITargetLowering& Lowering =
740       *static_cast<const SITargetLowering*>(getTargetLowering());
741     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
742     break;
743   }
744   case ISD::AND:
745   case ISD::SRL:
746   case ISD::SRA:
747   case ISD::SIGN_EXTEND_INREG:
748     if (N->getValueType(0) != MVT::i32)
749       break;
750 
751     SelectS_BFE(N);
752     return;
753   case ISD::BRCOND:
754     SelectBRCOND(N);
755     return;
756   case ISD::FP_EXTEND:
757     SelectFP_EXTEND(N);
758     return;
759   case AMDGPUISD::CVT_PKRTZ_F16_F32:
760   case AMDGPUISD::CVT_PKNORM_I16_F32:
761   case AMDGPUISD::CVT_PKNORM_U16_F32:
762   case AMDGPUISD::CVT_PK_U16_U32:
763   case AMDGPUISD::CVT_PK_I16_I32: {
764     // Hack around using a legal type if f16 is illegal.
765     if (N->getValueType(0) == MVT::i32) {
766       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
767       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
768                               { N->getOperand(0), N->getOperand(1) });
769       SelectCode(N);
770       return;
771     }
772 
773     break;
774   }
775   case ISD::INTRINSIC_W_CHAIN: {
776     SelectINTRINSIC_W_CHAIN(N);
777     return;
778   }
779   case ISD::INTRINSIC_WO_CHAIN: {
780     SelectINTRINSIC_WO_CHAIN(N);
781     return;
782   }
783   case ISD::INTRINSIC_VOID: {
784     SelectINTRINSIC_VOID(N);
785     return;
786   }
787   case AMDGPUISD::WAVE_ADDRESS: {
788     SelectWAVE_ADDRESS(N);
789     return;
790   }
791   case ISD::STACKRESTORE: {
792     SelectSTACKRESTORE(N);
793     return;
794   }
795   }
796 
797   SelectCode(N);
798 }
799 
800 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
801   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
802   const Instruction *Term = BB->getTerminator();
803   return Term->getMetadata("amdgpu.uniform") ||
804          Term->getMetadata("structurizecfg.uniform");
805 }
806 
807 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
808                                              unsigned ShAmtBits) const {
809   assert(N->getOpcode() == ISD::AND);
810 
811   const APInt &RHS = N->getConstantOperandAPInt(1);
812   if (RHS.countr_one() >= ShAmtBits)
813     return true;
814 
815   const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
816   return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
817 }
818 
819 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
820                                           SDValue &N0, SDValue &N1) {
821   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
822       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
823     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
824     // (i64 (bitcast (v2i32 (build_vector
825     //                        (or (extract_vector_elt V, 0), OFFSET),
826     //                        (extract_vector_elt V, 1)))))
827     SDValue Lo = Addr.getOperand(0).getOperand(0);
828     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
829       SDValue BaseLo = Lo.getOperand(0);
830       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
831       // Check that split base (Lo and Hi) are extracted from the same one.
832       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
833           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
834           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
835           // Lo is statically extracted from index 0.
836           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
837           BaseLo.getConstantOperandVal(1) == 0 &&
838           // Hi is statically extracted from index 0.
839           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
840           BaseHi.getConstantOperandVal(1) == 1) {
841         N0 = BaseLo.getOperand(0).getOperand(0);
842         N1 = Lo.getOperand(1);
843         return true;
844       }
845     }
846   }
847   return false;
848 }
849 
850 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
851                                                     SDValue &RHS) const {
852   if (CurDAG->isBaseWithConstantOffset(Addr)) {
853     LHS = Addr.getOperand(0);
854     RHS = Addr.getOperand(1);
855     return true;
856   }
857 
858   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
859     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
860     return true;
861   }
862 
863   return false;
864 }
865 
866 StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
867   return "AMDGPU DAG->DAG Pattern Instruction Selection";
868 }
869 
870 AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
871     : SelectionDAGISelPass(
872           std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
873 
874 PreservedAnalyses
875 AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
876                             MachineFunctionAnalysisManager &MFAM) {
877 #ifdef EXPENSIVE_CHECKS
878   auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
879                   .getManager();
880   auto &F = MF.getFunction();
881   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
882   LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
883   for (auto &L : LI.getLoopsInPreorder())
884     assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
885 #endif
886   return SelectionDAGISelPass::run(MF, MFAM);
887 }
888 
889 //===----------------------------------------------------------------------===//
890 // Complex Patterns
891 //===----------------------------------------------------------------------===//
892 
893 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
894                                             SDValue &Offset) {
895   return false;
896 }
897 
898 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
899                                             SDValue &Offset) {
900   ConstantSDNode *C;
901   SDLoc DL(Addr);
902 
903   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
904     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
905     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
906   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
907              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
908     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
909     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
910   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
911             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
912     Base = Addr.getOperand(0);
913     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
914   } else {
915     Base = Addr;
916     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
917   }
918 
919   return true;
920 }
921 
922 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
923                                                        const SDLoc &DL) const {
924   SDNode *Mov = CurDAG->getMachineNode(
925     AMDGPU::S_MOV_B32, DL, MVT::i32,
926     CurDAG->getTargetConstant(Val, DL, MVT::i32));
927   return SDValue(Mov, 0);
928 }
929 
930 // FIXME: Should only handle uaddo_carry/usubo_carry
931 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
932   SDLoc DL(N);
933   SDValue LHS = N->getOperand(0);
934   SDValue RHS = N->getOperand(1);
935 
936   unsigned Opcode = N->getOpcode();
937   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
938   bool ProduceCarry =
939       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
940   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
941 
942   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
943   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
944 
945   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
946                                        DL, MVT::i32, LHS, Sub0);
947   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
948                                        DL, MVT::i32, LHS, Sub1);
949 
950   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
951                                        DL, MVT::i32, RHS, Sub0);
952   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
953                                        DL, MVT::i32, RHS, Sub1);
954 
955   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
956 
957   static const unsigned OpcMap[2][2][2] = {
958       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
959        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
960       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
961        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
962 
963   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
964   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
965 
966   SDNode *AddLo;
967   if (!ConsumeCarry) {
968     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
969     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
970   } else {
971     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
972     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
973   }
974   SDValue AddHiArgs[] = {
975     SDValue(Hi0, 0),
976     SDValue(Hi1, 0),
977     SDValue(AddLo, 1)
978   };
979   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
980 
981   SDValue RegSequenceArgs[] = {
982     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
983     SDValue(AddLo,0),
984     Sub0,
985     SDValue(AddHi,0),
986     Sub1,
987   };
988   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
989                                                MVT::i64, RegSequenceArgs);
990 
991   if (ProduceCarry) {
992     // Replace the carry-use
993     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
994   }
995 
996   // Replace the remaining uses.
997   ReplaceNode(N, RegSequence);
998 }
999 
1000 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1001   SDLoc DL(N);
1002   SDValue LHS = N->getOperand(0);
1003   SDValue RHS = N->getOperand(1);
1004   SDValue CI = N->getOperand(2);
1005 
1006   if (N->isDivergent()) {
1007     unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1008                                                       : AMDGPU::V_SUBB_U32_e64;
1009     CurDAG->SelectNodeTo(
1010         N, Opc, N->getVTList(),
1011         {LHS, RHS, CI,
1012          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1013   } else {
1014     unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1015                                                       : AMDGPU::S_SUB_CO_PSEUDO;
1016     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1017   }
1018 }
1019 
1020 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1021   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1022   // carry out despite the _i32 name. These were renamed in VI to _U32.
1023   // FIXME: We should probably rename the opcodes here.
1024   bool IsAdd = N->getOpcode() == ISD::UADDO;
1025   bool IsVALU = N->isDivergent();
1026 
1027   for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1028        ++UI)
1029     if (UI.getUse().getResNo() == 1) {
1030       if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
1031           (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
1032         IsVALU = true;
1033         break;
1034       }
1035     }
1036 
1037   if (IsVALU) {
1038     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1039 
1040     CurDAG->SelectNodeTo(
1041         N, Opc, N->getVTList(),
1042         {N->getOperand(0), N->getOperand(1),
1043          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1044   } else {
1045     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1046                                                 : AMDGPU::S_USUBO_PSEUDO;
1047 
1048     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1049                          {N->getOperand(0), N->getOperand(1)});
1050   }
1051 }
1052 
1053 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1054   SDLoc SL(N);
1055   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1056   SDValue Ops[10];
1057 
1058   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1059   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1060   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1061   Ops[8] = N->getOperand(0);
1062   Ops[9] = N->getOperand(4);
1063 
1064   // If there are no source modifiers, prefer fmac over fma because it can use
1065   // the smaller VOP2 encoding.
1066   bool UseFMAC = Subtarget->hasDLInsts() &&
1067                  cast<ConstantSDNode>(Ops[0])->isZero() &&
1068                  cast<ConstantSDNode>(Ops[2])->isZero() &&
1069                  cast<ConstantSDNode>(Ops[4])->isZero();
1070   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1071   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1072 }
1073 
1074 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1075   SDLoc SL(N);
1076   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
1077   SDValue Ops[8];
1078 
1079   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1080   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1081   Ops[6] = N->getOperand(0);
1082   Ops[7] = N->getOperand(3);
1083 
1084   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1085 }
1086 
1087 // We need to handle this here because tablegen doesn't support matching
1088 // instructions with multiple outputs.
1089 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1090   SDLoc SL(N);
1091   EVT VT = N->getValueType(0);
1092 
1093   assert(VT == MVT::f32 || VT == MVT::f64);
1094 
1095   unsigned Opc
1096     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1097 
1098   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1099   // omod
1100   SDValue Ops[8];
1101   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1102   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1103   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1104   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1105 }
1106 
1107 // We need to handle this here because tablegen doesn't support matching
1108 // instructions with multiple outputs.
1109 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1110   SDLoc SL(N);
1111   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1112   unsigned Opc;
1113   if (Subtarget->hasMADIntraFwdBug())
1114     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1115                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1116   else
1117     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1118 
1119   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1120   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1121                     Clamp };
1122   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1123 }
1124 
1125 // We need to handle this here because tablegen doesn't support matching
1126 // instructions with multiple outputs.
1127 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1128   SDLoc SL(N);
1129   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1130   unsigned Opc;
1131   if (Subtarget->hasMADIntraFwdBug())
1132     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1133                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1134   else
1135     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1136 
1137   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1138   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1139   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1140   SDNode *Mad = CurDAG->getMachineNode(
1141       Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
1142   if (!SDValue(N, 0).use_empty()) {
1143     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1144     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1145                                         MVT::i32, SDValue(Mad, 0), Sub0);
1146     ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1147   }
1148   if (!SDValue(N, 1).use_empty()) {
1149     SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1150     SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1151                                         MVT::i32, SDValue(Mad, 0), Sub1);
1152     ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1153   }
1154   CurDAG->RemoveDeadNode(N);
1155 }
1156 
1157 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1158   if (!isUInt<16>(Offset))
1159     return false;
1160 
1161   if (!Base || Subtarget->hasUsableDSOffset() ||
1162       Subtarget->unsafeDSOffsetFoldingEnabled())
1163     return true;
1164 
1165   // On Southern Islands instruction with a negative base value and an offset
1166   // don't seem to work.
1167   return CurDAG->SignBitIsZero(Base);
1168 }
1169 
1170 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1171                                               SDValue &Offset) const {
1172   SDLoc DL(Addr);
1173   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1174     SDValue N0 = Addr.getOperand(0);
1175     SDValue N1 = Addr.getOperand(1);
1176     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1177     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1178       // (add n0, c0)
1179       Base = N0;
1180       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1181       return true;
1182     }
1183   } else if (Addr.getOpcode() == ISD::SUB) {
1184     // sub C, x -> add (sub 0, x), C
1185     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1186       int64_t ByteOffset = C->getSExtValue();
1187       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1188         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1189 
1190         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1191         // the known bits in isDSOffsetLegal. We need to emit the selected node
1192         // here, so this is thrown away.
1193         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1194                                       Zero, Addr.getOperand(1));
1195 
1196         if (isDSOffsetLegal(Sub, ByteOffset)) {
1197           SmallVector<SDValue, 3> Opnds;
1198           Opnds.push_back(Zero);
1199           Opnds.push_back(Addr.getOperand(1));
1200 
1201           // FIXME: Select to VOP3 version for with-carry.
1202           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1203           if (Subtarget->hasAddNoCarry()) {
1204             SubOp = AMDGPU::V_SUB_U32_e64;
1205             Opnds.push_back(
1206                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1207           }
1208 
1209           MachineSDNode *MachineSub =
1210               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1211 
1212           Base = SDValue(MachineSub, 0);
1213           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1214           return true;
1215         }
1216       }
1217     }
1218   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1219     // If we have a constant address, prefer to put the constant into the
1220     // offset. This can save moves to load the constant address since multiple
1221     // operations can share the zero base address register, and enables merging
1222     // into read2 / write2 instructions.
1223 
1224     SDLoc DL(Addr);
1225 
1226     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1227       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1228       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1229                                  DL, MVT::i32, Zero);
1230       Base = SDValue(MovZero, 0);
1231       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1232       return true;
1233     }
1234   }
1235 
1236   // default case
1237   Base = Addr;
1238   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1239   return true;
1240 }
1241 
1242 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1243                                           unsigned Offset1,
1244                                           unsigned Size) const {
1245   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1246     return false;
1247   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1248     return false;
1249 
1250   if (!Base || Subtarget->hasUsableDSOffset() ||
1251       Subtarget->unsafeDSOffsetFoldingEnabled())
1252     return true;
1253 
1254   // On Southern Islands instruction with a negative base value and an offset
1255   // don't seem to work.
1256   return CurDAG->SignBitIsZero(Base);
1257 }
1258 
1259 // Return whether the operation has NoUnsignedWrap property.
1260 static bool isNoUnsignedWrap(SDValue Addr) {
1261   return (Addr.getOpcode() == ISD::ADD &&
1262           Addr->getFlags().hasNoUnsignedWrap()) ||
1263          Addr->getOpcode() == ISD::OR;
1264 }
1265 
1266 // Check that the base address of flat scratch load/store in the form of `base +
1267 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1268 // requirement). We always treat the first operand as the base address here.
1269 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1270   if (isNoUnsignedWrap(Addr))
1271     return true;
1272 
1273   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1274   // values.
1275   if (Subtarget->hasSignedScratchOffsets())
1276     return true;
1277 
1278   auto LHS = Addr.getOperand(0);
1279   auto RHS = Addr.getOperand(1);
1280 
1281   // If the immediate offset is negative and within certain range, the base
1282   // address cannot also be negative. If the base is also negative, the sum
1283   // would be either negative or much larger than the valid range of scratch
1284   // memory a thread can access.
1285   ConstantSDNode *ImmOp = nullptr;
1286   if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1287     if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1288       return true;
1289   }
1290 
1291   return CurDAG->SignBitIsZero(LHS);
1292 }
1293 
1294 // Check address value in SGPR/VGPR are legal for flat scratch in the form
1295 // of: SGPR + VGPR.
1296 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1297   if (isNoUnsignedWrap(Addr))
1298     return true;
1299 
1300   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1301   // values.
1302   if (Subtarget->hasSignedScratchOffsets())
1303     return true;
1304 
1305   auto LHS = Addr.getOperand(0);
1306   auto RHS = Addr.getOperand(1);
1307   return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1308 }
1309 
1310 // Check address value in SGPR/VGPR are legal for flat scratch in the form
1311 // of: SGPR + VGPR + Imm.
1312 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1313   // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1314   // values.
1315   if (AMDGPU::isGFX12Plus(*Subtarget))
1316     return true;
1317 
1318   auto Base = Addr.getOperand(0);
1319   auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1320   // If the immediate offset is negative and within certain range, the base
1321   // address cannot also be negative. If the base is also negative, the sum
1322   // would be either negative or much larger than the valid range of scratch
1323   // memory a thread can access.
1324   if (isNoUnsignedWrap(Base) &&
1325       (isNoUnsignedWrap(Addr) ||
1326        (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1327     return true;
1328 
1329   auto LHS = Base.getOperand(0);
1330   auto RHS = Base.getOperand(1);
1331   return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1332 }
1333 
1334 // TODO: If offset is too big, put low 16-bit into offset.
1335 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1336                                                    SDValue &Offset0,
1337                                                    SDValue &Offset1) const {
1338   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1339 }
1340 
1341 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1342                                                     SDValue &Offset0,
1343                                                     SDValue &Offset1) const {
1344   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1345 }
1346 
1347 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1348                                             SDValue &Offset0, SDValue &Offset1,
1349                                             unsigned Size) const {
1350   SDLoc DL(Addr);
1351 
1352   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1353     SDValue N0 = Addr.getOperand(0);
1354     SDValue N1 = Addr.getOperand(1);
1355     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1356     unsigned OffsetValue0 = C1->getZExtValue();
1357     unsigned OffsetValue1 = OffsetValue0 + Size;
1358 
1359     // (add n0, c0)
1360     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1361       Base = N0;
1362       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1363       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1364       return true;
1365     }
1366   } else if (Addr.getOpcode() == ISD::SUB) {
1367     // sub C, x -> add (sub 0, x), C
1368     if (const ConstantSDNode *C =
1369             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1370       unsigned OffsetValue0 = C->getZExtValue();
1371       unsigned OffsetValue1 = OffsetValue0 + Size;
1372 
1373       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1374         SDLoc DL(Addr);
1375         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1376 
1377         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1378         // the known bits in isDSOffsetLegal. We need to emit the selected node
1379         // here, so this is thrown away.
1380         SDValue Sub =
1381             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1382 
1383         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1384           SmallVector<SDValue, 3> Opnds;
1385           Opnds.push_back(Zero);
1386           Opnds.push_back(Addr.getOperand(1));
1387           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1388           if (Subtarget->hasAddNoCarry()) {
1389             SubOp = AMDGPU::V_SUB_U32_e64;
1390             Opnds.push_back(
1391                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1392           }
1393 
1394           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1395               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1396 
1397           Base = SDValue(MachineSub, 0);
1398           Offset0 =
1399               CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1400           Offset1 =
1401               CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1402           return true;
1403         }
1404       }
1405     }
1406   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1407     unsigned OffsetValue0 = CAddr->getZExtValue();
1408     unsigned OffsetValue1 = OffsetValue0 + Size;
1409 
1410     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1411       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1412       MachineSDNode *MovZero =
1413           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1414       Base = SDValue(MovZero, 0);
1415       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1416       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1417       return true;
1418     }
1419   }
1420 
1421   // default case
1422 
1423   Base = Addr;
1424   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1425   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1426   return true;
1427 }
1428 
1429 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1430                                      SDValue &SOffset, SDValue &Offset,
1431                                      SDValue &Offen, SDValue &Idxen,
1432                                      SDValue &Addr64) const {
1433   // Subtarget prefers to use flat instruction
1434   // FIXME: This should be a pattern predicate and not reach here
1435   if (Subtarget->useFlatForGlobal())
1436     return false;
1437 
1438   SDLoc DL(Addr);
1439 
1440   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1441   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1442   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1443   SOffset = Subtarget->hasRestrictedSOffset()
1444                 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1445                 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1446 
1447   ConstantSDNode *C1 = nullptr;
1448   SDValue N0 = Addr;
1449   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1450     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1451     if (isUInt<32>(C1->getZExtValue()))
1452       N0 = Addr.getOperand(0);
1453     else
1454       C1 = nullptr;
1455   }
1456 
1457   if (N0.getOpcode() == ISD::ADD) {
1458     // (add N2, N3) -> addr64, or
1459     // (add (add N2, N3), C1) -> addr64
1460     SDValue N2 = N0.getOperand(0);
1461     SDValue N3 = N0.getOperand(1);
1462     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1463 
1464     if (N2->isDivergent()) {
1465       if (N3->isDivergent()) {
1466         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1467         // addr64, and construct the resource from a 0 address.
1468         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1469         VAddr = N0;
1470       } else {
1471         // N2 is divergent, N3 is not.
1472         Ptr = N3;
1473         VAddr = N2;
1474       }
1475     } else {
1476       // N2 is not divergent.
1477       Ptr = N2;
1478       VAddr = N3;
1479     }
1480     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1481   } else if (N0->isDivergent()) {
1482     // N0 is divergent. Use it as the addr64, and construct the resource from a
1483     // 0 address.
1484     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1485     VAddr = N0;
1486     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1487   } else {
1488     // N0 -> offset, or
1489     // (N0 + C1) -> offset
1490     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1491     Ptr = N0;
1492   }
1493 
1494   if (!C1) {
1495     // No offset.
1496     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1497     return true;
1498   }
1499 
1500   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1501   if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1502     // Legal offset for instruction.
1503     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1504     return true;
1505   }
1506 
1507   // Illegal offset, store it in soffset.
1508   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1509   SOffset =
1510       SDValue(CurDAG->getMachineNode(
1511                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1512                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1513               0);
1514   return true;
1515 }
1516 
1517 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1518                                            SDValue &VAddr, SDValue &SOffset,
1519                                            SDValue &Offset) const {
1520   SDValue Ptr, Offen, Idxen, Addr64;
1521 
1522   // addr64 bit was removed for volcanic islands.
1523   // FIXME: This should be a pattern predicate and not reach here
1524   if (!Subtarget->hasAddr64())
1525     return false;
1526 
1527   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1528     return false;
1529 
1530   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1531   if (C->getSExtValue()) {
1532     SDLoc DL(Addr);
1533 
1534     const SITargetLowering& Lowering =
1535       *static_cast<const SITargetLowering*>(getTargetLowering());
1536 
1537     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1538     return true;
1539   }
1540 
1541   return false;
1542 }
1543 
1544 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1545   SDLoc DL(N);
1546 
1547   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1548   SDValue TFI =
1549       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1550 
1551   // We rebase the base address into an absolute stack address and hence
1552   // use constant 0 for soffset. This value must be retained until
1553   // frame elimination and eliminateFrameIndex will choose the appropriate
1554   // frame register if need be.
1555   return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1556 }
1557 
1558 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1559                                                  SDValue Addr, SDValue &Rsrc,
1560                                                  SDValue &VAddr, SDValue &SOffset,
1561                                                  SDValue &ImmOffset) const {
1562 
1563   SDLoc DL(Addr);
1564   MachineFunction &MF = CurDAG->getMachineFunction();
1565   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1566 
1567   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1568 
1569   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1570     int64_t Imm = CAddr->getSExtValue();
1571     const int64_t NullPtr =
1572         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1573     // Don't fold null pointer.
1574     if (Imm != NullPtr) {
1575       const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1576       SDValue HighBits =
1577           CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1578       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1579         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1580       VAddr = SDValue(MovHighBits, 0);
1581 
1582       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1583       ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1584       return true;
1585     }
1586   }
1587 
1588   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1589     // (add n0, c1)
1590 
1591     SDValue N0 = Addr.getOperand(0);
1592     uint64_t C1 = Addr.getConstantOperandVal(1);
1593 
1594     // Offsets in vaddr must be positive if range checking is enabled.
1595     //
1596     // The total computation of vaddr + soffset + offset must not overflow.  If
1597     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1598     // overflowing.
1599     //
1600     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1601     // always perform a range check. If a negative vaddr base index was used,
1602     // this would fail the range check. The overall address computation would
1603     // compute a valid address, but this doesn't happen due to the range
1604     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1605     //
1606     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1607     // MUBUF vaddr, but not on older subtargets which can only do this if the
1608     // sign bit is known 0.
1609     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1610     if (TII->isLegalMUBUFImmOffset(C1) &&
1611         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1612          CurDAG->SignBitIsZero(N0))) {
1613       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1614       ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1615       return true;
1616     }
1617   }
1618 
1619   // (node)
1620   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1621   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1622   return true;
1623 }
1624 
1625 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1626   if (Val.getOpcode() != ISD::CopyFromReg)
1627     return false;
1628   auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1629   if (!Reg.isPhysical())
1630     return false;
1631   const auto *RC = TRI.getPhysRegBaseClass(Reg);
1632   return RC && TRI.isSGPRClass(RC);
1633 }
1634 
1635 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1636                                                   SDValue Addr,
1637                                                   SDValue &SRsrc,
1638                                                   SDValue &SOffset,
1639                                                   SDValue &Offset) const {
1640   const SIRegisterInfo *TRI =
1641       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1642   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1643   MachineFunction &MF = CurDAG->getMachineFunction();
1644   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1645   SDLoc DL(Addr);
1646 
1647   // CopyFromReg <sgpr>
1648   if (IsCopyFromSGPR(*TRI, Addr)) {
1649     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1650     SOffset = Addr;
1651     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1652     return true;
1653   }
1654 
1655   ConstantSDNode *CAddr;
1656   if (Addr.getOpcode() == ISD::ADD) {
1657     // Add (CopyFromReg <sgpr>) <constant>
1658     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1659     if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1660       return false;
1661     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1662       return false;
1663 
1664     SOffset = Addr.getOperand(0);
1665   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1666              TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1667     // <constant>
1668     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1669   } else {
1670     return false;
1671   }
1672 
1673   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1674 
1675   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1676   return true;
1677 }
1678 
1679 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1680                                            SDValue &SOffset, SDValue &Offset
1681                                            ) const {
1682   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1683   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1684 
1685   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1686     return false;
1687 
1688   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1689       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1690       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1691     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1692                     maskTrailingOnes<uint64_t>(32); // Size
1693     SDLoc DL(Addr);
1694 
1695     const SITargetLowering& Lowering =
1696       *static_cast<const SITargetLowering*>(getTargetLowering());
1697 
1698     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1699     return true;
1700   }
1701   return false;
1702 }
1703 
1704 bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1705                                           SDValue &SOffset) const {
1706   if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1707     SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1708     return true;
1709   }
1710 
1711   SOffset = ByteOffsetNode;
1712   return true;
1713 }
1714 
1715 // Find a load or store from corresponding pattern root.
1716 // Roots may be build_vector, bitconvert or their combinations.
1717 static MemSDNode* findMemSDNode(SDNode *N) {
1718   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1719   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1720     return MN;
1721   assert(isa<BuildVectorSDNode>(N));
1722   for (SDValue V : N->op_values())
1723     if (MemSDNode *MN =
1724           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1725       return MN;
1726   llvm_unreachable("cannot find MemSDNode in the pattern!");
1727 }
1728 
1729 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1730                                               SDValue &VAddr, SDValue &Offset,
1731                                               uint64_t FlatVariant) const {
1732   int64_t OffsetVal = 0;
1733 
1734   unsigned AS = findMemSDNode(N)->getAddressSpace();
1735 
1736   bool CanHaveFlatSegmentOffsetBug =
1737       Subtarget->hasFlatSegmentOffsetBug() &&
1738       FlatVariant == SIInstrFlags::FLAT &&
1739       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1740 
1741   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1742     SDValue N0, N1;
1743     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1744         (FlatVariant != SIInstrFlags::FlatScratch ||
1745          isFlatScratchBaseLegal(Addr))) {
1746       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1747 
1748       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1749       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1750         Addr = N0;
1751         OffsetVal = COffsetVal;
1752       } else {
1753         // If the offset doesn't fit, put the low bits into the offset field and
1754         // add the rest.
1755         //
1756         // For a FLAT instruction the hardware decides whether to access
1757         // global/scratch/shared memory based on the high bits of vaddr,
1758         // ignoring the offset field, so we have to ensure that when we add
1759         // remainder to vaddr it still points into the same underlying object.
1760         // The easiest way to do that is to make sure that we split the offset
1761         // into two pieces that are both >= 0 or both <= 0.
1762 
1763         SDLoc DL(N);
1764         uint64_t RemainderOffset;
1765 
1766         std::tie(OffsetVal, RemainderOffset) =
1767             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1768 
1769         SDValue AddOffsetLo =
1770             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1771         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1772 
1773         if (Addr.getValueType().getSizeInBits() == 32) {
1774           SmallVector<SDValue, 3> Opnds;
1775           Opnds.push_back(N0);
1776           Opnds.push_back(AddOffsetLo);
1777           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1778           if (Subtarget->hasAddNoCarry()) {
1779             AddOp = AMDGPU::V_ADD_U32_e64;
1780             Opnds.push_back(Clamp);
1781           }
1782           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1783         } else {
1784           // TODO: Should this try to use a scalar add pseudo if the base address
1785           // is uniform and saddr is usable?
1786           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1787           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1788 
1789           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1790                                                 DL, MVT::i32, N0, Sub0);
1791           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1792                                                 DL, MVT::i32, N0, Sub1);
1793 
1794           SDValue AddOffsetHi =
1795               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1796 
1797           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1798 
1799           SDNode *Add =
1800               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1801                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1802 
1803           SDNode *Addc = CurDAG->getMachineNode(
1804               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1805               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1806 
1807           SDValue RegSequenceArgs[] = {
1808               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1809               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1810 
1811           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1812                                                 MVT::i64, RegSequenceArgs),
1813                          0);
1814         }
1815       }
1816     }
1817   }
1818 
1819   VAddr = Addr;
1820   Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1821   return true;
1822 }
1823 
1824 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1825                                           SDValue &VAddr,
1826                                           SDValue &Offset) const {
1827   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1828 }
1829 
1830 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1831                                             SDValue &VAddr,
1832                                             SDValue &Offset) const {
1833   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1834 }
1835 
1836 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1837                                              SDValue &VAddr,
1838                                              SDValue &Offset) const {
1839   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1840                               SIInstrFlags::FlatScratch);
1841 }
1842 
1843 // If this matches zero_extend i32:x, return x
1844 static SDValue matchZExtFromI32(SDValue Op) {
1845   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1846     return SDValue();
1847 
1848   SDValue ExtSrc = Op.getOperand(0);
1849   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1850 }
1851 
1852 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1853 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1854                                            SDValue Addr,
1855                                            SDValue &SAddr,
1856                                            SDValue &VOffset,
1857                                            SDValue &Offset) const {
1858   int64_t ImmOffset = 0;
1859 
1860   // Match the immediate offset first, which canonically is moved as low as
1861   // possible.
1862 
1863   SDValue LHS, RHS;
1864   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1865     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1866     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1867 
1868     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1869                                SIInstrFlags::FlatGlobal)) {
1870       Addr = LHS;
1871       ImmOffset = COffsetVal;
1872     } else if (!LHS->isDivergent()) {
1873       if (COffsetVal > 0) {
1874         SDLoc SL(N);
1875         // saddr + large_offset -> saddr +
1876         //                         (voffset = large_offset & ~MaxOffset) +
1877         //                         (large_offset & MaxOffset);
1878         int64_t SplitImmOffset, RemainderOffset;
1879         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1880             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1881 
1882         if (isUInt<32>(RemainderOffset)) {
1883           SDNode *VMov = CurDAG->getMachineNode(
1884               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1885               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1886           VOffset = SDValue(VMov, 0);
1887           SAddr = LHS;
1888           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1889           return true;
1890         }
1891       }
1892 
1893       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1894       // is 1 we would need to perform 1 or 2 extra moves for each half of
1895       // the constant and it is better to do a scalar add and then issue a
1896       // single VALU instruction to materialize zero. Otherwise it is less
1897       // instructions to perform VALU adds with immediates or inline literals.
1898       unsigned NumLiterals =
1899           !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1900           !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1901       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1902         return false;
1903     }
1904   }
1905 
1906   // Match the variable offset.
1907   if (Addr.getOpcode() == ISD::ADD) {
1908     LHS = Addr.getOperand(0);
1909     RHS = Addr.getOperand(1);
1910 
1911     if (!LHS->isDivergent()) {
1912       // add (i64 sgpr), (zero_extend (i32 vgpr))
1913       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1914         SAddr = LHS;
1915         VOffset = ZextRHS;
1916       }
1917     }
1918 
1919     if (!SAddr && !RHS->isDivergent()) {
1920       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1921       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1922         SAddr = RHS;
1923         VOffset = ZextLHS;
1924       }
1925     }
1926 
1927     if (SAddr) {
1928       Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1929       return true;
1930     }
1931   }
1932 
1933   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1934       isa<ConstantSDNode>(Addr))
1935     return false;
1936 
1937   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1938   // moves required to copy a 64-bit SGPR to VGPR.
1939   SAddr = Addr;
1940   SDNode *VMov =
1941       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1942                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1943   VOffset = SDValue(VMov, 0);
1944   Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1945   return true;
1946 }
1947 
1948 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1949   if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1950     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1951   } else if (SAddr.getOpcode() == ISD::ADD &&
1952              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1953     // Materialize this into a scalar move for scalar address to avoid
1954     // readfirstlane.
1955     auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1956     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1957                                               FI->getValueType(0));
1958     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1959                                            MVT::i32, TFI, SAddr.getOperand(1)),
1960                     0);
1961   }
1962 
1963   return SAddr;
1964 }
1965 
1966 // Match (32-bit SGPR base) + sext(imm offset)
1967 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1968                                             SDValue &SAddr,
1969                                             SDValue &Offset) const {
1970   if (Addr->isDivergent())
1971     return false;
1972 
1973   SDLoc DL(Addr);
1974 
1975   int64_t COffsetVal = 0;
1976 
1977   if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1978     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1979     SAddr = Addr.getOperand(0);
1980   } else {
1981     SAddr = Addr;
1982   }
1983 
1984   SAddr = SelectSAddrFI(CurDAG, SAddr);
1985 
1986   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1987 
1988   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1989                               SIInstrFlags::FlatScratch)) {
1990     int64_t SplitImmOffset, RemainderOffset;
1991     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1992         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1993 
1994     COffsetVal = SplitImmOffset;
1995 
1996     SDValue AddOffset =
1997         SAddr.getOpcode() == ISD::TargetFrameIndex
1998             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1999             : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2000     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2001                                            SAddr, AddOffset),
2002                     0);
2003   }
2004 
2005   Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2006 
2007   return true;
2008 }
2009 
2010 // Check whether the flat scratch SVS swizzle bug affects this access.
2011 bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2012     SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2013   if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2014     return false;
2015 
2016   // The bug affects the swizzling of SVS accesses if there is any carry out
2017   // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2018   // voffset to (soffset + inst_offset).
2019   KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2020   KnownBits SKnown =
2021       KnownBits::add(CurDAG->computeKnownBits(SAddr),
2022                      KnownBits::makeConstant(APInt(32, ImmOffset,
2023                                                    /*isSigned=*/true)));
2024   uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2025   uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2026   return (VMax & 3) + (SMax & 3) >= 4;
2027 }
2028 
2029 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2030                                              SDValue &VAddr, SDValue &SAddr,
2031                                              SDValue &Offset) const  {
2032   int64_t ImmOffset = 0;
2033 
2034   SDValue LHS, RHS;
2035   SDValue OrigAddr = Addr;
2036   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2037     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2038     const SIInstrInfo *TII = Subtarget->getInstrInfo();
2039 
2040     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
2041       Addr = LHS;
2042       ImmOffset = COffsetVal;
2043     } else if (!LHS->isDivergent() && COffsetVal > 0) {
2044       SDLoc SL(N);
2045       // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2046       //                         (large_offset & MaxOffset);
2047       int64_t SplitImmOffset, RemainderOffset;
2048       std::tie(SplitImmOffset, RemainderOffset)
2049         = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
2050 
2051       if (isUInt<32>(RemainderOffset)) {
2052         SDNode *VMov = CurDAG->getMachineNode(
2053           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2054           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2055         VAddr = SDValue(VMov, 0);
2056         SAddr = LHS;
2057         if (!isFlatScratchBaseLegal(Addr))
2058           return false;
2059         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2060           return false;
2061         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2062         return true;
2063       }
2064     }
2065   }
2066 
2067   if (Addr.getOpcode() != ISD::ADD)
2068     return false;
2069 
2070   LHS = Addr.getOperand(0);
2071   RHS = Addr.getOperand(1);
2072 
2073   if (!LHS->isDivergent() && RHS->isDivergent()) {
2074     SAddr = LHS;
2075     VAddr = RHS;
2076   } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2077     SAddr = RHS;
2078     VAddr = LHS;
2079   } else {
2080     return false;
2081   }
2082 
2083   if (OrigAddr != Addr) {
2084     if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2085       return false;
2086   } else {
2087     if (!isFlatScratchBaseLegalSV(OrigAddr))
2088       return false;
2089   }
2090 
2091   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2092     return false;
2093   SAddr = SelectSAddrFI(CurDAG, SAddr);
2094   Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2095   return true;
2096 }
2097 
2098 // For unbuffered smem loads, it is illegal for the Immediate Offset to be
2099 // negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2100 // Handle the case where the Immediate Offset + SOffset is negative.
2101 bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2102                                                      bool Imm32Only,
2103                                                      bool IsBuffer,
2104                                                      int64_t ImmOffset) const {
2105   if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2106       AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2107     KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2108     if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2109       return false;
2110   }
2111 
2112   return true;
2113 }
2114 
2115 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2116 // not null) offset. If Imm32Only is true, match only 32-bit immediate
2117 // offsets available on CI.
2118 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2119                                           SDValue *SOffset, SDValue *Offset,
2120                                           bool Imm32Only, bool IsBuffer,
2121                                           bool HasSOffset,
2122                                           int64_t ImmOffset) const {
2123   assert((!SOffset || !Offset) &&
2124          "Cannot match both soffset and offset at the same time!");
2125 
2126   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2127   if (!C) {
2128     if (!SOffset)
2129       return false;
2130 
2131     if (ByteOffsetNode.getValueType().isScalarInteger() &&
2132         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2133       *SOffset = ByteOffsetNode;
2134       return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2135                                          ImmOffset);
2136     }
2137     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2138       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2139         *SOffset = ByteOffsetNode.getOperand(0);
2140         return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2141                                            ImmOffset);
2142       }
2143     }
2144     return false;
2145   }
2146 
2147   SDLoc SL(ByteOffsetNode);
2148 
2149   // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2150   // offset for S_BUFFER instructions is unsigned.
2151   int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2152   std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2153       *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2154   if (EncodedOffset && Offset && !Imm32Only) {
2155     *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2156     return true;
2157   }
2158 
2159   // SGPR and literal offsets are unsigned.
2160   if (ByteOffset < 0)
2161     return false;
2162 
2163   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2164   if (EncodedOffset && Offset && Imm32Only) {
2165     *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2166     return true;
2167   }
2168 
2169   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2170     return false;
2171 
2172   if (SOffset) {
2173     SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2174     *SOffset = SDValue(
2175         CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2176     return true;
2177   }
2178 
2179   return false;
2180 }
2181 
2182 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2183   if (Addr.getValueType() != MVT::i32)
2184     return Addr;
2185 
2186   // Zero-extend a 32-bit address.
2187   SDLoc SL(Addr);
2188 
2189   const MachineFunction &MF = CurDAG->getMachineFunction();
2190   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2191   unsigned AddrHiVal = Info->get32BitAddressHighBits();
2192   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2193 
2194   const SDValue Ops[] = {
2195     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2196     Addr,
2197     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2198     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2199             0),
2200     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2201   };
2202 
2203   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2204                                         Ops), 0);
2205 }
2206 
2207 // Match a base and an immediate (if Offset is not null) or an SGPR (if
2208 // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2209 // true, match only 32-bit immediate offsets available on CI.
2210 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2211                                               SDValue *SOffset, SDValue *Offset,
2212                                               bool Imm32Only, bool IsBuffer,
2213                                               bool HasSOffset,
2214                                               int64_t ImmOffset) const {
2215   if (SOffset && Offset) {
2216     assert(!Imm32Only && !IsBuffer);
2217     SDValue B;
2218 
2219     if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2220       return false;
2221 
2222     int64_t ImmOff = 0;
2223     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2224       ImmOff = C->getSExtValue();
2225 
2226     return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2227                                 ImmOff);
2228   }
2229 
2230   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2231   // wraparound, because s_load instructions perform the addition in 64 bits.
2232   if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2233       !Addr->getFlags().hasNoUnsignedWrap())
2234     return false;
2235 
2236   SDValue N0, N1;
2237   // Extract the base and offset if possible.
2238   if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2239     N0 = Addr.getOperand(0);
2240     N1 = Addr.getOperand(1);
2241   } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2242     assert(N0 && N1 && isa<ConstantSDNode>(N1));
2243   }
2244   if (!N0 || !N1)
2245     return false;
2246 
2247   if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2248                        ImmOffset)) {
2249     SBase = N0;
2250     return true;
2251   }
2252   if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2253                        ImmOffset)) {
2254     SBase = N1;
2255     return true;
2256   }
2257   return false;
2258 }
2259 
2260 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2261                                     SDValue *SOffset, SDValue *Offset,
2262                                     bool Imm32Only) const {
2263   if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2264     SBase = Expand32BitAddress(SBase);
2265     return true;
2266   }
2267 
2268   if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2269     SBase = Expand32BitAddress(Addr);
2270     *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2271     return true;
2272   }
2273 
2274   return false;
2275 }
2276 
2277 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2278                                        SDValue &Offset) const {
2279   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2280 }
2281 
2282 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2283                                          SDValue &Offset) const {
2284   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2285   return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2286                     /* Imm32Only */ true);
2287 }
2288 
2289 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2290                                         SDValue &SOffset) const {
2291   return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2292 }
2293 
2294 bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2295                                            SDValue &SOffset,
2296                                            SDValue &Offset) const {
2297   return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2298 }
2299 
2300 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2301   return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2302                           /* Imm32Only */ false, /* IsBuffer */ true);
2303 }
2304 
2305 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2306                                                SDValue &Offset) const {
2307   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2308   return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2309                           /* Imm32Only */ true, /* IsBuffer */ true);
2310 }
2311 
2312 bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2313                                                  SDValue &Offset) const {
2314   // Match the (soffset + offset) pair as a 32-bit register base and
2315   // an immediate offset.
2316   return N.getValueType() == MVT::i32 &&
2317          SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2318                               &Offset, /* Imm32Only */ false,
2319                               /* IsBuffer */ true);
2320 }
2321 
2322 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2323                                             SDValue &Base,
2324                                             SDValue &Offset) const {
2325   SDLoc DL(Index);
2326 
2327   if (CurDAG->isBaseWithConstantOffset(Index)) {
2328     SDValue N0 = Index.getOperand(0);
2329     SDValue N1 = Index.getOperand(1);
2330     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2331 
2332     // (add n0, c0)
2333     // Don't peel off the offset (c0) if doing so could possibly lead
2334     // the base (n0) to be negative.
2335     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2336     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2337         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2338       Base = N0;
2339       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2340       return true;
2341     }
2342   }
2343 
2344   if (isa<ConstantSDNode>(Index))
2345     return false;
2346 
2347   Base = Index;
2348   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2349   return true;
2350 }
2351 
2352 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2353                                      SDValue Val, uint32_t Offset,
2354                                      uint32_t Width) {
2355   if (Val->isDivergent()) {
2356     unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2357     SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2358     SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2359 
2360     return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2361   }
2362   unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2363   // Transformation function, pack the offset and width of a BFE into
2364   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2365   // source, bits [5:0] contain the offset and bits [22:16] the width.
2366   uint32_t PackedVal = Offset | (Width << 16);
2367   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2368 
2369   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2370 }
2371 
2372 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2373   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2374   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2375   // Predicate: 0 < b <= c < 32
2376 
2377   const SDValue &Shl = N->getOperand(0);
2378   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2379   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2380 
2381   if (B && C) {
2382     uint32_t BVal = B->getZExtValue();
2383     uint32_t CVal = C->getZExtValue();
2384 
2385     if (0 < BVal && BVal <= CVal && CVal < 32) {
2386       bool Signed = N->getOpcode() == ISD::SRA;
2387       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2388                   32 - CVal));
2389       return;
2390     }
2391   }
2392   SelectCode(N);
2393 }
2394 
2395 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2396   switch (N->getOpcode()) {
2397   case ISD::AND:
2398     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2399       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2400       // Predicate: isMask(mask)
2401       const SDValue &Srl = N->getOperand(0);
2402       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2403       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2404 
2405       if (Shift && Mask) {
2406         uint32_t ShiftVal = Shift->getZExtValue();
2407         uint32_t MaskVal = Mask->getZExtValue();
2408 
2409         if (isMask_32(MaskVal)) {
2410           uint32_t WidthVal = llvm::popcount(MaskVal);
2411           ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2412                                   WidthVal));
2413           return;
2414         }
2415       }
2416     }
2417     break;
2418   case ISD::SRL:
2419     if (N->getOperand(0).getOpcode() == ISD::AND) {
2420       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2421       // Predicate: isMask(mask >> b)
2422       const SDValue &And = N->getOperand(0);
2423       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2424       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2425 
2426       if (Shift && Mask) {
2427         uint32_t ShiftVal = Shift->getZExtValue();
2428         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2429 
2430         if (isMask_32(MaskVal)) {
2431           uint32_t WidthVal = llvm::popcount(MaskVal);
2432           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2433                       WidthVal));
2434           return;
2435         }
2436       }
2437     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2438       SelectS_BFEFromShifts(N);
2439       return;
2440     }
2441     break;
2442   case ISD::SRA:
2443     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2444       SelectS_BFEFromShifts(N);
2445       return;
2446     }
2447     break;
2448 
2449   case ISD::SIGN_EXTEND_INREG: {
2450     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2451     SDValue Src = N->getOperand(0);
2452     if (Src.getOpcode() != ISD::SRL)
2453       break;
2454 
2455     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2456     if (!Amt)
2457       break;
2458 
2459     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2460     ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2461                             Amt->getZExtValue(), Width));
2462     return;
2463   }
2464   }
2465 
2466   SelectCode(N);
2467 }
2468 
2469 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2470   assert(N->getOpcode() == ISD::BRCOND);
2471   if (!N->hasOneUse())
2472     return false;
2473 
2474   SDValue Cond = N->getOperand(1);
2475   if (Cond.getOpcode() == ISD::CopyToReg)
2476     Cond = Cond.getOperand(2);
2477 
2478   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2479     return false;
2480 
2481   MVT VT = Cond.getOperand(0).getSimpleValueType();
2482   if (VT == MVT::i32)
2483     return true;
2484 
2485   if (VT == MVT::i64) {
2486     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2487     return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2488            Subtarget->hasScalarCompareEq64();
2489   }
2490 
2491   if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2492     return true;
2493 
2494   return false;
2495 }
2496 
2497 static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2498   assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2499   // Special case for amdgcn.ballot:
2500   // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2501   // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2502   // =>
2503   // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2504   // This is possible because divergent ISD::SETCC is selected as V_CMP and
2505   // Cond becomes a i(WaveSize) full mask value.
2506   // Note that ballot doesn't use SETEQ condition but its easy to support it
2507   // here for completeness, so in this case Negate is set true on return.
2508   auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2509   if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2510       isNullConstant(VCMP.getOperand(1))) {
2511 
2512     auto Cond = VCMP.getOperand(0);
2513     if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2514       Cond = Cond.getOperand(0);
2515 
2516     if (isBoolSGPR(Cond)) {
2517       Negate = VCMP_CC == ISD::SETEQ;
2518       return Cond;
2519     }
2520   }
2521   return SDValue();
2522 }
2523 
2524 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2525   SDValue Cond = N->getOperand(1);
2526 
2527   if (Cond.isUndef()) {
2528     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2529                          N->getOperand(2), N->getOperand(0));
2530     return;
2531   }
2532 
2533   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2534 
2535   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2536   bool AndExec = !UseSCCBr;
2537   bool Negate = false;
2538 
2539   if (Cond.getOpcode() == ISD::SETCC &&
2540       Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2541     SDValue VCMP = Cond->getOperand(0);
2542     auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2543     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2544         isNullConstant(Cond->getOperand(1)) &&
2545         // We may encounter ballot.i64 in wave32 mode on -O0.
2546         VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2547       // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2548       // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2549       // BRCOND i1 %C, %BB
2550       // =>
2551       // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2552       // VCC = COPY i(WaveSize) %VCMP
2553       // S_CBRANCH_VCCNZ/VCCZ %BB
2554       Negate = CC == ISD::SETEQ;
2555       bool NegatedBallot = false;
2556       if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2557         Cond = BallotCond;
2558         UseSCCBr = !BallotCond->isDivergent();
2559         Negate = Negate ^ NegatedBallot;
2560       } else {
2561         // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2562         // selected as V_CMP, but this may change for uniform condition.
2563         Cond = VCMP;
2564         UseSCCBr = false;
2565       }
2566     }
2567     // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2568     // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2569     // used.
2570     AndExec = false;
2571   }
2572 
2573   unsigned BrOp =
2574       UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2575                : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2576   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2577   SDLoc SL(N);
2578 
2579   if (AndExec) {
2580     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2581     // analyzed what generates the vcc value, so we do not know whether vcc
2582     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2583     // disabled lanes.
2584     //
2585     // For the case that we select S_CBRANCH_SCC1 and it gets
2586     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2587     // SIInstrInfo::moveToVALU which inserts the S_AND).
2588     //
2589     // We could add an analysis of what generates the vcc value here and omit
2590     // the S_AND when is unnecessary. But it would be better to add a separate
2591     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2592     // catches both cases.
2593     Cond = SDValue(
2594         CurDAG->getMachineNode(
2595             Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2596             MVT::i1,
2597             CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2598                                                       : AMDGPU::EXEC,
2599                                 MVT::i1),
2600             Cond),
2601         0);
2602   }
2603 
2604   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2605   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2606                        N->getOperand(2), // Basic Block
2607                        VCC.getValue(0));
2608 }
2609 
2610 void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2611   if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2612       !N->isDivergent()) {
2613     SDValue Src = N->getOperand(0);
2614     if (Src.getValueType() == MVT::f16) {
2615       if (isExtractHiElt(Src, Src)) {
2616         CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2617                              {Src});
2618         return;
2619       }
2620     }
2621   }
2622 
2623   SelectCode(N);
2624 }
2625 
2626 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2627   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2628   // be copied to an SGPR with readfirstlane.
2629   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2630     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2631 
2632   SDValue Chain = N->getOperand(0);
2633   SDValue Ptr = N->getOperand(2);
2634   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2635   MachineMemOperand *MMO = M->getMemOperand();
2636   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2637 
2638   SDValue Offset;
2639   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2640     SDValue PtrBase = Ptr.getOperand(0);
2641     SDValue PtrOffset = Ptr.getOperand(1);
2642 
2643     const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2644     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2645       N = glueCopyToM0(N, PtrBase);
2646       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2647     }
2648   }
2649 
2650   if (!Offset) {
2651     N = glueCopyToM0(N, Ptr);
2652     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2653   }
2654 
2655   SDValue Ops[] = {
2656     Offset,
2657     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2658     Chain,
2659     N->getOperand(N->getNumOperands() - 1) // New glue
2660   };
2661 
2662   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2663   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2664 }
2665 
2666 // We need to handle this here because tablegen doesn't support matching
2667 // instructions with multiple outputs.
2668 void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2669   unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2670   SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2671                    N->getOperand(5), N->getOperand(0)};
2672 
2673   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2674   MachineMemOperand *MMO = M->getMemOperand();
2675   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2676   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2677 }
2678 
2679 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2680   switch (IntrID) {
2681   case Intrinsic::amdgcn_ds_gws_init:
2682     return AMDGPU::DS_GWS_INIT;
2683   case Intrinsic::amdgcn_ds_gws_barrier:
2684     return AMDGPU::DS_GWS_BARRIER;
2685   case Intrinsic::amdgcn_ds_gws_sema_v:
2686     return AMDGPU::DS_GWS_SEMA_V;
2687   case Intrinsic::amdgcn_ds_gws_sema_br:
2688     return AMDGPU::DS_GWS_SEMA_BR;
2689   case Intrinsic::amdgcn_ds_gws_sema_p:
2690     return AMDGPU::DS_GWS_SEMA_P;
2691   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2692     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2693   default:
2694     llvm_unreachable("not a gws intrinsic");
2695   }
2696 }
2697 
2698 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2699   if (!Subtarget->hasGWS() ||
2700       (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2701        !Subtarget->hasGWSSemaReleaseAll())) {
2702     // Let this error.
2703     SelectCode(N);
2704     return;
2705   }
2706 
2707   // Chain, intrinsic ID, vsrc, offset
2708   const bool HasVSrc = N->getNumOperands() == 4;
2709   assert(HasVSrc || N->getNumOperands() == 3);
2710 
2711   SDLoc SL(N);
2712   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2713   int ImmOffset = 0;
2714   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2715   MachineMemOperand *MMO = M->getMemOperand();
2716 
2717   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2718   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2719 
2720   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2721   // offset field) % 64. Some versions of the programming guide omit the m0
2722   // part, or claim it's from offset 0.
2723   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2724     // If we have a constant offset, try to use the 0 in m0 as the base.
2725     // TODO: Look into changing the default m0 initialization value. If the
2726     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2727     // the immediate offset.
2728     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2729     ImmOffset = ConstOffset->getZExtValue();
2730   } else {
2731     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2732       ImmOffset = BaseOffset.getConstantOperandVal(1);
2733       BaseOffset = BaseOffset.getOperand(0);
2734     }
2735 
2736     // Prefer to do the shift in an SGPR since it should be possible to use m0
2737     // as the result directly. If it's already an SGPR, it will be eliminated
2738     // later.
2739     SDNode *SGPROffset
2740       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2741                                BaseOffset);
2742     // Shift to offset in m0
2743     SDNode *M0Base
2744       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2745                                SDValue(SGPROffset, 0),
2746                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2747     glueCopyToM0(N, SDValue(M0Base, 0));
2748   }
2749 
2750   SDValue Chain = N->getOperand(0);
2751   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2752 
2753   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2754   SmallVector<SDValue, 5> Ops;
2755   if (HasVSrc)
2756     Ops.push_back(N->getOperand(2));
2757   Ops.push_back(OffsetField);
2758   Ops.push_back(Chain);
2759 
2760   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2761   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2762 }
2763 
2764 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2765   if (Subtarget->getLDSBankCount() != 16) {
2766     // This is a single instruction with a pattern.
2767     SelectCode(N);
2768     return;
2769   }
2770 
2771   SDLoc DL(N);
2772 
2773   // This requires 2 instructions. It is possible to write a pattern to support
2774   // this, but the generated isel emitter doesn't correctly deal with multiple
2775   // output instructions using the same physical register input. The copy to m0
2776   // is incorrectly placed before the second instruction.
2777   //
2778   // TODO: Match source modifiers.
2779   //
2780   // def : Pat <
2781   //   (int_amdgcn_interp_p1_f16
2782   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2783   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2784   //                             (i1 timm:$high), M0),
2785   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2786   //       timm:$attrchan, 0,
2787   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2788   //   let Predicates = [has16BankLDS];
2789   // }
2790 
2791   // 16 bank LDS
2792   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2793                                       N->getOperand(5), SDValue());
2794 
2795   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2796 
2797   SDNode *InterpMov =
2798     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2799         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2800         N->getOperand(3),  // Attr
2801         N->getOperand(2),  // Attrchan
2802         ToM0.getValue(1) // In glue
2803   });
2804 
2805   SDNode *InterpP1LV =
2806     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2807         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2808         N->getOperand(1), // Src0
2809         N->getOperand(3), // Attr
2810         N->getOperand(2), // Attrchan
2811         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2812         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2813         N->getOperand(4), // high
2814         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2815         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2816         SDValue(InterpMov, 1)
2817   });
2818 
2819   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2820 }
2821 
2822 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2823   unsigned IntrID = N->getConstantOperandVal(1);
2824   switch (IntrID) {
2825   case Intrinsic::amdgcn_ds_append:
2826   case Intrinsic::amdgcn_ds_consume: {
2827     if (N->getValueType(0) != MVT::i32)
2828       break;
2829     SelectDSAppendConsume(N, IntrID);
2830     return;
2831   }
2832   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2833     SelectDSBvhStackIntrinsic(N);
2834     return;
2835   case Intrinsic::amdgcn_init_whole_wave:
2836     CurDAG->getMachineFunction()
2837         .getInfo<SIMachineFunctionInfo>()
2838         ->setInitWholeWave();
2839     break;
2840   }
2841 
2842   SelectCode(N);
2843 }
2844 
2845 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2846   unsigned IntrID = N->getConstantOperandVal(0);
2847   unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2848   SDNode *ConvGlueNode = N->getGluedNode();
2849   if (ConvGlueNode) {
2850     // FIXME: Possibly iterate over multiple glue nodes?
2851     assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2852     ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2853     ConvGlueNode =
2854         CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2855                                MVT::Glue, SDValue(ConvGlueNode, 0));
2856   } else {
2857     ConvGlueNode = nullptr;
2858   }
2859   switch (IntrID) {
2860   case Intrinsic::amdgcn_wqm:
2861     Opcode = AMDGPU::WQM;
2862     break;
2863   case Intrinsic::amdgcn_softwqm:
2864     Opcode = AMDGPU::SOFT_WQM;
2865     break;
2866   case Intrinsic::amdgcn_wwm:
2867   case Intrinsic::amdgcn_strict_wwm:
2868     Opcode = AMDGPU::STRICT_WWM;
2869     break;
2870   case Intrinsic::amdgcn_strict_wqm:
2871     Opcode = AMDGPU::STRICT_WQM;
2872     break;
2873   case Intrinsic::amdgcn_interp_p1_f16:
2874     SelectInterpP1F16(N);
2875     return;
2876   case Intrinsic::amdgcn_permlane16_swap:
2877   case Intrinsic::amdgcn_permlane32_swap: {
2878     if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2879          !Subtarget->hasPermlane16Swap()) ||
2880         (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2881          !Subtarget->hasPermlane32Swap())) {
2882       SelectCode(N); // Hit the default error
2883       return;
2884     }
2885 
2886     Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2887                  ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2888                  : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2889 
2890     SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2891     if (ConvGlueNode)
2892       NewOps.push_back(SDValue(ConvGlueNode, 0));
2893 
2894     bool FI = N->getConstantOperandVal(3);
2895     NewOps[2] = CurDAG->getTargetConstant(
2896         FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
2897 
2898     CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2899     return;
2900   }
2901   default:
2902     SelectCode(N);
2903     break;
2904   }
2905 
2906   if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2907     SDValue Src = N->getOperand(1);
2908     CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2909   }
2910 
2911   if (ConvGlueNode) {
2912     SmallVector<SDValue, 4> NewOps(N->ops());
2913     NewOps.push_back(SDValue(ConvGlueNode, 0));
2914     CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2915   }
2916 }
2917 
2918 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2919   unsigned IntrID = N->getConstantOperandVal(1);
2920   switch (IntrID) {
2921   case Intrinsic::amdgcn_ds_gws_init:
2922   case Intrinsic::amdgcn_ds_gws_barrier:
2923   case Intrinsic::amdgcn_ds_gws_sema_v:
2924   case Intrinsic::amdgcn_ds_gws_sema_br:
2925   case Intrinsic::amdgcn_ds_gws_sema_p:
2926   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2927     SelectDS_GWS(N, IntrID);
2928     return;
2929   default:
2930     break;
2931   }
2932 
2933   SelectCode(N);
2934 }
2935 
2936 void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2937   SDValue Log2WaveSize =
2938     CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2939   CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2940                        {N->getOperand(0), Log2WaveSize});
2941 }
2942 
2943 void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2944   SDValue SrcVal = N->getOperand(1);
2945   if (SrcVal.getValueType() != MVT::i32) {
2946     SelectCode(N); // Emit default error
2947     return;
2948   }
2949 
2950   SDValue CopyVal;
2951   Register SP = TLI->getStackPointerRegisterToSaveRestore();
2952   SDLoc SL(N);
2953 
2954   if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2955     CopyVal = SrcVal.getOperand(0);
2956   } else {
2957     SDValue Log2WaveSize = CurDAG->getTargetConstant(
2958         Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2959 
2960     if (N->isDivergent()) {
2961       SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2962                                               MVT::i32, SrcVal),
2963                        0);
2964     }
2965 
2966     CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2967                                              {SrcVal, Log2WaveSize}),
2968                       0);
2969   }
2970 
2971   SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2972   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2973 }
2974 
2975 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2976                                             unsigned &Mods,
2977                                             bool IsCanonicalizing,
2978                                             bool AllowAbs) const {
2979   Mods = SISrcMods::NONE;
2980   Src = In;
2981 
2982   if (Src.getOpcode() == ISD::FNEG) {
2983     Mods |= SISrcMods::NEG;
2984     Src = Src.getOperand(0);
2985   } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2986     // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2987     // denormal mode, but we're implicitly canonicalizing in a source operand.
2988     auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2989     if (LHS && LHS->isZero()) {
2990       Mods |= SISrcMods::NEG;
2991       Src = Src.getOperand(1);
2992     }
2993   }
2994 
2995   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2996     Mods |= SISrcMods::ABS;
2997     Src = Src.getOperand(0);
2998   }
2999 
3000   return true;
3001 }
3002 
3003 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3004                                         SDValue &SrcMods) const {
3005   unsigned Mods;
3006   if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3007                          /*AllowAbs=*/true)) {
3008     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3009     return true;
3010   }
3011 
3012   return false;
3013 }
3014 
3015 bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3016     SDValue In, SDValue &Src, SDValue &SrcMods) const {
3017   unsigned Mods;
3018   if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3019                          /*AllowAbs=*/true)) {
3020     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3021     return true;
3022   }
3023 
3024   return false;
3025 }
3026 
3027 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3028                                          SDValue &SrcMods) const {
3029   unsigned Mods;
3030   if (SelectVOP3ModsImpl(In, Src, Mods,
3031                          /*IsCanonicalizing=*/true,
3032                          /*AllowAbs=*/false)) {
3033     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3034     return true;
3035   }
3036 
3037   return false;
3038 }
3039 
3040 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3041   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3042     return false;
3043 
3044   Src = In;
3045   return true;
3046 }
3047 
3048 bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3049                                                SDValue &SrcMods,
3050                                                bool OpSel) const {
3051   unsigned Mods;
3052   if (SelectVOP3ModsImpl(In, Src, Mods,
3053                          /*IsCanonicalizing=*/true,
3054                          /*AllowAbs=*/false)) {
3055     if (OpSel)
3056       Mods |= SISrcMods::OP_SEL_0;
3057     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3058     return true;
3059   }
3060 
3061   return false;
3062 }
3063 
3064 bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3065                                            SDValue &SrcMods) const {
3066   return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3067 }
3068 
3069 bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3070                                              SDValue &SrcMods) const {
3071   return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3072 }
3073 
3074 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3075                                          SDValue &SrcMods, SDValue &Clamp,
3076                                          SDValue &Omod) const {
3077   SDLoc DL(In);
3078   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3079   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3080 
3081   return SelectVOP3Mods(In, Src, SrcMods);
3082 }
3083 
3084 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3085                                           SDValue &SrcMods, SDValue &Clamp,
3086                                           SDValue &Omod) const {
3087   SDLoc DL(In);
3088   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3089   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3090 
3091   return SelectVOP3BMods(In, Src, SrcMods);
3092 }
3093 
3094 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3095                                          SDValue &Clamp, SDValue &Omod) const {
3096   Src = In;
3097 
3098   SDLoc DL(In);
3099   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3100   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3101 
3102   return true;
3103 }
3104 
3105 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3106                                          SDValue &SrcMods, bool IsDOT) const {
3107   unsigned Mods = SISrcMods::NONE;
3108   Src = In;
3109 
3110   // TODO: Handle G_FSUB 0 as fneg
3111   if (Src.getOpcode() == ISD::FNEG) {
3112     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3113     Src = Src.getOperand(0);
3114   }
3115 
3116   if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3117       (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3118     unsigned VecMods = Mods;
3119 
3120     SDValue Lo = stripBitcast(Src.getOperand(0));
3121     SDValue Hi = stripBitcast(Src.getOperand(1));
3122 
3123     if (Lo.getOpcode() == ISD::FNEG) {
3124       Lo = stripBitcast(Lo.getOperand(0));
3125       Mods ^= SISrcMods::NEG;
3126     }
3127 
3128     if (Hi.getOpcode() == ISD::FNEG) {
3129       Hi = stripBitcast(Hi.getOperand(0));
3130       Mods ^= SISrcMods::NEG_HI;
3131     }
3132 
3133     if (isExtractHiElt(Lo, Lo))
3134       Mods |= SISrcMods::OP_SEL_0;
3135 
3136     if (isExtractHiElt(Hi, Hi))
3137       Mods |= SISrcMods::OP_SEL_1;
3138 
3139     unsigned VecSize = Src.getValueSizeInBits();
3140     Lo = stripExtractLoElt(Lo);
3141     Hi = stripExtractLoElt(Hi);
3142 
3143     if (Lo.getValueSizeInBits() > VecSize) {
3144       Lo = CurDAG->getTargetExtractSubreg(
3145         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3146         MVT::getIntegerVT(VecSize), Lo);
3147     }
3148 
3149     if (Hi.getValueSizeInBits() > VecSize) {
3150       Hi = CurDAG->getTargetExtractSubreg(
3151         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3152         MVT::getIntegerVT(VecSize), Hi);
3153     }
3154 
3155     assert(Lo.getValueSizeInBits() <= VecSize &&
3156            Hi.getValueSizeInBits() <= VecSize);
3157 
3158     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3159       // Really a scalar input. Just select from the low half of the register to
3160       // avoid packing.
3161 
3162       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3163         Src = Lo;
3164       } else {
3165         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3166 
3167         SDLoc SL(In);
3168         SDValue Undef = SDValue(
3169           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3170                                  Lo.getValueType()), 0);
3171         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3172                                     : AMDGPU::SReg_64RegClassID;
3173         const SDValue Ops[] = {
3174           CurDAG->getTargetConstant(RC, SL, MVT::i32),
3175           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3176           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3177 
3178         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3179                                              Src.getValueType(), Ops), 0);
3180       }
3181       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3182       return true;
3183     }
3184 
3185     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3186       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3187                       .bitcastToAPInt().getZExtValue();
3188       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3189         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3190         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3191         return true;
3192       }
3193     }
3194 
3195     Mods = VecMods;
3196   } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3197              Src.getNumOperands() == 2) {
3198 
3199     // TODO: We should repeat the build_vector source check above for the
3200     // vector_shuffle for negates and casts of individual elements.
3201 
3202     auto *SVN = cast<ShuffleVectorSDNode>(Src);
3203     ArrayRef<int> Mask = SVN->getMask();
3204 
3205     if (Mask[0] < 2 && Mask[1] < 2) {
3206       // src1 should be undef.
3207       SDValue ShuffleSrc = SVN->getOperand(0);
3208 
3209       if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3210         ShuffleSrc = ShuffleSrc.getOperand(0);
3211         Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3212       }
3213 
3214       if (Mask[0] == 1)
3215         Mods |= SISrcMods::OP_SEL_0;
3216       if (Mask[1] == 1)
3217         Mods |= SISrcMods::OP_SEL_1;
3218 
3219       Src = ShuffleSrc;
3220       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3221       return true;
3222     }
3223   }
3224 
3225   // Packed instructions do not have abs modifiers.
3226   Mods |= SISrcMods::OP_SEL_1;
3227 
3228   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3229   return true;
3230 }
3231 
3232 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3233                                             SDValue &SrcMods) const {
3234   return SelectVOP3PMods(In, Src, SrcMods, true);
3235 }
3236 
3237 bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3238   const ConstantSDNode *C = cast<ConstantSDNode>(In);
3239   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3240   // 1 promotes packed values to signed, 0 treats them as unsigned.
3241   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3242 
3243   unsigned Mods = SISrcMods::OP_SEL_1;
3244   unsigned SrcSign = C->getZExtValue();
3245   if (SrcSign == 1)
3246     Mods ^= SISrcMods::NEG;
3247 
3248   Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3249   return true;
3250 }
3251 
3252 bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3253                                                   SDValue &Src) const {
3254   const ConstantSDNode *C = cast<ConstantSDNode>(In);
3255   assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3256 
3257   unsigned Mods = SISrcMods::OP_SEL_1;
3258   unsigned SrcVal = C->getZExtValue();
3259   if (SrcVal == 1)
3260     Mods |= SISrcMods::OP_SEL_0;
3261 
3262   Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3263   return true;
3264 }
3265 
3266 static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3267                                          llvm::SelectionDAG *CurDAG,
3268                                          const SDLoc &DL) {
3269   unsigned DstRegClass;
3270   EVT DstTy;
3271   switch (Elts.size()) {
3272   case 8:
3273     DstRegClass = AMDGPU::VReg_256RegClassID;
3274     DstTy = MVT::v8i32;
3275     break;
3276   case 4:
3277     DstRegClass = AMDGPU::VReg_128RegClassID;
3278     DstTy = MVT::v4i32;
3279     break;
3280   case 2:
3281     DstRegClass = AMDGPU::VReg_64RegClassID;
3282     DstTy = MVT::v2i32;
3283     break;
3284   default:
3285     llvm_unreachable("unhandled Reg sequence size");
3286   }
3287 
3288   SmallVector<SDValue, 17> Ops;
3289   Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3290   for (unsigned i = 0; i < Elts.size(); ++i) {
3291     Ops.push_back(Elts[i]);
3292     Ops.push_back(CurDAG->getTargetConstant(
3293         SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
3294   }
3295   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3296 }
3297 
3298 static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3299                                          llvm::SelectionDAG *CurDAG,
3300                                          const SDLoc &DL) {
3301   SmallVector<SDValue, 8> PackedElts;
3302   assert("unhandled Reg sequence size" &&
3303          (Elts.size() == 8 || Elts.size() == 16));
3304 
3305   // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3306   // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3307   for (unsigned i = 0; i < Elts.size(); i += 2) {
3308     SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3309     SDValue HiSrc;
3310     if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3311       PackedElts.push_back(HiSrc);
3312     } else {
3313       SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3314       MachineSDNode *Packed =
3315           CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3316                                  {Elts[i + 1], Elts[i], PackLoLo});
3317       PackedElts.push_back(SDValue(Packed, 0));
3318     }
3319   }
3320 
3321   return buildRegSequence32(PackedElts, CurDAG, DL);
3322 }
3323 
3324 static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3325                                        llvm::SelectionDAG *CurDAG,
3326                                        const SDLoc &DL, unsigned ElementSize) {
3327   if (ElementSize == 16)
3328     return buildRegSequence16(Elts, CurDAG, DL);
3329   if (ElementSize == 32)
3330     return buildRegSequence32(Elts, CurDAG, DL);
3331   llvm_unreachable("Unhandled element size");
3332 }
3333 
3334 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3335                                  SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3336                                  llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3337                                  unsigned ElementSize) {
3338   if (ModOpcode == ISD::FNEG) {
3339     Mods |= SISrcMods::NEG;
3340     // Check if all elements also have abs modifier
3341     SmallVector<SDValue, 8> NegAbsElts;
3342     for (auto El : Elts) {
3343       if (El.getOpcode() != ISD::FABS)
3344         break;
3345       NegAbsElts.push_back(El->getOperand(0));
3346     }
3347     if (Elts.size() != NegAbsElts.size()) {
3348       // Neg
3349       Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3350     } else {
3351       // Neg and Abs
3352       Mods |= SISrcMods::NEG_HI;
3353       Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3354     }
3355   } else {
3356     assert(ModOpcode == ISD::FABS);
3357     // Abs
3358     Mods |= SISrcMods::NEG_HI;
3359     Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3360   }
3361 }
3362 
3363 // Check all f16 elements for modifiers while looking through b32 and v2b16
3364 // build vector, stop if element does not satisfy ModifierCheck.
3365 static void
3366 checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3367                               std::function<bool(SDValue)> ModifierCheck) {
3368   for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3369     if (auto *F16Pair =
3370             dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3371       for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3372         SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3373         if (!ModifierCheck(ElF16))
3374           break;
3375       }
3376     }
3377   }
3378 }
3379 
3380 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3381                                               SDValue &SrcMods) const {
3382   Src = In;
3383   unsigned Mods = SISrcMods::OP_SEL_1;
3384 
3385   // mods are on f16 elements
3386   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3387     SmallVector<SDValue, 8> EltsF16;
3388 
3389     checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3390       if (Element.getOpcode() != ISD::FNEG)
3391         return false;
3392       EltsF16.push_back(Element.getOperand(0));
3393       return true;
3394     });
3395 
3396     // All elements have neg modifier
3397     if (BV->getNumOperands() * 2 == EltsF16.size()) {
3398       Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3399       Mods |= SISrcMods::NEG;
3400       Mods |= SISrcMods::NEG_HI;
3401     }
3402   }
3403 
3404   // mods are on v2f16 elements
3405   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3406     SmallVector<SDValue, 8> EltsV2F16;
3407     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3408       SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3409       // Based on first element decide which mod we match, neg or abs
3410       if (ElV2f16.getOpcode() != ISD::FNEG)
3411         break;
3412       EltsV2F16.push_back(ElV2f16.getOperand(0));
3413     }
3414 
3415     // All pairs of elements have neg modifier
3416     if (BV->getNumOperands() == EltsV2F16.size()) {
3417       Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3418       Mods |= SISrcMods::NEG;
3419       Mods |= SISrcMods::NEG_HI;
3420     }
3421   }
3422 
3423   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3424   return true;
3425 }
3426 
3427 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3428                                                  SDValue &SrcMods) const {
3429   Src = In;
3430   unsigned Mods = SISrcMods::OP_SEL_1;
3431   unsigned ModOpcode;
3432 
3433   // mods are on f16 elements
3434   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3435     SmallVector<SDValue, 8> EltsF16;
3436     checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3437       // Based on first element decide which mod we match, neg or abs
3438       if (EltsF16.empty())
3439         ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3440       if (ElF16.getOpcode() != ModOpcode)
3441         return false;
3442       EltsF16.push_back(ElF16.getOperand(0));
3443       return true;
3444     });
3445 
3446     // All elements have ModOpcode modifier
3447     if (BV->getNumOperands() * 2 == EltsF16.size())
3448       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3449                            16);
3450   }
3451 
3452   // mods are on v2f16 elements
3453   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3454     SmallVector<SDValue, 8> EltsV2F16;
3455 
3456     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3457       SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3458       // Based on first element decide which mod we match, neg or abs
3459       if (EltsV2F16.empty())
3460         ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3461       if (ElV2f16->getOpcode() != ModOpcode)
3462         break;
3463       EltsV2F16.push_back(ElV2f16->getOperand(0));
3464     }
3465 
3466     // All elements have ModOpcode modifier
3467     if (BV->getNumOperands() == EltsV2F16.size())
3468       selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3469                            32);
3470   }
3471 
3472   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3473   return true;
3474 }
3475 
3476 bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3477                                                  SDValue &SrcMods) const {
3478   Src = In;
3479   unsigned Mods = SISrcMods::OP_SEL_1;
3480   SmallVector<SDValue, 8> EltsF32;
3481 
3482   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3483     assert(BV->getNumOperands() > 0);
3484     // Based on first element decide which mod we match, neg or abs
3485     SDValue ElF32 = stripBitcast(BV->getOperand(0));
3486     unsigned ModOpcode =
3487         (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3488     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3489       SDValue ElF32 = stripBitcast(BV->getOperand(i));
3490       if (ElF32.getOpcode() != ModOpcode)
3491         break;
3492       EltsF32.push_back(ElF32.getOperand(0));
3493     }
3494 
3495     // All elements had ModOpcode modifier
3496     if (BV->getNumOperands() == EltsF32.size())
3497       selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3498                            32);
3499   }
3500 
3501   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3502   return true;
3503 }
3504 
3505 bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3506   if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3507     BitVector UndefElements;
3508     if (SDValue Splat = BV->getSplatValue(&UndefElements))
3509       if (isInlineImmediate(Splat.getNode())) {
3510         if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3511           unsigned Imm = C->getAPIntValue().getSExtValue();
3512           Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3513           return true;
3514         }
3515         if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3516           unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3517           Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3518           return true;
3519         }
3520         llvm_unreachable("unhandled Constant node");
3521       }
3522   }
3523 
3524   // 16 bit splat
3525   SDValue SplatSrc32 = stripBitcast(In);
3526   if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3527     if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3528       SDValue SplatSrc16 = stripBitcast(Splat32);
3529       if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3530         if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3531           const SIInstrInfo *TII = Subtarget->getInstrInfo();
3532           std::optional<APInt> RawValue;
3533           if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3534             RawValue = C->getValueAPF().bitcastToAPInt();
3535           else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3536             RawValue = C->getAPIntValue();
3537 
3538           if (RawValue.has_value()) {
3539             EVT VT = In.getValueType().getScalarType();
3540             if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3541               APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3542                                    ? APFloatBase::IEEEhalf()
3543                                    : APFloatBase::BFloat(),
3544                                RawValue.value());
3545               if (TII->isInlineConstant(FloatVal)) {
3546                 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3547                                                 MVT::i16);
3548                 return true;
3549               }
3550             } else if (VT.getSimpleVT() == MVT::i16) {
3551               if (TII->isInlineConstant(RawValue.value())) {
3552                 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3553                                                 MVT::i16);
3554                 return true;
3555               }
3556             } else
3557               llvm_unreachable("unknown 16-bit type");
3558           }
3559         }
3560     }
3561 
3562   return false;
3563 }
3564 
3565 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3566                                             SDValue &IndexKey) const {
3567   unsigned Key = 0;
3568   Src = In;
3569 
3570   if (In.getOpcode() == ISD::SRL) {
3571     const llvm::SDValue &ShiftSrc = In.getOperand(0);
3572     ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3573     if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3574         ShiftAmt->getZExtValue() % 8 == 0) {
3575       Key = ShiftAmt->getZExtValue() / 8;
3576       Src = ShiftSrc;
3577     }
3578   }
3579 
3580   IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3581   return true;
3582 }
3583 
3584 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3585                                              SDValue &IndexKey) const {
3586   unsigned Key = 0;
3587   Src = In;
3588 
3589   if (In.getOpcode() == ISD::SRL) {
3590     const llvm::SDValue &ShiftSrc = In.getOperand(0);
3591     ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3592     if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3593         ShiftAmt->getZExtValue() == 16) {
3594       Key = 1;
3595       Src = ShiftSrc;
3596     }
3597   }
3598 
3599   IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3600   return true;
3601 }
3602 
3603 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3604                                          SDValue &SrcMods) const {
3605   Src = In;
3606   // FIXME: Handle op_sel
3607   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3608   return true;
3609 }
3610 
3611 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3612                                              SDValue &SrcMods) const {
3613   // FIXME: Handle op_sel
3614   return SelectVOP3Mods(In, Src, SrcMods);
3615 }
3616 
3617 // The return value is not whether the match is possible (which it always is),
3618 // but whether or not it a conversion is really used.
3619 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3620                                                    unsigned &Mods) const {
3621   Mods = 0;
3622   SelectVOP3ModsImpl(In, Src, Mods);
3623 
3624   if (Src.getOpcode() == ISD::FP_EXTEND) {
3625     Src = Src.getOperand(0);
3626     assert(Src.getValueType() == MVT::f16);
3627     Src = stripBitcast(Src);
3628 
3629     // Be careful about folding modifiers if we already have an abs. fneg is
3630     // applied last, so we don't want to apply an earlier fneg.
3631     if ((Mods & SISrcMods::ABS) == 0) {
3632       unsigned ModsTmp;
3633       SelectVOP3ModsImpl(Src, Src, ModsTmp);
3634 
3635       if ((ModsTmp & SISrcMods::NEG) != 0)
3636         Mods ^= SISrcMods::NEG;
3637 
3638       if ((ModsTmp & SISrcMods::ABS) != 0)
3639         Mods |= SISrcMods::ABS;
3640     }
3641 
3642     // op_sel/op_sel_hi decide the source type and source.
3643     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3644     // If the sources's op_sel is set, it picks the high half of the source
3645     // register.
3646 
3647     Mods |= SISrcMods::OP_SEL_1;
3648     if (isExtractHiElt(Src, Src)) {
3649       Mods |= SISrcMods::OP_SEL_0;
3650 
3651       // TODO: Should we try to look for neg/abs here?
3652     }
3653 
3654     return true;
3655   }
3656 
3657   return false;
3658 }
3659 
3660 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3661                                                   SDValue &SrcMods) const {
3662   unsigned Mods = 0;
3663   if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3664     return false;
3665   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3666   return true;
3667 }
3668 
3669 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3670                                                SDValue &SrcMods) const {
3671   unsigned Mods = 0;
3672   SelectVOP3PMadMixModsImpl(In, Src, Mods);
3673   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3674   return true;
3675 }
3676 
3677 // Match BITOP3 operation and return a number of matched instructions plus
3678 // truth table.
3679 static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3680                                               SmallVectorImpl<SDValue> &Src) {
3681   unsigned NumOpcodes = 0;
3682   uint8_t LHSBits, RHSBits;
3683 
3684   auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3685     // Define truth table given Src0, Src1, Src2 bits permutations:
3686     //                          0     0     0
3687     //                          0     0     1
3688     //                          0     1     0
3689     //                          0     1     1
3690     //                          1     0     0
3691     //                          1     0     1
3692     //                          1     1     0
3693     //                          1     1     1
3694     const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3695 
3696     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3697       if (C->isAllOnes()) {
3698         Bits = 0xff;
3699         return true;
3700       }
3701       if (C->isZero()) {
3702         Bits = 0;
3703         return true;
3704       }
3705     }
3706 
3707     for (unsigned I = 0; I < Src.size(); ++I) {
3708       // Try to find existing reused operand
3709       if (Src[I] == Op) {
3710         Bits = SrcBits[I];
3711         return true;
3712       }
3713       // Try to replace parent operator
3714       if (Src[I] == In) {
3715         Bits = SrcBits[I];
3716         Src[I] = Op;
3717         return true;
3718       }
3719     }
3720 
3721     if (Src.size() == 3) {
3722       // No room left for operands. Try one last time, there can be a 'not' of
3723       // one of our source operands. In this case we can compute the bits
3724       // without growing Src vector.
3725       if (Op.getOpcode() == ISD::XOR) {
3726         if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3727           if (C->isAllOnes()) {
3728             SDValue LHS = Op.getOperand(0);
3729             for (unsigned I = 0; I < Src.size(); ++I) {
3730               if (Src[I] == LHS) {
3731                 Bits = ~SrcBits[I];
3732                 return true;
3733               }
3734             }
3735           }
3736         }
3737       }
3738 
3739       return false;
3740     }
3741 
3742     Bits = SrcBits[Src.size()];
3743     Src.push_back(Op);
3744     return true;
3745   };
3746 
3747   switch (In.getOpcode()) {
3748   case ISD::AND:
3749   case ISD::OR:
3750   case ISD::XOR: {
3751     SDValue LHS = In.getOperand(0);
3752     SDValue RHS = In.getOperand(1);
3753 
3754     SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3755     if (!getOperandBits(LHS, LHSBits) ||
3756         !getOperandBits(RHS, RHSBits)) {
3757       Src = Backup;
3758       return std::make_pair(0, 0);
3759     }
3760 
3761     // Recursion is naturally limited by the size of the operand vector.
3762     auto Op = BitOp3_Op(LHS, Src);
3763     if (Op.first) {
3764       NumOpcodes += Op.first;
3765       LHSBits = Op.second;
3766     }
3767 
3768     Op = BitOp3_Op(RHS, Src);
3769     if (Op.first) {
3770       NumOpcodes += Op.first;
3771       RHSBits = Op.second;
3772     }
3773     break;
3774   }
3775   default:
3776     return std::make_pair(0, 0);
3777   }
3778 
3779   uint8_t TTbl;
3780   switch (In.getOpcode()) {
3781   case ISD::AND:
3782     TTbl = LHSBits & RHSBits;
3783     break;
3784   case ISD::OR:
3785     TTbl = LHSBits | RHSBits;
3786     break;
3787   case ISD::XOR:
3788     TTbl = LHSBits ^ RHSBits;
3789     break;
3790   default:
3791     break;
3792   }
3793 
3794   return std::make_pair(NumOpcodes + 1, TTbl);
3795 }
3796 
3797 bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3798                                       SDValue &Src2, SDValue &Tbl) const {
3799   SmallVector<SDValue, 3> Src;
3800   uint8_t TTbl;
3801   unsigned NumOpcodes;
3802 
3803   std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3804 
3805   // Src.empty() case can happen if all operands are all zero or all ones.
3806   // Normally it shall be optimized out before reaching this.
3807   if (NumOpcodes < 2 || Src.empty())
3808     return false;
3809 
3810   // For a uniform case threshold should be higher to account for moves between
3811   // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3812   // and a readtfirstlane after.
3813   if (NumOpcodes < 4 && !In->isDivergent())
3814     return false;
3815 
3816   if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3817     // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3818     // asm more readable. This cannot be modeled with AddedComplexity because
3819     // selector does not know how many operations did we match.
3820     if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3821         (In.getOperand(0).getOpcode() == In.getOpcode() ||
3822          In.getOperand(1).getOpcode() == In.getOpcode()))
3823       return false;
3824 
3825     if (In.getOpcode() == ISD::OR &&
3826         (In.getOperand(0).getOpcode() == ISD::AND ||
3827          In.getOperand(1).getOpcode() == ISD::AND))
3828       return false;
3829   }
3830 
3831   // Last operand can be ignored, turning a ternary operation into a binary.
3832   // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3833   // 'c' with 'a' here without changing the answer. In some pathological
3834   // cases it should be possible to get an operation with a single operand
3835   // too if optimizer would not catch it.
3836   while (Src.size() < 3)
3837     Src.push_back(Src[0]);
3838 
3839   Src0 = Src[0];
3840   Src1 = Src[1];
3841   Src2 = Src[2];
3842 
3843   Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3844   return true;
3845 }
3846 
3847 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3848   if (In.isUndef())
3849     return CurDAG->getUNDEF(MVT::i32);
3850 
3851   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3852     SDLoc SL(In);
3853     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3854   }
3855 
3856   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3857     SDLoc SL(In);
3858     return CurDAG->getConstant(
3859       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3860   }
3861 
3862   SDValue Src;
3863   if (isExtractHiElt(In, Src))
3864     return Src;
3865 
3866   return SDValue();
3867 }
3868 
3869 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3870   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
3871 
3872   const SIRegisterInfo *SIRI =
3873     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3874   const SIInstrInfo * SII =
3875     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3876 
3877   unsigned Limit = 0;
3878   bool AllUsesAcceptSReg = true;
3879   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3880     Limit < 10 && U != E; ++U, ++Limit) {
3881     const TargetRegisterClass *RC =
3882         getOperandRegClass(U->getUser(), U->getOperandNo());
3883 
3884     // If the register class is unknown, it could be an unknown
3885     // register class that needs to be an SGPR, e.g. an inline asm
3886     // constraint
3887     if (!RC || SIRI->isSGPRClass(RC))
3888       return false;
3889 
3890     if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3891       AllUsesAcceptSReg = false;
3892       SDNode *User = U->getUser();
3893       if (User->isMachineOpcode()) {
3894         unsigned Opc = User->getMachineOpcode();
3895         const MCInstrDesc &Desc = SII->get(Opc);
3896         if (Desc.isCommutable()) {
3897           unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3898           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3899           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3900             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3901             const TargetRegisterClass *CommutedRC =
3902                 getOperandRegClass(U->getUser(), CommutedOpNo);
3903             if (CommutedRC == &AMDGPU::VS_32RegClass ||
3904                 CommutedRC == &AMDGPU::VS_64RegClass)
3905               AllUsesAcceptSReg = true;
3906           }
3907         }
3908       }
3909       // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3910       // commuting current user. This means have at least one use
3911       // that strictly require VGPR. Thus, we will not attempt to commute
3912       // other user instructions.
3913       if (!AllUsesAcceptSReg)
3914         break;
3915     }
3916   }
3917   return !AllUsesAcceptSReg && (Limit < 10);
3918 }
3919 
3920 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3921   const auto *Ld = cast<LoadSDNode>(N);
3922 
3923   const MachineMemOperand *MMO = Ld->getMemOperand();
3924   if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3925     return false;
3926 
3927   return MMO->getSize().hasValue() &&
3928          Ld->getAlign() >=
3929              Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3930                             uint64_t(4))) &&
3931          ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3932            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3933           (Subtarget->getScalarizeGlobalBehavior() &&
3934            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3935            Ld->isSimple() &&
3936            static_cast<const SITargetLowering *>(getTargetLowering())
3937                ->isMemOpHasNoClobberedMemOperand(N)));
3938 }
3939 
3940 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3941   const AMDGPUTargetLowering& Lowering =
3942     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3943   bool IsModified = false;
3944   do {
3945     IsModified = false;
3946 
3947     // Go over all selected nodes and try to fold them a bit more
3948     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3949     while (Position != CurDAG->allnodes_end()) {
3950       SDNode *Node = &*Position++;
3951       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3952       if (!MachineNode)
3953         continue;
3954 
3955       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3956       if (ResNode != Node) {
3957         if (ResNode)
3958           ReplaceUses(Node, ResNode);
3959         IsModified = true;
3960       }
3961     }
3962     CurDAG->RemoveDeadNodes();
3963   } while (IsModified);
3964 }
3965 
3966 AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
3967                                                    CodeGenOptLevel OptLevel)
3968     : SelectionDAGISelLegacy(
3969           ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3970 
3971 char AMDGPUDAGToDAGISelLegacy::ID = 0;
3972