xref: /llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp (revision cc97653d534e80745a0cfb0143972e8d4dec9f74)
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/FloatingPointMode.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/UniformityAnalysis.h"
27 #include "llvm/CodeGen/Analysis.h"
28 #include "llvm/CodeGen/ByteProvider.h"
29 #include "llvm/CodeGen/FunctionLoweringInfo.h"
30 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
31 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
32 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineLoopInfo.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/IntrinsicsAMDGPU.h"
40 #include "llvm/IR/IntrinsicsR600.h"
41 #include "llvm/IR/MDBuilder.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/KnownBits.h"
44 #include "llvm/Support/ModRef.h"
45 #include "llvm/Transforms/Utils/LowerAtomic.h"
46 #include <optional>
47 
48 using namespace llvm;
49 
50 #define DEBUG_TYPE "si-lower"
51 
52 STATISTIC(NumTailCalls, "Number of tail calls");
53 
54 static cl::opt<bool>
55     DisableLoopAlignment("amdgpu-disable-loop-alignment",
56                          cl::desc("Do not align and prefetch loops"),
57                          cl::init(false));
58 
59 static cl::opt<bool> UseDivergentRegisterIndexing(
60     "amdgpu-use-divergent-register-indexing", cl::Hidden,
61     cl::desc("Use indirect register addressing for divergent indexes"),
62     cl::init(false));
63 
64 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
65   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
66   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67 }
68 
69 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
70   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71   return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72 }
73 
74 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78       return AMDGPU::SGPR0 + Reg;
79     }
80   }
81   llvm_unreachable("Cannot allocate sgpr");
82 }
83 
84 SITargetLowering::SITargetLowering(const TargetMachine &TM,
85                                    const GCNSubtarget &STI)
86     : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
87   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
88   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
89 
90   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
91   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
92 
93   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
94 
95   const SIRegisterInfo *TRI = STI.getRegisterInfo();
96   const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
97 
98   addRegisterClass(MVT::f64, V64RegClass);
99   addRegisterClass(MVT::v2f32, V64RegClass);
100   addRegisterClass(MVT::Untyped, V64RegClass);
101 
102   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
103   addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
104 
105   addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
106   addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
107 
108   addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
109   addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
110 
111   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
112   addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
113 
114   addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
115   addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
116 
117   addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
118   addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
119 
120   addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
121   addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
122 
123   addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
124   addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
125 
126   addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
127   addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
128 
129   addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
130   addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
131 
132   addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
133   addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
134 
135   addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
136   addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
137 
138   addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
139   addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
140 
141   addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
142   addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
143 
144   addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
145   addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
146 
147   addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
148   addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
149 
150   if (Subtarget->has16BitInsts()) {
151     if (Subtarget->useRealTrue16Insts()) {
152       addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
153       addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
154       addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
155     } else {
156       addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
157       addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
158       addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
159     }
160 
161     // Unless there are also VOP3P operations, not operations are really legal.
162     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
163     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
164     addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
165     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
166     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
167     addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
168     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
169     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
170     addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
171     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
172     addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
173     addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
174     addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
175     addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
176     addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
177   }
178 
179   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
180   addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
181 
182   computeRegisterProperties(Subtarget->getRegisterInfo());
183 
184   // The boolean content concept here is too inflexible. Compares only ever
185   // really produce a 1-bit result. Any copy/extend from these will turn into a
186   // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
187   // it's what most targets use.
188   setBooleanContents(ZeroOrOneBooleanContent);
189   setBooleanVectorContents(ZeroOrOneBooleanContent);
190 
191   // We need to custom lower vector stores from local memory
192   setOperationAction(ISD::LOAD,
193                      {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194                       MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196                       MVT::i1, MVT::v32i32},
197                      Custom);
198 
199   setOperationAction(ISD::STORE,
200                      {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201                       MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203                       MVT::i1, MVT::v32i32},
204                      Custom);
205 
206   if (isTypeLegal(MVT::bf16)) {
207     for (unsigned Opc :
208          {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV,
209           ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM,
210           ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT,
211           ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI,
212           ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2,
213           ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10,
214           ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,
215           ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,
216           ISD::SETCC}) {
217       // FIXME: The promoted to type shouldn't need to be explicit
218       setOperationAction(Opc, MVT::bf16, Promote);
219       AddPromotedToType(Opc, MVT::bf16, MVT::f32);
220     }
221 
222     setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
223 
224     setOperationAction(ISD::SELECT, MVT::bf16, Promote);
225     AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
226 
227     setOperationAction(ISD::FABS, MVT::bf16, Legal);
228     setOperationAction(ISD::FNEG, MVT::bf16, Legal);
229     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
230 
231     // We only need to custom lower because we can't specify an action for bf16
232     // sources.
233     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
234     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
235   }
236 
237   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
238   setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
239   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
240   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
241   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
242   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
243   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
244   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
245   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
246   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
247   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
248   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
249   setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
250   setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
251   setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
252   setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
253 
254   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
255   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
256   setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
257   setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
258   setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
259   setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
260   setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
261 
262   setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
263 
264   setOperationAction(ISD::SELECT, MVT::i1, Promote);
265   setOperationAction(ISD::SELECT, MVT::i64, Custom);
266   setOperationAction(ISD::SELECT, MVT::f64, Promote);
267   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
268 
269   setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
270 
271   setOperationAction(ISD::SELECT_CC,
272                      {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
273 
274   setOperationAction(ISD::SETCC, MVT::i1, Promote);
275   setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
276   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
277 
278   setOperationAction(ISD::TRUNCATE,
279                      {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280                       MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
282                      Expand);
283   setOperationAction(ISD::FP_ROUND,
284                      {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285                       MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286                       MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
287                      Expand);
288 
289   setOperationAction(ISD::SIGN_EXTEND_INREG,
290                      {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291                       MVT::v3i16, MVT::v4i16, MVT::Other},
292                      Custom);
293 
294   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
295   setOperationAction(ISD::BR_CC,
296                      {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
297 
298   setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
299 
300   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
301 
302   setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
303                      Expand);
304 
305 #if 0
306   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
307 #endif
308 
309   // We only support LOAD/STORE and vector manipulation ops for vectors
310   // with > 4 elements.
311   for (MVT VT :
312        {MVT::v8i32,   MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,
313         MVT::v10f32,  MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314         MVT::v16i32,  MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,
315         MVT::v4f16,   MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32,
316         MVT::v6f32,   MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,
317         MVT::v8i16,   MVT::v8f16,  MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318         MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319         MVT::v32i16,  MVT::v32f16, MVT::v32bf16}) {
320     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
321       switch (Op) {
322       case ISD::LOAD:
323       case ISD::STORE:
324       case ISD::BUILD_VECTOR:
325       case ISD::BITCAST:
326       case ISD::UNDEF:
327       case ISD::EXTRACT_VECTOR_ELT:
328       case ISD::INSERT_VECTOR_ELT:
329       case ISD::SCALAR_TO_VECTOR:
330       case ISD::IS_FPCLASS:
331         break;
332       case ISD::EXTRACT_SUBVECTOR:
333       case ISD::INSERT_SUBVECTOR:
334       case ISD::CONCAT_VECTORS:
335         setOperationAction(Op, VT, Custom);
336         break;
337       default:
338         setOperationAction(Op, VT, Expand);
339         break;
340       }
341     }
342   }
343 
344   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
345 
346   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
347   // is expanded to avoid having two separate loops in case the index is a VGPR.
348 
349   // Most operations are naturally 32-bit vector operations. We only support
350   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
351   for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
352     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
353     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
354 
355     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
356     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
357 
358     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
359     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
360 
361     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
362     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
363   }
364 
365   for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
366     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
367     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
368 
369     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
370     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
371 
372     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
373     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
374 
375     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
376     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
377   }
378 
379   for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
380     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
381     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
382 
383     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
384     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
385 
386     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
387     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
388 
389     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
390     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
391   }
392 
393   for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
394     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
395     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
396 
397     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
398     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
399 
400     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
401     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
402 
403     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
404     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
405   }
406 
407   for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
408     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
409     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
410 
411     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
412     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
413 
414     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
415     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
416 
417     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
418     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
419   }
420 
421   setOperationAction(ISD::VECTOR_SHUFFLE,
422                      {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
423                       MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
424                      Custom);
425 
426   if (Subtarget->hasPkMovB32()) {
427     // TODO: 16-bit element vectors should be legal with even aligned elements.
428     // TODO: Can be legal with wider source types than the result with
429     // subregister extracts.
430     setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431   }
432 
433   setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434                      Custom);
435 
436   // Avoid stack access for these.
437   // TODO: Generalize to more vector types.
438   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
439                      {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
440                       MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
441                      Custom);
442 
443   // Deal with vec3 vector operations when widened to vec4.
444   setOperationAction(ISD::INSERT_SUBVECTOR,
445                      {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
446 
447   // Deal with vec5/6/7 vector operations when widened to vec8.
448   setOperationAction(ISD::INSERT_SUBVECTOR,
449                      {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
450                       MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
451                       MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
452                       MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
453                      Custom);
454 
455   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
456   // and output demarshalling
457   setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
458 
459   // We can't return success/failure, only the old value,
460   // let LLVM add the comparison
461   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
462                      Expand);
463 
464   setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
465 
466   setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
467 
468   // FIXME: This should be narrowed to i32, but that only happens if i64 is
469   // illegal.
470   // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
471   setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
472 
473   // On SI this is s_memtime and s_memrealtime on VI.
474   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
475 
476   if (Subtarget->hasSMemRealTime() ||
477       Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
478     setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
479   setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
480 
481   if (Subtarget->has16BitInsts()) {
482     setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
483     setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
484   } else {
485     setOperationAction(ISD::FSQRT, MVT::f16, Custom);
486   }
487 
488   if (Subtarget->hasMadMacF32Insts())
489     setOperationAction(ISD::FMAD, MVT::f32, Legal);
490 
491   if (!Subtarget->hasBFI())
492     // fcopysign can be done in a single instruction with BFI.
493     setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
494 
495   if (!Subtarget->hasBCNT(32))
496     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
497 
498   if (!Subtarget->hasBCNT(64))
499     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
500 
501   if (Subtarget->hasFFBH())
502     setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
503 
504   if (Subtarget->hasFFBL())
505     setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
506 
507   // We only really have 32-bit BFE instructions (and 16-bit on VI).
508   //
509   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
510   // effort to match them now. We want this to be false for i64 cases when the
511   // extraction isn't restricted to the upper or lower half. Ideally we would
512   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
513   // span the midpoint are probably relatively rare, so don't worry about them
514   // for now.
515   if (Subtarget->hasBFE())
516     setHasExtractBitsInsn(true);
517 
518   // Clamp modifier on add/sub
519   if (Subtarget->hasIntClamp())
520     setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
521 
522   if (Subtarget->hasAddNoCarry())
523     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
524                        Legal);
525 
526   setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
527                      Custom);
528 
529   // These are really only legal for ieee_mode functions. We should be avoiding
530   // them for functions that don't have ieee_mode enabled, so just say they are
531   // legal.
532   setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
533                      {MVT::f32, MVT::f64}, Legal);
534 
535   if (Subtarget->haveRoundOpsF64())
536     setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
537                        Legal);
538   else
539     setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
540                        MVT::f64, Custom);
541 
542   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
543   setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
544                      Legal);
545   setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
546 
547   setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
548   setOperationAction(ISD::FDIV, MVT::f64, Custom);
549 
550   setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
551   setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
552 
553   // Custom lower these because we can't specify a rule based on an illegal
554   // source bf16.
555   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
556   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
557 
558   if (Subtarget->has16BitInsts()) {
559     setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
560                         ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
561                        MVT::i16, Legal);
562 
563     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
564 
565     setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
566                        MVT::i16, Expand);
567 
568     setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
569                         ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
570                         ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
571                         ISD::CTPOP},
572                        MVT::i16, Promote);
573 
574     setOperationAction(ISD::LOAD, MVT::i16, Custom);
575 
576     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
577 
578     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
579     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
580     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
581     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
582 
583     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
584     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
585     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
586 
587     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
588 
589     // F16 - Constant Actions.
590     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
591     setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
592 
593     // F16 - Load/Store Actions.
594     setOperationAction(ISD::LOAD, MVT::f16, Promote);
595     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
596     setOperationAction(ISD::STORE, MVT::f16, Promote);
597     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
598 
599     // BF16 - Load/Store Actions.
600     setOperationAction(ISD::LOAD, MVT::bf16, Promote);
601     AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
602     setOperationAction(ISD::STORE, MVT::bf16, Promote);
603     AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
604 
605     // F16 - VOP1 Actions.
606     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
607                         ISD::FSIN, ISD::FROUND},
608                        MVT::f16, Custom);
609 
610     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
611     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
612 
613     // F16 - VOP2 Actions.
614     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
615                        Expand);
616     setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
617     setOperationAction(ISD::FFREXP, MVT::f16, Custom);
618     setOperationAction(ISD::FDIV, MVT::f16, Custom);
619 
620     // F16 - VOP3 Actions.
621     setOperationAction(ISD::FMA, MVT::f16, Legal);
622     if (STI.hasMadF16())
623       setOperationAction(ISD::FMAD, MVT::f16, Legal);
624 
625     for (MVT VT :
626          {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
627           MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
628           MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
629       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
630         switch (Op) {
631         case ISD::LOAD:
632         case ISD::STORE:
633         case ISD::BUILD_VECTOR:
634         case ISD::BITCAST:
635         case ISD::UNDEF:
636         case ISD::EXTRACT_VECTOR_ELT:
637         case ISD::INSERT_VECTOR_ELT:
638         case ISD::INSERT_SUBVECTOR:
639         case ISD::SCALAR_TO_VECTOR:
640         case ISD::IS_FPCLASS:
641           break;
642         case ISD::EXTRACT_SUBVECTOR:
643         case ISD::CONCAT_VECTORS:
644           setOperationAction(Op, VT, Custom);
645           break;
646         default:
647           setOperationAction(Op, VT, Expand);
648           break;
649         }
650       }
651     }
652 
653     // v_perm_b32 can handle either of these.
654     setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
655     setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
656 
657     // XXX - Do these do anything? Vector constants turn into build_vector.
658     setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
659 
660     setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
661                        Legal);
662 
663     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
664     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
665     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
666     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
667 
668     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
669     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
670     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
671     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
672 
673     setOperationAction(ISD::AND, MVT::v2i16, Promote);
674     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
675     setOperationAction(ISD::OR, MVT::v2i16, Promote);
676     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
677     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
678     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
679 
680     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
681     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
682     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
683     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
684     setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
685     AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
686 
687     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
688     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
689     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
690     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
691     setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
692     AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
693 
694     setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
695     AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
696     setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
697     AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
698     setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
699     AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
700 
701     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
702     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
703     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
704     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
705 
706     setOperationAction(ISD::STORE, MVT::v8i16, Promote);
707     AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
708     setOperationAction(ISD::STORE, MVT::v8f16, Promote);
709     AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
710     setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
711     AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
712 
713     setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
714     AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
715     setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
716     AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
717     setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
718     AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
719 
720     setOperationAction(ISD::STORE, MVT::v16i16, Promote);
721     AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
722     setOperationAction(ISD::STORE, MVT::v16f16, Promote);
723     AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
724     setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
725     AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
726 
727     setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
728     AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
729     setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
730     AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
731     setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
732     AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
733 
734     setOperationAction(ISD::STORE, MVT::v32i16, Promote);
735     AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
736     setOperationAction(ISD::STORE, MVT::v32f16, Promote);
737     AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
738     setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
739     AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
740 
741     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
742                        MVT::v2i32, Expand);
743     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
744 
745     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
746                        MVT::v4i32, Expand);
747 
748     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
749                        MVT::v8i32, Expand);
750 
751     setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
752                        Subtarget->hasVOP3PInsts() ? Legal : Custom);
753 
754     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
755     // This isn't really legal, but this avoids the legalizer unrolling it (and
756     // allows matching fneg (fabs x) patterns)
757     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
758 
759     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
760     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
761 
762     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
763                         ISD::FMAXIMUMNUM},
764                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765                        Custom);
766 
767     setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
768                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
769                        Expand);
770 
771     for (MVT Vec16 :
772          {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
773           MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
774       setOperationAction(
775           {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
776           Vec16, Custom);
777       setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
778     }
779   }
780 
781   if (Subtarget->hasVOP3PInsts()) {
782     setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
783                         ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
784                         ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
785                        MVT::v2i16, Legal);
786 
787     setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
788                         ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
789                        MVT::v2f16, Legal);
790 
791     setOperationAction(ISD::EXTRACT_VECTOR_ELT,
792                        {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
793 
794     setOperationAction(ISD::VECTOR_SHUFFLE,
795                        {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
796                         MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
797                         MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
798                        Custom);
799 
800     for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
801       // Split vector operations.
802       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
803                           ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
804                           ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
805                           ISD::SSUBSAT},
806                          VT, Custom);
807 
808     for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
809       // Split vector operations.
810       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
811                          VT, Custom);
812 
813     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
814                        Custom);
815 
816     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
817     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
818                        Custom);
819 
820     if (Subtarget->hasPackedFP32Ops()) {
821       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
822                          MVT::v2f32, Legal);
823       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
824                          {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
825                          Custom);
826     }
827   }
828 
829   setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
830 
831   if (Subtarget->has16BitInsts()) {
832     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
833     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
834     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
835     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
836   } else {
837     // Legalization hack.
838     setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
839 
840     setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
841   }
842 
843   setOperationAction(ISD::SELECT,
844                      {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
845                       MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
846                       MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
847                       MVT::v32f16, MVT::v32bf16},
848                      Custom);
849 
850   setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
851 
852   if (Subtarget->hasScalarSMulU64())
853     setOperationAction(ISD::MUL, MVT::i64, Custom);
854 
855   if (Subtarget->hasMad64_32())
856     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
857 
858   if (Subtarget->hasPrefetch())
859     setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
860 
861   if (Subtarget->hasIEEEMinMax()) {
862     setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
863                        {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
864     setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
865                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
866                        Custom);
867   } else {
868     // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
869     if (Subtarget->hasMinimum3Maximum3F32())
870       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
871 
872     if (Subtarget->hasMinimum3Maximum3PKF16())
873       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
874   }
875 
876   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
877                      {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
878                       MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
879                       MVT::i8},
880                      Custom);
881 
882   setOperationAction(ISD::INTRINSIC_W_CHAIN,
883                      {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
884                       MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
885                       MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
886                       MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
887                      Custom);
888 
889   setOperationAction(ISD::INTRINSIC_VOID,
890                      {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
891                       MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
892                       MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893                       MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
894                      Custom);
895 
896   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
897   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
898   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
899   setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
900   setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
901 
902   // TODO: Could move this to custom lowering, could benefit from combines on
903   // extract of relevant bits.
904   setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
905 
906   setOperationAction(ISD::MUL, MVT::i1, Promote);
907 
908   if (Subtarget->hasBF16ConversionInsts()) {
909     setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal);
910     setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal);
911     setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
912   }
913 
914   if (Subtarget->hasCvtPkF16F32Inst()) {
915     setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal);
916   }
917 
918   setTargetDAGCombine({ISD::ADD,
919                        ISD::UADDO_CARRY,
920                        ISD::SUB,
921                        ISD::USUBO_CARRY,
922                        ISD::MUL,
923                        ISD::FADD,
924                        ISD::FSUB,
925                        ISD::FDIV,
926                        ISD::FMUL,
927                        ISD::FMINNUM,
928                        ISD::FMAXNUM,
929                        ISD::FMINNUM_IEEE,
930                        ISD::FMAXNUM_IEEE,
931                        ISD::FMINIMUM,
932                        ISD::FMAXIMUM,
933                        ISD::FMA,
934                        ISD::SMIN,
935                        ISD::SMAX,
936                        ISD::UMIN,
937                        ISD::UMAX,
938                        ISD::SETCC,
939                        ISD::SELECT,
940                        ISD::SMIN,
941                        ISD::SMAX,
942                        ISD::UMIN,
943                        ISD::UMAX,
944                        ISD::AND,
945                        ISD::OR,
946                        ISD::XOR,
947                        ISD::SHL,
948                        ISD::SRL,
949                        ISD::SRA,
950                        ISD::FSHR,
951                        ISD::SINT_TO_FP,
952                        ISD::UINT_TO_FP,
953                        ISD::FCANONICALIZE,
954                        ISD::SCALAR_TO_VECTOR,
955                        ISD::ZERO_EXTEND,
956                        ISD::SIGN_EXTEND_INREG,
957                        ISD::EXTRACT_VECTOR_ELT,
958                        ISD::INSERT_VECTOR_ELT,
959                        ISD::FCOPYSIGN});
960 
961   if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
962     setTargetDAGCombine(ISD::FP_ROUND);
963 
964   // All memory operations. Some folding on the pointer operand is done to help
965   // matching the constant offsets in the addressing modes.
966   setTargetDAGCombine({ISD::LOAD,
967                        ISD::STORE,
968                        ISD::ATOMIC_LOAD,
969                        ISD::ATOMIC_STORE,
970                        ISD::ATOMIC_CMP_SWAP,
971                        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
972                        ISD::ATOMIC_SWAP,
973                        ISD::ATOMIC_LOAD_ADD,
974                        ISD::ATOMIC_LOAD_SUB,
975                        ISD::ATOMIC_LOAD_AND,
976                        ISD::ATOMIC_LOAD_OR,
977                        ISD::ATOMIC_LOAD_XOR,
978                        ISD::ATOMIC_LOAD_NAND,
979                        ISD::ATOMIC_LOAD_MIN,
980                        ISD::ATOMIC_LOAD_MAX,
981                        ISD::ATOMIC_LOAD_UMIN,
982                        ISD::ATOMIC_LOAD_UMAX,
983                        ISD::ATOMIC_LOAD_FADD,
984                        ISD::ATOMIC_LOAD_FMIN,
985                        ISD::ATOMIC_LOAD_FMAX,
986                        ISD::ATOMIC_LOAD_UINC_WRAP,
987                        ISD::ATOMIC_LOAD_UDEC_WRAP,
988                        ISD::INTRINSIC_VOID,
989                        ISD::INTRINSIC_W_CHAIN});
990 
991   // FIXME: In other contexts we pretend this is a per-function property.
992   setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
993 
994   setSchedulingPreference(Sched::RegPressure);
995 }
996 
997 const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
998 
999 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1000   static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1001   return RCRegs;
1002 }
1003 
1004 //===----------------------------------------------------------------------===//
1005 // TargetLowering queries
1006 //===----------------------------------------------------------------------===//
1007 
1008 // v_mad_mix* support a conversion from f16 to f32.
1009 //
1010 // There is only one special case when denormals are enabled we don't currently,
1011 // where this is OK to use.
1012 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1013                                        EVT DestVT, EVT SrcVT) const {
1014   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1015           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1016          DestVT.getScalarType() == MVT::f32 &&
1017          SrcVT.getScalarType() == MVT::f16 &&
1018          // TODO: This probably only requires no input flushing?
1019          denormalModeIsFlushAllF32(DAG.getMachineFunction());
1020 }
1021 
1022 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1023                                        LLT DestTy, LLT SrcTy) const {
1024   return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1025           (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1026          DestTy.getScalarSizeInBits() == 32 &&
1027          SrcTy.getScalarSizeInBits() == 16 &&
1028          // TODO: This probably only requires no input flushing?
1029          denormalModeIsFlushAllF32(*MI.getMF());
1030 }
1031 
1032 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1033   // SI has some legal vector types, but no legal vector operations. Say no
1034   // shuffles are legal in order to prefer scalarizing some vector operations.
1035   return false;
1036 }
1037 
1038 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1039                                                     CallingConv::ID CC,
1040                                                     EVT VT) const {
1041   if (CC == CallingConv::AMDGPU_KERNEL)
1042     return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1043 
1044   if (VT.isVector()) {
1045     EVT ScalarVT = VT.getScalarType();
1046     unsigned Size = ScalarVT.getSizeInBits();
1047     if (Size == 16) {
1048       if (Subtarget->has16BitInsts()) {
1049         if (VT.isInteger())
1050           return MVT::v2i16;
1051         return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1052       }
1053       return VT.isInteger() ? MVT::i32 : MVT::f32;
1054     }
1055 
1056     if (Size < 16)
1057       return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1058     return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1059   }
1060 
1061   if (VT.getSizeInBits() > 32)
1062     return MVT::i32;
1063 
1064   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1065 }
1066 
1067 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1068                                                          CallingConv::ID CC,
1069                                                          EVT VT) const {
1070   if (CC == CallingConv::AMDGPU_KERNEL)
1071     return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1072 
1073   if (VT.isVector()) {
1074     unsigned NumElts = VT.getVectorNumElements();
1075     EVT ScalarVT = VT.getScalarType();
1076     unsigned Size = ScalarVT.getSizeInBits();
1077 
1078     // FIXME: Should probably promote 8-bit vectors to i16.
1079     if (Size == 16 && Subtarget->has16BitInsts())
1080       return (NumElts + 1) / 2;
1081 
1082     if (Size <= 32)
1083       return NumElts;
1084 
1085     if (Size > 32)
1086       return NumElts * ((Size + 31) / 32);
1087   } else if (VT.getSizeInBits() > 32)
1088     return (VT.getSizeInBits() + 31) / 32;
1089 
1090   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1091 }
1092 
1093 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1094     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1095     unsigned &NumIntermediates, MVT &RegisterVT) const {
1096   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1097     unsigned NumElts = VT.getVectorNumElements();
1098     EVT ScalarVT = VT.getScalarType();
1099     unsigned Size = ScalarVT.getSizeInBits();
1100     // FIXME: We should fix the ABI to be the same on targets without 16-bit
1101     // support, but unless we can properly handle 3-vectors, it will be still be
1102     // inconsistent.
1103     if (Size == 16 && Subtarget->has16BitInsts()) {
1104       if (ScalarVT == MVT::bf16) {
1105         RegisterVT = MVT::i32;
1106         IntermediateVT = MVT::v2bf16;
1107       } else {
1108         RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1109         IntermediateVT = RegisterVT;
1110       }
1111       NumIntermediates = (NumElts + 1) / 2;
1112       return NumIntermediates;
1113     }
1114 
1115     if (Size == 32) {
1116       RegisterVT = ScalarVT.getSimpleVT();
1117       IntermediateVT = RegisterVT;
1118       NumIntermediates = NumElts;
1119       return NumIntermediates;
1120     }
1121 
1122     if (Size < 16 && Subtarget->has16BitInsts()) {
1123       // FIXME: Should probably form v2i16 pieces
1124       RegisterVT = MVT::i16;
1125       IntermediateVT = ScalarVT;
1126       NumIntermediates = NumElts;
1127       return NumIntermediates;
1128     }
1129 
1130     if (Size != 16 && Size <= 32) {
1131       RegisterVT = MVT::i32;
1132       IntermediateVT = ScalarVT;
1133       NumIntermediates = NumElts;
1134       return NumIntermediates;
1135     }
1136 
1137     if (Size > 32) {
1138       RegisterVT = MVT::i32;
1139       IntermediateVT = RegisterVT;
1140       NumIntermediates = NumElts * ((Size + 31) / 32);
1141       return NumIntermediates;
1142     }
1143   }
1144 
1145   return TargetLowering::getVectorTypeBreakdownForCallingConv(
1146       Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1147 }
1148 
1149 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1150                                  const DataLayout &DL, Type *Ty,
1151                                  unsigned MaxNumLanes) {
1152   assert(MaxNumLanes != 0);
1153 
1154   LLVMContext &Ctx = Ty->getContext();
1155   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1156     unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1157     return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1158                             NumElts);
1159   }
1160 
1161   return TLI.getValueType(DL, Ty);
1162 }
1163 
1164 // Peek through TFE struct returns to only use the data size.
1165 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1166                                    const DataLayout &DL, Type *Ty,
1167                                    unsigned MaxNumLanes) {
1168   auto *ST = dyn_cast<StructType>(Ty);
1169   if (!ST)
1170     return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1171 
1172   // TFE intrinsics return an aggregate type.
1173   assert(ST->getNumContainedTypes() == 2 &&
1174          ST->getContainedType(1)->isIntegerTy(32));
1175   return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1176 }
1177 
1178 /// Map address space 7 to MVT::v5i32 because that's its in-memory
1179 /// representation. This return value is vector-typed because there is no
1180 /// MVT::i160 and it is not clear if one can be added. While this could
1181 /// cause issues during codegen, these address space 7 pointers will be
1182 /// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1183 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1184 /// modeling, to work.
1185 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1186   if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1187     return MVT::v5i32;
1188   if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1189       DL.getPointerSizeInBits(AS) == 192)
1190     return MVT::v6i32;
1191   return AMDGPUTargetLowering::getPointerTy(DL, AS);
1192 }
1193 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1194 /// v8i32 when padding is added.
1195 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1196 /// also v8i32 with padding.
1197 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1198   if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1199        DL.getPointerSizeInBits(AS) == 160) ||
1200       (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1201        DL.getPointerSizeInBits(AS) == 192))
1202     return MVT::v8i32;
1203   return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1204 }
1205 
1206 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1207                                           const CallInst &CI,
1208                                           MachineFunction &MF,
1209                                           unsigned IntrID) const {
1210   Info.flags = MachineMemOperand::MONone;
1211   if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1212     Info.flags |= MachineMemOperand::MOInvariant;
1213   if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1214     Info.flags |= MachineMemOperand::MONonTemporal;
1215   Info.flags |= getTargetMMOFlags(CI);
1216 
1217   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1218           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1219     AttributeList Attr =
1220         Intrinsic::getAttributes(CI.getContext(), (Intrinsic::ID)IntrID);
1221     MemoryEffects ME = Attr.getMemoryEffects();
1222     if (ME.doesNotAccessMemory())
1223       return false;
1224 
1225     // TODO: Should images get their own address space?
1226     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1227 
1228     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1229     if (RsrcIntr->IsImage) {
1230       const AMDGPU::ImageDimIntrinsicInfo *Intr =
1231           AMDGPU::getImageDimIntrinsicInfo(IntrID);
1232       BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1233       Info.align.reset();
1234     }
1235 
1236     Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1237     if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1238       if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1239         // We conservatively set the memory operand of a buffer intrinsic to the
1240         // base resource pointer, so that we can access alias information about
1241         // those pointers. Cases like "this points at the same value
1242         // but with a different offset" are handled in
1243         // areMemAccessesTriviallyDisjoint.
1244         Info.ptrVal = RsrcArg;
1245     }
1246 
1247     bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1248     if (!IsSPrefetch) {
1249       auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1250       if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1251         Info.flags |= MachineMemOperand::MOVolatile;
1252     }
1253 
1254     Info.flags |= MachineMemOperand::MODereferenceable;
1255     if (ME.onlyReadsMemory()) {
1256       if (RsrcIntr->IsImage) {
1257         unsigned MaxNumLanes = 4;
1258 
1259         if (!BaseOpcode->Gather4) {
1260           // If this isn't a gather, we may have excess loaded elements in the
1261           // IR type. Check the dmask for the real number of elements loaded.
1262           unsigned DMask =
1263               cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1264           MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1265         }
1266 
1267         Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1268                                              CI.getType(), MaxNumLanes);
1269       } else {
1270         Info.memVT =
1271             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1272                                     std::numeric_limits<unsigned>::max());
1273       }
1274 
1275       // FIXME: What does alignment mean for an image?
1276       Info.opc = ISD::INTRINSIC_W_CHAIN;
1277       Info.flags |= MachineMemOperand::MOLoad;
1278     } else if (ME.onlyWritesMemory()) {
1279       Info.opc = ISD::INTRINSIC_VOID;
1280 
1281       Type *DataTy = CI.getArgOperand(0)->getType();
1282       if (RsrcIntr->IsImage) {
1283         unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1284         unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1285         Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1286                                            DMaskLanes);
1287       } else
1288         Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1289 
1290       Info.flags |= MachineMemOperand::MOStore;
1291     } else {
1292       // Atomic, NoReturn Sampler or prefetch
1293       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1294                                           : ISD::INTRINSIC_W_CHAIN;
1295       Info.flags |=
1296           MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1297 
1298       if (!IsSPrefetch)
1299         Info.flags |= MachineMemOperand::MOStore;
1300 
1301       switch (IntrID) {
1302       default:
1303         if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1304           // Fake memory access type for no return sampler intrinsics
1305           Info.memVT = MVT::i32;
1306         } else {
1307           // XXX - Should this be volatile without known ordering?
1308           Info.flags |= MachineMemOperand::MOVolatile;
1309           Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1310         }
1311         break;
1312       case Intrinsic::amdgcn_raw_buffer_load_lds:
1313       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1314       case Intrinsic::amdgcn_struct_buffer_load_lds:
1315       case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1316         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1317         Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1318         Info.ptrVal = CI.getArgOperand(1);
1319         return true;
1320       }
1321       case Intrinsic::amdgcn_raw_atomic_buffer_load:
1322       case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1323       case Intrinsic::amdgcn_struct_atomic_buffer_load:
1324       case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1325         Info.memVT =
1326             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1327                                     std::numeric_limits<unsigned>::max());
1328         Info.flags &= ~MachineMemOperand::MOStore;
1329         return true;
1330       }
1331       }
1332     }
1333     return true;
1334   }
1335 
1336   switch (IntrID) {
1337   case Intrinsic::amdgcn_ds_ordered_add:
1338   case Intrinsic::amdgcn_ds_ordered_swap: {
1339     Info.opc = ISD::INTRINSIC_W_CHAIN;
1340     Info.memVT = MVT::getVT(CI.getType());
1341     Info.ptrVal = CI.getOperand(0);
1342     Info.align.reset();
1343     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1344 
1345     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1346     if (!Vol->isZero())
1347       Info.flags |= MachineMemOperand::MOVolatile;
1348 
1349     return true;
1350   }
1351   case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1352   case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1353     Info.opc = ISD::INTRINSIC_W_CHAIN;
1354     Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1355     Info.ptrVal = nullptr;
1356     Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1357     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1358     return true;
1359   }
1360   case Intrinsic::amdgcn_ds_append:
1361   case Intrinsic::amdgcn_ds_consume: {
1362     Info.opc = ISD::INTRINSIC_W_CHAIN;
1363     Info.memVT = MVT::getVT(CI.getType());
1364     Info.ptrVal = CI.getOperand(0);
1365     Info.align.reset();
1366     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1367 
1368     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1369     if (!Vol->isZero())
1370       Info.flags |= MachineMemOperand::MOVolatile;
1371 
1372     return true;
1373   }
1374   case Intrinsic::amdgcn_global_atomic_csub: {
1375     Info.opc = ISD::INTRINSIC_W_CHAIN;
1376     Info.memVT = MVT::getVT(CI.getType());
1377     Info.ptrVal = CI.getOperand(0);
1378     Info.align.reset();
1379     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1380                   MachineMemOperand::MOVolatile;
1381     return true;
1382   }
1383   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1384     Info.opc = ISD::INTRINSIC_W_CHAIN;
1385     Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1386 
1387     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1388     Info.align.reset();
1389     Info.flags |=
1390         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1391     return true;
1392   }
1393   case Intrinsic::amdgcn_global_atomic_fmin_num:
1394   case Intrinsic::amdgcn_global_atomic_fmax_num:
1395   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1396   case Intrinsic::amdgcn_flat_atomic_fmin_num:
1397   case Intrinsic::amdgcn_flat_atomic_fmax_num:
1398   case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1399     Info.opc = ISD::INTRINSIC_W_CHAIN;
1400     Info.memVT = MVT::getVT(CI.getType());
1401     Info.ptrVal = CI.getOperand(0);
1402     Info.align.reset();
1403     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1404                   MachineMemOperand::MODereferenceable |
1405                   MachineMemOperand::MOVolatile;
1406     return true;
1407   }
1408   case Intrinsic::amdgcn_global_load_tr_b64:
1409   case Intrinsic::amdgcn_global_load_tr_b128:
1410   case Intrinsic::amdgcn_ds_read_tr4_b64:
1411   case Intrinsic::amdgcn_ds_read_tr6_b96:
1412   case Intrinsic::amdgcn_ds_read_tr8_b64:
1413   case Intrinsic::amdgcn_ds_read_tr16_b64: {
1414     Info.opc = ISD::INTRINSIC_W_CHAIN;
1415     Info.memVT = MVT::getVT(CI.getType());
1416     Info.ptrVal = CI.getOperand(0);
1417     Info.align.reset();
1418     Info.flags |= MachineMemOperand::MOLoad;
1419     return true;
1420   }
1421   case Intrinsic::amdgcn_ds_gws_init:
1422   case Intrinsic::amdgcn_ds_gws_barrier:
1423   case Intrinsic::amdgcn_ds_gws_sema_v:
1424   case Intrinsic::amdgcn_ds_gws_sema_br:
1425   case Intrinsic::amdgcn_ds_gws_sema_p:
1426   case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1427     Info.opc = ISD::INTRINSIC_VOID;
1428 
1429     const GCNTargetMachine &TM =
1430         static_cast<const GCNTargetMachine &>(getTargetMachine());
1431 
1432     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1433     Info.ptrVal = MFI->getGWSPSV(TM);
1434 
1435     // This is an abstract access, but we need to specify a type and size.
1436     Info.memVT = MVT::i32;
1437     Info.size = 4;
1438     Info.align = Align(4);
1439 
1440     if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1441       Info.flags |= MachineMemOperand::MOLoad;
1442     else
1443       Info.flags |= MachineMemOperand::MOStore;
1444     return true;
1445   }
1446   case Intrinsic::amdgcn_global_load_lds: {
1447     Info.opc = ISD::INTRINSIC_VOID;
1448     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1449     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1450     Info.ptrVal = CI.getArgOperand(1);
1451     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1452     return true;
1453   }
1454   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1455     Info.opc = ISD::INTRINSIC_W_CHAIN;
1456 
1457     const GCNTargetMachine &TM =
1458         static_cast<const GCNTargetMachine &>(getTargetMachine());
1459 
1460     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1461     Info.ptrVal = MFI->getGWSPSV(TM);
1462 
1463     // This is an abstract access, but we need to specify a type and size.
1464     Info.memVT = MVT::i32;
1465     Info.size = 4;
1466     Info.align = Align(4);
1467 
1468     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1469     return true;
1470   }
1471   case Intrinsic::amdgcn_s_prefetch_data: {
1472     Info.opc = ISD::INTRINSIC_VOID;
1473     Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1474     Info.ptrVal = CI.getArgOperand(0);
1475     Info.flags |= MachineMemOperand::MOLoad;
1476     return true;
1477   }
1478   default:
1479     return false;
1480   }
1481 }
1482 
1483 void SITargetLowering::CollectTargetIntrinsicOperands(
1484     const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1485   switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1486   case Intrinsic::amdgcn_addrspacecast_nonnull: {
1487     // The DAG's ValueType loses the addrspaces.
1488     // Add them as 2 extra Constant operands "from" and "to".
1489     unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1490     unsigned DstAS = I.getType()->getPointerAddressSpace();
1491     Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1492     Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1493     break;
1494   }
1495   default:
1496     break;
1497   }
1498 }
1499 
1500 bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1501                                             SmallVectorImpl<Value *> &Ops,
1502                                             Type *&AccessTy) const {
1503   Value *Ptr = nullptr;
1504   switch (II->getIntrinsicID()) {
1505   case Intrinsic::amdgcn_atomic_cond_sub_u32:
1506   case Intrinsic::amdgcn_ds_append:
1507   case Intrinsic::amdgcn_ds_consume:
1508   case Intrinsic::amdgcn_ds_read_tr4_b64:
1509   case Intrinsic::amdgcn_ds_read_tr6_b96:
1510   case Intrinsic::amdgcn_ds_read_tr8_b64:
1511   case Intrinsic::amdgcn_ds_read_tr16_b64:
1512   case Intrinsic::amdgcn_ds_ordered_add:
1513   case Intrinsic::amdgcn_ds_ordered_swap:
1514   case Intrinsic::amdgcn_flat_atomic_fmax_num:
1515   case Intrinsic::amdgcn_flat_atomic_fmin_num:
1516   case Intrinsic::amdgcn_global_atomic_csub:
1517   case Intrinsic::amdgcn_global_atomic_fmax_num:
1518   case Intrinsic::amdgcn_global_atomic_fmin_num:
1519   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1520   case Intrinsic::amdgcn_global_load_tr_b64:
1521   case Intrinsic::amdgcn_global_load_tr_b128:
1522     Ptr = II->getArgOperand(0);
1523     break;
1524   case Intrinsic::amdgcn_global_load_lds:
1525     Ptr = II->getArgOperand(1);
1526     break;
1527   default:
1528     return false;
1529   }
1530   AccessTy = II->getType();
1531   Ops.push_back(Ptr);
1532   return true;
1533 }
1534 
1535 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1536                                                  unsigned AddrSpace) const {
1537   if (!Subtarget->hasFlatInstOffsets()) {
1538     // Flat instructions do not have offsets, and only have the register
1539     // address.
1540     return AM.BaseOffs == 0 && AM.Scale == 0;
1541   }
1542 
1543   decltype(SIInstrFlags::FLAT) FlatVariant =
1544       AddrSpace == AMDGPUAS::GLOBAL_ADDRESS    ? SIInstrFlags::FlatGlobal
1545       : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1546                                                : SIInstrFlags::FLAT;
1547 
1548   return AM.Scale == 0 &&
1549          (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1550                                   AM.BaseOffs, AddrSpace, FlatVariant));
1551 }
1552 
1553 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1554   if (Subtarget->hasFlatGlobalInsts())
1555     return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS);
1556 
1557   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1558     // Assume the we will use FLAT for all global memory accesses
1559     // on VI.
1560     // FIXME: This assumption is currently wrong.  On VI we still use
1561     // MUBUF instructions for the r + i addressing mode.  As currently
1562     // implemented, the MUBUF instructions only work on buffer < 4GB.
1563     // It may be possible to support > 4GB buffers with MUBUF instructions,
1564     // by setting the stride value in the resource descriptor which would
1565     // increase the size limit to (stride * 4GB).  However, this is risky,
1566     // because it has never been validated.
1567     return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1568   }
1569 
1570   return isLegalMUBUFAddressingMode(AM);
1571 }
1572 
1573 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1574   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1575   // additionally can do r + r + i with addr64. 32-bit has more addressing
1576   // mode options. Depending on the resource constant, it can also do
1577   // (i64 r0) + (i32 r1) * (i14 i).
1578   //
1579   // Private arrays end up using a scratch buffer most of the time, so also
1580   // assume those use MUBUF instructions. Scratch loads / stores are currently
1581   // implemented as mubuf instructions with offen bit set, so slightly
1582   // different than the normal addr64.
1583   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1584   if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1585     return false;
1586 
1587   // FIXME: Since we can split immediate into soffset and immediate offset,
1588   // would it make sense to allow any immediate?
1589 
1590   switch (AM.Scale) {
1591   case 0: // r + i or just i, depending on HasBaseReg.
1592     return true;
1593   case 1:
1594     return true; // We have r + r or r + i.
1595   case 2:
1596     if (AM.HasBaseReg) {
1597       // Reject 2 * r + r.
1598       return false;
1599     }
1600 
1601     // Allow 2 * r as r + r
1602     // Or  2 * r + i is allowed as r + r + i.
1603     return true;
1604   default: // Don't allow n * r
1605     return false;
1606   }
1607 }
1608 
1609 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1610                                              const AddrMode &AM, Type *Ty,
1611                                              unsigned AS,
1612                                              Instruction *I) const {
1613   // No global is ever allowed as a base.
1614   if (AM.BaseGV)
1615     return false;
1616 
1617   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1618     return isLegalGlobalAddressingMode(AM);
1619 
1620   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1621       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1622       AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1623       AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1624     // If the offset isn't a multiple of 4, it probably isn't going to be
1625     // correctly aligned.
1626     // FIXME: Can we get the real alignment here?
1627     if (AM.BaseOffs % 4 != 0)
1628       return isLegalMUBUFAddressingMode(AM);
1629 
1630     if (!Subtarget->hasScalarSubwordLoads()) {
1631       // There are no SMRD extloads, so if we have to do a small type access we
1632       // will use a MUBUF load.
1633       // FIXME?: We also need to do this if unaligned, but we don't know the
1634       // alignment here.
1635       if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1636         return isLegalGlobalAddressingMode(AM);
1637     }
1638 
1639     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1640       // SMRD instructions have an 8-bit, dword offset on SI.
1641       if (!isUInt<8>(AM.BaseOffs / 4))
1642         return false;
1643     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1644       // On CI+, this can also be a 32-bit literal constant offset. If it fits
1645       // in 8-bits, it can use a smaller encoding.
1646       if (!isUInt<32>(AM.BaseOffs / 4))
1647         return false;
1648     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1649       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1650       if (!isUInt<20>(AM.BaseOffs))
1651         return false;
1652     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1653       // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1654       // for S_BUFFER_* instructions).
1655       if (!isInt<21>(AM.BaseOffs))
1656         return false;
1657     } else {
1658       // On GFX12, all offsets are signed 24-bit in bytes.
1659       if (!isInt<24>(AM.BaseOffs))
1660         return false;
1661     }
1662 
1663     if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1664          AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1665         AM.BaseOffs < 0) {
1666       // Scalar (non-buffer) loads can only use a negative offset if
1667       // soffset+offset is non-negative. Since the compiler can only prove that
1668       // in a few special cases, it is safer to claim that negative offsets are
1669       // not supported.
1670       return false;
1671     }
1672 
1673     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1674       return true;
1675 
1676     if (AM.Scale == 1 && AM.HasBaseReg)
1677       return true;
1678 
1679     return false;
1680   }
1681 
1682   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1683     return Subtarget->enableFlatScratch()
1684                ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)
1685                : isLegalMUBUFAddressingMode(AM);
1686 
1687   if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1688       (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1689     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1690     // field.
1691     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1692     // an 8-bit dword offset but we don't know the alignment here.
1693     if (!isUInt<16>(AM.BaseOffs))
1694       return false;
1695 
1696     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1697       return true;
1698 
1699     if (AM.Scale == 1 && AM.HasBaseReg)
1700       return true;
1701 
1702     return false;
1703   }
1704 
1705   if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1706     // For an unknown address space, this usually means that this is for some
1707     // reason being used for pure arithmetic, and not based on some addressing
1708     // computation. We don't have instructions that compute pointers with any
1709     // addressing modes, so treat them as having no offset like flat
1710     // instructions.
1711     return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1712   }
1713 
1714   // Assume a user alias of global for unknown address spaces.
1715   return isLegalGlobalAddressingMode(AM);
1716 }
1717 
1718 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1719                                         const MachineFunction &MF) const {
1720   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1721     return (MemVT.getSizeInBits() <= 4 * 32);
1722   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1723     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1724     return (MemVT.getSizeInBits() <= MaxPrivateBits);
1725   }
1726   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1727     return (MemVT.getSizeInBits() <= 2 * 32);
1728   return true;
1729 }
1730 
1731 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1732     unsigned Size, unsigned AddrSpace, Align Alignment,
1733     MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1734   if (IsFast)
1735     *IsFast = 0;
1736 
1737   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1738       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1739     // Check if alignment requirements for ds_read/write instructions are
1740     // disabled.
1741     if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1742       return false;
1743 
1744     Align RequiredAlignment(
1745         PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1746     if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1747         Alignment < RequiredAlignment)
1748       return false;
1749 
1750     // Either, the alignment requirements are "enabled", or there is an
1751     // unaligned LDS access related hardware bug though alignment requirements
1752     // are "disabled". In either case, we need to check for proper alignment
1753     // requirements.
1754     //
1755     switch (Size) {
1756     case 64:
1757       // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1758       // address is negative, then the instruction is incorrectly treated as
1759       // out-of-bounds even if base + offsets is in bounds. Split vectorized
1760       // loads here to avoid emitting ds_read2_b32. We may re-combine the
1761       // load later in the SILoadStoreOptimizer.
1762       if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1763         return false;
1764 
1765       // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1766       // can do a 4 byte aligned, 8 byte access in a single operation using
1767       // ds_read2/write2_b32 with adjacent offsets.
1768       RequiredAlignment = Align(4);
1769 
1770       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1771         // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1772         // ds_write2_b32 depending on the alignment. In either case with either
1773         // alignment there is no faster way of doing this.
1774 
1775         // The numbers returned here and below are not additive, it is a 'speed
1776         // rank'. They are just meant to be compared to decide if a certain way
1777         // of lowering an operation is faster than another. For that purpose
1778         // naturally aligned operation gets it bitsize to indicate that "it
1779         // operates with a speed comparable to N-bit wide load". With the full
1780         // alignment ds128 is slower than ds96 for example. If underaligned it
1781         // is comparable to a speed of a single dword access, which would then
1782         // mean 32 < 128 and it is faster to issue a wide load regardless.
1783         // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1784         // wider load which will not be aligned anymore the latter is slower.
1785         if (IsFast)
1786           *IsFast = (Alignment >= RequiredAlignment) ? 64
1787                     : (Alignment < Align(4))         ? 32
1788                                                      : 1;
1789         return true;
1790       }
1791 
1792       break;
1793     case 96:
1794       if (!Subtarget->hasDS96AndDS128())
1795         return false;
1796 
1797       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1798       // gfx8 and older.
1799 
1800       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1801         // Naturally aligned access is fastest. However, also report it is Fast
1802         // if memory is aligned less than DWORD. A narrow load or store will be
1803         // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1804         // be more of them, so overall we will pay less penalty issuing a single
1805         // instruction.
1806 
1807         // See comment on the values above.
1808         if (IsFast)
1809           *IsFast = (Alignment >= RequiredAlignment) ? 96
1810                     : (Alignment < Align(4))         ? 32
1811                                                      : 1;
1812         return true;
1813       }
1814 
1815       break;
1816     case 128:
1817       if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1818         return false;
1819 
1820       // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1821       // gfx8 and older, but  we can do a 8 byte aligned, 16 byte access in a
1822       // single operation using ds_read2/write2_b64.
1823       RequiredAlignment = Align(8);
1824 
1825       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1826         // Naturally aligned access is fastest. However, also report it is Fast
1827         // if memory is aligned less than DWORD. A narrow load or store will be
1828         // be equally slow as a single ds_read_b128/ds_write_b128, but there
1829         // will be more of them, so overall we will pay less penalty issuing a
1830         // single instruction.
1831 
1832         // See comment on the values above.
1833         if (IsFast)
1834           *IsFast = (Alignment >= RequiredAlignment) ? 128
1835                     : (Alignment < Align(4))         ? 32
1836                                                      : 1;
1837         return true;
1838       }
1839 
1840       break;
1841     default:
1842       if (Size > 32)
1843         return false;
1844 
1845       break;
1846     }
1847 
1848     // See comment on the values above.
1849     // Note that we have a single-dword or sub-dword here, so if underaligned
1850     // it is a slowest possible access, hence returned value is 0.
1851     if (IsFast)
1852       *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1853 
1854     return Alignment >= RequiredAlignment ||
1855            Subtarget->hasUnalignedDSAccessEnabled();
1856   }
1857 
1858   // FIXME: We have to be conservative here and assume that flat operations
1859   // will access scratch.  If we had access to the IR function, then we
1860   // could determine if any private memory was used in the function.
1861   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1862       AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1863     bool AlignedBy4 = Alignment >= Align(4);
1864     if (IsFast)
1865       *IsFast = AlignedBy4;
1866 
1867     return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1868   }
1869 
1870   // So long as they are correct, wide global memory operations perform better
1871   // than multiple smaller memory ops -- even when misaligned
1872   if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1873     if (IsFast)
1874       *IsFast = Size;
1875 
1876     return Alignment >= Align(4) ||
1877            Subtarget->hasUnalignedBufferAccessEnabled();
1878   }
1879 
1880   // Smaller than dword value must be aligned.
1881   if (Size < 32)
1882     return false;
1883 
1884   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1885   // byte-address are ignored, thus forcing Dword alignment.
1886   // This applies to private, global, and constant memory.
1887   if (IsFast)
1888     *IsFast = 1;
1889 
1890   return Size >= 32 && Alignment >= Align(4);
1891 }
1892 
1893 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1894     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1895     unsigned *IsFast) const {
1896   return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1897                                             Alignment, Flags, IsFast);
1898 }
1899 
1900 EVT SITargetLowering::getOptimalMemOpType(
1901     const MemOp &Op, const AttributeList &FuncAttributes) const {
1902   // FIXME: Should account for address space here.
1903 
1904   // The default fallback uses the private pointer size as a guess for a type to
1905   // use. Make sure we switch these to 64-bit accesses.
1906 
1907   if (Op.size() >= 16 &&
1908       Op.isDstAligned(Align(4))) // XXX: Should only do for global
1909     return MVT::v4i32;
1910 
1911   if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1912     return MVT::v2i32;
1913 
1914   // Use the default.
1915   return MVT::Other;
1916 }
1917 
1918 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1919   const MemSDNode *MemNode = cast<MemSDNode>(N);
1920   return MemNode->getMemOperand()->getFlags() & MONoClobber;
1921 }
1922 
1923 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
1924   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1925          AS == AMDGPUAS::PRIVATE_ADDRESS;
1926 }
1927 
1928 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1929                                            unsigned DestAS) const {
1930   // Flat -> private/local is a simple truncate.
1931   // Flat -> global is no-op
1932   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1933     return true;
1934 
1935   const GCNTargetMachine &TM =
1936       static_cast<const GCNTargetMachine &>(getTargetMachine());
1937   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1938 }
1939 
1940 TargetLoweringBase::LegalizeTypeAction
1941 SITargetLowering::getPreferredVectorAction(MVT VT) const {
1942   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1943       VT.getScalarType().bitsLE(MVT::i16))
1944     return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1945   return TargetLoweringBase::getPreferredVectorAction(VT);
1946 }
1947 
1948 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1949                                                          Type *Ty) const {
1950   // FIXME: Could be smarter if called for vector constants.
1951   return true;
1952 }
1953 
1954 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1955                                                unsigned Index) const {
1956   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
1957     return false;
1958 
1959   // TODO: Add more cases that are cheap.
1960   return Index == 0;
1961 }
1962 
1963 bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
1964   // TODO: This should be more aggressive, particular for 16-bit element
1965   // vectors. However there are some mixed improvements and regressions.
1966   EVT EltTy = VT.getVectorElementType();
1967   return EltTy.getSizeInBits() % 32 == 0;
1968 }
1969 
1970 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1971   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1972     switch (Op) {
1973     case ISD::LOAD:
1974     case ISD::STORE:
1975       return true;
1976     default:
1977       return false;
1978     }
1979   }
1980 
1981   // SimplifySetCC uses this function to determine whether or not it should
1982   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
1983   if (VT == MVT::i1 && Op == ISD::SETCC)
1984     return false;
1985 
1986   return TargetLowering::isTypeDesirableForOp(Op, VT);
1987 }
1988 
1989 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1990                                                    const SDLoc &SL,
1991                                                    SDValue Chain,
1992                                                    uint64_t Offset) const {
1993   const DataLayout &DL = DAG.getDataLayout();
1994   MachineFunction &MF = DAG.getMachineFunction();
1995   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1996   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1997 
1998   auto [InputPtrReg, RC, ArgTy] =
1999       Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2000 
2001   // We may not have the kernarg segment argument if we have no kernel
2002   // arguments.
2003   if (!InputPtrReg)
2004     return DAG.getConstant(Offset, SL, PtrVT);
2005 
2006   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2007   SDValue BasePtr = DAG.getCopyFromReg(
2008       Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2009 
2010   return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2011 }
2012 
2013 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2014                                             const SDLoc &SL) const {
2015   uint64_t Offset =
2016       getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT);
2017   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2018 }
2019 
2020 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2021                                          const SDLoc &SL) const {
2022 
2023   Function &F = DAG.getMachineFunction().getFunction();
2024   std::optional<uint32_t> KnownSize =
2025       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2026   if (KnownSize.has_value())
2027     return DAG.getConstant(*KnownSize, SL, MVT::i32);
2028   return SDValue();
2029 }
2030 
2031 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2032                                          const SDLoc &SL, SDValue Val,
2033                                          bool Signed,
2034                                          const ISD::InputArg *Arg) const {
2035   // First, if it is a widened vector, narrow it.
2036   if (VT.isVector() &&
2037       VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2038     EVT NarrowedVT =
2039         EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
2040                          VT.getVectorNumElements());
2041     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2042                       DAG.getConstant(0, SL, MVT::i32));
2043   }
2044 
2045   // Then convert the vector elements or scalar value.
2046   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2047     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2048     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2049   }
2050 
2051   if (MemVT.isFloatingPoint())
2052     Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2053   else if (Signed)
2054     Val = DAG.getSExtOrTrunc(Val, SL, VT);
2055   else
2056     Val = DAG.getZExtOrTrunc(Val, SL, VT);
2057 
2058   return Val;
2059 }
2060 
2061 SDValue SITargetLowering::lowerKernargMemParameter(
2062     SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2063     uint64_t Offset, Align Alignment, bool Signed,
2064     const ISD::InputArg *Arg) const {
2065   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2066 
2067   // Try to avoid using an extload by loading earlier than the argument address,
2068   // and extracting the relevant bits. The load should hopefully be merged with
2069   // the previous argument.
2070   if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2071     // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2072     int64_t AlignDownOffset = alignDown(Offset, 4);
2073     int64_t OffsetDiff = Offset - AlignDownOffset;
2074 
2075     EVT IntVT = MemVT.changeTypeToInteger();
2076 
2077     // TODO: If we passed in the base kernel offset we could have a better
2078     // alignment than 4, but we don't really need it.
2079     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2080     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2081                                MachineMemOperand::MODereferenceable |
2082                                    MachineMemOperand::MOInvariant);
2083 
2084     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2085     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2086 
2087     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2088     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2089     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2090 
2091     return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2092   }
2093 
2094   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2095   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2096                              MachineMemOperand::MODereferenceable |
2097                                  MachineMemOperand::MOInvariant);
2098 
2099   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2100   return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2101 }
2102 
2103 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2104                                               CCValAssign &VA, const SDLoc &SL,
2105                                               SDValue Chain,
2106                                               const ISD::InputArg &Arg) const {
2107   MachineFunction &MF = DAG.getMachineFunction();
2108   MachineFrameInfo &MFI = MF.getFrameInfo();
2109 
2110   if (Arg.Flags.isByVal()) {
2111     unsigned Size = Arg.Flags.getByValSize();
2112     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2113     return DAG.getFrameIndex(FrameIdx, MVT::i32);
2114   }
2115 
2116   unsigned ArgOffset = VA.getLocMemOffset();
2117   unsigned ArgSize = VA.getValVT().getStoreSize();
2118 
2119   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2120 
2121   // Create load nodes to retrieve arguments from the stack.
2122   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2123   SDValue ArgValue;
2124 
2125   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2126   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2127   MVT MemVT = VA.getValVT();
2128 
2129   switch (VA.getLocInfo()) {
2130   default:
2131     break;
2132   case CCValAssign::BCvt:
2133     MemVT = VA.getLocVT();
2134     break;
2135   case CCValAssign::SExt:
2136     ExtType = ISD::SEXTLOAD;
2137     break;
2138   case CCValAssign::ZExt:
2139     ExtType = ISD::ZEXTLOAD;
2140     break;
2141   case CCValAssign::AExt:
2142     ExtType = ISD::EXTLOAD;
2143     break;
2144   }
2145 
2146   ArgValue = DAG.getExtLoad(
2147       ExtType, SL, VA.getLocVT(), Chain, FIN,
2148       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT);
2149   return ArgValue;
2150 }
2151 
2152 SDValue SITargetLowering::getPreloadedValue(
2153     SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2154     AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2155   const ArgDescriptor *Reg = nullptr;
2156   const TargetRegisterClass *RC;
2157   LLT Ty;
2158 
2159   CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2160   const ArgDescriptor WorkGroupIDX =
2161       ArgDescriptor::createRegister(AMDGPU::TTMP9);
2162   // If GridZ is not programmed in an entry function then the hardware will set
2163   // it to all zeros, so there is no need to mask the GridY value in the low
2164   // order bits.
2165   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2166       AMDGPU::TTMP7,
2167       AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2168   const ArgDescriptor WorkGroupIDZ =
2169       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2170   if (Subtarget->hasArchitectedSGPRs() &&
2171       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2172     switch (PVID) {
2173     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2174       Reg = &WorkGroupIDX;
2175       RC = &AMDGPU::SReg_32RegClass;
2176       Ty = LLT::scalar(32);
2177       break;
2178     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2179       Reg = &WorkGroupIDY;
2180       RC = &AMDGPU::SReg_32RegClass;
2181       Ty = LLT::scalar(32);
2182       break;
2183     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2184       Reg = &WorkGroupIDZ;
2185       RC = &AMDGPU::SReg_32RegClass;
2186       Ty = LLT::scalar(32);
2187       break;
2188     default:
2189       break;
2190     }
2191   }
2192 
2193   if (!Reg)
2194     std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2195   if (!Reg) {
2196     if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2197       // It's possible for a kernarg intrinsic call to appear in a kernel with
2198       // no allocated segment, in which case we do not add the user sgpr
2199       // argument, so just return null.
2200       return DAG.getConstant(0, SDLoc(), VT);
2201     }
2202 
2203     // It's undefined behavior if a function marked with the amdgpu-no-*
2204     // attributes uses the corresponding intrinsic.
2205     return DAG.getUNDEF(VT);
2206   }
2207 
2208   return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2209 }
2210 
2211 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2212                                CallingConv::ID CallConv,
2213                                ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2214                                FunctionType *FType,
2215                                SIMachineFunctionInfo *Info) {
2216   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2217     const ISD::InputArg *Arg = &Ins[I];
2218 
2219     assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2220            "vector type argument should have been split");
2221 
2222     // First check if it's a PS input addr.
2223     if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2224         PSInputNum <= 15) {
2225       bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2226 
2227       // Inconveniently only the first part of the split is marked as isSplit,
2228       // so skip to the end. We only want to increment PSInputNum once for the
2229       // entire split argument.
2230       if (Arg->Flags.isSplit()) {
2231         while (!Arg->Flags.isSplitEnd()) {
2232           assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2233                  "unexpected vector split in ps argument type");
2234           if (!SkipArg)
2235             Splits.push_back(*Arg);
2236           Arg = &Ins[++I];
2237         }
2238       }
2239 
2240       if (SkipArg) {
2241         // We can safely skip PS inputs.
2242         Skipped.set(Arg->getOrigArgIndex());
2243         ++PSInputNum;
2244         continue;
2245       }
2246 
2247       Info->markPSInputAllocated(PSInputNum);
2248       if (Arg->Used)
2249         Info->markPSInputEnabled(PSInputNum);
2250 
2251       ++PSInputNum;
2252     }
2253 
2254     Splits.push_back(*Arg);
2255   }
2256 }
2257 
2258 // Allocate special inputs passed in VGPRs.
2259 void SITargetLowering::allocateSpecialEntryInputVGPRs(
2260     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2261     SIMachineFunctionInfo &Info) const {
2262   const LLT S32 = LLT::scalar(32);
2263   MachineRegisterInfo &MRI = MF.getRegInfo();
2264 
2265   if (Info.hasWorkItemIDX()) {
2266     Register Reg = AMDGPU::VGPR0;
2267     MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2268 
2269     CCInfo.AllocateReg(Reg);
2270     unsigned Mask =
2271         (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2272     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2273   }
2274 
2275   if (Info.hasWorkItemIDY()) {
2276     assert(Info.hasWorkItemIDX());
2277     if (Subtarget->hasPackedTID()) {
2278       Info.setWorkItemIDY(
2279           ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2280     } else {
2281       unsigned Reg = AMDGPU::VGPR1;
2282       MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2283 
2284       CCInfo.AllocateReg(Reg);
2285       Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2286     }
2287   }
2288 
2289   if (Info.hasWorkItemIDZ()) {
2290     assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2291     if (Subtarget->hasPackedTID()) {
2292       Info.setWorkItemIDZ(
2293           ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2294     } else {
2295       unsigned Reg = AMDGPU::VGPR2;
2296       MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2297 
2298       CCInfo.AllocateReg(Reg);
2299       Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2300     }
2301   }
2302 }
2303 
2304 // Try to allocate a VGPR at the end of the argument list, or if no argument
2305 // VGPRs are left allocating a stack slot.
2306 // If \p Mask is is given it indicates bitfield position in the register.
2307 // If \p Arg is given use it with new ]p Mask instead of allocating new.
2308 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2309                                          ArgDescriptor Arg = ArgDescriptor()) {
2310   if (Arg.isSet())
2311     return ArgDescriptor::createArg(Arg, Mask);
2312 
2313   ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2314   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2315   if (RegIdx == ArgVGPRs.size()) {
2316     // Spill to stack required.
2317     int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2318 
2319     return ArgDescriptor::createStack(Offset, Mask);
2320   }
2321 
2322   unsigned Reg = ArgVGPRs[RegIdx];
2323   Reg = CCInfo.AllocateReg(Reg);
2324   assert(Reg != AMDGPU::NoRegister);
2325 
2326   MachineFunction &MF = CCInfo.getMachineFunction();
2327   Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2328   MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2329   return ArgDescriptor::createRegister(Reg, Mask);
2330 }
2331 
2332 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2333                                              const TargetRegisterClass *RC,
2334                                              unsigned NumArgRegs) {
2335   ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2336   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2337   if (RegIdx == ArgSGPRs.size())
2338     report_fatal_error("ran out of SGPRs for arguments");
2339 
2340   unsigned Reg = ArgSGPRs[RegIdx];
2341   Reg = CCInfo.AllocateReg(Reg);
2342   assert(Reg != AMDGPU::NoRegister);
2343 
2344   MachineFunction &MF = CCInfo.getMachineFunction();
2345   MF.addLiveIn(Reg, RC);
2346   return ArgDescriptor::createRegister(Reg);
2347 }
2348 
2349 // If this has a fixed position, we still should allocate the register in the
2350 // CCInfo state. Technically we could get away with this for values passed
2351 // outside of the normal argument range.
2352 static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2353                                        const TargetRegisterClass *RC,
2354                                        MCRegister Reg) {
2355   Reg = CCInfo.AllocateReg(Reg);
2356   assert(Reg != AMDGPU::NoRegister);
2357   MachineFunction &MF = CCInfo.getMachineFunction();
2358   MF.addLiveIn(Reg, RC);
2359 }
2360 
2361 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2362   if (Arg) {
2363     allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2364                                Arg.getRegister());
2365   } else
2366     Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2367 }
2368 
2369 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2370   if (Arg) {
2371     allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2372                                Arg.getRegister());
2373   } else
2374     Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2375 }
2376 
2377 /// Allocate implicit function VGPR arguments at the end of allocated user
2378 /// arguments.
2379 void SITargetLowering::allocateSpecialInputVGPRs(
2380     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2381     SIMachineFunctionInfo &Info) const {
2382   const unsigned Mask = 0x3ff;
2383   ArgDescriptor Arg;
2384 
2385   if (Info.hasWorkItemIDX()) {
2386     Arg = allocateVGPR32Input(CCInfo, Mask);
2387     Info.setWorkItemIDX(Arg);
2388   }
2389 
2390   if (Info.hasWorkItemIDY()) {
2391     Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2392     Info.setWorkItemIDY(Arg);
2393   }
2394 
2395   if (Info.hasWorkItemIDZ())
2396     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2397 }
2398 
2399 /// Allocate implicit function VGPR arguments in fixed registers.
2400 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2401     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2402     SIMachineFunctionInfo &Info) const {
2403   Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2404   if (!Reg)
2405     report_fatal_error("failed to allocated VGPR for implicit arguments");
2406 
2407   const unsigned Mask = 0x3ff;
2408   Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2409   Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2410   Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2411 }
2412 
2413 void SITargetLowering::allocateSpecialInputSGPRs(
2414     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2415     SIMachineFunctionInfo &Info) const {
2416   auto &ArgInfo = Info.getArgInfo();
2417   const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2418 
2419   // TODO: Unify handling with private memory pointers.
2420   if (UserSGPRInfo.hasDispatchPtr())
2421     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2422 
2423   if (UserSGPRInfo.hasQueuePtr())
2424     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2425 
2426   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2427   // constant offset from the kernarg segment.
2428   if (Info.hasImplicitArgPtr())
2429     allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2430 
2431   if (UserSGPRInfo.hasDispatchID())
2432     allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2433 
2434   // flat_scratch_init is not applicable for non-kernel functions.
2435 
2436   if (Info.hasWorkGroupIDX())
2437     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2438 
2439   if (Info.hasWorkGroupIDY())
2440     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2441 
2442   if (Info.hasWorkGroupIDZ())
2443     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2444 
2445   if (Info.hasLDSKernelId())
2446     allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2447 }
2448 
2449 // Allocate special inputs passed in user SGPRs.
2450 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2451                                             MachineFunction &MF,
2452                                             const SIRegisterInfo &TRI,
2453                                             SIMachineFunctionInfo &Info) const {
2454   const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2455   if (UserSGPRInfo.hasImplicitBufferPtr()) {
2456     Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2457     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2458     CCInfo.AllocateReg(ImplicitBufferPtrReg);
2459   }
2460 
2461   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2462   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2463     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2464     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2465     CCInfo.AllocateReg(PrivateSegmentBufferReg);
2466   }
2467 
2468   if (UserSGPRInfo.hasDispatchPtr()) {
2469     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2470     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2471     CCInfo.AllocateReg(DispatchPtrReg);
2472   }
2473 
2474   if (UserSGPRInfo.hasQueuePtr()) {
2475     Register QueuePtrReg = Info.addQueuePtr(TRI);
2476     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2477     CCInfo.AllocateReg(QueuePtrReg);
2478   }
2479 
2480   if (UserSGPRInfo.hasKernargSegmentPtr()) {
2481     MachineRegisterInfo &MRI = MF.getRegInfo();
2482     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2483     CCInfo.AllocateReg(InputPtrReg);
2484 
2485     Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2486     MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2487   }
2488 
2489   if (UserSGPRInfo.hasDispatchID()) {
2490     Register DispatchIDReg = Info.addDispatchID(TRI);
2491     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2492     CCInfo.AllocateReg(DispatchIDReg);
2493   }
2494 
2495   if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2496     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2497     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2498     CCInfo.AllocateReg(FlatScratchInitReg);
2499   }
2500 
2501   if (UserSGPRInfo.hasPrivateSegmentSize()) {
2502     Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2503     MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2504     CCInfo.AllocateReg(PrivateSegmentSizeReg);
2505   }
2506 
2507   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2508   // these from the dispatch pointer.
2509 }
2510 
2511 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2512 // sequential starting from the first argument.
2513 void SITargetLowering::allocatePreloadKernArgSGPRs(
2514     CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2515     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2516     const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2517   Function &F = MF.getFunction();
2518   unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2519   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2520   bool InPreloadSequence = true;
2521   unsigned InIdx = 0;
2522   bool AlignedForImplictArgs = false;
2523   unsigned ImplicitArgOffset = 0;
2524   for (auto &Arg : F.args()) {
2525     if (!InPreloadSequence || !Arg.hasInRegAttr())
2526       break;
2527 
2528     unsigned ArgIdx = Arg.getArgNo();
2529     // Don't preload non-original args or parts not in the current preload
2530     // sequence.
2531     if (InIdx < Ins.size() &&
2532         (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2533       break;
2534 
2535     for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2536            Ins[InIdx].getOrigArgIndex() == ArgIdx;
2537          InIdx++) {
2538       assert(ArgLocs[ArgIdx].isMemLoc());
2539       auto &ArgLoc = ArgLocs[InIdx];
2540       const Align KernelArgBaseAlign = Align(16);
2541       unsigned ArgOffset = ArgLoc.getLocMemOffset();
2542       Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2543       unsigned NumAllocSGPRs =
2544           alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2545 
2546       // Fix alignment for hidden arguments.
2547       if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2548         if (!AlignedForImplictArgs) {
2549           ImplicitArgOffset =
2550               alignTo(LastExplicitArgOffset,
2551                       Subtarget->getAlignmentForImplicitArgPtr()) -
2552               LastExplicitArgOffset;
2553           AlignedForImplictArgs = true;
2554         }
2555         ArgOffset += ImplicitArgOffset;
2556       }
2557 
2558       // Arg is preloaded into the previous SGPR.
2559       if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2560         assert(InIdx >= 1 && "No previous SGPR");
2561         Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2562             Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2563         continue;
2564       }
2565 
2566       unsigned Padding = ArgOffset - LastExplicitArgOffset;
2567       unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2568       // Check for free user SGPRs for preloading.
2569       if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2570         InPreloadSequence = false;
2571         break;
2572       }
2573 
2574       // Preload this argument.
2575       const TargetRegisterClass *RC =
2576           TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2577       SmallVectorImpl<MCRegister> *PreloadRegs =
2578           Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2579 
2580       if (PreloadRegs->size() > 1)
2581         RC = &AMDGPU::SGPR_32RegClass;
2582       for (auto &Reg : *PreloadRegs) {
2583         assert(Reg);
2584         MF.addLiveIn(Reg, RC);
2585         CCInfo.AllocateReg(Reg);
2586       }
2587 
2588       LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2589     }
2590   }
2591 }
2592 
2593 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2594                                            const SIRegisterInfo &TRI,
2595                                            SIMachineFunctionInfo &Info) const {
2596   // Always allocate this last since it is a synthetic preload.
2597   if (Info.hasLDSKernelId()) {
2598     Register Reg = Info.addLDSKernelId();
2599     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600     CCInfo.AllocateReg(Reg);
2601   }
2602 }
2603 
2604 // Allocate special input registers that are initialized per-wave.
2605 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
2606                                            SIMachineFunctionInfo &Info,
2607                                            CallingConv::ID CallConv,
2608                                            bool IsShader) const {
2609   bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2610   if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2611     // Note: user SGPRs are handled by the front-end for graphics shaders
2612     // Pad up the used user SGPRs with dead inputs.
2613 
2614     // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2615     // before enabling architected SGPRs for workgroup IDs.
2616     assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2617 
2618     unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2619     // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2620     // rely on it to reach 16 since if we end up having no stack usage, it will
2621     // not really be added.
2622     unsigned NumRequiredSystemSGPRs =
2623         Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2624         Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2625     for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2626       Register Reg = Info.addReservedUserSGPR();
2627       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2628       CCInfo.AllocateReg(Reg);
2629     }
2630   }
2631 
2632   if (!HasArchitectedSGPRs) {
2633     if (Info.hasWorkGroupIDX()) {
2634       Register Reg = Info.addWorkGroupIDX();
2635       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636       CCInfo.AllocateReg(Reg);
2637     }
2638 
2639     if (Info.hasWorkGroupIDY()) {
2640       Register Reg = Info.addWorkGroupIDY();
2641       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2642       CCInfo.AllocateReg(Reg);
2643     }
2644 
2645     if (Info.hasWorkGroupIDZ()) {
2646       Register Reg = Info.addWorkGroupIDZ();
2647       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2648       CCInfo.AllocateReg(Reg);
2649     }
2650   }
2651 
2652   if (Info.hasWorkGroupInfo()) {
2653     Register Reg = Info.addWorkGroupInfo();
2654     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2655     CCInfo.AllocateReg(Reg);
2656   }
2657 
2658   if (Info.hasPrivateSegmentWaveByteOffset()) {
2659     // Scratch wave offset passed in system SGPR.
2660     unsigned PrivateSegmentWaveByteOffsetReg;
2661 
2662     if (IsShader) {
2663       PrivateSegmentWaveByteOffsetReg =
2664           Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2665 
2666       // This is true if the scratch wave byte offset doesn't have a fixed
2667       // location.
2668       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2669         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2670         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2671       }
2672     } else
2673       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2674 
2675     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2676     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2677   }
2678 
2679   assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2680          Info.getNumPreloadedSGPRs() >= 16);
2681 }
2682 
2683 static void reservePrivateMemoryRegs(const TargetMachine &TM,
2684                                      MachineFunction &MF,
2685                                      const SIRegisterInfo &TRI,
2686                                      SIMachineFunctionInfo &Info) {
2687   // Now that we've figured out where the scratch register inputs are, see if
2688   // should reserve the arguments and use them directly.
2689   MachineFrameInfo &MFI = MF.getFrameInfo();
2690   bool HasStackObjects = MFI.hasStackObjects();
2691   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2692 
2693   // Record that we know we have non-spill stack objects so we don't need to
2694   // check all stack objects later.
2695   if (HasStackObjects)
2696     Info.setHasNonSpillStackObjects(true);
2697 
2698   // Everything live out of a block is spilled with fast regalloc, so it's
2699   // almost certain that spilling will be required.
2700   if (TM.getOptLevel() == CodeGenOptLevel::None)
2701     HasStackObjects = true;
2702 
2703   // For now assume stack access is needed in any callee functions, so we need
2704   // the scratch registers to pass in.
2705   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2706 
2707   if (!ST.enableFlatScratch()) {
2708     if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2709       // If we have stack objects, we unquestionably need the private buffer
2710       // resource. For the Code Object V2 ABI, this will be the first 4 user
2711       // SGPR inputs. We can reserve those and use them directly.
2712 
2713       Register PrivateSegmentBufferReg =
2714           Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2715       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2716     } else {
2717       unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2718       // We tentatively reserve the last registers (skipping the last registers
2719       // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2720       // we'll replace these with the ones immediately after those which were
2721       // really allocated. In the prologue copies will be inserted from the
2722       // argument to these reserved registers.
2723 
2724       // Without HSA, relocations are used for the scratch pointer and the
2725       // buffer resource setup is always inserted in the prologue. Scratch wave
2726       // offset is still in an input SGPR.
2727       Info.setScratchRSrcReg(ReservedBufferReg);
2728     }
2729   }
2730 
2731   MachineRegisterInfo &MRI = MF.getRegInfo();
2732 
2733   // For entry functions we have to set up the stack pointer if we use it,
2734   // whereas non-entry functions get this "for free". This means there is no
2735   // intrinsic advantage to using S32 over S34 in cases where we do not have
2736   // calls but do need a frame pointer (i.e. if we are requested to have one
2737   // because frame pointer elimination is disabled). To keep things simple we
2738   // only ever use S32 as the call ABI stack pointer, and so using it does not
2739   // imply we need a separate frame pointer.
2740   //
2741   // Try to use s32 as the SP, but move it if it would interfere with input
2742   // arguments. This won't work with calls though.
2743   //
2744   // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2745   // registers.
2746   if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2747     Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2748   } else {
2749     assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2750 
2751     if (MFI.hasCalls())
2752       report_fatal_error("call in graphics shader with too many input SGPRs");
2753 
2754     for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2755       if (!MRI.isLiveIn(Reg)) {
2756         Info.setStackPtrOffsetReg(Reg);
2757         break;
2758       }
2759     }
2760 
2761     if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2762       report_fatal_error("failed to find register for SP");
2763   }
2764 
2765   // hasFP should be accurate for entry functions even before the frame is
2766   // finalized, because it does not rely on the known stack size, only
2767   // properties like whether variable sized objects are present.
2768   if (ST.getFrameLowering()->hasFP(MF)) {
2769     Info.setFrameOffsetReg(AMDGPU::SGPR33);
2770   }
2771 }
2772 
2773 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2774   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2775   return !Info->isEntryFunction();
2776 }
2777 
2778 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
2779 
2780 void SITargetLowering::insertCopiesSplitCSR(
2781     MachineBasicBlock *Entry,
2782     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2783   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2784 
2785   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2786   if (!IStart)
2787     return;
2788 
2789   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2790   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2791   MachineBasicBlock::iterator MBBI = Entry->begin();
2792   for (const MCPhysReg *I = IStart; *I; ++I) {
2793     const TargetRegisterClass *RC = nullptr;
2794     if (AMDGPU::SReg_64RegClass.contains(*I))
2795       RC = &AMDGPU::SGPR_64RegClass;
2796     else if (AMDGPU::SReg_32RegClass.contains(*I))
2797       RC = &AMDGPU::SGPR_32RegClass;
2798     else
2799       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2800 
2801     Register NewVR = MRI->createVirtualRegister(RC);
2802     // Create copy from CSR to a virtual register.
2803     Entry->addLiveIn(*I);
2804     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2805         .addReg(*I);
2806 
2807     // Insert the copy-back instructions right before the terminator.
2808     for (auto *Exit : Exits)
2809       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2810               TII->get(TargetOpcode::COPY), *I)
2811           .addReg(NewVR);
2812   }
2813 }
2814 
2815 SDValue SITargetLowering::LowerFormalArguments(
2816     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2817     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2818     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2819   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2820 
2821   MachineFunction &MF = DAG.getMachineFunction();
2822   const Function &Fn = MF.getFunction();
2823   FunctionType *FType = MF.getFunction().getFunctionType();
2824   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2825 
2826   if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2827     DiagnosticInfoUnsupported NoGraphicsHSA(
2828         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2829     DAG.getContext()->diagnose(NoGraphicsHSA);
2830     return DAG.getEntryNode();
2831   }
2832 
2833   SmallVector<ISD::InputArg, 16> Splits;
2834   SmallVector<CCValAssign, 16> ArgLocs;
2835   BitVector Skipped(Ins.size());
2836   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2837                  *DAG.getContext());
2838 
2839   bool IsGraphics = AMDGPU::isGraphics(CallConv);
2840   bool IsKernel = AMDGPU::isKernel(CallConv);
2841   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2842 
2843   if (IsGraphics) {
2844     const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2845     assert(!UserSGPRInfo.hasDispatchPtr() &&
2846            !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2847            !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2848            !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2849     (void)UserSGPRInfo;
2850     if (!Subtarget->enableFlatScratch())
2851       assert(!UserSGPRInfo.hasFlatScratchInit());
2852     if ((CallConv != CallingConv::AMDGPU_CS &&
2853          CallConv != CallingConv::AMDGPU_Gfx) ||
2854         !Subtarget->hasArchitectedSGPRs())
2855       assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2856              !Info->hasWorkGroupIDZ());
2857   }
2858 
2859   if (CallConv == CallingConv::AMDGPU_PS) {
2860     processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2861 
2862     // At least one interpolation mode must be enabled or else the GPU will
2863     // hang.
2864     //
2865     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2866     // set PSInputAddr, the user wants to enable some bits after the compilation
2867     // based on run-time states. Since we can't know what the final PSInputEna
2868     // will look like, so we shouldn't do anything here and the user should take
2869     // responsibility for the correct programming.
2870     //
2871     // Otherwise, the following restrictions apply:
2872     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2873     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2874     //   enabled too.
2875     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2876         ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2877       CCInfo.AllocateReg(AMDGPU::VGPR0);
2878       CCInfo.AllocateReg(AMDGPU::VGPR1);
2879       Info->markPSInputAllocated(0);
2880       Info->markPSInputEnabled(0);
2881     }
2882     if (Subtarget->isAmdPalOS()) {
2883       // For isAmdPalOS, the user does not enable some bits after compilation
2884       // based on run-time states; the register values being generated here are
2885       // the final ones set in hardware. Therefore we need to apply the
2886       // workaround to PSInputAddr and PSInputEnable together.  (The case where
2887       // a bit is set in PSInputAddr but not PSInputEnable is where the
2888       // frontend set up an input arg for a particular interpolation mode, but
2889       // nothing uses that input arg. Really we should have an earlier pass
2890       // that removes such an arg.)
2891       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2892       if ((PsInputBits & 0x7F) == 0 ||
2893           ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2894         Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2895     }
2896   } else if (IsKernel) {
2897     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2898   } else {
2899     Splits.append(Ins.begin(), Ins.end());
2900   }
2901 
2902   if (IsKernel)
2903     analyzeFormalArgumentsCompute(CCInfo, Ins);
2904 
2905   if (IsEntryFunc) {
2906     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2907     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2908     if (IsKernel && Subtarget->hasKernargPreload())
2909       allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2910 
2911     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2912   } else if (!IsGraphics) {
2913     // For the fixed ABI, pass workitem IDs in the last argument register.
2914     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2915 
2916     // FIXME: Sink this into allocateSpecialInputSGPRs
2917     if (!Subtarget->enableFlatScratch())
2918       CCInfo.AllocateReg(Info->getScratchRSrcReg());
2919 
2920     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2921   }
2922 
2923   if (!IsKernel) {
2924     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2925     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2926   }
2927 
2928   SmallVector<SDValue, 16> Chains;
2929 
2930   // FIXME: This is the minimum kernel argument alignment. We should improve
2931   // this to the maximum alignment of the arguments.
2932   //
2933   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2934   // kern arg offset.
2935   const Align KernelArgBaseAlign = Align(16);
2936 
2937   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2938     const ISD::InputArg &Arg = Ins[i];
2939     if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2940       InVals.push_back(DAG.getUNDEF(Arg.VT));
2941       continue;
2942     }
2943 
2944     CCValAssign &VA = ArgLocs[ArgIdx++];
2945     MVT VT = VA.getLocVT();
2946 
2947     if (IsEntryFunc && VA.isMemLoc()) {
2948       VT = Ins[i].VT;
2949       EVT MemVT = VA.getLocVT();
2950 
2951       const uint64_t Offset = VA.getLocMemOffset();
2952       Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2953 
2954       if (Arg.Flags.isByRef()) {
2955         SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2956 
2957         const GCNTargetMachine &TM =
2958             static_cast<const GCNTargetMachine &>(getTargetMachine());
2959         if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2960                                     Arg.Flags.getPointerAddrSpace())) {
2961           Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2962                                      Arg.Flags.getPointerAddrSpace());
2963         }
2964 
2965         InVals.push_back(Ptr);
2966         continue;
2967       }
2968 
2969       SDValue NewArg;
2970       if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2971         if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2972           // In this case the argument is packed into the previous preload SGPR.
2973           int64_t AlignDownOffset = alignDown(Offset, 4);
2974           int64_t OffsetDiff = Offset - AlignDownOffset;
2975           EVT IntVT = MemVT.changeTypeToInteger();
2976 
2977           const SIMachineFunctionInfo *Info =
2978               MF.getInfo<SIMachineFunctionInfo>();
2979           MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2980           Register Reg =
2981               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2982 
2983           assert(Reg);
2984           Register VReg = MRI.getLiveInVirtReg(Reg);
2985           SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2986 
2987           SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2988           SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2989 
2990           SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2991           ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2992           NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2993                                   Ins[i].Flags.isSExt(), &Ins[i]);
2994 
2995           NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2996         } else {
2997           const SIMachineFunctionInfo *Info =
2998               MF.getInfo<SIMachineFunctionInfo>();
2999           MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3000           const SmallVectorImpl<MCRegister> &PreloadRegs =
3001               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3002 
3003           SDValue Copy;
3004           if (PreloadRegs.size() == 1) {
3005             Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3006             const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3007             NewArg = DAG.getCopyFromReg(
3008                 Chain, DL, VReg,
3009                 EVT::getIntegerVT(*DAG.getContext(),
3010                                   TRI->getRegSizeInBits(*RC)));
3011 
3012           } else {
3013             // If the kernarg alignment does not match the alignment of the SGPR
3014             // tuple RC that can accommodate this argument, it will be built up
3015             // via copies from from the individual SGPRs that the argument was
3016             // preloaded to.
3017             SmallVector<SDValue, 4> Elts;
3018             for (auto Reg : PreloadRegs) {
3019               Register VReg = MRI.getLiveInVirtReg(Reg);
3020               Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3021               Elts.push_back(Copy);
3022             }
3023             NewArg =
3024                 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3025                                                     PreloadRegs.size()),
3026                                    DL, Elts);
3027           }
3028 
3029           // If the argument was preloaded to multiple consecutive 32-bit
3030           // registers because of misalignment between addressable SGPR tuples
3031           // and the argument size, we can still assume that because of kernarg
3032           // segment alignment restrictions that NewArg's size is the same as
3033           // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3034           // truncate since we cannot preload to less than a single SGPR and the
3035           // MemVT may be smaller.
3036           EVT MemVTInt =
3037               EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
3038           if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3039             NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3040 
3041           NewArg = DAG.getBitcast(MemVT, NewArg);
3042           NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3043                                   Ins[i].Flags.isSExt(), &Ins[i]);
3044           NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3045         }
3046       } else {
3047         // Hidden arguments that are in the kernel signature must be preloaded
3048         // to user SGPRs. Print a diagnostic error if a hidden argument is in
3049         // the argument list and is not preloaded.
3050         if (Arg.isOrigArg()) {
3051           Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3052           if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3053             DiagnosticInfoUnsupported NonPreloadHiddenArg(
3054                 *OrigArg->getParent(),
3055                 "hidden argument in kernel signature was not preloaded",
3056                 DL.getDebugLoc());
3057             DAG.getContext()->diagnose(NonPreloadHiddenArg);
3058           }
3059         }
3060 
3061         NewArg =
3062             lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3063                                      Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3064       }
3065       Chains.push_back(NewArg.getValue(1));
3066 
3067       auto *ParamTy =
3068           dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3069       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3070           ParamTy &&
3071           (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3072            ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3073         // On SI local pointers are just offsets into LDS, so they are always
3074         // less than 16-bits.  On CI and newer they could potentially be
3075         // real pointers, so we can't guarantee their size.
3076         NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3077                              DAG.getValueType(MVT::i16));
3078       }
3079 
3080       InVals.push_back(NewArg);
3081       continue;
3082     }
3083     if (!IsEntryFunc && VA.isMemLoc()) {
3084       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3085       InVals.push_back(Val);
3086       if (!Arg.Flags.isByVal())
3087         Chains.push_back(Val.getValue(1));
3088       continue;
3089     }
3090 
3091     assert(VA.isRegLoc() && "Parameter must be in a register!");
3092 
3093     Register Reg = VA.getLocReg();
3094     const TargetRegisterClass *RC = nullptr;
3095     if (AMDGPU::VGPR_32RegClass.contains(Reg))
3096       RC = &AMDGPU::VGPR_32RegClass;
3097     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3098       RC = &AMDGPU::SGPR_32RegClass;
3099     else
3100       llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3101     EVT ValVT = VA.getValVT();
3102 
3103     Reg = MF.addLiveIn(Reg, RC);
3104     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3105 
3106     if (Arg.Flags.isSRet()) {
3107       // The return object should be reasonably addressable.
3108 
3109       // FIXME: This helps when the return is a real sret. If it is a
3110       // automatically inserted sret (i.e. CanLowerReturn returns false), an
3111       // extra copy is inserted in SelectionDAGBuilder which obscures this.
3112       unsigned NumBits =
3113           32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3114       Val = DAG.getNode(
3115           ISD::AssertZext, DL, VT, Val,
3116           DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3117     }
3118 
3119     // If this is an 8 or 16-bit value, it is really passed promoted
3120     // to 32 bits. Insert an assert[sz]ext to capture this, then
3121     // truncate to the right size.
3122     switch (VA.getLocInfo()) {
3123     case CCValAssign::Full:
3124       break;
3125     case CCValAssign::BCvt:
3126       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3127       break;
3128     case CCValAssign::SExt:
3129       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3130       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3131       break;
3132     case CCValAssign::ZExt:
3133       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3134       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3135       break;
3136     case CCValAssign::AExt:
3137       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3138       break;
3139     default:
3140       llvm_unreachable("Unknown loc info!");
3141     }
3142 
3143     InVals.push_back(Val);
3144   }
3145 
3146   // Start adding system SGPRs.
3147   if (IsEntryFunc)
3148     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3149 
3150   // DAG.getPass() returns nullptr when using new pass manager.
3151   // TODO: Use DAG.getMFAM() to access analysis result.
3152   if (DAG.getPass()) {
3153     auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3154     ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3155   }
3156 
3157   unsigned StackArgSize = CCInfo.getStackSize();
3158   Info->setBytesInStackArgArea(StackArgSize);
3159 
3160   return Chains.empty() ? Chain
3161                         : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3162 }
3163 
3164 // TODO: If return values can't fit in registers, we should return as many as
3165 // possible in registers before passing on stack.
3166 bool SITargetLowering::CanLowerReturn(
3167     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3168     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3169     const Type *RetTy) const {
3170   // Replacing returns with sret/stack usage doesn't make sense for shaders.
3171   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3172   // for shaders. Vector types should be explicitly handled by CC.
3173   if (AMDGPU::isEntryFunctionCC(CallConv))
3174     return true;
3175 
3176   SmallVector<CCValAssign, 16> RVLocs;
3177   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3178   if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3179     return false;
3180 
3181   // We must use the stack if return would require unavailable registers.
3182   unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3183   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3184   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3185     if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3186       return false;
3187 
3188   return true;
3189 }
3190 
3191 SDValue
3192 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3193                               bool isVarArg,
3194                               const SmallVectorImpl<ISD::OutputArg> &Outs,
3195                               const SmallVectorImpl<SDValue> &OutVals,
3196                               const SDLoc &DL, SelectionDAG &DAG) const {
3197   MachineFunction &MF = DAG.getMachineFunction();
3198   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3199 
3200   if (AMDGPU::isKernel(CallConv)) {
3201     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3202                                              OutVals, DL, DAG);
3203   }
3204 
3205   bool IsShader = AMDGPU::isShader(CallConv);
3206 
3207   Info->setIfReturnsVoid(Outs.empty());
3208   bool IsWaveEnd = Info->returnsVoid() && IsShader;
3209 
3210   // CCValAssign - represent the assignment of the return value to a location.
3211   SmallVector<CCValAssign, 48> RVLocs;
3212   SmallVector<ISD::OutputArg, 48> Splits;
3213 
3214   // CCState - Info about the registers and stack slots.
3215   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3216                  *DAG.getContext());
3217 
3218   // Analyze outgoing return values.
3219   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3220 
3221   SDValue Glue;
3222   SmallVector<SDValue, 48> RetOps;
3223   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3224 
3225   // Copy the result values into the output registers.
3226   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3227        ++I, ++RealRVLocIdx) {
3228     CCValAssign &VA = RVLocs[I];
3229     assert(VA.isRegLoc() && "Can only return in registers!");
3230     // TODO: Partially return in registers if return values don't fit.
3231     SDValue Arg = OutVals[RealRVLocIdx];
3232 
3233     // Copied from other backends.
3234     switch (VA.getLocInfo()) {
3235     case CCValAssign::Full:
3236       break;
3237     case CCValAssign::BCvt:
3238       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3239       break;
3240     case CCValAssign::SExt:
3241       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3242       break;
3243     case CCValAssign::ZExt:
3244       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3245       break;
3246     case CCValAssign::AExt:
3247       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3248       break;
3249     default:
3250       llvm_unreachable("Unknown loc info!");
3251     }
3252 
3253     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3254     Glue = Chain.getValue(1);
3255     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3256   }
3257 
3258   // FIXME: Does sret work properly?
3259   if (!Info->isEntryFunction()) {
3260     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3261     const MCPhysReg *I =
3262         TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3263     if (I) {
3264       for (; *I; ++I) {
3265         if (AMDGPU::SReg_64RegClass.contains(*I))
3266           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3267         else if (AMDGPU::SReg_32RegClass.contains(*I))
3268           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3269         else
3270           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3271       }
3272     }
3273   }
3274 
3275   // Update chain and glue.
3276   RetOps[0] = Chain;
3277   if (Glue.getNode())
3278     RetOps.push_back(Glue);
3279 
3280   unsigned Opc = AMDGPUISD::ENDPGM;
3281   if (!IsWaveEnd)
3282     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3283   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3284 }
3285 
3286 SDValue SITargetLowering::LowerCallResult(
3287     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3288     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3289     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3290     SDValue ThisVal) const {
3291   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3292 
3293   // Assign locations to each value returned by this call.
3294   SmallVector<CCValAssign, 16> RVLocs;
3295   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3296                  *DAG.getContext());
3297   CCInfo.AnalyzeCallResult(Ins, RetCC);
3298 
3299   // Copy all of the result registers out of their specified physreg.
3300   for (CCValAssign VA : RVLocs) {
3301     SDValue Val;
3302 
3303     if (VA.isRegLoc()) {
3304       Val =
3305           DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3306       Chain = Val.getValue(1);
3307       InGlue = Val.getValue(2);
3308     } else if (VA.isMemLoc()) {
3309       report_fatal_error("TODO: return values in memory");
3310     } else
3311       llvm_unreachable("unknown argument location type");
3312 
3313     switch (VA.getLocInfo()) {
3314     case CCValAssign::Full:
3315       break;
3316     case CCValAssign::BCvt:
3317       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3318       break;
3319     case CCValAssign::ZExt:
3320       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3321                         DAG.getValueType(VA.getValVT()));
3322       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3323       break;
3324     case CCValAssign::SExt:
3325       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3326                         DAG.getValueType(VA.getValVT()));
3327       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3328       break;
3329     case CCValAssign::AExt:
3330       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3331       break;
3332     default:
3333       llvm_unreachable("Unknown loc info!");
3334     }
3335 
3336     InVals.push_back(Val);
3337   }
3338 
3339   return Chain;
3340 }
3341 
3342 // Add code to pass special inputs required depending on used features separate
3343 // from the explicit user arguments present in the IR.
3344 void SITargetLowering::passSpecialInputs(
3345     CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3346     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3347     SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3348   // If we don't have a call site, this was a call inserted by
3349   // legalization. These can never use special inputs.
3350   if (!CLI.CB)
3351     return;
3352 
3353   SelectionDAG &DAG = CLI.DAG;
3354   const SDLoc &DL = CLI.DL;
3355   const Function &F = DAG.getMachineFunction().getFunction();
3356 
3357   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3358   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3359 
3360   const AMDGPUFunctionArgInfo *CalleeArgInfo =
3361       &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3362   if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3363     // DAG.getPass() returns nullptr when using new pass manager.
3364     // TODO: Use DAG.getMFAM() to access analysis result.
3365     if (DAG.getPass()) {
3366       auto &ArgUsageInfo =
3367           DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3368       CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3369     }
3370   }
3371 
3372   // TODO: Unify with private memory register handling. This is complicated by
3373   // the fact that at least in kernels, the input argument is not necessarily
3374   // in the same location as the input.
3375   // clang-format off
3376   static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3377                               StringLiteral> ImplicitAttrs[] = {
3378      {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3379      {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3380      {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3381      {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3382      {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3383      {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3384      {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3385      {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3386    };
3387   // clang-format on
3388 
3389   for (auto [InputID, Attr] : ImplicitAttrs) {
3390     // If the callee does not use the attribute value, skip copying the value.
3391     if (CLI.CB->hasFnAttr(Attr))
3392       continue;
3393 
3394     const auto [OutgoingArg, ArgRC, ArgTy] =
3395         CalleeArgInfo->getPreloadedValue(InputID);
3396     if (!OutgoingArg)
3397       continue;
3398 
3399     const auto [IncomingArg, IncomingArgRC, Ty] =
3400         CallerArgInfo.getPreloadedValue(InputID);
3401     assert(IncomingArgRC == ArgRC);
3402 
3403     // All special arguments are ints for now.
3404     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3405     SDValue InputReg;
3406 
3407     if (IncomingArg) {
3408       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3409     } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3410       // The implicit arg ptr is special because it doesn't have a corresponding
3411       // input for kernels, and is computed from the kernarg segment pointer.
3412       InputReg = getImplicitArgPtr(DAG, DL);
3413     } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3414       std::optional<uint32_t> Id =
3415           AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3416       if (Id.has_value()) {
3417         InputReg = DAG.getConstant(*Id, DL, ArgVT);
3418       } else {
3419         InputReg = DAG.getUNDEF(ArgVT);
3420       }
3421     } else {
3422       // We may have proven the input wasn't needed, although the ABI is
3423       // requiring it. We just need to allocate the register appropriately.
3424       InputReg = DAG.getUNDEF(ArgVT);
3425     }
3426 
3427     if (OutgoingArg->isRegister()) {
3428       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3429       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3430         report_fatal_error("failed to allocate implicit input argument");
3431     } else {
3432       unsigned SpecialArgOffset =
3433           CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3434       SDValue ArgStore =
3435           storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3436       MemOpChains.push_back(ArgStore);
3437     }
3438   }
3439 
3440   // Pack workitem IDs into a single register or pass it as is if already
3441   // packed.
3442 
3443   auto [OutgoingArg, ArgRC, Ty] =
3444       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3445   if (!OutgoingArg)
3446     std::tie(OutgoingArg, ArgRC, Ty) =
3447         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3448   if (!OutgoingArg)
3449     std::tie(OutgoingArg, ArgRC, Ty) =
3450         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3451   if (!OutgoingArg)
3452     return;
3453 
3454   const ArgDescriptor *IncomingArgX = std::get<0>(
3455       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3456   const ArgDescriptor *IncomingArgY = std::get<0>(
3457       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3458   const ArgDescriptor *IncomingArgZ = std::get<0>(
3459       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3460 
3461   SDValue InputReg;
3462   SDLoc SL;
3463 
3464   const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3465   const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3466   const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3467 
3468   // If incoming ids are not packed we need to pack them.
3469   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3470       NeedWorkItemIDX) {
3471     if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3472       InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3473     } else {
3474       InputReg = DAG.getConstant(0, DL, MVT::i32);
3475     }
3476   }
3477 
3478   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3479       NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3480     SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3481     Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3482                     DAG.getShiftAmountConstant(10, MVT::i32, SL));
3483     InputReg = InputReg.getNode()
3484                    ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3485                    : Y;
3486   }
3487 
3488   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3489       NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3490     SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3491     Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3492                     DAG.getShiftAmountConstant(20, MVT::i32, SL));
3493     InputReg = InputReg.getNode()
3494                    ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3495                    : Z;
3496   }
3497 
3498   if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3499     if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3500       // We're in a situation where the outgoing function requires the workitem
3501       // ID, but the calling function does not have it (e.g a graphics function
3502       // calling a C calling convention function). This is illegal, but we need
3503       // to produce something.
3504       InputReg = DAG.getUNDEF(MVT::i32);
3505     } else {
3506       // Workitem ids are already packed, any of present incoming arguments
3507       // will carry all required fields.
3508       ArgDescriptor IncomingArg =
3509           ArgDescriptor::createArg(IncomingArgX   ? *IncomingArgX
3510                                    : IncomingArgY ? *IncomingArgY
3511                                                   : *IncomingArgZ,
3512                                    ~0u);
3513       InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3514     }
3515   }
3516 
3517   if (OutgoingArg->isRegister()) {
3518     if (InputReg)
3519       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3520 
3521     CCInfo.AllocateReg(OutgoingArg->getRegister());
3522   } else {
3523     unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3524     if (InputReg) {
3525       SDValue ArgStore =
3526           storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3527       MemOpChains.push_back(ArgStore);
3528     }
3529   }
3530 }
3531 
3532 static bool canGuaranteeTCO(CallingConv::ID CC) {
3533   return CC == CallingConv::Fast;
3534 }
3535 
3536 /// Return true if we might ever do TCO for calls with this calling convention.
3537 static bool mayTailCallThisCC(CallingConv::ID CC) {
3538   switch (CC) {
3539   case CallingConv::C:
3540   case CallingConv::AMDGPU_Gfx:
3541     return true;
3542   default:
3543     return canGuaranteeTCO(CC);
3544   }
3545 }
3546 
3547 bool SITargetLowering::isEligibleForTailCallOptimization(
3548     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3549     const SmallVectorImpl<ISD::OutputArg> &Outs,
3550     const SmallVectorImpl<SDValue> &OutVals,
3551     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3552   if (AMDGPU::isChainCC(CalleeCC))
3553     return true;
3554 
3555   if (!mayTailCallThisCC(CalleeCC))
3556     return false;
3557 
3558   // For a divergent call target, we need to do a waterfall loop over the
3559   // possible callees which precludes us from using a simple jump.
3560   if (Callee->isDivergent())
3561     return false;
3562 
3563   MachineFunction &MF = DAG.getMachineFunction();
3564   const Function &CallerF = MF.getFunction();
3565   CallingConv::ID CallerCC = CallerF.getCallingConv();
3566   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3567   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3568 
3569   // Kernels aren't callable, and don't have a live in return address so it
3570   // doesn't make sense to do a tail call with entry functions.
3571   if (!CallerPreserved)
3572     return false;
3573 
3574   bool CCMatch = CallerCC == CalleeCC;
3575 
3576   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3577     if (canGuaranteeTCO(CalleeCC) && CCMatch)
3578       return true;
3579     return false;
3580   }
3581 
3582   // TODO: Can we handle var args?
3583   if (IsVarArg)
3584     return false;
3585 
3586   for (const Argument &Arg : CallerF.args()) {
3587     if (Arg.hasByValAttr())
3588       return false;
3589   }
3590 
3591   LLVMContext &Ctx = *DAG.getContext();
3592 
3593   // Check that the call results are passed in the same way.
3594   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3595                                   CCAssignFnForCall(CalleeCC, IsVarArg),
3596                                   CCAssignFnForCall(CallerCC, IsVarArg)))
3597     return false;
3598 
3599   // The callee has to preserve all registers the caller needs to preserve.
3600   if (!CCMatch) {
3601     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3602     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3603       return false;
3604   }
3605 
3606   // Nothing more to check if the callee is taking no arguments.
3607   if (Outs.empty())
3608     return true;
3609 
3610   SmallVector<CCValAssign, 16> ArgLocs;
3611   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3612 
3613   // FIXME: We are not allocating special input registers, so we will be
3614   // deciding based on incorrect register assignments.
3615   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3616 
3617   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3618   // If the stack arguments for this call do not fit into our own save area then
3619   // the call cannot be made tail.
3620   // TODO: Is this really necessary?
3621   if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3622     return false;
3623 
3624   for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3625     // FIXME: What about inreg arguments that end up passed in memory?
3626     if (!CCVA.isRegLoc())
3627       continue;
3628 
3629     // If we are passing an argument in an SGPR, and the value is divergent,
3630     // this call requires a waterfall loop.
3631     if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3632       LLVM_DEBUG(
3633           dbgs() << "Cannot tail call due to divergent outgoing argument in "
3634                  << printReg(CCVA.getLocReg(), TRI) << '\n');
3635       return false;
3636     }
3637   }
3638 
3639   const MachineRegisterInfo &MRI = MF.getRegInfo();
3640   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3641 }
3642 
3643 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3644   if (!CI->isTailCall())
3645     return false;
3646 
3647   const Function *ParentFn = CI->getParent()->getParent();
3648   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3649     return false;
3650   return true;
3651 }
3652 
3653 // The wave scratch offset register is used as the global base pointer.
3654 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3655                                     SmallVectorImpl<SDValue> &InVals) const {
3656   CallingConv::ID CallConv = CLI.CallConv;
3657   bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3658 
3659   SelectionDAG &DAG = CLI.DAG;
3660 
3661   TargetLowering::ArgListEntry RequestedExec;
3662   if (IsChainCallConv) {
3663     // The last argument should be the value that we need to put in EXEC.
3664     // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3665     // don't treat it like the rest of the arguments.
3666     RequestedExec = CLI.Args.back();
3667     assert(RequestedExec.Node && "No node for EXEC");
3668 
3669     if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3670       return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3671 
3672     assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3673     CLI.Outs.pop_back();
3674     CLI.OutVals.pop_back();
3675 
3676     if (RequestedExec.Ty->isIntegerTy(64)) {
3677       assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3678       CLI.Outs.pop_back();
3679       CLI.OutVals.pop_back();
3680     }
3681 
3682     assert(CLI.Outs.back().OrigArgIndex != 2 &&
3683            "Haven't popped all the pieces of the EXEC mask");
3684   }
3685 
3686   const SDLoc &DL = CLI.DL;
3687   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3688   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3689   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3690   SDValue Chain = CLI.Chain;
3691   SDValue Callee = CLI.Callee;
3692   bool &IsTailCall = CLI.IsTailCall;
3693   bool IsVarArg = CLI.IsVarArg;
3694   bool IsSibCall = false;
3695   MachineFunction &MF = DAG.getMachineFunction();
3696 
3697   if (Callee.isUndef() || isNullConstant(Callee)) {
3698     if (!CLI.IsTailCall) {
3699       for (ISD::InputArg &Arg : CLI.Ins)
3700         InVals.push_back(DAG.getUNDEF(Arg.VT));
3701     }
3702 
3703     return Chain;
3704   }
3705 
3706   if (IsVarArg) {
3707     return lowerUnhandledCall(CLI, InVals,
3708                               "unsupported call to variadic function ");
3709   }
3710 
3711   if (!CLI.CB)
3712     report_fatal_error("unsupported libcall legalization");
3713 
3714   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3715     return lowerUnhandledCall(CLI, InVals,
3716                               "unsupported required tail call to function ");
3717   }
3718 
3719   if (IsTailCall) {
3720     IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3721                                                    Outs, OutVals, Ins, DAG);
3722     if (!IsTailCall &&
3723         ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3724       report_fatal_error("failed to perform tail call elimination on a call "
3725                          "site marked musttail or on llvm.amdgcn.cs.chain");
3726     }
3727 
3728     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3729 
3730     // A sibling call is one where we're under the usual C ABI and not planning
3731     // to change that but can still do a tail call:
3732     if (!TailCallOpt && IsTailCall)
3733       IsSibCall = true;
3734 
3735     if (IsTailCall)
3736       ++NumTailCalls;
3737   }
3738 
3739   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3740   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3741   SmallVector<SDValue, 8> MemOpChains;
3742 
3743   // Analyze operands of the call, assigning locations to each operand.
3744   SmallVector<CCValAssign, 16> ArgLocs;
3745   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3746   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3747 
3748   if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3749     // With a fixed ABI, allocate fixed registers before user arguments.
3750     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3751   }
3752 
3753   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3754 
3755   // Get a count of how many bytes are to be pushed on the stack.
3756   unsigned NumBytes = CCInfo.getStackSize();
3757 
3758   if (IsSibCall) {
3759     // Since we're not changing the ABI to make this a tail call, the memory
3760     // operands are already available in the caller's incoming argument space.
3761     NumBytes = 0;
3762   }
3763 
3764   // FPDiff is the byte offset of the call's argument area from the callee's.
3765   // Stores to callee stack arguments will be placed in FixedStackSlots offset
3766   // by this amount for a tail call. In a sibling call it must be 0 because the
3767   // caller will deallocate the entire stack and the callee still expects its
3768   // arguments to begin at SP+0. Completely unused for non-tail calls.
3769   int32_t FPDiff = 0;
3770   MachineFrameInfo &MFI = MF.getFrameInfo();
3771   auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3772 
3773   // Adjust the stack pointer for the new arguments...
3774   // These operations are automatically eliminated by the prolog/epilog pass
3775   if (!IsSibCall)
3776     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3777 
3778   if (!IsSibCall || IsChainCallConv) {
3779     if (!Subtarget->enableFlatScratch()) {
3780       SmallVector<SDValue, 4> CopyFromChains;
3781 
3782       // In the HSA case, this should be an identity copy.
3783       SDValue ScratchRSrcReg =
3784           DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3785       RegsToPass.emplace_back(IsChainCallConv
3786                                   ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3787                                   : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3788                               ScratchRSrcReg);
3789       CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3790       Chain = DAG.getTokenFactor(DL, CopyFromChains);
3791     }
3792   }
3793 
3794   const unsigned NumSpecialInputs = RegsToPass.size();
3795 
3796   MVT PtrVT = MVT::i32;
3797 
3798   // Walk the register/memloc assignments, inserting copies/loads.
3799   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3800     CCValAssign &VA = ArgLocs[i];
3801     SDValue Arg = OutVals[i];
3802 
3803     // Promote the value if needed.
3804     switch (VA.getLocInfo()) {
3805     case CCValAssign::Full:
3806       break;
3807     case CCValAssign::BCvt:
3808       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3809       break;
3810     case CCValAssign::ZExt:
3811       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3812       break;
3813     case CCValAssign::SExt:
3814       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3815       break;
3816     case CCValAssign::AExt:
3817       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3818       break;
3819     case CCValAssign::FPExt:
3820       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3821       break;
3822     default:
3823       llvm_unreachable("Unknown loc info!");
3824     }
3825 
3826     if (VA.isRegLoc()) {
3827       RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3828     } else {
3829       assert(VA.isMemLoc());
3830 
3831       SDValue DstAddr;
3832       MachinePointerInfo DstInfo;
3833 
3834       unsigned LocMemOffset = VA.getLocMemOffset();
3835       int32_t Offset = LocMemOffset;
3836 
3837       SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3838       MaybeAlign Alignment;
3839 
3840       if (IsTailCall) {
3841         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3842         unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3843                                           : VA.getValVT().getStoreSize();
3844 
3845         // FIXME: We can have better than the minimum byval required alignment.
3846         Alignment =
3847             Flags.isByVal()
3848                 ? Flags.getNonZeroByValAlign()
3849                 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3850 
3851         Offset = Offset + FPDiff;
3852         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3853 
3854         DstAddr = DAG.getFrameIndex(FI, PtrVT);
3855         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3856 
3857         // Make sure any stack arguments overlapping with where we're storing
3858         // are loaded before this eventual operation. Otherwise they'll be
3859         // clobbered.
3860 
3861         // FIXME: Why is this really necessary? This seems to just result in a
3862         // lot of code to copy the stack and write them back to the same
3863         // locations, which are supposed to be immutable?
3864         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3865       } else {
3866         // Stores to the argument stack area are relative to the stack pointer.
3867         SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3868                                         MVT::i32);
3869         DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3870         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3871         Alignment =
3872             commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3873       }
3874 
3875       if (Outs[i].Flags.isByVal()) {
3876         SDValue SizeNode =
3877             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3878         SDValue Cpy =
3879             DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3880                           Outs[i].Flags.getNonZeroByValAlign(),
3881                           /*isVol = */ false, /*AlwaysInline = */ true,
3882                           /*CI=*/nullptr, std::nullopt, DstInfo,
3883                           MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
3884 
3885         MemOpChains.push_back(Cpy);
3886       } else {
3887         SDValue Store =
3888             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3889         MemOpChains.push_back(Store);
3890       }
3891     }
3892   }
3893 
3894   if (!MemOpChains.empty())
3895     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3896 
3897   SDValue ReadFirstLaneID =
3898       DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3899 
3900   SDValue TokenGlue;
3901   if (CLI.ConvergenceControlToken) {
3902     TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3903                             CLI.ConvergenceControlToken);
3904   }
3905 
3906   // Build a sequence of copy-to-reg nodes chained together with token chain
3907   // and flag operands which copy the outgoing args into the appropriate regs.
3908   SDValue InGlue;
3909 
3910   unsigned ArgIdx = 0;
3911   for (auto [Reg, Val] : RegsToPass) {
3912     if (ArgIdx++ >= NumSpecialInputs &&
3913         (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3914       // For chain calls, the inreg arguments are required to be
3915       // uniform. Speculatively Insert a readfirstlane in case we cannot prove
3916       // they are uniform.
3917       //
3918       // For other calls, if an inreg arguments is known to be uniform,
3919       // speculatively insert a readfirstlane in case it is in a VGPR.
3920       //
3921       // FIXME: We need to execute this in a waterfall loop if it is a divergent
3922       // value, so let that continue to produce invalid code.
3923 
3924       SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3925       if (TokenGlue)
3926         ReadfirstlaneArgs.push_back(TokenGlue);
3927       Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
3928                         ReadfirstlaneArgs);
3929     }
3930 
3931     Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3932     InGlue = Chain.getValue(1);
3933   }
3934 
3935   // We don't usually want to end the call-sequence here because we would tidy
3936   // the frame up *after* the call, however in the ABI-changing tail-call case
3937   // we've carefully laid out the parameters so that when sp is reset they'll be
3938   // in the correct location.
3939   if (IsTailCall && !IsSibCall) {
3940     Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3941     InGlue = Chain.getValue(1);
3942   }
3943 
3944   std::vector<SDValue> Ops({Chain});
3945 
3946   // Add a redundant copy of the callee global which will not be legalized, as
3947   // we need direct access to the callee later.
3948   if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3949     const GlobalValue *GV = GSD->getGlobal();
3950     Ops.push_back(Callee);
3951     Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3952   } else {
3953     if (IsTailCall) {
3954       // isEligibleForTailCallOptimization considered whether the call target is
3955       // divergent, but we may still end up with a uniform value in a VGPR.
3956       // Insert a readfirstlane just in case.
3957       SDValue ReadFirstLaneID =
3958           DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3959 
3960       SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3961       if (TokenGlue)
3962         ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3963       Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3964                            ReadfirstlaneArgs);
3965     }
3966 
3967     Ops.push_back(Callee);
3968     Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3969   }
3970 
3971   if (IsTailCall) {
3972     // Each tail call may have to adjust the stack by a different amount, so
3973     // this information must travel along with the operation for eventual
3974     // consumption by emitEpilogue.
3975     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3976   }
3977 
3978   if (IsChainCallConv)
3979     Ops.push_back(RequestedExec.Node);
3980 
3981   // Add argument registers to the end of the list so that they are known live
3982   // into the call.
3983   for (auto &[Reg, Val] : RegsToPass)
3984     Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
3985 
3986   // Add a register mask operand representing the call-preserved registers.
3987   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3988   assert(Mask && "Missing call preserved mask for calling convention");
3989   Ops.push_back(DAG.getRegisterMask(Mask));
3990 
3991   if (SDValue Token = CLI.ConvergenceControlToken) {
3992     SmallVector<SDValue, 2> GlueOps;
3993     GlueOps.push_back(Token);
3994     if (InGlue)
3995       GlueOps.push_back(InGlue);
3996 
3997     InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3998                                         MVT::Glue, GlueOps),
3999                      0);
4000   }
4001 
4002   if (InGlue)
4003     Ops.push_back(InGlue);
4004 
4005   // If we're doing a tall call, use a TC_RETURN here rather than an
4006   // actual call instruction.
4007   if (IsTailCall) {
4008     MFI.setHasTailCall();
4009     unsigned OPC = AMDGPUISD::TC_RETURN;
4010     switch (CallConv) {
4011     case CallingConv::AMDGPU_Gfx:
4012       OPC = AMDGPUISD::TC_RETURN_GFX;
4013       break;
4014     case CallingConv::AMDGPU_CS_Chain:
4015     case CallingConv::AMDGPU_CS_ChainPreserve:
4016       OPC = AMDGPUISD::TC_RETURN_CHAIN;
4017       break;
4018     }
4019 
4020     return DAG.getNode(OPC, DL, MVT::Other, Ops);
4021   }
4022 
4023   // Returns a chain and a flag for retval copy to use.
4024   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4025   Chain = Call.getValue(0);
4026   InGlue = Call.getValue(1);
4027 
4028   uint64_t CalleePopBytes = NumBytes;
4029   Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4030   if (!Ins.empty())
4031     InGlue = Chain.getValue(1);
4032 
4033   // Handle result values, copying them out of physregs into vregs that we
4034   // return.
4035   return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4036                          InVals, /*IsThisReturn=*/false, SDValue());
4037 }
4038 
4039 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4040 // except for:
4041 // 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4042 // 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4043 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4044                                                   SelectionDAG &DAG) const {
4045   const MachineFunction &MF = DAG.getMachineFunction();
4046   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4047 
4048   SDLoc dl(Op);
4049   EVT VT = Op.getValueType();
4050   SDValue Chain = Op.getOperand(0);
4051   Register SPReg = Info->getStackPtrOffsetReg();
4052 
4053   // Chain the dynamic stack allocation so that it doesn't modify the stack
4054   // pointer when other instructions are using the stack.
4055   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4056 
4057   SDValue Size = Op.getOperand(1);
4058   SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4059   Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4060 
4061   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4062   assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4063          "Stack grows upwards for AMDGPU");
4064 
4065   Chain = BaseAddr.getValue(1);
4066   Align StackAlign = TFL->getStackAlign();
4067   if (Alignment > StackAlign) {
4068     uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4069                                << Subtarget->getWavefrontSizeLog2();
4070     uint64_t StackAlignMask = ScaledAlignment - 1;
4071     SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4072                                   DAG.getConstant(StackAlignMask, dl, VT));
4073     BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4074                            DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4075   }
4076 
4077   assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4078   SDValue NewSP;
4079   if (isa<ConstantSDNode>(Size)) {
4080     // For constant sized alloca, scale alloca size by wave-size
4081     SDValue ScaledSize = DAG.getNode(
4082         ISD::SHL, dl, VT, Size,
4083         DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4084     NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4085   } else {
4086     // For dynamic sized alloca, perform wave-wide reduction to get max of
4087     // alloca size(divergent) and then scale it by wave-size
4088     SDValue WaveReduction =
4089         DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4090     Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4091                        Size, DAG.getConstant(0, dl, MVT::i32));
4092     SDValue ScaledSize = DAG.getNode(
4093         ISD::SHL, dl, VT, Size,
4094         DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4095     NewSP =
4096         DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4097     SDValue ReadFirstLaneID =
4098         DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4099     NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4100                         NewSP);
4101   }
4102 
4103   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4104   SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4105 
4106   return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4107 }
4108 
4109 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4110   if (Op.getValueType() != MVT::i32)
4111     return Op; // Defer to cannot select error.
4112 
4113   Register SP = getStackPointerRegisterToSaveRestore();
4114   SDLoc SL(Op);
4115 
4116   SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4117 
4118   // Convert from wave uniform to swizzled vector address. This should protect
4119   // from any edge cases where the stacksave result isn't directly used with
4120   // stackrestore.
4121   SDValue VectorAddress =
4122       DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4123   return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4124 }
4125 
4126 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4127                                             SelectionDAG &DAG) const {
4128   SDLoc SL(Op);
4129   assert(Op.getValueType() == MVT::i32);
4130 
4131   uint32_t BothRoundHwReg =
4132       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4133   SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4134 
4135   SDValue IntrinID =
4136       DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4137   SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4138                                Op.getOperand(0), IntrinID, GetRoundBothImm);
4139 
4140   // There are two rounding modes, one for f32 and one for f64/f16. We only
4141   // report in the standard value range if both are the same.
4142   //
4143   // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4144   // ties away from zero is not supported, and the other values are rotated by
4145   // 1.
4146   //
4147   // If the two rounding modes are not the same, report a target defined value.
4148 
4149   // Mode register rounding mode fields:
4150   //
4151   // [1:0] Single-precision round mode.
4152   // [3:2] Double/Half-precision round mode.
4153   //
4154   // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4155   //
4156   //             Hardware   Spec
4157   // Toward-0        3        0
4158   // Nearest Even    0        1
4159   // +Inf            1        2
4160   // -Inf            2        3
4161   //  NearestAway0  N/A       4
4162   //
4163   // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4164   // table we can index by the raw hardware mode.
4165   //
4166   // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4167 
4168   SDValue BitTable =
4169       DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
4170 
4171   SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4172   SDValue RoundModeTimesNumBits =
4173       DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4174 
4175   // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4176   // knew only one mode was demanded.
4177   SDValue TableValue =
4178       DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4179   SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4180 
4181   SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4182   SDValue TableEntry =
4183       DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4184 
4185   // There's a gap in the 4-bit encoded table and actual enum values, so offset
4186   // if it's an extended value.
4187   SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4188   SDValue IsStandardValue =
4189       DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4190   SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4191   SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4192                                TableEntry, EnumOffset);
4193 
4194   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4195 }
4196 
4197 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4198                                             SelectionDAG &DAG) const {
4199   SDLoc SL(Op);
4200 
4201   SDValue NewMode = Op.getOperand(1);
4202   assert(NewMode.getValueType() == MVT::i32);
4203 
4204   // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4205   // hardware MODE.fp_round values.
4206   if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4207     uint32_t ClampedVal = std::min(
4208         static_cast<uint32_t>(ConstMode->getZExtValue()),
4209         static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4210     NewMode = DAG.getConstant(
4211         AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4212   } else {
4213     // If we know the input can only be one of the supported standard modes in
4214     // the range 0-3, we can use a simplified mapping to hardware values.
4215     KnownBits KB = DAG.computeKnownBits(NewMode);
4216     const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4217     // The supported standard values are 0-3. The extended values start at 8. We
4218     // need to offset by 4 if the value is in the extended range.
4219 
4220     if (UseReducedTable) {
4221       // Truncate to the low 32-bits.
4222       SDValue BitTable = DAG.getConstant(
4223           AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4224 
4225       SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4226       SDValue RoundModeTimesNumBits =
4227           DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4228 
4229       NewMode =
4230           DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4231 
4232       // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4233       // the table extracted bits into inline immediates.
4234     } else {
4235       // table_index = umin(value, value - 4)
4236       // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4237       SDValue BitTable =
4238           DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4239 
4240       SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4241       SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4242       SDValue IndexVal =
4243           DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4244 
4245       SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4246       SDValue RoundModeTimesNumBits =
4247           DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4248 
4249       SDValue TableValue =
4250           DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4251       SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4252 
4253       // No need to mask out the high bits since the setreg will ignore them
4254       // anyway.
4255       NewMode = TruncTable;
4256     }
4257 
4258     // Insert a readfirstlane in case the value is a VGPR. We could do this
4259     // earlier and keep more operations scalar, but that interferes with
4260     // combining the source.
4261     SDValue ReadFirstLaneID =
4262         DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4263     NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4264                           ReadFirstLaneID, NewMode);
4265   }
4266 
4267   // N.B. The setreg will be later folded into s_round_mode on supported
4268   // targets.
4269   SDValue IntrinID =
4270       DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4271   uint32_t BothRoundHwReg =
4272       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4273   SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4274 
4275   SDValue SetReg =
4276       DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4277                   IntrinID, RoundBothImm, NewMode);
4278 
4279   return SetReg;
4280 }
4281 
4282 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4283   if (Op->isDivergent())
4284     return SDValue();
4285 
4286   switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4287   case AMDGPUAS::FLAT_ADDRESS:
4288   case AMDGPUAS::GLOBAL_ADDRESS:
4289   case AMDGPUAS::CONSTANT_ADDRESS:
4290   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4291     break;
4292   default:
4293     return SDValue();
4294   }
4295 
4296   return Op;
4297 }
4298 
4299 // Work around DAG legality rules only based on the result type.
4300 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4301   bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4302   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4303   EVT SrcVT = Src.getValueType();
4304 
4305   if (SrcVT.getScalarType() != MVT::bf16)
4306     return Op;
4307 
4308   SDLoc SL(Op);
4309   SDValue BitCast =
4310       DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4311 
4312   EVT DstVT = Op.getValueType();
4313   if (IsStrict)
4314     llvm_unreachable("Need STRICT_BF16_TO_FP");
4315 
4316   return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4317 }
4318 
4319 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4320   SDLoc SL(Op);
4321   if (Op.getValueType() != MVT::i64)
4322     return Op;
4323 
4324   uint32_t ModeHwReg =
4325       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4326   SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4327   uint32_t TrapHwReg =
4328       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4329   SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4330 
4331   SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4332   SDValue IntrinID =
4333       DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4334   SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4335                                    Op.getOperand(0), IntrinID, ModeHwRegImm);
4336   SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4337                                    Op.getOperand(0), IntrinID, TrapHwRegImm);
4338   SDValue TokenReg =
4339       DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4340                   GetTrapReg.getValue(1));
4341 
4342   SDValue CvtPtr =
4343       DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4344   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4345 
4346   return DAG.getMergeValues({Result, TokenReg}, SL);
4347 }
4348 
4349 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4350   SDLoc SL(Op);
4351   if (Op.getOperand(1).getValueType() != MVT::i64)
4352     return Op;
4353 
4354   SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4355   SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4356                                    DAG.getConstant(0, SL, MVT::i32));
4357   SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4358                                    DAG.getConstant(1, SL, MVT::i32));
4359 
4360   SDValue ReadFirstLaneID =
4361       DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4362   NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4363                            ReadFirstLaneID, NewModeReg);
4364   NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4365                            ReadFirstLaneID, NewTrapReg);
4366 
4367   unsigned ModeHwReg =
4368       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4369   SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4370   unsigned TrapHwReg =
4371       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4372   SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4373 
4374   SDValue IntrinID =
4375       DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4376   SDValue SetModeReg =
4377       DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4378                   IntrinID, ModeHwRegImm, NewModeReg);
4379   SDValue SetTrapReg =
4380       DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4381                   IntrinID, TrapHwRegImm, NewTrapReg);
4382   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4383 }
4384 
4385 Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4386                                              const MachineFunction &MF) const {
4387   Register Reg = StringSwitch<Register>(RegName)
4388                      .Case("m0", AMDGPU::M0)
4389                      .Case("exec", AMDGPU::EXEC)
4390                      .Case("exec_lo", AMDGPU::EXEC_LO)
4391                      .Case("exec_hi", AMDGPU::EXEC_HI)
4392                      .Case("flat_scratch", AMDGPU::FLAT_SCR)
4393                      .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4394                      .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4395                      .Default(Register());
4396 
4397   if (Reg == AMDGPU::NoRegister) {
4398     report_fatal_error(
4399         Twine("invalid register name \"" + StringRef(RegName) + "\"."));
4400   }
4401 
4402   if (!Subtarget->hasFlatScrRegister() &&
4403       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4404     report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
4405                              "\" for subtarget."));
4406   }
4407 
4408   switch (Reg) {
4409   case AMDGPU::M0:
4410   case AMDGPU::EXEC_LO:
4411   case AMDGPU::EXEC_HI:
4412   case AMDGPU::FLAT_SCR_LO:
4413   case AMDGPU::FLAT_SCR_HI:
4414     if (VT.getSizeInBits() == 32)
4415       return Reg;
4416     break;
4417   case AMDGPU::EXEC:
4418   case AMDGPU::FLAT_SCR:
4419     if (VT.getSizeInBits() == 64)
4420       return Reg;
4421     break;
4422   default:
4423     llvm_unreachable("missing register type checking");
4424   }
4425 
4426   report_fatal_error(
4427       Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4428 }
4429 
4430 // If kill is not the last instruction, split the block so kill is always a
4431 // proper terminator.
4432 MachineBasicBlock *
4433 SITargetLowering::splitKillBlock(MachineInstr &MI,
4434                                  MachineBasicBlock *BB) const {
4435   MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4436   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4437   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4438   return SplitBB;
4439 }
4440 
4441 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4442 // \p MI will be the only instruction in the loop body block. Otherwise, it will
4443 // be the first instruction in the remainder block.
4444 //
4445 /// \returns { LoopBody, Remainder }
4446 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4447 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4448   MachineFunction *MF = MBB.getParent();
4449   MachineBasicBlock::iterator I(&MI);
4450 
4451   // To insert the loop we need to split the block. Move everything after this
4452   // point to a new block, and insert a new empty block between the two.
4453   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4454   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4455   MachineFunction::iterator MBBI(MBB);
4456   ++MBBI;
4457 
4458   MF->insert(MBBI, LoopBB);
4459   MF->insert(MBBI, RemainderBB);
4460 
4461   LoopBB->addSuccessor(LoopBB);
4462   LoopBB->addSuccessor(RemainderBB);
4463 
4464   // Move the rest of the block into a new block.
4465   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4466 
4467   if (InstInLoop) {
4468     auto Next = std::next(I);
4469 
4470     // Move instruction to loop body.
4471     LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4472 
4473     // Move the rest of the block.
4474     RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4475   } else {
4476     RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4477   }
4478 
4479   MBB.addSuccessor(LoopBB);
4480 
4481   return std::pair(LoopBB, RemainderBB);
4482 }
4483 
4484 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4485 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4486   MachineBasicBlock *MBB = MI.getParent();
4487   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4488   auto I = MI.getIterator();
4489   auto E = std::next(I);
4490 
4491   // clang-format off
4492   BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4493       .addImm(0);
4494   // clang-format on
4495 
4496   MIBundleBuilder Bundler(*MBB, I, E);
4497   finalizeBundle(*MBB, Bundler.begin());
4498 }
4499 
4500 MachineBasicBlock *
4501 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4502                                          MachineBasicBlock *BB) const {
4503   const DebugLoc &DL = MI.getDebugLoc();
4504 
4505   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4506 
4507   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4508 
4509   // Apparently kill flags are only valid if the def is in the same block?
4510   if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4511     Src->setIsKill(false);
4512 
4513   auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4514 
4515   MachineBasicBlock::iterator I = LoopBB->end();
4516 
4517   const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4518       AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
4519 
4520   // Clear TRAP_STS.MEM_VIOL
4521   BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4522       .addImm(0)
4523       .addImm(EncodedReg);
4524 
4525   bundleInstWithWaitcnt(MI);
4526 
4527   Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4528 
4529   // Load and check TRAP_STS.MEM_VIOL
4530   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4531       .addImm(EncodedReg);
4532 
4533   // FIXME: Do we need to use an isel pseudo that may clobber scc?
4534   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4535       .addReg(Reg, RegState::Kill)
4536       .addImm(0);
4537   // clang-format off
4538   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4539       .addMBB(LoopBB);
4540   // clang-format on
4541 
4542   return RemainderBB;
4543 }
4544 
4545 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4546 // wavefront. If the value is uniform and just happens to be in a VGPR, this
4547 // will only do one iteration. In the worst case, this will loop 64 times.
4548 //
4549 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4550 static MachineBasicBlock::iterator
4551 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4552                        MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4553                        const DebugLoc &DL, const MachineOperand &Idx,
4554                        unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4555                        unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4556                        Register &SGPRIdxReg) {
4557 
4558   MachineFunction *MF = OrigBB.getParent();
4559   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4560   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4561   MachineBasicBlock::iterator I = LoopBB.begin();
4562 
4563   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4564   Register PhiExec = MRI.createVirtualRegister(BoolRC);
4565   Register NewExec = MRI.createVirtualRegister(BoolRC);
4566   Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4567   Register CondReg = MRI.createVirtualRegister(BoolRC);
4568 
4569   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4570       .addReg(InitReg)
4571       .addMBB(&OrigBB)
4572       .addReg(ResultReg)
4573       .addMBB(&LoopBB);
4574 
4575   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4576       .addReg(InitSaveExecReg)
4577       .addMBB(&OrigBB)
4578       .addReg(NewExec)
4579       .addMBB(&LoopBB);
4580 
4581   // Read the next variant <- also loop target.
4582   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4583       .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4584 
4585   // Compare the just read M0 value to all possible Idx values.
4586   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4587       .addReg(CurrentIdxReg)
4588       .addReg(Idx.getReg(), 0, Idx.getSubReg());
4589 
4590   // Update EXEC, save the original EXEC value to VCC.
4591   BuildMI(LoopBB, I, DL,
4592           TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4593                                  : AMDGPU::S_AND_SAVEEXEC_B64),
4594           NewExec)
4595       .addReg(CondReg, RegState::Kill);
4596 
4597   MRI.setSimpleHint(NewExec, CondReg);
4598 
4599   if (UseGPRIdxMode) {
4600     if (Offset == 0) {
4601       SGPRIdxReg = CurrentIdxReg;
4602     } else {
4603       SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4604       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4605           .addReg(CurrentIdxReg, RegState::Kill)
4606           .addImm(Offset);
4607     }
4608   } else {
4609     // Move index from VCC into M0
4610     if (Offset == 0) {
4611       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4612           .addReg(CurrentIdxReg, RegState::Kill);
4613     } else {
4614       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4615           .addReg(CurrentIdxReg, RegState::Kill)
4616           .addImm(Offset);
4617     }
4618   }
4619 
4620   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4621   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4622   MachineInstr *InsertPt =
4623       BuildMI(LoopBB, I, DL,
4624               TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4625                                      : AMDGPU::S_XOR_B64_term),
4626               Exec)
4627           .addReg(Exec)
4628           .addReg(NewExec);
4629 
4630   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4631   // s_cbranch_scc0?
4632 
4633   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4634   // clang-format off
4635   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4636       .addMBB(&LoopBB);
4637   // clang-format on
4638 
4639   return InsertPt->getIterator();
4640 }
4641 
4642 // This has slightly sub-optimal regalloc when the source vector is killed by
4643 // the read. The register allocator does not understand that the kill is
4644 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4645 // subregister from it, using 1 more VGPR than necessary. This was saved when
4646 // this was expanded after register allocation.
4647 static MachineBasicBlock::iterator
4648 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4649                unsigned InitResultReg, unsigned PhiReg, int Offset,
4650                bool UseGPRIdxMode, Register &SGPRIdxReg) {
4651   MachineFunction *MF = MBB.getParent();
4652   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4653   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4654   MachineRegisterInfo &MRI = MF->getRegInfo();
4655   const DebugLoc &DL = MI.getDebugLoc();
4656   MachineBasicBlock::iterator I(&MI);
4657 
4658   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4659   Register DstReg = MI.getOperand(0).getReg();
4660   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4661   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4662   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4663   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4664 
4665   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4666 
4667   // Save the EXEC mask
4668   // clang-format off
4669   BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4670       .addReg(Exec);
4671   // clang-format on
4672 
4673   auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4674 
4675   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4676 
4677   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4678                                       InitResultReg, DstReg, PhiReg, TmpExec,
4679                                       Offset, UseGPRIdxMode, SGPRIdxReg);
4680 
4681   MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4682   MachineFunction::iterator MBBI(LoopBB);
4683   ++MBBI;
4684   MF->insert(MBBI, LandingPad);
4685   LoopBB->removeSuccessor(RemainderBB);
4686   LandingPad->addSuccessor(RemainderBB);
4687   LoopBB->addSuccessor(LandingPad);
4688   MachineBasicBlock::iterator First = LandingPad->begin();
4689   // clang-format off
4690   BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4691       .addReg(SaveExec);
4692   // clang-format on
4693 
4694   return InsPt;
4695 }
4696 
4697 // Returns subreg index, offset
4698 static std::pair<unsigned, int>
4699 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4700                             const TargetRegisterClass *SuperRC, unsigned VecReg,
4701                             int Offset) {
4702   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4703 
4704   // Skip out of bounds offsets, or else we would end up using an undefined
4705   // register.
4706   if (Offset >= NumElts || Offset < 0)
4707     return std::pair(AMDGPU::sub0, Offset);
4708 
4709   return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4710 }
4711 
4712 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4713                                  MachineRegisterInfo &MRI, MachineInstr &MI,
4714                                  int Offset) {
4715   MachineBasicBlock *MBB = MI.getParent();
4716   const DebugLoc &DL = MI.getDebugLoc();
4717   MachineBasicBlock::iterator I(&MI);
4718 
4719   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4720 
4721   assert(Idx->getReg() != AMDGPU::NoRegister);
4722 
4723   if (Offset == 0) {
4724     // clang-format off
4725     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4726         .add(*Idx);
4727     // clang-format on
4728   } else {
4729     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4730         .add(*Idx)
4731         .addImm(Offset);
4732   }
4733 }
4734 
4735 static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4736                                    MachineRegisterInfo &MRI, MachineInstr &MI,
4737                                    int Offset) {
4738   MachineBasicBlock *MBB = MI.getParent();
4739   const DebugLoc &DL = MI.getDebugLoc();
4740   MachineBasicBlock::iterator I(&MI);
4741 
4742   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4743 
4744   if (Offset == 0)
4745     return Idx->getReg();
4746 
4747   Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4748   BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4749       .add(*Idx)
4750       .addImm(Offset);
4751   return Tmp;
4752 }
4753 
4754 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4755                                           MachineBasicBlock &MBB,
4756                                           const GCNSubtarget &ST) {
4757   const SIInstrInfo *TII = ST.getInstrInfo();
4758   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4759   MachineFunction *MF = MBB.getParent();
4760   MachineRegisterInfo &MRI = MF->getRegInfo();
4761 
4762   Register Dst = MI.getOperand(0).getReg();
4763   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4764   Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4765   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4766 
4767   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4768   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4769 
4770   unsigned SubReg;
4771   std::tie(SubReg, Offset) =
4772       computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4773 
4774   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4775 
4776   // Check for a SGPR index.
4777   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4778     MachineBasicBlock::iterator I(&MI);
4779     const DebugLoc &DL = MI.getDebugLoc();
4780 
4781     if (UseGPRIdxMode) {
4782       // TODO: Look at the uses to avoid the copy. This may require rescheduling
4783       // to avoid interfering with other uses, so probably requires a new
4784       // optimization pass.
4785       Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4786 
4787       const MCInstrDesc &GPRIDXDesc =
4788           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4789       BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4790           .addReg(SrcReg)
4791           .addReg(Idx)
4792           .addImm(SubReg);
4793     } else {
4794       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4795 
4796       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4797           .addReg(SrcReg, 0, SubReg)
4798           .addReg(SrcReg, RegState::Implicit);
4799     }
4800 
4801     MI.eraseFromParent();
4802 
4803     return &MBB;
4804   }
4805 
4806   // Control flow needs to be inserted if indexing with a VGPR.
4807   const DebugLoc &DL = MI.getDebugLoc();
4808   MachineBasicBlock::iterator I(&MI);
4809 
4810   Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4811   Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4812 
4813   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4814 
4815   Register SGPRIdxReg;
4816   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4817                               UseGPRIdxMode, SGPRIdxReg);
4818 
4819   MachineBasicBlock *LoopBB = InsPt->getParent();
4820 
4821   if (UseGPRIdxMode) {
4822     const MCInstrDesc &GPRIDXDesc =
4823         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4824 
4825     BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4826         .addReg(SrcReg)
4827         .addReg(SGPRIdxReg)
4828         .addImm(SubReg);
4829   } else {
4830     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4831         .addReg(SrcReg, 0, SubReg)
4832         .addReg(SrcReg, RegState::Implicit);
4833   }
4834 
4835   MI.eraseFromParent();
4836 
4837   return LoopBB;
4838 }
4839 
4840 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4841                                           MachineBasicBlock &MBB,
4842                                           const GCNSubtarget &ST) {
4843   const SIInstrInfo *TII = ST.getInstrInfo();
4844   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4845   MachineFunction *MF = MBB.getParent();
4846   MachineRegisterInfo &MRI = MF->getRegInfo();
4847 
4848   Register Dst = MI.getOperand(0).getReg();
4849   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4850   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4851   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4852   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4853   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4854   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4855 
4856   // This can be an immediate, but will be folded later.
4857   assert(Val->getReg());
4858 
4859   unsigned SubReg;
4860   std::tie(SubReg, Offset) =
4861       computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4862   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4863 
4864   if (Idx->getReg() == AMDGPU::NoRegister) {
4865     MachineBasicBlock::iterator I(&MI);
4866     const DebugLoc &DL = MI.getDebugLoc();
4867 
4868     assert(Offset == 0);
4869 
4870     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4871         .add(*SrcVec)
4872         .add(*Val)
4873         .addImm(SubReg);
4874 
4875     MI.eraseFromParent();
4876     return &MBB;
4877   }
4878 
4879   // Check for a SGPR index.
4880   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4881     MachineBasicBlock::iterator I(&MI);
4882     const DebugLoc &DL = MI.getDebugLoc();
4883 
4884     if (UseGPRIdxMode) {
4885       Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4886 
4887       const MCInstrDesc &GPRIDXDesc =
4888           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4889       BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4890           .addReg(SrcVec->getReg())
4891           .add(*Val)
4892           .addReg(Idx)
4893           .addImm(SubReg);
4894     } else {
4895       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4896 
4897       const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4898           TRI.getRegSizeInBits(*VecRC), 32, false);
4899       BuildMI(MBB, I, DL, MovRelDesc, Dst)
4900           .addReg(SrcVec->getReg())
4901           .add(*Val)
4902           .addImm(SubReg);
4903     }
4904     MI.eraseFromParent();
4905     return &MBB;
4906   }
4907 
4908   // Control flow needs to be inserted if indexing with a VGPR.
4909   if (Val->isReg())
4910     MRI.clearKillFlags(Val->getReg());
4911 
4912   const DebugLoc &DL = MI.getDebugLoc();
4913 
4914   Register PhiReg = MRI.createVirtualRegister(VecRC);
4915 
4916   Register SGPRIdxReg;
4917   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4918                               UseGPRIdxMode, SGPRIdxReg);
4919   MachineBasicBlock *LoopBB = InsPt->getParent();
4920 
4921   if (UseGPRIdxMode) {
4922     const MCInstrDesc &GPRIDXDesc =
4923         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4924 
4925     BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4926         .addReg(PhiReg)
4927         .add(*Val)
4928         .addReg(SGPRIdxReg)
4929         .addImm(SubReg);
4930   } else {
4931     const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4932         TRI.getRegSizeInBits(*VecRC), 32, false);
4933     BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4934         .addReg(PhiReg)
4935         .add(*Val)
4936         .addImm(SubReg);
4937   }
4938 
4939   MI.eraseFromParent();
4940   return LoopBB;
4941 }
4942 
4943 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
4944                                           MachineBasicBlock &BB,
4945                                           const GCNSubtarget &ST,
4946                                           unsigned Opc) {
4947   MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
4948   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4949   const DebugLoc &DL = MI.getDebugLoc();
4950   const SIInstrInfo *TII = ST.getInstrInfo();
4951 
4952   // Reduction operations depend on whether the input operand is SGPR or VGPR.
4953   Register SrcReg = MI.getOperand(1).getReg();
4954   bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4955   Register DstReg = MI.getOperand(0).getReg();
4956   MachineBasicBlock *RetBB = nullptr;
4957   if (isSGPR) {
4958     // These operations with a uniform value i.e. SGPR are idempotent.
4959     // Reduced value will be same as given sgpr.
4960     // clang-format off
4961     BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962         .addReg(SrcReg);
4963     // clang-format on
4964     RetBB = &BB;
4965   } else {
4966     // TODO: Implement DPP Strategy and switch based on immediate strategy
4967     // operand. For now, for all the cases (default, Iterative and DPP we use
4968     // iterative approach by default.)
4969 
4970     // To reduce the VGPR using iterative approach, we need to iterate
4971     // over all the active lanes. Lowering consists of ComputeLoop,
4972     // which iterate over only active lanes. We use copy of EXEC register
4973     // as induction variable and every active lane modifies it using bitset0
4974     // so that we will get the next active lane for next iteration.
4975     MachineBasicBlock::iterator I = BB.end();
4976     Register SrcReg = MI.getOperand(1).getReg();
4977 
4978     // Create Control flow for loop
4979     // Split MI's Machine Basic block into For loop
4980     auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4981 
4982     // Create virtual registers required for lowering.
4983     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4984     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4985     Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4986     Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4987 
4988     Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4989     Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4990     Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4991 
4992     Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4993     Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4994 
4995     bool IsWave32 = ST.isWave32();
4996     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4997     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4998 
4999     // Create initail values of induction variable from Exec, Accumulator and
5000     // insert branch instr to newly created ComputeBlockk
5001     uint32_t InitalValue =
5002         (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5003     auto TmpSReg =
5004         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5005     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5006         .addImm(InitalValue);
5007     // clang-format off
5008     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5009         .addMBB(ComputeLoop);
5010     // clang-format on
5011 
5012     // Start constructing ComputeLoop
5013     I = ComputeLoop->end();
5014     auto Accumulator =
5015         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5016             .addReg(InitalValReg)
5017             .addMBB(&BB);
5018     auto ActiveBits =
5019         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5020             .addReg(TmpSReg->getOperand(0).getReg())
5021             .addMBB(&BB);
5022 
5023     // Perform the computations
5024     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5025     auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5026                    .addReg(ActiveBits->getOperand(0).getReg());
5027     auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5028                              TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5029                          .addReg(SrcReg)
5030                          .addReg(FF1->getOperand(0).getReg());
5031     auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5032                               .addReg(Accumulator->getOperand(0).getReg())
5033                               .addReg(LaneValue->getOperand(0).getReg());
5034 
5035     // Manipulate the iterator to get the next active lane
5036     unsigned BITSETOpc =
5037         IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5038     auto NewActiveBits =
5039         BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5040             .addReg(FF1->getOperand(0).getReg())
5041             .addReg(ActiveBits->getOperand(0).getReg());
5042 
5043     // Add phi nodes
5044     Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5045         .addMBB(ComputeLoop);
5046     ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5047         .addMBB(ComputeLoop);
5048 
5049     // Creating branching
5050     unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5051     BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5052         .addReg(NewActiveBits->getOperand(0).getReg())
5053         .addImm(0);
5054     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5055         .addMBB(ComputeLoop);
5056 
5057     RetBB = ComputeEnd;
5058   }
5059   MI.eraseFromParent();
5060   return RetBB;
5061 }
5062 
5063 MachineBasicBlock *
5064 SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5065                                               MachineBasicBlock *BB) const {
5066 
5067   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5068   MachineFunction *MF = BB->getParent();
5069   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5070 
5071   switch (MI.getOpcode()) {
5072   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5073     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5074   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5075     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5076   case AMDGPU::S_UADDO_PSEUDO:
5077   case AMDGPU::S_USUBO_PSEUDO: {
5078     const DebugLoc &DL = MI.getDebugLoc();
5079     MachineOperand &Dest0 = MI.getOperand(0);
5080     MachineOperand &Dest1 = MI.getOperand(1);
5081     MachineOperand &Src0 = MI.getOperand(2);
5082     MachineOperand &Src1 = MI.getOperand(3);
5083 
5084     unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5085                        ? AMDGPU::S_ADD_I32
5086                        : AMDGPU::S_SUB_I32;
5087     // clang-format off
5088     BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5089         .add(Src0)
5090         .add(Src1);
5091     // clang-format on
5092 
5093     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5094         .addImm(1)
5095         .addImm(0);
5096 
5097     MI.eraseFromParent();
5098     return BB;
5099   }
5100   case AMDGPU::S_ADD_U64_PSEUDO:
5101   case AMDGPU::S_SUB_U64_PSEUDO: {
5102     // For targets older than GFX12, we emit a sequence of 32-bit operations.
5103     // For GFX12, we emit s_add_u64 and s_sub_u64.
5104     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5105     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5106     const DebugLoc &DL = MI.getDebugLoc();
5107     MachineOperand &Dest = MI.getOperand(0);
5108     MachineOperand &Src0 = MI.getOperand(1);
5109     MachineOperand &Src1 = MI.getOperand(2);
5110     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5111     if (Subtarget->hasScalarAddSub64()) {
5112       unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5113       // clang-format off
5114       BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5115           .add(Src0)
5116           .add(Src1);
5117       // clang-format on
5118     } else {
5119       const SIRegisterInfo *TRI = ST.getRegisterInfo();
5120       const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5121 
5122       Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5123       Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5124 
5125       MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5126           MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5127       MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5128           MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5129 
5130       MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5131           MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5132       MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5133           MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5134 
5135       unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5136       unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5137       BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5138           .add(Src0Sub0)
5139           .add(Src1Sub0);
5140       BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5141           .add(Src0Sub1)
5142           .add(Src1Sub1);
5143       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5144           .addReg(DestSub0)
5145           .addImm(AMDGPU::sub0)
5146           .addReg(DestSub1)
5147           .addImm(AMDGPU::sub1);
5148     }
5149     MI.eraseFromParent();
5150     return BB;
5151   }
5152   case AMDGPU::V_ADD_U64_PSEUDO:
5153   case AMDGPU::V_SUB_U64_PSEUDO: {
5154     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5155     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5156     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5157     const DebugLoc &DL = MI.getDebugLoc();
5158 
5159     bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5160 
5161     MachineOperand &Dest = MI.getOperand(0);
5162     MachineOperand &Src0 = MI.getOperand(1);
5163     MachineOperand &Src1 = MI.getOperand(2);
5164 
5165     if (IsAdd && ST.hasLshlAddB64()) {
5166       auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5167                          Dest.getReg())
5168                      .add(Src0)
5169                      .addImm(0)
5170                      .add(Src1);
5171       TII->legalizeOperands(*Add);
5172       MI.eraseFromParent();
5173       return BB;
5174     }
5175 
5176     const auto *CarryRC = TRI->getWaveMaskRegClass();
5177 
5178     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5179     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5180 
5181     Register CarryReg = MRI.createVirtualRegister(CarryRC);
5182     Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5183 
5184     const TargetRegisterClass *Src0RC = Src0.isReg()
5185                                             ? MRI.getRegClass(Src0.getReg())
5186                                             : &AMDGPU::VReg_64RegClass;
5187     const TargetRegisterClass *Src1RC = Src1.isReg()
5188                                             ? MRI.getRegClass(Src1.getReg())
5189                                             : &AMDGPU::VReg_64RegClass;
5190 
5191     const TargetRegisterClass *Src0SubRC =
5192         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5193     const TargetRegisterClass *Src1SubRC =
5194         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5195 
5196     MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5197         MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5198     MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5199         MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5200 
5201     MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5202         MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5203     MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5204         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5205 
5206     unsigned LoOpc =
5207         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5208     MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5209                                .addReg(CarryReg, RegState::Define)
5210                                .add(SrcReg0Sub0)
5211                                .add(SrcReg1Sub0)
5212                                .addImm(0); // clamp bit
5213 
5214     unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5215     MachineInstr *HiHalf =
5216         BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5217             .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5218             .add(SrcReg0Sub1)
5219             .add(SrcReg1Sub1)
5220             .addReg(CarryReg, RegState::Kill)
5221             .addImm(0); // clamp bit
5222 
5223     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5224         .addReg(DestSub0)
5225         .addImm(AMDGPU::sub0)
5226         .addReg(DestSub1)
5227         .addImm(AMDGPU::sub1);
5228     TII->legalizeOperands(*LoHalf);
5229     TII->legalizeOperands(*HiHalf);
5230     MI.eraseFromParent();
5231     return BB;
5232   }
5233   case AMDGPU::S_ADD_CO_PSEUDO:
5234   case AMDGPU::S_SUB_CO_PSEUDO: {
5235     // This pseudo has a chance to be selected
5236     // only from uniform add/subcarry node. All the VGPR operands
5237     // therefore assumed to be splat vectors.
5238     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5239     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5240     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5241     MachineBasicBlock::iterator MII = MI;
5242     const DebugLoc &DL = MI.getDebugLoc();
5243     MachineOperand &Dest = MI.getOperand(0);
5244     MachineOperand &CarryDest = MI.getOperand(1);
5245     MachineOperand &Src0 = MI.getOperand(2);
5246     MachineOperand &Src1 = MI.getOperand(3);
5247     MachineOperand &Src2 = MI.getOperand(4);
5248     unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5249                        ? AMDGPU::S_ADDC_U32
5250                        : AMDGPU::S_SUBB_U32;
5251     if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5252       Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5253       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5254           .addReg(Src0.getReg());
5255       Src0.setReg(RegOp0);
5256     }
5257     if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5258       Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5259       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5260           .addReg(Src1.getReg());
5261       Src1.setReg(RegOp1);
5262     }
5263     Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5264     if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5265       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5266           .addReg(Src2.getReg());
5267       Src2.setReg(RegOp2);
5268     }
5269 
5270     const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5271     unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5272     assert(WaveSize == 64 || WaveSize == 32);
5273 
5274     if (WaveSize == 64) {
5275       if (ST.hasScalarCompareEq64()) {
5276         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5277             .addReg(Src2.getReg())
5278             .addImm(0);
5279       } else {
5280         const TargetRegisterClass *SubRC =
5281             TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5282         MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5283             MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5284         MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5285             MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5286         Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5287 
5288         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5289             .add(Src2Sub0)
5290             .add(Src2Sub1);
5291 
5292         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5293             .addReg(Src2_32, RegState::Kill)
5294             .addImm(0);
5295       }
5296     } else {
5297       BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5298           .addReg(Src2.getReg())
5299           .addImm(0);
5300     }
5301 
5302     // clang-format off
5303     BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5304         .add(Src0)
5305         .add(Src1);
5306     // clang-format on
5307 
5308     unsigned SelOpc =
5309         (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5310 
5311     BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5312         .addImm(-1)
5313         .addImm(0);
5314 
5315     MI.eraseFromParent();
5316     return BB;
5317   }
5318   case AMDGPU::SI_INIT_M0: {
5319     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5320             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5321         .add(MI.getOperand(0));
5322     MI.eraseFromParent();
5323     return BB;
5324   }
5325   case AMDGPU::GET_GROUPSTATICSIZE: {
5326     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5327            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5328     DebugLoc DL = MI.getDebugLoc();
5329     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5330         .add(MI.getOperand(0))
5331         .addImm(MFI->getLDSSize());
5332     MI.eraseFromParent();
5333     return BB;
5334   }
5335   case AMDGPU::GET_SHADERCYCLESHILO: {
5336     assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5337     MachineRegisterInfo &MRI = MF->getRegInfo();
5338     const DebugLoc &DL = MI.getDebugLoc();
5339     // The algorithm is:
5340     //
5341     // hi1 = getreg(SHADER_CYCLES_HI)
5342     // lo1 = getreg(SHADER_CYCLES_LO)
5343     // hi2 = getreg(SHADER_CYCLES_HI)
5344     //
5345     // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5346     // Otherwise there was overflow and the result is hi2:0. In both cases the
5347     // result should represent the actual time at some point during the sequence
5348     // of three getregs.
5349     using namespace AMDGPU::Hwreg;
5350     Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5352         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5353     Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5354     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5355         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5356     Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5357     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5358         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5359     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5360         .addReg(RegHi1)
5361         .addReg(RegHi2);
5362     Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5364         .addReg(RegLo1)
5365         .addImm(0);
5366     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5367         .add(MI.getOperand(0))
5368         .addReg(RegLo)
5369         .addImm(AMDGPU::sub0)
5370         .addReg(RegHi2)
5371         .addImm(AMDGPU::sub1);
5372     MI.eraseFromParent();
5373     return BB;
5374   }
5375   case AMDGPU::SI_INDIRECT_SRC_V1:
5376   case AMDGPU::SI_INDIRECT_SRC_V2:
5377   case AMDGPU::SI_INDIRECT_SRC_V4:
5378   case AMDGPU::SI_INDIRECT_SRC_V8:
5379   case AMDGPU::SI_INDIRECT_SRC_V9:
5380   case AMDGPU::SI_INDIRECT_SRC_V10:
5381   case AMDGPU::SI_INDIRECT_SRC_V11:
5382   case AMDGPU::SI_INDIRECT_SRC_V12:
5383   case AMDGPU::SI_INDIRECT_SRC_V16:
5384   case AMDGPU::SI_INDIRECT_SRC_V32:
5385     return emitIndirectSrc(MI, *BB, *getSubtarget());
5386   case AMDGPU::SI_INDIRECT_DST_V1:
5387   case AMDGPU::SI_INDIRECT_DST_V2:
5388   case AMDGPU::SI_INDIRECT_DST_V4:
5389   case AMDGPU::SI_INDIRECT_DST_V8:
5390   case AMDGPU::SI_INDIRECT_DST_V9:
5391   case AMDGPU::SI_INDIRECT_DST_V10:
5392   case AMDGPU::SI_INDIRECT_DST_V11:
5393   case AMDGPU::SI_INDIRECT_DST_V12:
5394   case AMDGPU::SI_INDIRECT_DST_V16:
5395   case AMDGPU::SI_INDIRECT_DST_V32:
5396     return emitIndirectDst(MI, *BB, *getSubtarget());
5397   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5398   case AMDGPU::SI_KILL_I1_PSEUDO:
5399     return splitKillBlock(MI, BB);
5400   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5401     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5402     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5403     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5404 
5405     Register Dst = MI.getOperand(0).getReg();
5406     const MachineOperand &Src0 = MI.getOperand(1);
5407     const MachineOperand &Src1 = MI.getOperand(2);
5408     const DebugLoc &DL = MI.getDebugLoc();
5409     Register SrcCond = MI.getOperand(3).getReg();
5410 
5411     Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5412     Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5413     const auto *CondRC = TRI->getWaveMaskRegClass();
5414     Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5415 
5416     const TargetRegisterClass *Src0RC = Src0.isReg()
5417                                             ? MRI.getRegClass(Src0.getReg())
5418                                             : &AMDGPU::VReg_64RegClass;
5419     const TargetRegisterClass *Src1RC = Src1.isReg()
5420                                             ? MRI.getRegClass(Src1.getReg())
5421                                             : &AMDGPU::VReg_64RegClass;
5422 
5423     const TargetRegisterClass *Src0SubRC =
5424         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5425     const TargetRegisterClass *Src1SubRC =
5426         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5427 
5428     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5429         MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5430     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5431         MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5432 
5433     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5434         MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5435     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5436         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5437 
5438     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5439     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5440         .addImm(0)
5441         .add(Src0Sub0)
5442         .addImm(0)
5443         .add(Src1Sub0)
5444         .addReg(SrcCondCopy);
5445     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5446         .addImm(0)
5447         .add(Src0Sub1)
5448         .addImm(0)
5449         .add(Src1Sub1)
5450         .addReg(SrcCondCopy);
5451 
5452     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5453         .addReg(DstLo)
5454         .addImm(AMDGPU::sub0)
5455         .addReg(DstHi)
5456         .addImm(AMDGPU::sub1);
5457     MI.eraseFromParent();
5458     return BB;
5459   }
5460   case AMDGPU::SI_BR_UNDEF: {
5461     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5462     const DebugLoc &DL = MI.getDebugLoc();
5463     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5464                            .add(MI.getOperand(0));
5465     Br->getOperand(1).setIsUndef(); // read undef SCC
5466     MI.eraseFromParent();
5467     return BB;
5468   }
5469   case AMDGPU::ADJCALLSTACKUP:
5470   case AMDGPU::ADJCALLSTACKDOWN: {
5471     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5472     MachineInstrBuilder MIB(*MF, &MI);
5473     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5474         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5475     return BB;
5476   }
5477   case AMDGPU::SI_CALL_ISEL: {
5478     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5479     const DebugLoc &DL = MI.getDebugLoc();
5480 
5481     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5482 
5483     MachineInstrBuilder MIB;
5484     MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5485 
5486     for (const MachineOperand &MO : MI.operands())
5487       MIB.add(MO);
5488 
5489     MIB.cloneMemRefs(MI);
5490     MI.eraseFromParent();
5491     return BB;
5492   }
5493   case AMDGPU::V_ADD_CO_U32_e32:
5494   case AMDGPU::V_SUB_CO_U32_e32:
5495   case AMDGPU::V_SUBREV_CO_U32_e32: {
5496     // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5497     const DebugLoc &DL = MI.getDebugLoc();
5498     unsigned Opc = MI.getOpcode();
5499 
5500     bool NeedClampOperand = false;
5501     if (TII->pseudoToMCOpcode(Opc) == -1) {
5502       Opc = AMDGPU::getVOPe64(Opc);
5503       NeedClampOperand = true;
5504     }
5505 
5506     auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5507     if (TII->isVOP3(*I)) {
5508       const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5509       const SIRegisterInfo *TRI = ST.getRegisterInfo();
5510       I.addReg(TRI->getVCC(), RegState::Define);
5511     }
5512     I.add(MI.getOperand(1)).add(MI.getOperand(2));
5513     if (NeedClampOperand)
5514       I.addImm(0); // clamp bit for e64 encoding
5515 
5516     TII->legalizeOperands(*I);
5517 
5518     MI.eraseFromParent();
5519     return BB;
5520   }
5521   case AMDGPU::V_ADDC_U32_e32:
5522   case AMDGPU::V_SUBB_U32_e32:
5523   case AMDGPU::V_SUBBREV_U32_e32:
5524     // These instructions have an implicit use of vcc which counts towards the
5525     // constant bus limit.
5526     TII->legalizeOperands(MI);
5527     return BB;
5528   case AMDGPU::DS_GWS_INIT:
5529   case AMDGPU::DS_GWS_SEMA_BR:
5530   case AMDGPU::DS_GWS_BARRIER:
5531     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5532     [[fallthrough]];
5533   case AMDGPU::DS_GWS_SEMA_V:
5534   case AMDGPU::DS_GWS_SEMA_P:
5535   case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5536     // A s_waitcnt 0 is required to be the instruction immediately following.
5537     if (getSubtarget()->hasGWSAutoReplay()) {
5538       bundleInstWithWaitcnt(MI);
5539       return BB;
5540     }
5541 
5542     return emitGWSMemViolTestLoop(MI, BB);
5543   case AMDGPU::S_SETREG_B32: {
5544     // Try to optimize cases that only set the denormal mode or rounding mode.
5545     //
5546     // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5547     // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5548     // instead.
5549     //
5550     // FIXME: This could be predicates on the immediate, but tablegen doesn't
5551     // allow you to have a no side effect instruction in the output of a
5552     // sideeffecting pattern.
5553     auto [ID, Offset, Width] =
5554         AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5555     if (ID != AMDGPU::Hwreg::ID_MODE)
5556       return BB;
5557 
5558     const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5559     const unsigned SetMask = WidthMask << Offset;
5560 
5561     if (getSubtarget()->hasDenormModeInst()) {
5562       unsigned SetDenormOp = 0;
5563       unsigned SetRoundOp = 0;
5564 
5565       // The dedicated instructions can only set the whole denorm or round mode
5566       // at once, not a subset of bits in either.
5567       if (SetMask ==
5568           (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5569         // If this fully sets both the round and denorm mode, emit the two
5570         // dedicated instructions for these.
5571         SetRoundOp = AMDGPU::S_ROUND_MODE;
5572         SetDenormOp = AMDGPU::S_DENORM_MODE;
5573       } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5574         SetRoundOp = AMDGPU::S_ROUND_MODE;
5575       } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5576         SetDenormOp = AMDGPU::S_DENORM_MODE;
5577       }
5578 
5579       if (SetRoundOp || SetDenormOp) {
5580         MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5581         MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5582         if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5583           unsigned ImmVal = Def->getOperand(1).getImm();
5584           if (SetRoundOp) {
5585             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5586                 .addImm(ImmVal & 0xf);
5587 
5588             // If we also have the denorm mode, get just the denorm mode bits.
5589             ImmVal >>= 4;
5590           }
5591 
5592           if (SetDenormOp) {
5593             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5594                 .addImm(ImmVal & 0xf);
5595           }
5596 
5597           MI.eraseFromParent();
5598           return BB;
5599         }
5600       }
5601     }
5602 
5603     // If only FP bits are touched, used the no side effects pseudo.
5604     if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5605                     AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5606       MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5607 
5608     return BB;
5609   }
5610   case AMDGPU::S_INVERSE_BALLOT_U32:
5611   case AMDGPU::S_INVERSE_BALLOT_U64:
5612     // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5613     // necessary. After that they are equivalent to a COPY.
5614     MI.setDesc(TII->get(AMDGPU::COPY));
5615     return BB;
5616   case AMDGPU::ENDPGM_TRAP: {
5617     const DebugLoc &DL = MI.getDebugLoc();
5618     if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5619       MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5620       MI.addOperand(MachineOperand::CreateImm(0));
5621       return BB;
5622     }
5623 
5624     // We need a block split to make the real endpgm a terminator. We also don't
5625     // want to break phis in successor blocks, so we can't just delete to the
5626     // end of the block.
5627 
5628     MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5629     MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5630     MF->push_back(TrapBB);
5631     // clang-format off
5632     BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5633         .addImm(0);
5634     BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5635         .addMBB(TrapBB);
5636     // clang-format on
5637 
5638     BB->addSuccessor(TrapBB);
5639     MI.eraseFromParent();
5640     return SplitBB;
5641   }
5642   case AMDGPU::SIMULATED_TRAP: {
5643     assert(Subtarget->hasPrivEnabledTrap2NopBug());
5644     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5645     MachineBasicBlock *SplitBB =
5646         TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5647     MI.eraseFromParent();
5648     return SplitBB;
5649   }
5650   default:
5651     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5652       if (!MI.mayStore())
5653         AddMemOpInit(MI);
5654       return BB;
5655     }
5656     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
5657   }
5658 }
5659 
5660 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5661   // This currently forces unfolding various combinations of fsub into fma with
5662   // free fneg'd operands. As long as we have fast FMA (controlled by
5663   // isFMAFasterThanFMulAndFAdd), we should perform these.
5664 
5665   // When fma is quarter rate, for f64 where add / sub are at best half rate,
5666   // most of these combines appear to be cycle neutral but save on instruction
5667   // count / code size.
5668   return true;
5669 }
5670 
5671 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5672 
5673 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5674                                          EVT VT) const {
5675   if (!VT.isVector()) {
5676     return MVT::i1;
5677   }
5678   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5679 }
5680 
5681 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5682   // TODO: Should i16 be used always if legal? For now it would force VALU
5683   // shifts.
5684   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5685 }
5686 
5687 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5688   return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5689              ? Ty.changeElementSize(16)
5690              : Ty.changeElementSize(32);
5691 }
5692 
5693 // Answering this is somewhat tricky and depends on the specific device which
5694 // have different rates for fma or all f64 operations.
5695 //
5696 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5697 // regardless of which device (although the number of cycles differs between
5698 // devices), so it is always profitable for f64.
5699 //
5700 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5701 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
5702 // which we can always do even without fused FP ops since it returns the same
5703 // result as the separate operations and since it is always full
5704 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5705 // however does not support denormals, so we do report fma as faster if we have
5706 // a fast fma device and require denormals.
5707 //
5708 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5709                                                   EVT VT) const {
5710   VT = VT.getScalarType();
5711 
5712   switch (VT.getSimpleVT().SimpleTy) {
5713   case MVT::f32: {
5714     // If mad is not available this depends only on if f32 fma is full rate.
5715     if (!Subtarget->hasMadMacF32Insts())
5716       return Subtarget->hasFastFMAF32();
5717 
5718     // Otherwise f32 mad is always full rate and returns the same result as
5719     // the separate operations so should be preferred over fma.
5720     // However does not support denormals.
5721     if (!denormalModeIsFlushAllF32(MF))
5722       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5723 
5724     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5725     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5726   }
5727   case MVT::f64:
5728     return true;
5729   case MVT::f16:
5730     return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5731   default:
5732     break;
5733   }
5734 
5735   return false;
5736 }
5737 
5738 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5739                                                   LLT Ty) const {
5740   switch (Ty.getScalarSizeInBits()) {
5741   case 16:
5742     return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5743   case 32:
5744     return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5745   case 64:
5746     return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5747   default:
5748     break;
5749   }
5750 
5751   return false;
5752 }
5753 
5754 // Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for
5755 // specific details.
5756 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
5757                                                   Type *Ty) const {
5758   switch (Ty->getScalarSizeInBits()) {
5759   case 16: {
5760     SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget);
5761     return Subtarget->has16BitInsts() &&
5762            Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
5763   }
5764   case 32: {
5765     if (!Subtarget->hasMadMacF32Insts())
5766       return Subtarget->hasFastFMAF32();
5767 
5768     SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget);
5769     if (Mode.FP32Denormals != DenormalMode::getPreserveSign())
5770       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5771 
5772     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5773   }
5774   case 64:
5775     return true;
5776   default:
5777     break;
5778   }
5779 
5780   return false;
5781 }
5782 
5783 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5784   if (!Ty.isScalar())
5785     return false;
5786 
5787   if (Ty.getScalarSizeInBits() == 16)
5788     return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5789   if (Ty.getScalarSizeInBits() == 32)
5790     return Subtarget->hasMadMacF32Insts() &&
5791            denormalModeIsFlushAllF32(*MI.getMF());
5792 
5793   return false;
5794 }
5795 
5796 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
5797                                    const SDNode *N) const {
5798   // TODO: Check future ftz flag
5799   // v_mad_f32/v_mac_f32 do not support denormals.
5800   EVT VT = N->getValueType(0);
5801   if (VT == MVT::f32)
5802     return Subtarget->hasMadMacF32Insts() &&
5803            denormalModeIsFlushAllF32(DAG.getMachineFunction());
5804   if (VT == MVT::f16) {
5805     return Subtarget->hasMadF16() &&
5806            denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
5807   }
5808 
5809   return false;
5810 }
5811 
5812 //===----------------------------------------------------------------------===//
5813 // Custom DAG Lowering Operations
5814 //===----------------------------------------------------------------------===//
5815 
5816 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5817 // wider vector type is legal.
5818 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
5819                                              SelectionDAG &DAG) const {
5820   unsigned Opc = Op.getOpcode();
5821   EVT VT = Op.getValueType();
5822   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5823          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5824          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5825          VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5826 
5827   auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
5828 
5829   SDLoc SL(Op);
5830   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
5831   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
5832 
5833   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5834 }
5835 
5836 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5837 // wider vector type is legal.
5838 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
5839                                               SelectionDAG &DAG) const {
5840   unsigned Opc = Op.getOpcode();
5841   EVT VT = Op.getValueType();
5842   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5843          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5844          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5845          VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5846 
5847   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
5848   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5849 
5850   SDLoc SL(Op);
5851 
5852   SDValue OpLo =
5853       DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
5854   SDValue OpHi =
5855       DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
5856 
5857   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5858 }
5859 
5860 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
5861                                                SelectionDAG &DAG) const {
5862   unsigned Opc = Op.getOpcode();
5863   EVT VT = Op.getValueType();
5864   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5865          VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5866          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5867          VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5868          VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5869          VT == MVT::v32bf16);
5870 
5871   SDValue Op0 = Op.getOperand(0);
5872   auto [Lo0, Hi0] = Op0.getValueType().isVector()
5873                         ? DAG.SplitVectorOperand(Op.getNode(), 0)
5874                         : std::pair(Op0, Op0);
5875 
5876   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
5877   auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
5878 
5879   SDLoc SL(Op);
5880   auto ResVT = DAG.GetSplitDestVTs(VT);
5881 
5882   SDValue OpLo =
5883       DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
5884   SDValue OpHi =
5885       DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
5886 
5887   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5888 }
5889 
5890 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5891   switch (Op.getOpcode()) {
5892   default:
5893     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5894   case ISD::BRCOND:
5895     return LowerBRCOND(Op, DAG);
5896   case ISD::RETURNADDR:
5897     return LowerRETURNADDR(Op, DAG);
5898   case ISD::LOAD: {
5899     SDValue Result = LowerLOAD(Op, DAG);
5900     assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
5901            "Load should return a value and a chain");
5902     return Result;
5903   }
5904   case ISD::FSQRT: {
5905     EVT VT = Op.getValueType();
5906     if (VT == MVT::f32)
5907       return lowerFSQRTF32(Op, DAG);
5908     if (VT == MVT::f64)
5909       return lowerFSQRTF64(Op, DAG);
5910     return SDValue();
5911   }
5912   case ISD::FSIN:
5913   case ISD::FCOS:
5914     return LowerTrig(Op, DAG);
5915   case ISD::SELECT:
5916     return LowerSELECT(Op, DAG);
5917   case ISD::FDIV:
5918     return LowerFDIV(Op, DAG);
5919   case ISD::FFREXP:
5920     return LowerFFREXP(Op, DAG);
5921   case ISD::ATOMIC_CMP_SWAP:
5922     return LowerATOMIC_CMP_SWAP(Op, DAG);
5923   case ISD::STORE:
5924     return LowerSTORE(Op, DAG);
5925   case ISD::GlobalAddress: {
5926     MachineFunction &MF = DAG.getMachineFunction();
5927     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
5928     return LowerGlobalAddress(MFI, Op, DAG);
5929   }
5930   case ISD::INTRINSIC_WO_CHAIN:
5931     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5932   case ISD::INTRINSIC_W_CHAIN:
5933     return LowerINTRINSIC_W_CHAIN(Op, DAG);
5934   case ISD::INTRINSIC_VOID:
5935     return LowerINTRINSIC_VOID(Op, DAG);
5936   case ISD::ADDRSPACECAST:
5937     return lowerADDRSPACECAST(Op, DAG);
5938   case ISD::INSERT_SUBVECTOR:
5939     return lowerINSERT_SUBVECTOR(Op, DAG);
5940   case ISD::INSERT_VECTOR_ELT:
5941     return lowerINSERT_VECTOR_ELT(Op, DAG);
5942   case ISD::EXTRACT_VECTOR_ELT:
5943     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5944   case ISD::VECTOR_SHUFFLE:
5945     return lowerVECTOR_SHUFFLE(Op, DAG);
5946   case ISD::SCALAR_TO_VECTOR:
5947     return lowerSCALAR_TO_VECTOR(Op, DAG);
5948   case ISD::BUILD_VECTOR:
5949     return lowerBUILD_VECTOR(Op, DAG);
5950   case ISD::FP_ROUND:
5951   case ISD::STRICT_FP_ROUND:
5952     return lowerFP_ROUND(Op, DAG);
5953   case ISD::TRAP:
5954     return lowerTRAP(Op, DAG);
5955   case ISD::DEBUGTRAP:
5956     return lowerDEBUGTRAP(Op, DAG);
5957   case ISD::ABS:
5958   case ISD::FABS:
5959   case ISD::FNEG:
5960   case ISD::FCANONICALIZE:
5961   case ISD::BSWAP:
5962     return splitUnaryVectorOp(Op, DAG);
5963   case ISD::FMINNUM:
5964   case ISD::FMAXNUM:
5965     return lowerFMINNUM_FMAXNUM(Op, DAG);
5966   case ISD::FLDEXP:
5967   case ISD::STRICT_FLDEXP:
5968     return lowerFLDEXP(Op, DAG);
5969   case ISD::FMA:
5970     return splitTernaryVectorOp(Op, DAG);
5971   case ISD::FP_TO_SINT:
5972   case ISD::FP_TO_UINT:
5973     return LowerFP_TO_INT(Op, DAG);
5974   case ISD::SHL:
5975   case ISD::SRA:
5976   case ISD::SRL:
5977   case ISD::ADD:
5978   case ISD::SUB:
5979   case ISD::SMIN:
5980   case ISD::SMAX:
5981   case ISD::UMIN:
5982   case ISD::UMAX:
5983   case ISD::FADD:
5984   case ISD::FMUL:
5985   case ISD::FMINNUM_IEEE:
5986   case ISD::FMAXNUM_IEEE:
5987   case ISD::FMINIMUM:
5988   case ISD::FMAXIMUM:
5989   case ISD::FMINIMUMNUM:
5990   case ISD::FMAXIMUMNUM:
5991   case ISD::UADDSAT:
5992   case ISD::USUBSAT:
5993   case ISD::SADDSAT:
5994   case ISD::SSUBSAT:
5995     return splitBinaryVectorOp(Op, DAG);
5996   case ISD::MUL:
5997     return lowerMUL(Op, DAG);
5998   case ISD::SMULO:
5999   case ISD::UMULO:
6000     return lowerXMULO(Op, DAG);
6001   case ISD::SMUL_LOHI:
6002   case ISD::UMUL_LOHI:
6003     return lowerXMUL_LOHI(Op, DAG);
6004   case ISD::DYNAMIC_STACKALLOC:
6005     return LowerDYNAMIC_STACKALLOC(Op, DAG);
6006   case ISD::STACKSAVE:
6007     return LowerSTACKSAVE(Op, DAG);
6008   case ISD::GET_ROUNDING:
6009     return lowerGET_ROUNDING(Op, DAG);
6010   case ISD::SET_ROUNDING:
6011     return lowerSET_ROUNDING(Op, DAG);
6012   case ISD::PREFETCH:
6013     return lowerPREFETCH(Op, DAG);
6014   case ISD::FP_EXTEND:
6015   case ISD::STRICT_FP_EXTEND:
6016     return lowerFP_EXTEND(Op, DAG);
6017   case ISD::GET_FPENV:
6018     return lowerGET_FPENV(Op, DAG);
6019   case ISD::SET_FPENV:
6020     return lowerSET_FPENV(Op, DAG);
6021   }
6022   return SDValue();
6023 }
6024 
6025 // Used for D16: Casts the result of an instruction into the right vector,
6026 // packs values if loads return unpacked values.
6027 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
6028                                        const SDLoc &DL, SelectionDAG &DAG,
6029                                        bool Unpacked) {
6030   if (!LoadVT.isVector())
6031     return Result;
6032 
6033   // Cast back to the original packed type or to a larger type that is a
6034   // multiple of 32 bit for D16. Widening the return type is a required for
6035   // legalization.
6036   EVT FittingLoadVT = LoadVT;
6037   if ((LoadVT.getVectorNumElements() % 2) == 1) {
6038     FittingLoadVT =
6039         EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
6040                          LoadVT.getVectorNumElements() + 1);
6041   }
6042 
6043   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6044     // Truncate to v2i16/v4i16.
6045     EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6046 
6047     // Workaround legalizer not scalarizing truncate after vector op
6048     // legalization but not creating intermediate vector trunc.
6049     SmallVector<SDValue, 4> Elts;
6050     DAG.ExtractVectorElements(Result, Elts);
6051     for (SDValue &Elt : Elts)
6052       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6053 
6054     // Pad illegal v1i16/v3fi6 to v4i16
6055     if ((LoadVT.getVectorNumElements() % 2) == 1)
6056       Elts.push_back(DAG.getUNDEF(MVT::i16));
6057 
6058     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6059 
6060     // Bitcast to original type (v2f16/v4f16).
6061     return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6062   }
6063 
6064   // Cast back to the original packed type.
6065   return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6066 }
6067 
6068 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6069                                               SelectionDAG &DAG,
6070                                               ArrayRef<SDValue> Ops,
6071                                               bool IsIntrinsic) const {
6072   SDLoc DL(M);
6073 
6074   bool Unpacked = Subtarget->hasUnpackedD16VMem();
6075   EVT LoadVT = M->getValueType(0);
6076 
6077   EVT EquivLoadVT = LoadVT;
6078   if (LoadVT.isVector()) {
6079     if (Unpacked) {
6080       EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6081                                      LoadVT.getVectorNumElements());
6082     } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6083       // Widen v3f16 to legal type
6084       EquivLoadVT =
6085           EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
6086                            LoadVT.getVectorNumElements() + 1);
6087     }
6088   }
6089 
6090   // Change from v4f16/v2f16 to EquivLoadVT.
6091   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6092 
6093   SDValue Load = DAG.getMemIntrinsicNode(
6094       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6095       M->getMemoryVT(), M->getMemOperand());
6096 
6097   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6098 
6099   return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6100 }
6101 
6102 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6103                                              SelectionDAG &DAG,
6104                                              ArrayRef<SDValue> Ops) const {
6105   SDLoc DL(M);
6106   EVT LoadVT = M->getValueType(0);
6107   EVT EltType = LoadVT.getScalarType();
6108   EVT IntVT = LoadVT.changeTypeToInteger();
6109 
6110   bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6111 
6112   assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6113   bool IsTFE = M->getNumValues() == 3;
6114 
6115   unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6116                                    : AMDGPUISD::BUFFER_LOAD_FORMAT)
6117                  : IsTFE  ? AMDGPUISD::BUFFER_LOAD_TFE
6118                           : AMDGPUISD::BUFFER_LOAD;
6119 
6120   if (IsD16) {
6121     return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6122   }
6123 
6124   // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6125   if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6126     return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6127                                       IsTFE);
6128 
6129   if (isTypeLegal(LoadVT)) {
6130     return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6131                                M->getMemOperand(), DAG);
6132   }
6133 
6134   EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6135   SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6136   SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6137                                         M->getMemOperand(), DAG);
6138   return DAG.getMergeValues(
6139       {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6140       DL);
6141 }
6142 
6143 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6144                                   SelectionDAG &DAG) {
6145   EVT VT = N->getValueType(0);
6146   unsigned CondCode = N->getConstantOperandVal(3);
6147   if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6148     return DAG.getUNDEF(VT);
6149 
6150   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6151 
6152   SDValue LHS = N->getOperand(1);
6153   SDValue RHS = N->getOperand(2);
6154 
6155   SDLoc DL(N);
6156 
6157   EVT CmpVT = LHS.getValueType();
6158   if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6159     unsigned PromoteOp =
6160         ICmpInst::isSigned(IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6161     LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6162     RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6163   }
6164 
6165   ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6166 
6167   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6168   EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6169 
6170   SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6171                               DAG.getCondCode(CCOpcode));
6172   if (VT.bitsEq(CCVT))
6173     return SetCC;
6174   return DAG.getZExtOrTrunc(SetCC, DL, VT);
6175 }
6176 
6177 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6178                                   SelectionDAG &DAG) {
6179   EVT VT = N->getValueType(0);
6180 
6181   unsigned CondCode = N->getConstantOperandVal(3);
6182   if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6183     return DAG.getUNDEF(VT);
6184 
6185   SDValue Src0 = N->getOperand(1);
6186   SDValue Src1 = N->getOperand(2);
6187   EVT CmpVT = Src0.getValueType();
6188   SDLoc SL(N);
6189 
6190   if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6191     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6192     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6193   }
6194 
6195   FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6196   ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6197   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6198   EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6199   SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6200                               DAG.getCondCode(CCOpcode));
6201   if (VT.bitsEq(CCVT))
6202     return SetCC;
6203   return DAG.getZExtOrTrunc(SetCC, SL, VT);
6204 }
6205 
6206 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6207                                     SelectionDAG &DAG) {
6208   EVT VT = N->getValueType(0);
6209   SDValue Src = N->getOperand(1);
6210   SDLoc SL(N);
6211 
6212   if (Src.getOpcode() == ISD::SETCC) {
6213     // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6214     return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6215                        Src.getOperand(1), Src.getOperand(2));
6216   }
6217   if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6218     // (ballot 0) -> 0
6219     if (Arg->isZero())
6220       return DAG.getConstant(0, SL, VT);
6221 
6222     // (ballot 1) -> EXEC/EXEC_LO
6223     if (Arg->isOne()) {
6224       Register Exec;
6225       if (VT.getScalarSizeInBits() == 32)
6226         Exec = AMDGPU::EXEC_LO;
6227       else if (VT.getScalarSizeInBits() == 64)
6228         Exec = AMDGPU::EXEC;
6229       else
6230         return SDValue();
6231 
6232       return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6233     }
6234   }
6235 
6236   // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6237   // ISD::SETNE)
6238   return DAG.getNode(
6239       AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6240       DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6241 }
6242 
6243 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6244                            SelectionDAG &DAG) {
6245   EVT VT = N->getValueType(0);
6246   unsigned ValSize = VT.getSizeInBits();
6247   unsigned IID = N->getConstantOperandVal(0);
6248   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6249                       IID == Intrinsic::amdgcn_permlanex16;
6250   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6251                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6252   SDLoc SL(N);
6253   MVT IntVT = MVT::getIntegerVT(ValSize);
6254   const GCNSubtarget *ST = TLI.getSubtarget();
6255   unsigned SplitSize = 32;
6256   if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6257       ST->hasDPALU_DPP() &&
6258       AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6259     SplitSize = 64;
6260 
6261   auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6262                                           SDValue Src2, MVT ValT) -> SDValue {
6263     SmallVector<SDValue, 8> Operands;
6264     switch (IID) {
6265     case Intrinsic::amdgcn_permlane16:
6266     case Intrinsic::amdgcn_permlanex16:
6267     case Intrinsic::amdgcn_update_dpp:
6268       Operands.push_back(N->getOperand(6));
6269       Operands.push_back(N->getOperand(5));
6270       Operands.push_back(N->getOperand(4));
6271       [[fallthrough]];
6272     case Intrinsic::amdgcn_writelane:
6273       Operands.push_back(Src2);
6274       [[fallthrough]];
6275     case Intrinsic::amdgcn_readlane:
6276     case Intrinsic::amdgcn_set_inactive:
6277     case Intrinsic::amdgcn_set_inactive_chain_arg:
6278     case Intrinsic::amdgcn_mov_dpp8:
6279       Operands.push_back(Src1);
6280       [[fallthrough]];
6281     case Intrinsic::amdgcn_readfirstlane:
6282     case Intrinsic::amdgcn_permlane64:
6283       Operands.push_back(Src0);
6284       break;
6285     default:
6286       llvm_unreachable("unhandled lane op");
6287     }
6288 
6289     Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6290     std::reverse(Operands.begin(), Operands.end());
6291 
6292     if (SDNode *GL = N->getGluedNode()) {
6293       assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6294       GL = GL->getOperand(0).getNode();
6295       Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6296                                      SDValue(GL, 0)));
6297     }
6298 
6299     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6300   };
6301 
6302   SDValue Src0 = N->getOperand(1);
6303   SDValue Src1, Src2;
6304   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6305       IID == Intrinsic::amdgcn_mov_dpp8 ||
6306       IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6307     Src1 = N->getOperand(2);
6308     if (IID == Intrinsic::amdgcn_writelane ||
6309         IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6310       Src2 = N->getOperand(3);
6311   }
6312 
6313   if (ValSize == SplitSize) {
6314     // Already legal
6315     return SDValue();
6316   }
6317 
6318   if (ValSize < 32) {
6319     bool IsFloat = VT.isFloatingPoint();
6320     Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6321                                 SL, MVT::i32);
6322 
6323     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6324       Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6325                                   SL, MVT::i32);
6326     }
6327 
6328     if (IID == Intrinsic::amdgcn_writelane) {
6329       Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6330                                   SL, MVT::i32);
6331     }
6332 
6333     SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6334     SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6335     return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6336   }
6337 
6338   if (ValSize % SplitSize != 0)
6339     return SDValue();
6340 
6341   auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6342     EVT VT = N->getValueType(0);
6343     unsigned NE = VT.getVectorNumElements();
6344     EVT EltVT = VT.getVectorElementType();
6345     SmallVector<SDValue, 8> Scalars;
6346     unsigned NumOperands = N->getNumOperands();
6347     SmallVector<SDValue, 4> Operands(NumOperands);
6348     SDNode *GL = N->getGluedNode();
6349 
6350     // only handle convergencectrl_glue
6351     assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6352 
6353     for (unsigned i = 0; i != NE; ++i) {
6354       for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6355            ++j) {
6356         SDValue Operand = N->getOperand(j);
6357         EVT OperandVT = Operand.getValueType();
6358         if (OperandVT.isVector()) {
6359           // A vector operand; extract a single element.
6360           EVT OperandEltVT = OperandVT.getVectorElementType();
6361           Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6362                                     Operand, DAG.getVectorIdxConstant(i, SL));
6363         } else {
6364           // A scalar operand; just use it as is.
6365           Operands[j] = Operand;
6366         }
6367       }
6368 
6369       if (GL)
6370         Operands[NumOperands - 1] =
6371             DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6372                         SDValue(GL->getOperand(0).getNode(), 0));
6373 
6374       Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6375     }
6376 
6377     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6378     return DAG.getBuildVector(VecVT, SL, Scalars);
6379   };
6380 
6381   if (VT.isVector()) {
6382     switch (MVT::SimpleValueType EltTy =
6383                 VT.getVectorElementType().getSimpleVT().SimpleTy) {
6384     case MVT::i32:
6385     case MVT::f32:
6386       if (SplitSize == 32) {
6387         SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6388         return unrollLaneOp(LaneOp.getNode());
6389       }
6390       [[fallthrough]];
6391     case MVT::i16:
6392     case MVT::f16:
6393     case MVT::bf16: {
6394       unsigned SubVecNumElt =
6395           SplitSize / VT.getVectorElementType().getSizeInBits();
6396       MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6397       SmallVector<SDValue, 4> Pieces;
6398       SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6399       for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6400         Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6401                                  DAG.getConstant(EltIdx, SL, MVT::i32));
6402 
6403         if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6404             IsPermLane16)
6405           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6406                                    DAG.getConstant(EltIdx, SL, MVT::i32));
6407 
6408         if (IID == Intrinsic::amdgcn_writelane)
6409           Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6410                                    DAG.getConstant(EltIdx, SL, MVT::i32));
6411 
6412         Pieces.push_back(
6413             IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6414                 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6415                 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6416         EltIdx += SubVecNumElt;
6417       }
6418       return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6419     }
6420     default:
6421       // Handle all other cases by bitcasting to i32 vectors
6422       break;
6423     }
6424   }
6425 
6426   MVT VecVT =
6427       MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6428   Src0 = DAG.getBitcast(VecVT, Src0);
6429 
6430   if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6431     Src1 = DAG.getBitcast(VecVT, Src1);
6432 
6433   if (IID == Intrinsic::amdgcn_writelane)
6434     Src2 = DAG.getBitcast(VecVT, Src2);
6435 
6436   SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6437   SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6438   return DAG.getBitcast(VT, UnrolledLaneOp);
6439 }
6440 
6441 void SITargetLowering::ReplaceNodeResults(SDNode *N,
6442                                           SmallVectorImpl<SDValue> &Results,
6443                                           SelectionDAG &DAG) const {
6444   switch (N->getOpcode()) {
6445   case ISD::INSERT_VECTOR_ELT: {
6446     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6447       Results.push_back(Res);
6448     return;
6449   }
6450   case ISD::EXTRACT_VECTOR_ELT: {
6451     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6452       Results.push_back(Res);
6453     return;
6454   }
6455   case ISD::INTRINSIC_WO_CHAIN: {
6456     unsigned IID = N->getConstantOperandVal(0);
6457     switch (IID) {
6458     case Intrinsic::amdgcn_make_buffer_rsrc:
6459       Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6460       return;
6461     case Intrinsic::amdgcn_cvt_pkrtz: {
6462       SDValue Src0 = N->getOperand(1);
6463       SDValue Src1 = N->getOperand(2);
6464       SDLoc SL(N);
6465       SDValue Cvt =
6466           DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6467       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6468       return;
6469     }
6470     case Intrinsic::amdgcn_cvt_pknorm_i16:
6471     case Intrinsic::amdgcn_cvt_pknorm_u16:
6472     case Intrinsic::amdgcn_cvt_pk_i16:
6473     case Intrinsic::amdgcn_cvt_pk_u16: {
6474       SDValue Src0 = N->getOperand(1);
6475       SDValue Src1 = N->getOperand(2);
6476       SDLoc SL(N);
6477       unsigned Opcode;
6478 
6479       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6480         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6481       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6482         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6483       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6484         Opcode = AMDGPUISD::CVT_PK_I16_I32;
6485       else
6486         Opcode = AMDGPUISD::CVT_PK_U16_U32;
6487 
6488       EVT VT = N->getValueType(0);
6489       if (isTypeLegal(VT))
6490         Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6491       else {
6492         SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6493         Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6494       }
6495       return;
6496     }
6497     case Intrinsic::amdgcn_s_buffer_load: {
6498       // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6499       // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6500       // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6501       // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6502       // s_buffer_load_i8.
6503       if (!Subtarget->hasScalarSubwordLoads())
6504         return;
6505       SDValue Op = SDValue(N, 0);
6506       SDValue Rsrc = Op.getOperand(1);
6507       SDValue Offset = Op.getOperand(2);
6508       SDValue CachePolicy = Op.getOperand(3);
6509       EVT VT = Op.getValueType();
6510       assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6511       SDLoc DL(Op);
6512       MachineFunction &MF = DAG.getMachineFunction();
6513       const DataLayout &DataLayout = DAG.getDataLayout();
6514       Align Alignment =
6515           DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6516       MachineMemOperand *MMO = MF.getMachineMemOperand(
6517           MachinePointerInfo(),
6518           MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6519               MachineMemOperand::MOInvariant,
6520           VT.getStoreSize(), Alignment);
6521       SDValue LoadVal;
6522       if (!Offset->isDivergent()) {
6523         SDValue Ops[] = {Rsrc, // source register
6524                          Offset, CachePolicy};
6525         SDValue BufferLoad =
6526             DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
6527                                     DAG.getVTList(MVT::i32), Ops, VT, MMO);
6528         LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6529       } else {
6530         SDValue Ops[] = {
6531             DAG.getEntryNode(),                    // Chain
6532             Rsrc,                                  // rsrc
6533             DAG.getConstant(0, DL, MVT::i32),      // vindex
6534             {},                                    // voffset
6535             {},                                    // soffset
6536             {},                                    // offset
6537             CachePolicy,                           // cachepolicy
6538             DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6539         };
6540         setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6541         LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6542       }
6543       Results.push_back(LoadVal);
6544       return;
6545     }
6546     }
6547     break;
6548   }
6549   case ISD::INTRINSIC_W_CHAIN: {
6550     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6551       if (Res.getOpcode() == ISD::MERGE_VALUES) {
6552         // FIXME: Hacky
6553         for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6554           Results.push_back(Res.getOperand(I));
6555         }
6556       } else {
6557         Results.push_back(Res);
6558         Results.push_back(Res.getValue(1));
6559       }
6560       return;
6561     }
6562 
6563     break;
6564   }
6565   case ISD::SELECT: {
6566     SDLoc SL(N);
6567     EVT VT = N->getValueType(0);
6568     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6569     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6570     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6571 
6572     EVT SelectVT = NewVT;
6573     if (NewVT.bitsLT(MVT::i32)) {
6574       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6575       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6576       SelectVT = MVT::i32;
6577     }
6578 
6579     SDValue NewSelect =
6580         DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6581 
6582     if (NewVT != SelectVT)
6583       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6584     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6585     return;
6586   }
6587   case ISD::FNEG: {
6588     if (N->getValueType(0) != MVT::v2f16)
6589       break;
6590 
6591     SDLoc SL(N);
6592     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6593 
6594     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6595                              DAG.getConstant(0x80008000, SL, MVT::i32));
6596     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6597     return;
6598   }
6599   case ISD::FABS: {
6600     if (N->getValueType(0) != MVT::v2f16)
6601       break;
6602 
6603     SDLoc SL(N);
6604     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6605 
6606     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6607                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6608     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6609     return;
6610   }
6611   case ISD::FSQRT: {
6612     if (N->getValueType(0) != MVT::f16)
6613       break;
6614     Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6615     break;
6616   }
6617   default:
6618     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6619     break;
6620   }
6621 }
6622 
6623 /// Helper function for LowerBRCOND
6624 static SDNode *findUser(SDValue Value, unsigned Opcode) {
6625 
6626   for (SDUse &U : Value->uses()) {
6627     if (U.get() != Value)
6628       continue;
6629 
6630     if (U.getUser()->getOpcode() == Opcode)
6631       return U.getUser();
6632   }
6633   return nullptr;
6634 }
6635 
6636 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6637   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6638     switch (Intr->getConstantOperandVal(1)) {
6639     case Intrinsic::amdgcn_if:
6640       return AMDGPUISD::IF;
6641     case Intrinsic::amdgcn_else:
6642       return AMDGPUISD::ELSE;
6643     case Intrinsic::amdgcn_loop:
6644       return AMDGPUISD::LOOP;
6645     case Intrinsic::amdgcn_end_cf:
6646       llvm_unreachable("should not occur");
6647     default:
6648       return 0;
6649     }
6650   }
6651 
6652   // break, if_break, else_break are all only used as inputs to loop, not
6653   // directly as branch conditions.
6654   return 0;
6655 }
6656 
6657 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6658   const Triple &TT = getTargetMachine().getTargetTriple();
6659   return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6660           GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6661          AMDGPU::shouldEmitConstantsToTextSection(TT);
6662 }
6663 
6664 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6665   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6666     return false;
6667 
6668   // FIXME: Either avoid relying on address space here or change the default
6669   // address space for functions to avoid the explicit check.
6670   return (GV->getValueType()->isFunctionTy() ||
6671           !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
6672          !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6673 }
6674 
6675 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6676   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6677 }
6678 
6679 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6680   if (!GV->hasExternalLinkage())
6681     return true;
6682 
6683   const auto OS = getTargetMachine().getTargetTriple().getOS();
6684   return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6685 }
6686 
6687 /// This transforms the control flow intrinsics to get the branch destination as
6688 /// last parameter, also switches branch target with BR if the need arise
6689 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6690   SDLoc DL(BRCOND);
6691 
6692   SDNode *Intr = BRCOND.getOperand(1).getNode();
6693   SDValue Target = BRCOND.getOperand(2);
6694   SDNode *BR = nullptr;
6695   SDNode *SetCC = nullptr;
6696 
6697   if (Intr->getOpcode() == ISD::SETCC) {
6698     // As long as we negate the condition everything is fine
6699     SetCC = Intr;
6700     Intr = SetCC->getOperand(0).getNode();
6701 
6702   } else {
6703     // Get the target from BR if we don't negate the condition
6704     BR = findUser(BRCOND, ISD::BR);
6705     assert(BR && "brcond missing unconditional branch user");
6706     Target = BR->getOperand(1);
6707   }
6708 
6709   unsigned CFNode = isCFIntrinsic(Intr);
6710   if (CFNode == 0) {
6711     // This is a uniform branch so we don't need to legalize.
6712     return BRCOND;
6713   }
6714 
6715   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6716                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6717 
6718   assert(!SetCC ||
6719          (SetCC->getConstantOperandVal(1) == 1 &&
6720           cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6721               ISD::SETNE));
6722 
6723   // operands of the new intrinsic call
6724   SmallVector<SDValue, 4> Ops;
6725   if (HaveChain)
6726     Ops.push_back(BRCOND.getOperand(0));
6727 
6728   Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6729   Ops.push_back(Target);
6730 
6731   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6732 
6733   // build the new intrinsic call
6734   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6735 
6736   if (!HaveChain) {
6737     SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6738 
6739     Result = DAG.getMergeValues(Ops, DL).getNode();
6740   }
6741 
6742   if (BR) {
6743     // Give the branch instruction our target
6744     SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6745     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6746     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6747   }
6748 
6749   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6750 
6751   // Copy the intrinsic results to registers
6752   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6753     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
6754     if (!CopyToReg)
6755       continue;
6756 
6757     Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6758                              SDValue(Result, i - 1), SDValue());
6759 
6760     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6761   }
6762 
6763   // Remove the old intrinsic from the chain
6764   DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6765                                 Intr->getOperand(0));
6766 
6767   return Chain;
6768 }
6769 
6770 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6771   MVT VT = Op.getSimpleValueType();
6772   SDLoc DL(Op);
6773   // Checking the depth
6774   if (Op.getConstantOperandVal(0) != 0)
6775     return DAG.getConstant(0, DL, VT);
6776 
6777   MachineFunction &MF = DAG.getMachineFunction();
6778   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6779   // Check for kernel and shader functions
6780   if (Info->isEntryFunction())
6781     return DAG.getConstant(0, DL, VT);
6782 
6783   MachineFrameInfo &MFI = MF.getFrameInfo();
6784   // There is a call to @llvm.returnaddress in this function
6785   MFI.setReturnAddressIsTaken(true);
6786 
6787   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
6788   // Get the return address reg and mark it as an implicit live-in
6789   Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
6790                               getRegClassFor(VT, Op.getNode()->isDivergent()));
6791 
6792   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6793 }
6794 
6795 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
6796                                             const SDLoc &DL, EVT VT) const {
6797   return Op.getValueType().bitsLE(VT)
6798              ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
6799              : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6800                            DAG.getTargetConstant(0, DL, MVT::i32));
6801 }
6802 
6803 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6804   assert(Op.getValueType() == MVT::f16 &&
6805          "Do not know how to custom lower FP_ROUND for non-f16 type");
6806 
6807   SDValue Src = Op.getOperand(0);
6808   EVT SrcVT = Src.getValueType();
6809   if (SrcVT != MVT::f64)
6810     return Op;
6811 
6812   // TODO: Handle strictfp
6813   if (Op.getOpcode() != ISD::FP_ROUND)
6814     return Op;
6815 
6816   SDLoc DL(Op);
6817 
6818   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6819   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6820   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6821 }
6822 
6823 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6824                                                SelectionDAG &DAG) const {
6825   EVT VT = Op.getValueType();
6826   const MachineFunction &MF = DAG.getMachineFunction();
6827   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6828   bool IsIEEEMode = Info->getMode().IEEE;
6829 
6830   // FIXME: Assert during selection that this is only selected for
6831   // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6832   // mode functions, but this happens to be OK since it's only done in cases
6833   // where there is known no sNaN.
6834   if (IsIEEEMode)
6835     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6836 
6837   if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6838       VT == MVT::v16bf16)
6839     return splitBinaryVectorOp(Op, DAG);
6840   return Op;
6841 }
6842 
6843 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6844   bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6845   EVT VT = Op.getValueType();
6846   assert(VT == MVT::f16);
6847 
6848   SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6849   EVT ExpVT = Exp.getValueType();
6850   if (ExpVT == MVT::i16)
6851     return Op;
6852 
6853   SDLoc DL(Op);
6854 
6855   // Correct the exponent type for f16 to i16.
6856   // Clamp the range of the exponent to the instruction's range.
6857 
6858   // TODO: This should be a generic narrowing legalization, and can easily be
6859   // for GlobalISel.
6860 
6861   SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
6862   SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6863 
6864   SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
6865   SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6866 
6867   SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6868 
6869   if (IsStrict) {
6870     return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6871                        {Op.getOperand(0), Op.getOperand(1), TruncExp});
6872   }
6873 
6874   return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6875 }
6876 
6877 static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
6878   switch (Op->getOpcode()) {
6879   case ISD::SRA:
6880   case ISD::SMIN:
6881   case ISD::SMAX:
6882     return ISD::SIGN_EXTEND;
6883   case ISD::SRL:
6884   case ISD::UMIN:
6885   case ISD::UMAX:
6886     return ISD::ZERO_EXTEND;
6887   case ISD::ADD:
6888   case ISD::SUB:
6889   case ISD::AND:
6890   case ISD::OR:
6891   case ISD::XOR:
6892   case ISD::SHL:
6893   case ISD::SELECT:
6894   case ISD::MUL:
6895     // operation result won't be influenced by garbage high bits.
6896     // TODO: are all of those cases correct, and are there more?
6897     return ISD::ANY_EXTEND;
6898   case ISD::SETCC: {
6899     ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6900     return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6901   }
6902   default:
6903     llvm_unreachable("unexpected opcode!");
6904   }
6905 }
6906 
6907 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6908                                                 DAGCombinerInfo &DCI) const {
6909   const unsigned Opc = Op.getOpcode();
6910   assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6911          Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6912          Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6913          Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6914          Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6915 
6916   EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6917                                  : Op->getOperand(0).getValueType();
6918   auto ExtTy = OpTy.changeElementType(MVT::i32);
6919 
6920   if (DCI.isBeforeLegalizeOps() ||
6921       isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6922     return SDValue();
6923 
6924   auto &DAG = DCI.DAG;
6925 
6926   SDLoc DL(Op);
6927   SDValue LHS;
6928   SDValue RHS;
6929   if (Opc == ISD::SELECT) {
6930     LHS = Op->getOperand(1);
6931     RHS = Op->getOperand(2);
6932   } else {
6933     LHS = Op->getOperand(0);
6934     RHS = Op->getOperand(1);
6935   }
6936 
6937   const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6938   LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6939 
6940   // Special case: for shifts, the RHS always needs a zext.
6941   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6942     RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6943   else
6944     RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6945 
6946   // setcc always return i1/i1 vec so no need to truncate after.
6947   if (Opc == ISD::SETCC) {
6948     ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6949     return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6950   }
6951 
6952   // For other ops, we extend the operation's return type as well so we need to
6953   // truncate back to the original type.
6954   SDValue NewVal;
6955   if (Opc == ISD::SELECT)
6956     NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6957   else
6958     NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6959 
6960   return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6961 }
6962 
6963 // Custom lowering for vector multiplications and s_mul_u64.
6964 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6965   EVT VT = Op.getValueType();
6966 
6967   // Split vector operands.
6968   if (VT.isVector())
6969     return splitBinaryVectorOp(Op, DAG);
6970 
6971   assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6972 
6973   // There are four ways to lower s_mul_u64:
6974   //
6975   // 1. If all the operands are uniform, then we lower it as it is.
6976   //
6977   // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6978   //    multiplications because there is not a vector equivalent of s_mul_u64.
6979   //
6980   // 3. If the cost model decides that it is more efficient to use vector
6981   //    registers, then we have to split s_mul_u64 in 32-bit multiplications.
6982   //    This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6983   //
6984   // 4. If the cost model decides to use vector registers and both of the
6985   //    operands are zero-extended/sign-extended from 32-bits, then we split the
6986   //    s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6987   //    possible to check if the operands are zero-extended or sign-extended in
6988   //    SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6989   //    s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6990   //    s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6991   //    If the cost model decides that we have to use vector registers, then
6992   //    splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6993   //    s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6994   //    decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6995   //    s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6996   //    SIInstrInfo.cpp .
6997 
6998   if (Op->isDivergent())
6999     return SDValue();
7000 
7001   SDValue Op0 = Op.getOperand(0);
7002   SDValue Op1 = Op.getOperand(1);
7003   // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7004   // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7005   // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7006   KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7007   unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7008   KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7009   unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7010   SDLoc SL(Op);
7011   if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7012     return SDValue(
7013         DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7014   unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7015   unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7016   if (Op0SignBits >= 33 && Op1SignBits >= 33)
7017     return SDValue(
7018         DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7019   // If all the operands are uniform, then we lower s_mul_u64 as it is.
7020   return Op;
7021 }
7022 
7023 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7024   EVT VT = Op.getValueType();
7025   SDLoc SL(Op);
7026   SDValue LHS = Op.getOperand(0);
7027   SDValue RHS = Op.getOperand(1);
7028   bool isSigned = Op.getOpcode() == ISD::SMULO;
7029 
7030   if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7031     const APInt &C = RHSC->getAPIntValue();
7032     // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7033     if (C.isPowerOf2()) {
7034       // smulo(x, signed_min) is same as umulo(x, signed_min).
7035       bool UseArithShift = isSigned && !C.isMinSignedValue();
7036       SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7037       SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7038       SDValue Overflow =
7039           DAG.getSetCC(SL, MVT::i1,
7040                        DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7041                                    Result, ShiftAmt),
7042                        LHS, ISD::SETNE);
7043       return DAG.getMergeValues({Result, Overflow}, SL);
7044     }
7045   }
7046 
7047   SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7048   SDValue Top =
7049       DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7050 
7051   SDValue Sign = isSigned
7052                      ? DAG.getNode(ISD::SRA, SL, VT, Result,
7053                                    DAG.getConstant(VT.getScalarSizeInBits() - 1,
7054                                                    SL, MVT::i32))
7055                      : DAG.getConstant(0, SL, VT);
7056   SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7057 
7058   return DAG.getMergeValues({Result, Overflow}, SL);
7059 }
7060 
7061 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7062   if (Op->isDivergent()) {
7063     // Select to V_MAD_[IU]64_[IU]32.
7064     return Op;
7065   }
7066   if (Subtarget->hasSMulHi()) {
7067     // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7068     return SDValue();
7069   }
7070   // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7071   // calculate the high part, so we might as well do the whole thing with
7072   // V_MAD_[IU]64_[IU]32.
7073   return Op;
7074 }
7075 
7076 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7077   if (!Subtarget->isTrapHandlerEnabled() ||
7078       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7079     return lowerTrapEndpgm(Op, DAG);
7080 
7081   return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7082                                             : lowerTrapHsaQueuePtr(Op, DAG);
7083 }
7084 
7085 SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7086   SDLoc SL(Op);
7087   SDValue Chain = Op.getOperand(0);
7088   return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7089 }
7090 
7091 SDValue
7092 SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7093                                              const SDLoc &DL, Align Alignment,
7094                                              ImplicitParameter Param) const {
7095   MachineFunction &MF = DAG.getMachineFunction();
7096   uint64_t Offset = getImplicitParameterOffset(MF, Param);
7097   SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7098   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7099   return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7100                      MachineMemOperand::MODereferenceable |
7101                          MachineMemOperand::MOInvariant);
7102 }
7103 
7104 SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7105                                                SelectionDAG &DAG) const {
7106   SDLoc SL(Op);
7107   SDValue Chain = Op.getOperand(0);
7108 
7109   SDValue QueuePtr;
7110   // For code object version 5, QueuePtr is passed through implicit kernarg.
7111   const Module *M = DAG.getMachineFunction().getFunction().getParent();
7112   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7113     QueuePtr =
7114         loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7115   } else {
7116     MachineFunction &MF = DAG.getMachineFunction();
7117     SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7118     Register UserSGPR = Info->getQueuePtrUserSGPR();
7119 
7120     if (UserSGPR == AMDGPU::NoRegister) {
7121       // We probably are in a function incorrectly marked with
7122       // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7123       // trap, so just use a null pointer.
7124       QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7125     } else {
7126       QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7127                                       MVT::i64);
7128     }
7129   }
7130 
7131   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7132   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7133 
7134   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7135   SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7136                    ToReg.getValue(1)};
7137   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7138 }
7139 
7140 SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7141   SDLoc SL(Op);
7142   SDValue Chain = Op.getOperand(0);
7143 
7144   // We need to simulate the 's_trap 2' instruction on targets that run in
7145   // PRIV=1 (where it is treated as a nop).
7146   if (Subtarget->hasPrivEnabledTrap2NopBug())
7147     return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7148 
7149   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7150   SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7151   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7152 }
7153 
7154 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7155   SDLoc SL(Op);
7156   SDValue Chain = Op.getOperand(0);
7157   MachineFunction &MF = DAG.getMachineFunction();
7158 
7159   if (!Subtarget->isTrapHandlerEnabled() ||
7160       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7161     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
7162                                      "debugtrap handler not supported",
7163                                      Op.getDebugLoc(), DS_Warning);
7164     LLVMContext &Ctx = MF.getFunction().getContext();
7165     Ctx.diagnose(NoTrap);
7166     return Chain;
7167   }
7168 
7169   uint64_t TrapID =
7170       static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7171   SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7172   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7173 }
7174 
7175 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7176                                              SelectionDAG &DAG) const {
7177   if (Subtarget->hasApertureRegs()) {
7178     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7179                                        ? AMDGPU::SRC_SHARED_BASE
7180                                        : AMDGPU::SRC_PRIVATE_BASE;
7181     // Note: this feature (register) is broken. When used as a 32-bit operand,
7182     // it returns a wrong value (all zeroes?). The real value is in the upper 32
7183     // bits.
7184     //
7185     // To work around the issue, directly emit a 64 bit mov from this register
7186     // then extract the high bits. Note that this shouldn't even result in a
7187     // shift being emitted and simply become a pair of registers (e.g.):
7188     //    s_mov_b64 s[6:7], src_shared_base
7189     //    v_mov_b32_e32 v1, s7
7190     //
7191     // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7192     // coalescing would kick in and it would think it's okay to use the "HI"
7193     // subregister directly (instead of extracting the HI 32 bits) which is an
7194     // artificial (unusable) register.
7195     //  Register TableGen definitions would need an overhaul to get rid of the
7196     //  artificial "HI" aperture registers and prevent this kind of issue from
7197     //  happening.
7198     SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7199                                      DAG.getRegister(ApertureRegNo, MVT::i64));
7200     return DAG.getNode(
7201         ISD::TRUNCATE, DL, MVT::i32,
7202         DAG.getNode(ISD::SRL, DL, MVT::i64,
7203                     {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7204   }
7205 
7206   // For code object version 5, private_base and shared_base are passed through
7207   // implicit kernargs.
7208   const Module *M = DAG.getMachineFunction().getFunction().getParent();
7209   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7210     ImplicitParameter Param =
7211         (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7212     return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7213   }
7214 
7215   MachineFunction &MF = DAG.getMachineFunction();
7216   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7217   Register UserSGPR = Info->getQueuePtrUserSGPR();
7218   if (UserSGPR == AMDGPU::NoRegister) {
7219     // We probably are in a function incorrectly marked with
7220     // amdgpu-no-queue-ptr. This is undefined.
7221     return DAG.getUNDEF(MVT::i32);
7222   }
7223 
7224   SDValue QueuePtr =
7225       CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7226 
7227   // Offset into amd_queue_t for group_segment_aperture_base_hi /
7228   // private_segment_aperture_base_hi.
7229   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7230 
7231   SDValue Ptr =
7232       DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7233 
7234   // TODO: Use custom target PseudoSourceValue.
7235   // TODO: We should use the value from the IR intrinsic call, but it might not
7236   // be available and how do we get it?
7237   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7238   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7239                      commonAlignment(Align(64), StructOffset),
7240                      MachineMemOperand::MODereferenceable |
7241                          MachineMemOperand::MOInvariant);
7242 }
7243 
7244 /// Return true if the value is a known valid address, such that a null check is
7245 /// not necessary.
7246 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7247                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7248   if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7249       isa<BasicBlockSDNode>(Val))
7250     return true;
7251 
7252   if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7253     return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7254 
7255   // TODO: Search through arithmetic, handle arguments and loads
7256   // marked nonnull.
7257   return false;
7258 }
7259 
7260 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7261                                              SelectionDAG &DAG) const {
7262   SDLoc SL(Op);
7263 
7264   const AMDGPUTargetMachine &TM =
7265       static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7266 
7267   unsigned DestAS, SrcAS;
7268   SDValue Src;
7269   bool IsNonNull = false;
7270   if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7271     SrcAS = ASC->getSrcAddressSpace();
7272     Src = ASC->getOperand(0);
7273     DestAS = ASC->getDestAddressSpace();
7274   } else {
7275     assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7276            Op.getConstantOperandVal(0) ==
7277                Intrinsic::amdgcn_addrspacecast_nonnull);
7278     Src = Op->getOperand(1);
7279     SrcAS = Op->getConstantOperandVal(2);
7280     DestAS = Op->getConstantOperandVal(3);
7281     IsNonNull = true;
7282   }
7283 
7284   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7285 
7286   // flat -> local/private
7287   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7288     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7289         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7290       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7291 
7292       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7293         return Ptr;
7294 
7295       unsigned NullVal = TM.getNullPointerValue(DestAS);
7296       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7297       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7298 
7299       return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7300                          SegmentNullPtr);
7301     }
7302   }
7303 
7304   // local/private -> flat
7305   if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7306     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7307         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7308 
7309       SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7310       SDValue CvtPtr =
7311           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7312       CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7313 
7314       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7315         return CvtPtr;
7316 
7317       unsigned NullVal = TM.getNullPointerValue(SrcAS);
7318       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7319 
7320       SDValue NonNull =
7321           DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7322 
7323       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7324                          FlatNullPtr);
7325     }
7326   }
7327 
7328   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7329       Op.getValueType() == MVT::i64) {
7330     const SIMachineFunctionInfo *Info =
7331         DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7332     SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7333     SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7334     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7335   }
7336 
7337   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7338       Src.getValueType() == MVT::i64)
7339     return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7340 
7341   // global <-> flat are no-ops and never emitted.
7342 
7343   const MachineFunction &MF = DAG.getMachineFunction();
7344   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7345       MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7346   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7347 
7348   return DAG.getUNDEF(Op->getValueType(0));
7349 }
7350 
7351 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7352 // the small vector and inserting them into the big vector. That is better than
7353 // the default expansion of doing it via a stack slot. Even though the use of
7354 // the stack slot would be optimized away afterwards, the stack slot itself
7355 // remains.
7356 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7357                                                 SelectionDAG &DAG) const {
7358   SDValue Vec = Op.getOperand(0);
7359   SDValue Ins = Op.getOperand(1);
7360   SDValue Idx = Op.getOperand(2);
7361   EVT VecVT = Vec.getValueType();
7362   EVT InsVT = Ins.getValueType();
7363   EVT EltVT = VecVT.getVectorElementType();
7364   unsigned InsNumElts = InsVT.getVectorNumElements();
7365   unsigned IdxVal = Idx->getAsZExtVal();
7366   SDLoc SL(Op);
7367 
7368   if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7369     // Insert 32-bit registers at a time.
7370     assert(InsNumElts % 2 == 0 && "expect legal vector types");
7371 
7372     unsigned VecNumElts = VecVT.getVectorNumElements();
7373     EVT NewVecVT =
7374         EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7375     EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7376                                    : EVT::getVectorVT(*DAG.getContext(),
7377                                                       MVT::i32, InsNumElts / 2);
7378 
7379     Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7380     Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7381 
7382     for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7383       SDValue Elt;
7384       if (InsNumElts == 2) {
7385         Elt = Ins;
7386       } else {
7387         Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7388                           DAG.getConstant(I, SL, MVT::i32));
7389       }
7390       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7391                         DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7392     }
7393 
7394     return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7395   }
7396 
7397   for (unsigned I = 0; I != InsNumElts; ++I) {
7398     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7399                               DAG.getConstant(I, SL, MVT::i32));
7400     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7401                       DAG.getConstant(IdxVal + I, SL, MVT::i32));
7402   }
7403   return Vec;
7404 }
7405 
7406 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7407                                                  SelectionDAG &DAG) const {
7408   SDValue Vec = Op.getOperand(0);
7409   SDValue InsVal = Op.getOperand(1);
7410   SDValue Idx = Op.getOperand(2);
7411   EVT VecVT = Vec.getValueType();
7412   EVT EltVT = VecVT.getVectorElementType();
7413   unsigned VecSize = VecVT.getSizeInBits();
7414   unsigned EltSize = EltVT.getSizeInBits();
7415   SDLoc SL(Op);
7416 
7417   // Specially handle the case of v4i16 with static indexing.
7418   unsigned NumElts = VecVT.getVectorNumElements();
7419   auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7420   if (NumElts == 4 && EltSize == 16 && KIdx) {
7421     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7422 
7423     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7424                                  DAG.getConstant(0, SL, MVT::i32));
7425     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7426                                  DAG.getConstant(1, SL, MVT::i32));
7427 
7428     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7429     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7430 
7431     unsigned Idx = KIdx->getZExtValue();
7432     bool InsertLo = Idx < 2;
7433     SDValue InsHalf = DAG.getNode(
7434         ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7435         DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7436         DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7437 
7438     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7439 
7440     SDValue Concat =
7441         InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7442                  : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7443 
7444     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7445   }
7446 
7447   // Static indexing does not lower to stack access, and hence there is no need
7448   // for special custom lowering to avoid stack access.
7449   if (isa<ConstantSDNode>(Idx))
7450     return SDValue();
7451 
7452   // Avoid stack access for dynamic indexing by custom lowering to
7453   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7454 
7455   assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7456 
7457   MVT IntVT = MVT::getIntegerVT(VecSize);
7458 
7459   // Convert vector index to bit-index and get the required bit mask.
7460   assert(isPowerOf2_32(EltSize));
7461   const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7462   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7463   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7464   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7465                             DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7466 
7467   // 1. Create a congruent vector with the target value in each element.
7468   SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7469                                DAG.getSplatBuildVector(VecVT, SL, InsVal));
7470 
7471   // 2. Mask off all other indices except the required index within (1).
7472   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7473 
7474   // 3. Mask off the required index within the target vector.
7475   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7476   SDValue RHS =
7477       DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7478 
7479   // 4. Get (2) and (3) ORed into the target vector.
7480   SDValue BFI =
7481       DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7482 
7483   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7484 }
7485 
7486 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7487                                                   SelectionDAG &DAG) const {
7488   SDLoc SL(Op);
7489 
7490   EVT ResultVT = Op.getValueType();
7491   SDValue Vec = Op.getOperand(0);
7492   SDValue Idx = Op.getOperand(1);
7493   EVT VecVT = Vec.getValueType();
7494   unsigned VecSize = VecVT.getSizeInBits();
7495   EVT EltVT = VecVT.getVectorElementType();
7496 
7497   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7498 
7499   // Make sure we do any optimizations that will make it easier to fold
7500   // source modifiers before obscuring it with bit operations.
7501 
7502   // XXX - Why doesn't this get called when vector_shuffle is expanded?
7503   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7504     return Combined;
7505 
7506   if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7507     SDValue Lo, Hi;
7508     auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7509 
7510     if (VecSize == 128) {
7511       SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7512       Lo = DAG.getBitcast(LoVT,
7513                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7514                                       DAG.getConstant(0, SL, MVT::i32)));
7515       Hi = DAG.getBitcast(HiVT,
7516                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7517                                       DAG.getConstant(1, SL, MVT::i32)));
7518     } else if (VecSize == 256) {
7519       SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7520       SDValue Parts[4];
7521       for (unsigned P = 0; P < 4; ++P) {
7522         Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7523                                DAG.getConstant(P, SL, MVT::i32));
7524       }
7525 
7526       Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7527                                             Parts[0], Parts[1]));
7528       Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7529                                             Parts[2], Parts[3]));
7530     } else {
7531       assert(VecSize == 512);
7532 
7533       SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7534       SDValue Parts[8];
7535       for (unsigned P = 0; P < 8; ++P) {
7536         Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7537                                DAG.getConstant(P, SL, MVT::i32));
7538       }
7539 
7540       Lo = DAG.getBitcast(LoVT,
7541                           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7542                                       Parts[0], Parts[1], Parts[2], Parts[3]));
7543       Hi = DAG.getBitcast(HiVT,
7544                           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7545                                       Parts[4], Parts[5], Parts[6], Parts[7]));
7546     }
7547 
7548     EVT IdxVT = Idx.getValueType();
7549     unsigned NElem = VecVT.getVectorNumElements();
7550     assert(isPowerOf2_32(NElem));
7551     SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7552     SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7553     SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7554     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7555   }
7556 
7557   assert(VecSize <= 64);
7558 
7559   MVT IntVT = MVT::getIntegerVT(VecSize);
7560 
7561   // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7562   SDValue VecBC = peekThroughBitcasts(Vec);
7563   if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7564     SDValue Src = VecBC.getOperand(0);
7565     Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7566     Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7567   }
7568 
7569   unsigned EltSize = EltVT.getSizeInBits();
7570   assert(isPowerOf2_32(EltSize));
7571 
7572   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7573 
7574   // Convert vector index to bit-index (* EltSize)
7575   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7576 
7577   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7578   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7579 
7580   if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7581     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7582     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7583   }
7584 
7585   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7586 }
7587 
7588 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7589   assert(Elt % 2 == 0);
7590   return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7591 }
7592 
7593 static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7594   assert(Elt % 2 == 0);
7595   return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7596          !(Mask[Elt + 1] & 1);
7597 }
7598 
7599 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7600                                               SelectionDAG &DAG) const {
7601   SDLoc SL(Op);
7602   EVT ResultVT = Op.getValueType();
7603   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7604   MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7605   const int NewSrcNumElts = 2;
7606   MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
7607   int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7608 
7609   // Break up the shuffle into registers sized pieces.
7610   //
7611   // We're trying to form sub-shuffles that the register allocation pipeline
7612   // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7613   // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7614   // pair of copies into a consecutive register copy, so use the ordinary
7615   // extract_vector_elt lowering unless we can use the shuffle.
7616   //
7617   // TODO: This is a bit of hack, and we should probably always use
7618   // extract_subvector for the largest possible subvector we can (or at least
7619   // use it for PackVT aligned pieces). However we have worse support for
7620   // combines on them don't directly treat extract_subvector / insert_subvector
7621   // as legal. The DAG scheduler also ends up doing a worse job with the
7622   // extract_subvectors.
7623   const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
7624 
7625   // vector_shuffle <0,1,6,7> lhs, rhs
7626   // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7627   //
7628   // vector_shuffle <6,7,2,3> lhs, rhs
7629   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7630   //
7631   // vector_shuffle <6,7,0,1> lhs, rhs
7632   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7633 
7634   // Avoid scalarizing when both halves are reading from consecutive elements.
7635 
7636   // If we're treating 2 element shuffles as legal, also create odd-to-even
7637   // shuffles of neighboring pairs.
7638   //
7639   // vector_shuffle <3,2,7,6> lhs, rhs
7640   //  -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7641   //                    vector_shuffle <1, 0> (extract_subvector rhs, 2)
7642 
7643   SmallVector<SDValue, 16> Pieces;
7644   for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7645     if (ShouldUseConsecutiveExtract &&
7646         elementPairIsContiguous(SVN->getMask(), I)) {
7647       const int Idx = SVN->getMaskElt(I);
7648       int VecIdx = Idx < SrcNumElts ? 0 : 1;
7649       int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7650       SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7651                                    SVN->getOperand(VecIdx),
7652                                    DAG.getConstant(EltIdx, SL, MVT::i32));
7653       Pieces.push_back(SubVec);
7654     } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
7655                isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) {
7656       int Idx0 = SVN->getMaskElt(I);
7657       int Idx1 = SVN->getMaskElt(I + 1);
7658 
7659       SDValue SrcOp0 = SVN->getOperand(0);
7660       SDValue SrcOp1 = SrcOp0;
7661       if (Idx0 >= SrcNumElts) {
7662         SrcOp0 = SVN->getOperand(1);
7663         Idx0 -= SrcNumElts;
7664       }
7665 
7666       if (Idx1 >= SrcNumElts) {
7667         SrcOp1 = SVN->getOperand(1);
7668         Idx1 -= SrcNumElts;
7669       }
7670 
7671       int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
7672       int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
7673 
7674       // Extract nearest even aligned piece.
7675       SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
7676                                     DAG.getConstant(AlignedIdx0, SL, MVT::i32));
7677       SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
7678                                     DAG.getConstant(AlignedIdx1, SL, MVT::i32));
7679 
7680       int NewMaskIdx0 = Idx0 - AlignedIdx0;
7681       int NewMaskIdx1 = Idx1 - AlignedIdx1;
7682 
7683       SDValue Result0 = SubVec0;
7684       SDValue Result1 = SubVec0;
7685 
7686       if (SubVec0 != SubVec1) {
7687         NewMaskIdx1 += NewSrcNumElts;
7688         Result1 = SubVec1;
7689       } else {
7690         Result1 = DAG.getUNDEF(PackVT);
7691       }
7692 
7693       SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
7694                                           {NewMaskIdx0, NewMaskIdx1});
7695       Pieces.push_back(Shuf);
7696     } else {
7697       const int Idx0 = SVN->getMaskElt(I);
7698       const int Idx1 = SVN->getMaskElt(I + 1);
7699       int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7700       int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7701       int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7702       int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7703 
7704       SDValue Vec0 = SVN->getOperand(VecIdx0);
7705       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
7706                                  DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
7707 
7708       SDValue Vec1 = SVN->getOperand(VecIdx1);
7709       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
7710                                  DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
7711       Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
7712     }
7713   }
7714 
7715   return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7716 }
7717 
7718 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7719                                                 SelectionDAG &DAG) const {
7720   SDValue SVal = Op.getOperand(0);
7721   EVT ResultVT = Op.getValueType();
7722   EVT SValVT = SVal.getValueType();
7723   SDValue UndefVal = DAG.getUNDEF(SValVT);
7724   SDLoc SL(Op);
7725 
7726   SmallVector<SDValue, 8> VElts;
7727   VElts.push_back(SVal);
7728   for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7729     VElts.push_back(UndefVal);
7730 
7731   return DAG.getBuildVector(ResultVT, SL, VElts);
7732 }
7733 
7734 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7735                                             SelectionDAG &DAG) const {
7736   SDLoc SL(Op);
7737   EVT VT = Op.getValueType();
7738 
7739   if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7740     assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7741 
7742     SDValue Lo = Op.getOperand(0);
7743     SDValue Hi = Op.getOperand(1);
7744 
7745     // Avoid adding defined bits with the zero_extend.
7746     if (Hi.isUndef()) {
7747       Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7748       SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7749       return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7750     }
7751 
7752     Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7753     Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7754 
7755     SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7756                                 DAG.getConstant(16, SL, MVT::i32));
7757     if (Lo.isUndef())
7758       return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7759 
7760     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7761     Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7762 
7763     SDValue Or =
7764         DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
7765     return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7766   }
7767 
7768   // Split into 2-element chunks.
7769   const unsigned NumParts = VT.getVectorNumElements() / 2;
7770   EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
7771   MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7772 
7773   SmallVector<SDValue> Casts;
7774   for (unsigned P = 0; P < NumParts; ++P) {
7775     SDValue Vec = DAG.getBuildVector(
7776         PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7777     Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7778   }
7779 
7780   SDValue Blend =
7781       DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7782   return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7783 }
7784 
7785 bool SITargetLowering::isOffsetFoldingLegal(
7786     const GlobalAddressSDNode *GA) const {
7787   // OSes that use ELF REL relocations (instead of RELA) can only store a
7788   // 32-bit addend in the instruction, so it is not safe to allow offset folding
7789   // which can create arbitrary 64-bit addends. (This is only a problem for
7790   // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7791   // the high 32 bits of the addend.)
7792   //
7793   // This should be kept in sync with how HasRelocationAddend is initialized in
7794   // the constructor of ELFAMDGPUAsmBackend.
7795   if (!Subtarget->isAmdHsaOS())
7796     return false;
7797 
7798   // We can fold offsets for anything that doesn't require a GOT relocation.
7799   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7800           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7801           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7802          !shouldEmitGOTReloc(GA->getGlobal());
7803 }
7804 
7805 static SDValue
7806 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
7807                         const SDLoc &DL, int64_t Offset, EVT PtrVT,
7808                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
7809   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7810   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7811   // lowered to the following code sequence:
7812   //
7813   // For constant address space:
7814   //   s_getpc_b64 s[0:1]
7815   //   s_add_u32 s0, s0, $symbol
7816   //   s_addc_u32 s1, s1, 0
7817   //
7818   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
7819   //   a fixup or relocation is emitted to replace $symbol with a literal
7820   //   constant, which is a pc-relative offset from the encoding of the $symbol
7821   //   operand to the global variable.
7822   //
7823   // For global address space:
7824   //   s_getpc_b64 s[0:1]
7825   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7826   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7827   //
7828   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
7829   //   fixups or relocations are emitted to replace $symbol@*@lo and
7830   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7831   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
7832   //   operand to the global variable.
7833   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7834   SDValue PtrHi;
7835   if (GAFlags == SIInstrInfo::MO_NONE)
7836     PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7837   else
7838     PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7839   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7840 }
7841 
7842 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7843                                              SDValue Op,
7844                                              SelectionDAG &DAG) const {
7845   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7846   SDLoc DL(GSD);
7847   EVT PtrVT = Op.getValueType();
7848 
7849   const GlobalValue *GV = GSD->getGlobal();
7850   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7851        shouldUseLDSConstAddress(GV)) ||
7852       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
7853       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
7854     if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7855         GV->hasExternalLinkage()) {
7856       Type *Ty = GV->getValueType();
7857       // HIP uses an unsized array `extern __shared__ T s[]` or similar
7858       // zero-sized type in other languages to declare the dynamic shared
7859       // memory which size is not known at the compile time. They will be
7860       // allocated by the runtime and placed directly after the static
7861       // allocated ones. They all share the same offset.
7862       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7863         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7864         // Adjust alignment for that dynamic shared memory array.
7865         Function &F = DAG.getMachineFunction().getFunction();
7866         MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7867         MFI->setUsesDynamicLDS(true);
7868         return SDValue(
7869             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7870       }
7871     }
7872     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
7873   }
7874 
7875   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7876     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7877                                             SIInstrInfo::MO_ABS32_LO);
7878     return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7879   }
7880 
7881   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7882     SDValue AddrLo = DAG.getTargetGlobalAddress(
7883         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7884     AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7885 
7886     SDValue AddrHi = DAG.getTargetGlobalAddress(
7887         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7888     AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7889 
7890     return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7891   }
7892 
7893   if (shouldEmitFixup(GV))
7894     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7895 
7896   if (shouldEmitPCReloc(GV))
7897     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7898                                    SIInstrInfo::MO_REL32);
7899 
7900   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7901                                             SIInstrInfo::MO_GOTPCREL32);
7902   PointerType *PtrTy =
7903       PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS);
7904   const DataLayout &DataLayout = DAG.getDataLayout();
7905   Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7906   MachinePointerInfo PtrInfo =
7907       MachinePointerInfo::getGOT(DAG.getMachineFunction());
7908 
7909   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7910                      MachineMemOperand::MODereferenceable |
7911                          MachineMemOperand::MOInvariant);
7912 }
7913 
7914 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
7915                                    const SDLoc &DL, SDValue V) const {
7916   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7917   // the destination register.
7918   //
7919   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7920   // so we will end up with redundant moves to m0.
7921   //
7922   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7923 
7924   // A Null SDValue creates a glue result.
7925   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7926                                   V, Chain);
7927   return SDValue(M0, 0);
7928 }
7929 
7930 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
7931                                                  MVT VT,
7932                                                  unsigned Offset) const {
7933   SDLoc SL(Op);
7934   SDValue Param = lowerKernargMemParameter(
7935       DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7936   // The local size values will have the hi 16-bits as zero.
7937   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7938                      DAG.getValueType(VT));
7939 }
7940 
7941 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7942                                         EVT VT) {
7943   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7944                                       "non-hsa intrinsic with hsa target",
7945                                       DL.getDebugLoc());
7946   DAG.getContext()->diagnose(BadIntrin);
7947   return DAG.getUNDEF(VT);
7948 }
7949 
7950 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7951                                          EVT VT) {
7952   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7953                                       "intrinsic not supported on subtarget",
7954                                       DL.getDebugLoc());
7955   DAG.getContext()->diagnose(BadIntrin);
7956   return DAG.getUNDEF(VT);
7957 }
7958 
7959 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
7960                                     ArrayRef<SDValue> Elts) {
7961   assert(!Elts.empty());
7962   MVT Type;
7963   unsigned NumElts = Elts.size();
7964 
7965   if (NumElts <= 12) {
7966     Type = MVT::getVectorVT(MVT::f32, NumElts);
7967   } else {
7968     assert(Elts.size() <= 16);
7969     Type = MVT::v16f32;
7970     NumElts = 16;
7971   }
7972 
7973   SmallVector<SDValue, 16> VecElts(NumElts);
7974   for (unsigned i = 0; i < Elts.size(); ++i) {
7975     SDValue Elt = Elts[i];
7976     if (Elt.getValueType() != MVT::f32)
7977       Elt = DAG.getBitcast(MVT::f32, Elt);
7978     VecElts[i] = Elt;
7979   }
7980   for (unsigned i = Elts.size(); i < NumElts; ++i)
7981     VecElts[i] = DAG.getUNDEF(MVT::f32);
7982 
7983   if (NumElts == 1)
7984     return VecElts[0];
7985   return DAG.getBuildVector(Type, DL, VecElts);
7986 }
7987 
7988 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7989                               SDValue Src, int ExtraElts) {
7990   EVT SrcVT = Src.getValueType();
7991 
7992   SmallVector<SDValue, 8> Elts;
7993 
7994   if (SrcVT.isVector())
7995     DAG.ExtractVectorElements(Src, Elts);
7996   else
7997     Elts.push_back(Src);
7998 
7999   SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
8000   while (ExtraElts--)
8001     Elts.push_back(Undef);
8002 
8003   return DAG.getBuildVector(CastVT, DL, Elts);
8004 }
8005 
8006 // Re-construct the required return value for a image load intrinsic.
8007 // This is more complicated due to the optional use TexFailCtrl which means the
8008 // required return type is an aggregate
8009 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
8010                                  ArrayRef<EVT> ResultTypes, bool IsTexFail,
8011                                  bool Unpacked, bool IsD16, int DMaskPop,
8012                                  int NumVDataDwords, bool IsAtomicPacked16Bit,
8013                                  const SDLoc &DL) {
8014   // Determine the required return type. This is the same regardless of
8015   // IsTexFail flag
8016   EVT ReqRetVT = ResultTypes[0];
8017   int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8018   int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8019                           ? (ReqRetNumElts + 1) / 2
8020                           : ReqRetNumElts;
8021 
8022   int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8023 
8024   MVT DataDwordVT =
8025       NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8026 
8027   MVT MaskPopVT =
8028       MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8029 
8030   SDValue Data(Result, 0);
8031   SDValue TexFail;
8032 
8033   if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8034     SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8035     if (MaskPopVT.isVector()) {
8036       Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8037                          SDValue(Result, 0), ZeroIdx);
8038     } else {
8039       Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8040                          SDValue(Result, 0), ZeroIdx);
8041     }
8042   }
8043 
8044   if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8045     Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8046                           NumDataDwords - MaskPopDwords);
8047 
8048   if (IsD16)
8049     Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8050 
8051   EVT LegalReqRetVT = ReqRetVT;
8052   if (!ReqRetVT.isVector()) {
8053     if (!Data.getValueType().isInteger())
8054       Data = DAG.getNode(ISD::BITCAST, DL,
8055                          Data.getValueType().changeTypeToInteger(), Data);
8056     Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8057   } else {
8058     // We need to widen the return vector to a legal type
8059     if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8060         ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8061       LegalReqRetVT =
8062           EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
8063                            ReqRetVT.getVectorNumElements() + 1);
8064     }
8065   }
8066   Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8067 
8068   if (IsTexFail) {
8069     TexFail =
8070         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8071                     DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8072 
8073     return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8074   }
8075 
8076   if (Result->getNumValues() == 1)
8077     return Data;
8078 
8079   return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8080 }
8081 
8082 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8083                          SDValue *LWE, bool &IsTexFail) {
8084   auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8085 
8086   uint64_t Value = TexFailCtrlConst->getZExtValue();
8087   if (Value) {
8088     IsTexFail = true;
8089   }
8090 
8091   SDLoc DL(TexFailCtrlConst);
8092   *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8093   Value &= ~(uint64_t)0x1;
8094   *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8095   Value &= ~(uint64_t)0x2;
8096 
8097   return Value == 0;
8098 }
8099 
8100 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
8101                                       MVT PackVectorVT,
8102                                       SmallVectorImpl<SDValue> &PackedAddrs,
8103                                       unsigned DimIdx, unsigned EndIdx,
8104                                       unsigned NumGradients) {
8105   SDLoc DL(Op);
8106   for (unsigned I = DimIdx; I < EndIdx; I++) {
8107     SDValue Addr = Op.getOperand(I);
8108 
8109     // Gradients are packed with undef for each coordinate.
8110     // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8111     // 1D: undef,dx/dh; undef,dx/dv
8112     // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8113     // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8114     if (((I + 1) >= EndIdx) ||
8115         ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8116                                          I == DimIdx + NumGradients - 1))) {
8117       if (Addr.getValueType() != MVT::i16)
8118         Addr = DAG.getBitcast(MVT::i16, Addr);
8119       Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8120     } else {
8121       Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8122       I++;
8123     }
8124     Addr = DAG.getBitcast(MVT::f32, Addr);
8125     PackedAddrs.push_back(Addr);
8126   }
8127 }
8128 
8129 SDValue SITargetLowering::lowerImage(SDValue Op,
8130                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
8131                                      SelectionDAG &DAG, bool WithChain) const {
8132   SDLoc DL(Op);
8133   MachineFunction &MF = DAG.getMachineFunction();
8134   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8135   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8136       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
8137   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8138   unsigned IntrOpcode = Intr->BaseOpcode;
8139   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8140   bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8141   bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8142 
8143   SmallVector<EVT, 3> ResultTypes(Op->values());
8144   SmallVector<EVT, 3> OrigResultTypes(Op->values());
8145   bool IsD16 = false;
8146   bool IsG16 = false;
8147   bool IsA16 = false;
8148   SDValue VData;
8149   int NumVDataDwords = 0;
8150   bool AdjustRetType = false;
8151   bool IsAtomicPacked16Bit = false;
8152 
8153   // Offset of intrinsic arguments
8154   const unsigned ArgOffset = WithChain ? 2 : 1;
8155 
8156   unsigned DMask;
8157   unsigned DMaskLanes = 0;
8158 
8159   if (BaseOpcode->Atomic) {
8160     VData = Op.getOperand(2);
8161 
8162     IsAtomicPacked16Bit =
8163         (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8164          Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8165 
8166     bool Is64Bit = VData.getValueSizeInBits() == 64;
8167     if (BaseOpcode->AtomicX2) {
8168       SDValue VData2 = Op.getOperand(3);
8169       VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8170                                  {VData, VData2});
8171       if (Is64Bit)
8172         VData = DAG.getBitcast(MVT::v4i32, VData);
8173 
8174       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8175       DMask = Is64Bit ? 0xf : 0x3;
8176       NumVDataDwords = Is64Bit ? 4 : 2;
8177     } else {
8178       DMask = Is64Bit ? 0x3 : 0x1;
8179       NumVDataDwords = Is64Bit ? 2 : 1;
8180     }
8181   } else {
8182     DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8183     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8184 
8185     if (BaseOpcode->Store) {
8186       VData = Op.getOperand(2);
8187 
8188       MVT StoreVT = VData.getSimpleValueType();
8189       if (StoreVT.getScalarType() == MVT::f16) {
8190         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8191           return Op; // D16 is unsupported for this instruction
8192 
8193         IsD16 = true;
8194         VData = handleD16VData(VData, DAG, true);
8195       }
8196 
8197       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8198     } else if (!BaseOpcode->NoReturn) {
8199       // Work out the num dwords based on the dmask popcount and underlying type
8200       // and whether packing is supported.
8201       MVT LoadVT = ResultTypes[0].getSimpleVT();
8202       if (LoadVT.getScalarType() == MVT::f16) {
8203         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8204           return Op; // D16 is unsupported for this instruction
8205 
8206         IsD16 = true;
8207       }
8208 
8209       // Confirm that the return type is large enough for the dmask specified
8210       if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8211           (!LoadVT.isVector() && DMaskLanes > 1))
8212         return Op;
8213 
8214       // The sq block of gfx8 and gfx9 do not estimate register use correctly
8215       // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8216       // instructions.
8217       if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8218           !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8219         NumVDataDwords = (DMaskLanes + 1) / 2;
8220       else
8221         NumVDataDwords = DMaskLanes;
8222 
8223       AdjustRetType = true;
8224     }
8225   }
8226 
8227   unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8228   SmallVector<SDValue, 4> VAddrs;
8229 
8230   // Check for 16 bit addresses or derivatives and pack if true.
8231   MVT VAddrVT =
8232       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8233   MVT VAddrScalarVT = VAddrVT.getScalarType();
8234   MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8235   IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8236 
8237   VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8238   VAddrScalarVT = VAddrVT.getScalarType();
8239   MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8240   IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8241 
8242   // Push back extra arguments.
8243   for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8244     if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8245       assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8246       // Special handling of bias when A16 is on. Bias is of type half but
8247       // occupies full 32-bit.
8248       SDValue Bias = DAG.getBuildVector(
8249           MVT::v2f16, DL,
8250           {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8251       VAddrs.push_back(Bias);
8252     } else {
8253       assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8254              "Bias needs to be converted to 16 bit in A16 mode");
8255       VAddrs.push_back(Op.getOperand(ArgOffset + I));
8256     }
8257   }
8258 
8259   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8260     // 16 bit gradients are supported, but are tied to the A16 control
8261     // so both gradients and addresses must be 16 bit
8262     LLVM_DEBUG(
8263         dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8264                   "require 16 bit args for both gradients and addresses");
8265     return Op;
8266   }
8267 
8268   if (IsA16) {
8269     if (!ST->hasA16()) {
8270       LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8271                            "support 16 bit addresses\n");
8272       return Op;
8273     }
8274   }
8275 
8276   // We've dealt with incorrect input so we know that if IsA16, IsG16
8277   // are set then we have to compress/pack operands (either address,
8278   // gradient or both)
8279   // In the case where a16 and gradients are tied (no G16 support) then we
8280   // have already verified that both IsA16 and IsG16 are true
8281   if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8282     // Activate g16
8283     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8284         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
8285     IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8286   }
8287 
8288   // Add gradients (packed or unpacked)
8289   if (IsG16) {
8290     // Pack the gradients
8291     // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8292     packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8293                               ArgOffset + Intr->GradientStart,
8294                               ArgOffset + Intr->CoordStart, Intr->NumGradients);
8295   } else {
8296     for (unsigned I = ArgOffset + Intr->GradientStart;
8297          I < ArgOffset + Intr->CoordStart; I++)
8298       VAddrs.push_back(Op.getOperand(I));
8299   }
8300 
8301   // Add addresses (packed or unpacked)
8302   if (IsA16) {
8303     packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8304                               ArgOffset + Intr->CoordStart, VAddrEnd,
8305                               0 /* No gradients */);
8306   } else {
8307     // Add uncompressed address
8308     for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8309       VAddrs.push_back(Op.getOperand(I));
8310   }
8311 
8312   // If the register allocator cannot place the address registers contiguously
8313   // without introducing moves, then using the non-sequential address encoding
8314   // is always preferable, since it saves VALU instructions and is usually a
8315   // wash in terms of code size or even better.
8316   //
8317   // However, we currently have no way of hinting to the register allocator that
8318   // MIMG addresses should be placed contiguously when it is possible to do so,
8319   // so force non-NSA for the common 2-address case as a heuristic.
8320   //
8321   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8322   // allocation when possible.
8323   //
8324   // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8325   // set of the remaining addresses.
8326   const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8327   const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8328   const bool UseNSA = ST->hasNSAEncoding() &&
8329                       VAddrs.size() >= ST->getNSAThreshold(MF) &&
8330                       (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8331   const bool UsePartialNSA =
8332       UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8333 
8334   SDValue VAddr;
8335   if (UsePartialNSA) {
8336     VAddr = getBuildDwordsVector(DAG, DL,
8337                                  ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8338   } else if (!UseNSA) {
8339     VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8340   }
8341 
8342   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8343   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8344   SDValue Unorm;
8345   if (!BaseOpcode->Sampler) {
8346     Unorm = True;
8347   } else {
8348     uint64_t UnormConst =
8349         Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8350 
8351     Unorm = UnormConst ? True : False;
8352   }
8353 
8354   SDValue TFE;
8355   SDValue LWE;
8356   SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8357   bool IsTexFail = false;
8358   if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8359     return Op;
8360 
8361   if (IsTexFail) {
8362     if (!DMaskLanes) {
8363       // Expecting to get an error flag since TFC is on - and dmask is 0
8364       // Force dmask to be at least 1 otherwise the instruction will fail
8365       DMask = 0x1;
8366       DMaskLanes = 1;
8367       NumVDataDwords = 1;
8368     }
8369     NumVDataDwords += 1;
8370     AdjustRetType = true;
8371   }
8372 
8373   // Has something earlier tagged that the return type needs adjusting
8374   // This happens if the instruction is a load or has set TexFailCtrl flags
8375   if (AdjustRetType) {
8376     // NumVDataDwords reflects the true number of dwords required in the return
8377     // type
8378     if (DMaskLanes == 0 && !BaseOpcode->Store) {
8379       // This is a no-op load. This can be eliminated
8380       SDValue Undef = DAG.getUNDEF(Op.getValueType());
8381       if (isa<MemSDNode>(Op))
8382         return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8383       return Undef;
8384     }
8385 
8386     EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8387                                                       MVT::i32, NumVDataDwords)
8388                                    : MVT::i32;
8389 
8390     ResultTypes[0] = NewVT;
8391     if (ResultTypes.size() == 3) {
8392       // Original result was aggregate type used for TexFailCtrl results
8393       // The actual instruction returns as a vector type which has now been
8394       // created. Remove the aggregate result.
8395       ResultTypes.erase(&ResultTypes[1]);
8396     }
8397   }
8398 
8399   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8400   if (BaseOpcode->Atomic)
8401     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8402   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8403                AMDGPU::CPol::VOLATILE))
8404     return Op;
8405 
8406   SmallVector<SDValue, 26> Ops;
8407   if (BaseOpcode->Store || BaseOpcode->Atomic)
8408     Ops.push_back(VData); // vdata
8409   if (UsePartialNSA) {
8410     append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8411     Ops.push_back(VAddr);
8412   } else if (UseNSA)
8413     append_range(Ops, VAddrs);
8414   else
8415     Ops.push_back(VAddr);
8416   SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8417   EVT RsrcVT = Rsrc.getValueType();
8418   if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8419     return Op;
8420   Ops.push_back(Rsrc);
8421   if (BaseOpcode->Sampler) {
8422     SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8423     if (Samp.getValueType() != MVT::v4i32)
8424       return Op;
8425     Ops.push_back(Samp);
8426   }
8427   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8428   if (IsGFX10Plus)
8429     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8430   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8431     Ops.push_back(Unorm);
8432   Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8433   Ops.push_back(IsA16 && // r128, a16 for gfx9
8434                         ST->hasFeature(AMDGPU::FeatureR128A16)
8435                     ? True
8436                     : False);
8437   if (IsGFX10Plus)
8438     Ops.push_back(IsA16 ? True : False);
8439   if (!Subtarget->hasGFX90AInsts()) {
8440     Ops.push_back(TFE); // tfe
8441   } else if (TFE->getAsZExtVal()) {
8442     report_fatal_error("TFE is not supported on this GPU");
8443   }
8444   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8445     Ops.push_back(LWE); // lwe
8446   if (!IsGFX10Plus)
8447     Ops.push_back(DimInfo->DA ? True : False);
8448   if (BaseOpcode->HasD16)
8449     Ops.push_back(IsD16 ? True : False);
8450   if (isa<MemSDNode>(Op))
8451     Ops.push_back(Op.getOperand(0)); // chain
8452 
8453   int NumVAddrDwords =
8454       UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8455   int Opcode = -1;
8456 
8457   if (IsGFX12Plus) {
8458     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8459                                    NumVDataDwords, NumVAddrDwords);
8460   } else if (IsGFX11Plus) {
8461     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8462                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
8463                                           : AMDGPU::MIMGEncGfx11Default,
8464                                    NumVDataDwords, NumVAddrDwords);
8465   } else if (IsGFX10Plus) {
8466     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8467                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
8468                                           : AMDGPU::MIMGEncGfx10Default,
8469                                    NumVDataDwords, NumVAddrDwords);
8470   } else {
8471     if (Subtarget->hasGFX90AInsts()) {
8472       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8473                                      NumVDataDwords, NumVAddrDwords);
8474       if (Opcode == -1)
8475         report_fatal_error(
8476             "requested image instruction is not supported on this GPU");
8477     }
8478     if (Opcode == -1 &&
8479         Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8480       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8481                                      NumVDataDwords, NumVAddrDwords);
8482     if (Opcode == -1)
8483       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8484                                      NumVDataDwords, NumVAddrDwords);
8485   }
8486   if (Opcode == -1)
8487     return Op;
8488 
8489   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8490   if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8491     MachineMemOperand *MemRef = MemOp->getMemOperand();
8492     DAG.setNodeMemRefs(NewNode, {MemRef});
8493   }
8494 
8495   if (BaseOpcode->AtomicX2) {
8496     SmallVector<SDValue, 1> Elt;
8497     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8498     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8499   }
8500   if (BaseOpcode->NoReturn)
8501     return SDValue(NewNode, 0);
8502   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8503                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8504                            NumVDataDwords, IsAtomicPacked16Bit, DL);
8505 }
8506 
8507 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8508                                        SDValue Offset, SDValue CachePolicy,
8509                                        SelectionDAG &DAG) const {
8510   MachineFunction &MF = DAG.getMachineFunction();
8511 
8512   const DataLayout &DataLayout = DAG.getDataLayout();
8513   Align Alignment =
8514       DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
8515 
8516   MachineMemOperand *MMO = MF.getMachineMemOperand(
8517       MachinePointerInfo(),
8518       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8519           MachineMemOperand::MOInvariant,
8520       VT.getStoreSize(), Alignment);
8521 
8522   if (!Offset->isDivergent()) {
8523     SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8524 
8525     // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8526     // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8527     // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8528     // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8529     if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8530       SDValue BufferLoad =
8531           DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
8532                                   DAG.getVTList(MVT::i32), Ops, VT, MMO);
8533       return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8534     }
8535 
8536     // Widen vec3 load to vec4.
8537     if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8538         !Subtarget->hasScalarDwordx3Loads()) {
8539       EVT WidenedVT =
8540           EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
8541       auto WidenedOp = DAG.getMemIntrinsicNode(
8542           AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8543           MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8544       auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8545                                    DAG.getVectorIdxConstant(0, DL));
8546       return Subvector;
8547     }
8548 
8549     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
8550                                    DAG.getVTList(VT), Ops, VT, MMO);
8551   }
8552 
8553   // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8554   // assume that the buffer is unswizzled.
8555   SDValue Ops[] = {
8556       DAG.getEntryNode(),                    // Chain
8557       Rsrc,                                  // rsrc
8558       DAG.getConstant(0, DL, MVT::i32),      // vindex
8559       {},                                    // voffset
8560       {},                                    // soffset
8561       {},                                    // offset
8562       CachePolicy,                           // cachepolicy
8563       DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8564   };
8565   if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8566     setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8567     return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8568   }
8569 
8570   SmallVector<SDValue, 4> Loads;
8571   unsigned NumLoads = 1;
8572   MVT LoadVT = VT.getSimpleVT();
8573   unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8574   assert((LoadVT.getScalarType() == MVT::i32 ||
8575           LoadVT.getScalarType() == MVT::f32));
8576 
8577   if (NumElts == 8 || NumElts == 16) {
8578     NumLoads = NumElts / 4;
8579     LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8580   }
8581 
8582   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8583 
8584   // Use the alignment to ensure that the required offsets will fit into the
8585   // immediate offsets.
8586   setBufferOffsets(Offset, DAG, &Ops[3],
8587                    NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8588 
8589   uint64_t InstOffset = Ops[5]->getAsZExtVal();
8590   for (unsigned i = 0; i < NumLoads; ++i) {
8591     Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8592     Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8593                                         LoadVT, MMO, DAG));
8594   }
8595 
8596   if (NumElts == 8 || NumElts == 16)
8597     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8598 
8599   return Loads[0];
8600 }
8601 
8602 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8603   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8604   if (!Subtarget->hasArchitectedSGPRs())
8605     return {};
8606   SDLoc SL(Op);
8607   MVT VT = MVT::i32;
8608   SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8609   return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8610                      DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8611 }
8612 
8613 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8614                                           unsigned Dim,
8615                                           const ArgDescriptor &Arg) const {
8616   SDLoc SL(Op);
8617   MachineFunction &MF = DAG.getMachineFunction();
8618   unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8619   if (MaxID == 0)
8620     return DAG.getConstant(0, SL, MVT::i32);
8621 
8622   SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8623                                SDLoc(DAG.getEntryNode()), Arg);
8624 
8625   // Don't bother inserting AssertZext for packed IDs since we're emitting the
8626   // masking operations anyway.
8627   //
8628   // TODO: We could assert the top bit is 0 for the source copy.
8629   if (Arg.isMasked())
8630     return Val;
8631 
8632   // Preserve the known bits after expansion to a copy.
8633   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
8634   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8635                      DAG.getValueType(SmallVT));
8636 }
8637 
8638 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8639                                                   SelectionDAG &DAG) const {
8640   MachineFunction &MF = DAG.getMachineFunction();
8641   auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8642 
8643   EVT VT = Op.getValueType();
8644   SDLoc DL(Op);
8645   unsigned IntrinsicID = Op.getConstantOperandVal(0);
8646 
8647   // TODO: Should this propagate fast-math-flags?
8648 
8649   switch (IntrinsicID) {
8650   case Intrinsic::amdgcn_implicit_buffer_ptr: {
8651     if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8652       return emitNonHSAIntrinsicError(DAG, DL, VT);
8653     return getPreloadedValue(DAG, *MFI, VT,
8654                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8655   }
8656   case Intrinsic::amdgcn_dispatch_ptr:
8657   case Intrinsic::amdgcn_queue_ptr: {
8658     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8659       DiagnosticInfoUnsupported BadIntrin(
8660           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8661           DL.getDebugLoc());
8662       DAG.getContext()->diagnose(BadIntrin);
8663       return DAG.getUNDEF(VT);
8664     }
8665 
8666     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
8667                      ? AMDGPUFunctionArgInfo::DISPATCH_PTR
8668                      : AMDGPUFunctionArgInfo::QUEUE_PTR;
8669     return getPreloadedValue(DAG, *MFI, VT, RegID);
8670   }
8671   case Intrinsic::amdgcn_implicitarg_ptr: {
8672     if (MFI->isEntryFunction())
8673       return getImplicitArgPtr(DAG, DL);
8674     return getPreloadedValue(DAG, *MFI, VT,
8675                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
8676   }
8677   case Intrinsic::amdgcn_kernarg_segment_ptr: {
8678     if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
8679       // This only makes sense to call in a kernel, so just lower to null.
8680       return DAG.getConstant(0, DL, VT);
8681     }
8682 
8683     return getPreloadedValue(DAG, *MFI, VT,
8684                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8685   }
8686   case Intrinsic::amdgcn_dispatch_id: {
8687     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8688   }
8689   case Intrinsic::amdgcn_rcp:
8690     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8691   case Intrinsic::amdgcn_rsq:
8692     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8693   case Intrinsic::amdgcn_rsq_legacy:
8694     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8695       return emitRemovedIntrinsicError(DAG, DL, VT);
8696     return SDValue();
8697   case Intrinsic::amdgcn_rcp_legacy:
8698     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8699       return emitRemovedIntrinsicError(DAG, DL, VT);
8700     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8701   case Intrinsic::amdgcn_rsq_clamp: {
8702     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8703       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8704 
8705     Type *Type = VT.getTypeForEVT(*DAG.getContext());
8706     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
8707     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
8708 
8709     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8710     SDValue Tmp =
8711         DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
8712     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8713                        DAG.getConstantFP(Min, DL, VT));
8714   }
8715   case Intrinsic::r600_read_ngroups_x:
8716     if (Subtarget->isAmdHsaOS())
8717       return emitNonHSAIntrinsicError(DAG, DL, VT);
8718 
8719     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8720                                     SI::KernelInputOffsets::NGROUPS_X, Align(4),
8721                                     false);
8722   case Intrinsic::r600_read_ngroups_y:
8723     if (Subtarget->isAmdHsaOS())
8724       return emitNonHSAIntrinsicError(DAG, DL, VT);
8725 
8726     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8727                                     SI::KernelInputOffsets::NGROUPS_Y, Align(4),
8728                                     false);
8729   case Intrinsic::r600_read_ngroups_z:
8730     if (Subtarget->isAmdHsaOS())
8731       return emitNonHSAIntrinsicError(DAG, DL, VT);
8732 
8733     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8734                                     SI::KernelInputOffsets::NGROUPS_Z, Align(4),
8735                                     false);
8736   case Intrinsic::r600_read_global_size_x:
8737     if (Subtarget->isAmdHsaOS())
8738       return emitNonHSAIntrinsicError(DAG, DL, VT);
8739 
8740     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8741                                     SI::KernelInputOffsets::GLOBAL_SIZE_X,
8742                                     Align(4), false);
8743   case Intrinsic::r600_read_global_size_y:
8744     if (Subtarget->isAmdHsaOS())
8745       return emitNonHSAIntrinsicError(DAG, DL, VT);
8746 
8747     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8748                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y,
8749                                     Align(4), false);
8750   case Intrinsic::r600_read_global_size_z:
8751     if (Subtarget->isAmdHsaOS())
8752       return emitNonHSAIntrinsicError(DAG, DL, VT);
8753 
8754     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8755                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z,
8756                                     Align(4), false);
8757   case Intrinsic::r600_read_local_size_x:
8758     if (Subtarget->isAmdHsaOS())
8759       return emitNonHSAIntrinsicError(DAG, DL, VT);
8760 
8761     return lowerImplicitZextParam(DAG, Op, MVT::i16,
8762                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
8763   case Intrinsic::r600_read_local_size_y:
8764     if (Subtarget->isAmdHsaOS())
8765       return emitNonHSAIntrinsicError(DAG, DL, VT);
8766 
8767     return lowerImplicitZextParam(DAG, Op, MVT::i16,
8768                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
8769   case Intrinsic::r600_read_local_size_z:
8770     if (Subtarget->isAmdHsaOS())
8771       return emitNonHSAIntrinsicError(DAG, DL, VT);
8772 
8773     return lowerImplicitZextParam(DAG, Op, MVT::i16,
8774                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
8775   case Intrinsic::amdgcn_workgroup_id_x:
8776     return getPreloadedValue(DAG, *MFI, VT,
8777                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8778   case Intrinsic::amdgcn_workgroup_id_y:
8779     return getPreloadedValue(DAG, *MFI, VT,
8780                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8781   case Intrinsic::amdgcn_workgroup_id_z:
8782     return getPreloadedValue(DAG, *MFI, VT,
8783                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8784   case Intrinsic::amdgcn_wave_id:
8785     return lowerWaveID(DAG, Op);
8786   case Intrinsic::amdgcn_lds_kernel_id: {
8787     if (MFI->isEntryFunction())
8788       return getLDSKernelId(DAG, DL);
8789     return getPreloadedValue(DAG, *MFI, VT,
8790                              AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8791   }
8792   case Intrinsic::amdgcn_workitem_id_x:
8793     return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8794   case Intrinsic::amdgcn_workitem_id_y:
8795     return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8796   case Intrinsic::amdgcn_workitem_id_z:
8797     return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8798   case Intrinsic::amdgcn_wavefrontsize:
8799     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
8800                            SDLoc(Op), MVT::i32);
8801   case Intrinsic::amdgcn_s_buffer_load: {
8802     unsigned CPol = Op.getConstantOperandVal(3);
8803     // s_buffer_load, because of how it's optimized, can't be volatile
8804     // so reject ones with the volatile bit set.
8805     if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8806                      ? AMDGPU::CPol::ALL
8807                      : AMDGPU::CPol::ALL_pregfx12))
8808       return Op;
8809     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
8810                         Op.getOperand(3), DAG);
8811   }
8812   case Intrinsic::amdgcn_fdiv_fast:
8813     return lowerFDIV_FAST(Op, DAG);
8814   case Intrinsic::amdgcn_sin:
8815     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8816 
8817   case Intrinsic::amdgcn_cos:
8818     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8819 
8820   case Intrinsic::amdgcn_mul_u24:
8821     return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
8822                        Op.getOperand(2));
8823   case Intrinsic::amdgcn_mul_i24:
8824     return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
8825                        Op.getOperand(2));
8826 
8827   case Intrinsic::amdgcn_log_clamp: {
8828     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8829       return SDValue();
8830 
8831     return emitRemovedIntrinsicError(DAG, DL, VT);
8832   }
8833   case Intrinsic::amdgcn_fract:
8834     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8835 
8836   case Intrinsic::amdgcn_class:
8837     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
8838                        Op.getOperand(2));
8839   case Intrinsic::amdgcn_div_fmas:
8840     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
8841                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8842 
8843   case Intrinsic::amdgcn_div_fixup:
8844     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
8845                        Op.getOperand(2), Op.getOperand(3));
8846 
8847   case Intrinsic::amdgcn_div_scale: {
8848     const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8849 
8850     // Translate to the operands expected by the machine instruction. The
8851     // first parameter must be the same as the first instruction.
8852     SDValue Numerator = Op.getOperand(1);
8853     SDValue Denominator = Op.getOperand(2);
8854 
8855     // Note this order is opposite of the machine instruction's operations,
8856     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8857     // intrinsic has the numerator as the first operand to match a normal
8858     // division operation.
8859 
8860     SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8861 
8862     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8863                        Denominator, Numerator);
8864   }
8865   case Intrinsic::amdgcn_icmp: {
8866     // There is a Pat that handles this variant, so return it as-is.
8867     if (Op.getOperand(1).getValueType() == MVT::i1 &&
8868         Op.getConstantOperandVal(2) == 0 &&
8869         Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8870       return Op;
8871     return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8872   }
8873   case Intrinsic::amdgcn_fcmp: {
8874     return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8875   }
8876   case Intrinsic::amdgcn_ballot:
8877     return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8878   case Intrinsic::amdgcn_fmed3:
8879     return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
8880                        Op.getOperand(2), Op.getOperand(3));
8881   case Intrinsic::amdgcn_fdot2:
8882     return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
8883                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
8884   case Intrinsic::amdgcn_fmul_legacy:
8885     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
8886                        Op.getOperand(2));
8887   case Intrinsic::amdgcn_sffbh:
8888     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8889   case Intrinsic::amdgcn_sbfe:
8890     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
8891                        Op.getOperand(2), Op.getOperand(3));
8892   case Intrinsic::amdgcn_ubfe:
8893     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
8894                        Op.getOperand(2), Op.getOperand(3));
8895   case Intrinsic::amdgcn_cvt_pkrtz:
8896   case Intrinsic::amdgcn_cvt_pknorm_i16:
8897   case Intrinsic::amdgcn_cvt_pknorm_u16:
8898   case Intrinsic::amdgcn_cvt_pk_i16:
8899   case Intrinsic::amdgcn_cvt_pk_u16: {
8900     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8901     EVT VT = Op.getValueType();
8902     unsigned Opcode;
8903 
8904     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8905       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
8906     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8907       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8908     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8909       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8910     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8911       Opcode = AMDGPUISD::CVT_PK_I16_I32;
8912     else
8913       Opcode = AMDGPUISD::CVT_PK_U16_U32;
8914 
8915     if (isTypeLegal(VT))
8916       return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8917 
8918     SDValue Node =
8919         DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
8920     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8921   }
8922   case Intrinsic::amdgcn_fmad_ftz:
8923     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8924                        Op.getOperand(2), Op.getOperand(3));
8925 
8926   case Intrinsic::amdgcn_if_break:
8927     return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8928                                       Op->getOperand(1), Op->getOperand(2)),
8929                    0);
8930 
8931   case Intrinsic::amdgcn_groupstaticsize: {
8932     Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
8933     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8934       return Op;
8935 
8936     const Module *M = MF.getFunction().getParent();
8937     const GlobalValue *GV =
8938         Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
8939     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8940                                             SIInstrInfo::MO_ABS32_LO);
8941     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8942   }
8943   case Intrinsic::amdgcn_is_shared:
8944   case Intrinsic::amdgcn_is_private: {
8945     SDLoc SL(Op);
8946     unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
8947                       ? AMDGPUAS::LOCAL_ADDRESS
8948                       : AMDGPUAS::PRIVATE_ADDRESS;
8949     SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8950     SDValue SrcVec =
8951         DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8952 
8953     SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8954                                 DAG.getConstant(1, SL, MVT::i32));
8955     return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8956   }
8957   case Intrinsic::amdgcn_perm:
8958     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8959                        Op.getOperand(2), Op.getOperand(3));
8960   case Intrinsic::amdgcn_reloc_constant: {
8961     Module *M = const_cast<Module *>(MF.getFunction().getParent());
8962     const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8963     auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8964     auto *RelocSymbol = cast<GlobalVariable>(
8965         M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8966     SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8967                                             SIInstrInfo::MO_ABS32_LO);
8968     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8969   }
8970   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8971   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8972   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8973   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8974   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8975   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8976   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8977   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8978     if (Op.getOperand(4).getValueType() == MVT::i32)
8979       return SDValue();
8980 
8981     SDLoc SL(Op);
8982     auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8983     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8984                        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8985                        Op.getOperand(3), IndexKeyi32);
8986   }
8987   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8988   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8989   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8990     if (Op.getOperand(6).getValueType() == MVT::i32)
8991       return SDValue();
8992 
8993     SDLoc SL(Op);
8994     auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8995     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8996                        {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8997                         Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8998                         IndexKeyi32, Op.getOperand(7)});
8999   }
9000   case Intrinsic::amdgcn_addrspacecast_nonnull:
9001     return lowerADDRSPACECAST(Op, DAG);
9002   case Intrinsic::amdgcn_readlane:
9003   case Intrinsic::amdgcn_readfirstlane:
9004   case Intrinsic::amdgcn_writelane:
9005   case Intrinsic::amdgcn_permlane16:
9006   case Intrinsic::amdgcn_permlanex16:
9007   case Intrinsic::amdgcn_permlane64:
9008   case Intrinsic::amdgcn_set_inactive:
9009   case Intrinsic::amdgcn_set_inactive_chain_arg:
9010   case Intrinsic::amdgcn_mov_dpp8:
9011   case Intrinsic::amdgcn_update_dpp:
9012     return lowerLaneOp(*this, Op.getNode(), DAG);
9013   default:
9014     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9015             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
9016       return lowerImage(Op, ImageDimIntr, DAG, false);
9017 
9018     return Op;
9019   }
9020 }
9021 
9022 // On targets not supporting constant in soffset field, turn zero to
9023 // SGPR_NULL to avoid generating an extra s_mov with zero.
9024 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
9025                              const GCNSubtarget *Subtarget) {
9026   if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9027     return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9028   return SOffset;
9029 }
9030 
9031 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9032                                                      SelectionDAG &DAG,
9033                                                      unsigned NewOpcode) const {
9034   SDLoc DL(Op);
9035 
9036   SDValue VData = Op.getOperand(2);
9037   SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9038   auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9039   auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9040   SDValue Ops[] = {
9041       Op.getOperand(0),                      // Chain
9042       VData,                                 // vdata
9043       Rsrc,                                  // rsrc
9044       DAG.getConstant(0, DL, MVT::i32),      // vindex
9045       VOffset,                               // voffset
9046       SOffset,                               // soffset
9047       Offset,                                // offset
9048       Op.getOperand(6),                      // cachepolicy
9049       DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9050   };
9051 
9052   auto *M = cast<MemSDNode>(Op);
9053 
9054   EVT MemVT = VData.getValueType();
9055   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9056                                  M->getMemOperand());
9057 }
9058 
9059 SDValue
9060 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9061                                                 unsigned NewOpcode) const {
9062   SDLoc DL(Op);
9063 
9064   SDValue VData = Op.getOperand(2);
9065   SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9066   auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9067   auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9068   SDValue Ops[] = {
9069       Op.getOperand(0),                      // Chain
9070       VData,                                 // vdata
9071       Rsrc,                                  // rsrc
9072       Op.getOperand(4),                      // vindex
9073       VOffset,                               // voffset
9074       SOffset,                               // soffset
9075       Offset,                                // offset
9076       Op.getOperand(7),                      // cachepolicy
9077       DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9078   };
9079 
9080   auto *M = cast<MemSDNode>(Op);
9081 
9082   EVT MemVT = VData.getValueType();
9083   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9084                                  M->getMemOperand());
9085 }
9086 
9087 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9088                                                  SelectionDAG &DAG) const {
9089   unsigned IntrID = Op.getConstantOperandVal(1);
9090   SDLoc DL(Op);
9091 
9092   switch (IntrID) {
9093   case Intrinsic::amdgcn_ds_ordered_add:
9094   case Intrinsic::amdgcn_ds_ordered_swap: {
9095     MemSDNode *M = cast<MemSDNode>(Op);
9096     SDValue Chain = M->getOperand(0);
9097     SDValue M0 = M->getOperand(2);
9098     SDValue Value = M->getOperand(3);
9099     unsigned IndexOperand = M->getConstantOperandVal(7);
9100     unsigned WaveRelease = M->getConstantOperandVal(8);
9101     unsigned WaveDone = M->getConstantOperandVal(9);
9102 
9103     unsigned OrderedCountIndex = IndexOperand & 0x3f;
9104     IndexOperand &= ~0x3f;
9105     unsigned CountDw = 0;
9106 
9107     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9108       CountDw = (IndexOperand >> 24) & 0xf;
9109       IndexOperand &= ~(0xf << 24);
9110 
9111       if (CountDw < 1 || CountDw > 4) {
9112         report_fatal_error(
9113             "ds_ordered_count: dword count must be between 1 and 4");
9114       }
9115     }
9116 
9117     if (IndexOperand)
9118       report_fatal_error("ds_ordered_count: bad index operand");
9119 
9120     if (WaveDone && !WaveRelease)
9121       report_fatal_error("ds_ordered_count: wave_done requires wave_release");
9122 
9123     unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9124     unsigned ShaderType =
9125         SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
9126     unsigned Offset0 = OrderedCountIndex << 2;
9127     unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9128 
9129     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9130       Offset1 |= (CountDw - 1) << 6;
9131 
9132     if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9133       Offset1 |= ShaderType << 2;
9134 
9135     unsigned Offset = Offset0 | (Offset1 << 8);
9136 
9137     SDValue Ops[] = {
9138         Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9139         copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9140     };
9141     return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
9142                                    M->getVTList(), Ops, M->getMemoryVT(),
9143                                    M->getMemOperand());
9144   }
9145   case Intrinsic::amdgcn_raw_buffer_load:
9146   case Intrinsic::amdgcn_raw_ptr_buffer_load:
9147   case Intrinsic::amdgcn_raw_atomic_buffer_load:
9148   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9149   case Intrinsic::amdgcn_raw_buffer_load_format:
9150   case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9151     const bool IsFormat =
9152         IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9153         IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9154 
9155     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9156     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9157     auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9158     SDValue Ops[] = {
9159         Op.getOperand(0),                      // Chain
9160         Rsrc,                                  // rsrc
9161         DAG.getConstant(0, DL, MVT::i32),      // vindex
9162         VOffset,                               // voffset
9163         SOffset,                               // soffset
9164         Offset,                                // offset
9165         Op.getOperand(5),                      // cachepolicy, swizzled buffer
9166         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9167     };
9168 
9169     auto *M = cast<MemSDNode>(Op);
9170     return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9171   }
9172   case Intrinsic::amdgcn_struct_buffer_load:
9173   case Intrinsic::amdgcn_struct_ptr_buffer_load:
9174   case Intrinsic::amdgcn_struct_buffer_load_format:
9175   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9176   case Intrinsic::amdgcn_struct_atomic_buffer_load:
9177   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9178     const bool IsFormat =
9179         IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9180         IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9181 
9182     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9183     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9184     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9185     SDValue Ops[] = {
9186         Op.getOperand(0),                      // Chain
9187         Rsrc,                                  // rsrc
9188         Op.getOperand(3),                      // vindex
9189         VOffset,                               // voffset
9190         SOffset,                               // soffset
9191         Offset,                                // offset
9192         Op.getOperand(6),                      // cachepolicy, swizzled buffer
9193         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9194     };
9195 
9196     return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9197   }
9198   case Intrinsic::amdgcn_raw_tbuffer_load:
9199   case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9200     MemSDNode *M = cast<MemSDNode>(Op);
9201     EVT LoadVT = Op.getValueType();
9202     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9203     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9204     auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9205 
9206     SDValue Ops[] = {
9207         Op.getOperand(0),                      // Chain
9208         Rsrc,                                  // rsrc
9209         DAG.getConstant(0, DL, MVT::i32),      // vindex
9210         VOffset,                               // voffset
9211         SOffset,                               // soffset
9212         Offset,                                // offset
9213         Op.getOperand(5),                      // format
9214         Op.getOperand(6),                      // cachepolicy, swizzled buffer
9215         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9216     };
9217 
9218     if (LoadVT.getScalarType() == MVT::f16)
9219       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9220                                  Ops);
9221     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9222                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9223                                DAG);
9224   }
9225   case Intrinsic::amdgcn_struct_tbuffer_load:
9226   case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9227     MemSDNode *M = cast<MemSDNode>(Op);
9228     EVT LoadVT = Op.getValueType();
9229     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9230     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9231     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9232 
9233     SDValue Ops[] = {
9234         Op.getOperand(0),                      // Chain
9235         Rsrc,                                  // rsrc
9236         Op.getOperand(3),                      // vindex
9237         VOffset,                               // voffset
9238         SOffset,                               // soffset
9239         Offset,                                // offset
9240         Op.getOperand(6),                      // format
9241         Op.getOperand(7),                      // cachepolicy, swizzled buffer
9242         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9243     };
9244 
9245     if (LoadVT.getScalarType() == MVT::f16)
9246       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9247                                  Ops);
9248     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9249                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9250                                DAG);
9251   }
9252   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9253   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9254     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9255   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9256   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9257     return lowerStructBufferAtomicIntrin(Op, DAG,
9258                                          AMDGPUISD::BUFFER_ATOMIC_FADD);
9259   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9260   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9261     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9262   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9263   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9264     return lowerStructBufferAtomicIntrin(Op, DAG,
9265                                          AMDGPUISD::BUFFER_ATOMIC_FMIN);
9266   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9267   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9268     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9269   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9270   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9271     return lowerStructBufferAtomicIntrin(Op, DAG,
9272                                          AMDGPUISD::BUFFER_ATOMIC_FMAX);
9273   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9274   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9275     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9276   case Intrinsic::amdgcn_raw_buffer_atomic_add:
9277   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9278     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9279   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9280   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9281     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9282   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9283   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9284     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9285   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9286   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9287     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9288   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9289   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9290     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9291   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9292   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9293     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9294   case Intrinsic::amdgcn_raw_buffer_atomic_and:
9295   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9296     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9297   case Intrinsic::amdgcn_raw_buffer_atomic_or:
9298   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9299     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9300   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9301   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9302     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9303   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9304   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9305     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9306   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9307   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9308     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9309   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9310     return lowerRawBufferAtomicIntrin(Op, DAG,
9311                                       AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9312   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9313   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9314     return lowerStructBufferAtomicIntrin(Op, DAG,
9315                                          AMDGPUISD::BUFFER_ATOMIC_SWAP);
9316   case Intrinsic::amdgcn_struct_buffer_atomic_add:
9317   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9318     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9319   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9320   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9321     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9322   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9323   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9324     return lowerStructBufferAtomicIntrin(Op, DAG,
9325                                          AMDGPUISD::BUFFER_ATOMIC_SMIN);
9326   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9327   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9328     return lowerStructBufferAtomicIntrin(Op, DAG,
9329                                          AMDGPUISD::BUFFER_ATOMIC_UMIN);
9330   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9331   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9332     return lowerStructBufferAtomicIntrin(Op, DAG,
9333                                          AMDGPUISD::BUFFER_ATOMIC_SMAX);
9334   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9335   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9336     return lowerStructBufferAtomicIntrin(Op, DAG,
9337                                          AMDGPUISD::BUFFER_ATOMIC_UMAX);
9338   case Intrinsic::amdgcn_struct_buffer_atomic_and:
9339   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9340     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9341   case Intrinsic::amdgcn_struct_buffer_atomic_or:
9342   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9343     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9344   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9345   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9346     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9347   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9348   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9349     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9350   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9351   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9352     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9353   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9354     return lowerStructBufferAtomicIntrin(Op, DAG,
9355                                          AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9356 
9357   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9358   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9359     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9360     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9361     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9362     SDValue Ops[] = {
9363         Op.getOperand(0),                      // Chain
9364         Op.getOperand(2),                      // src
9365         Op.getOperand(3),                      // cmp
9366         Rsrc,                                  // rsrc
9367         DAG.getConstant(0, DL, MVT::i32),      // vindex
9368         VOffset,                               // voffset
9369         SOffset,                               // soffset
9370         Offset,                                // offset
9371         Op.getOperand(7),                      // cachepolicy
9372         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9373     };
9374     EVT VT = Op.getValueType();
9375     auto *M = cast<MemSDNode>(Op);
9376 
9377     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9378                                    Op->getVTList(), Ops, VT,
9379                                    M->getMemOperand());
9380   }
9381   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9382   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9383     SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9384     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9385     auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9386     SDValue Ops[] = {
9387         Op.getOperand(0),                      // Chain
9388         Op.getOperand(2),                      // src
9389         Op.getOperand(3),                      // cmp
9390         Rsrc,                                  // rsrc
9391         Op.getOperand(5),                      // vindex
9392         VOffset,                               // voffset
9393         SOffset,                               // soffset
9394         Offset,                                // offset
9395         Op.getOperand(8),                      // cachepolicy
9396         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9397     };
9398     EVT VT = Op.getValueType();
9399     auto *M = cast<MemSDNode>(Op);
9400 
9401     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9402                                    Op->getVTList(), Ops, VT,
9403                                    M->getMemOperand());
9404   }
9405   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9406     MemSDNode *M = cast<MemSDNode>(Op);
9407     SDValue NodePtr = M->getOperand(2);
9408     SDValue RayExtent = M->getOperand(3);
9409     SDValue RayOrigin = M->getOperand(4);
9410     SDValue RayDir = M->getOperand(5);
9411     SDValue RayInvDir = M->getOperand(6);
9412     SDValue TDescr = M->getOperand(7);
9413 
9414     assert(NodePtr.getValueType() == MVT::i32 ||
9415            NodePtr.getValueType() == MVT::i64);
9416     assert(RayDir.getValueType() == MVT::v3f16 ||
9417            RayDir.getValueType() == MVT::v3f32);
9418 
9419     if (!Subtarget->hasGFX10_AEncoding()) {
9420       emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9421       return SDValue();
9422     }
9423 
9424     const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9425     const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9426     const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9427     const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9428     const bool Is64 = NodePtr.getValueType() == MVT::i64;
9429     const unsigned NumVDataDwords = 4;
9430     const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9431     const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9432     const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9433                          NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9434                         IsGFX12Plus;
9435     const unsigned BaseOpcodes[2][2] = {
9436         {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9437         {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9438          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9439     int Opcode;
9440     if (UseNSA) {
9441       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9442                                      IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9443                                      : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
9444                                                  : AMDGPU::MIMGEncGfx10NSA,
9445                                      NumVDataDwords, NumVAddrDwords);
9446     } else {
9447       assert(!IsGFX12Plus);
9448       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9449                                      IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9450                                              : AMDGPU::MIMGEncGfx10Default,
9451                                      NumVDataDwords, NumVAddrDwords);
9452     }
9453     assert(Opcode != -1);
9454 
9455     SmallVector<SDValue, 16> Ops;
9456 
9457     auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9458       SmallVector<SDValue, 3> Lanes;
9459       DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9460       if (Lanes[0].getValueSizeInBits() == 32) {
9461         for (unsigned I = 0; I < 3; ++I)
9462           Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9463       } else {
9464         if (IsAligned) {
9465           Ops.push_back(DAG.getBitcast(
9466               MVT::i32,
9467               DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9468           Ops.push_back(Lanes[2]);
9469         } else {
9470           SDValue Elt0 = Ops.pop_back_val();
9471           Ops.push_back(DAG.getBitcast(
9472               MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9473           Ops.push_back(DAG.getBitcast(
9474               MVT::i32,
9475               DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9476         }
9477       }
9478     };
9479 
9480     if (UseNSA && IsGFX11Plus) {
9481       Ops.push_back(NodePtr);
9482       Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9483       Ops.push_back(RayOrigin);
9484       if (IsA16) {
9485         SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9486         DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9487         DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9488         for (unsigned I = 0; I < 3; ++I) {
9489           MergedLanes.push_back(DAG.getBitcast(
9490               MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9491                                            {DirLanes[I], InvDirLanes[I]})));
9492         }
9493         Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9494       } else {
9495         Ops.push_back(RayDir);
9496         Ops.push_back(RayInvDir);
9497       }
9498     } else {
9499       if (Is64)
9500         DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9501                                   2);
9502       else
9503         Ops.push_back(NodePtr);
9504 
9505       Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9506       packLanes(RayOrigin, true);
9507       packLanes(RayDir, true);
9508       packLanes(RayInvDir, false);
9509     }
9510 
9511     if (!UseNSA) {
9512       // Build a single vector containing all the operands so far prepared.
9513       if (NumVAddrDwords > 12) {
9514         SDValue Undef = DAG.getUNDEF(MVT::i32);
9515         Ops.append(16 - Ops.size(), Undef);
9516       }
9517       assert(Ops.size() >= 8 && Ops.size() <= 12);
9518       SDValue MergedOps =
9519           DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9520       Ops.clear();
9521       Ops.push_back(MergedOps);
9522     }
9523 
9524     Ops.push_back(TDescr);
9525     Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9526     Ops.push_back(M->getChain());
9527 
9528     auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9529     MachineMemOperand *MemRef = M->getMemOperand();
9530     DAG.setNodeMemRefs(NewNode, {MemRef});
9531     return SDValue(NewNode, 0);
9532   }
9533   case Intrinsic::amdgcn_global_atomic_fmin_num:
9534   case Intrinsic::amdgcn_global_atomic_fmax_num:
9535   case Intrinsic::amdgcn_flat_atomic_fmin_num:
9536   case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9537     MemSDNode *M = cast<MemSDNode>(Op);
9538     SDValue Ops[] = {
9539         M->getOperand(0), // Chain
9540         M->getOperand(2), // Ptr
9541         M->getOperand(3)  // Value
9542     };
9543     unsigned Opcode = 0;
9544     switch (IntrID) {
9545     case Intrinsic::amdgcn_global_atomic_fmin_num:
9546     case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9547       Opcode = ISD::ATOMIC_LOAD_FMIN;
9548       break;
9549     }
9550     case Intrinsic::amdgcn_global_atomic_fmax_num:
9551     case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9552       Opcode = ISD::ATOMIC_LOAD_FMAX;
9553       break;
9554     }
9555     default:
9556       llvm_unreachable("unhandled atomic opcode");
9557     }
9558     return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9559                          Ops, M->getMemOperand());
9560   }
9561   case Intrinsic::amdgcn_s_get_barrier_state:
9562   case Intrinsic::amdgcn_s_get_named_barrier_state: {
9563     SDValue Chain = Op->getOperand(0);
9564     SmallVector<SDValue, 2> Ops;
9565     unsigned Opc;
9566 
9567     if (isa<ConstantSDNode>(Op->getOperand(2))) {
9568       uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9569       if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9570         BarID = (BarID >> 4) & 0x3F;
9571       Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9572       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9573       Ops.push_back(K);
9574       Ops.push_back(Chain);
9575     } else {
9576       Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9577       if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9578         SDValue M0Val;
9579         M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9580                             DAG.getShiftAmountConstant(4, MVT::i32, DL));
9581         M0Val = SDValue(
9582             DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9583                                DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9584             0);
9585         Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9586       } else
9587         Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9588     }
9589 
9590     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9591     return SDValue(NewMI, 0);
9592   }
9593   default:
9594 
9595     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9596             AMDGPU::getImageDimIntrinsicInfo(IntrID))
9597       return lowerImage(Op, ImageDimIntr, DAG, true);
9598 
9599     return SDValue();
9600   }
9601 }
9602 
9603 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9604 // dwordx4 if on SI and handle TFE loads.
9605 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9606                                               SDVTList VTList,
9607                                               ArrayRef<SDValue> Ops, EVT MemVT,
9608                                               MachineMemOperand *MMO,
9609                                               SelectionDAG &DAG) const {
9610   LLVMContext &C = *DAG.getContext();
9611   MachineFunction &MF = DAG.getMachineFunction();
9612   EVT VT = VTList.VTs[0];
9613 
9614   assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9615   bool IsTFE = VTList.NumVTs == 3;
9616   if (IsTFE) {
9617     unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9618     unsigned NumOpDWords = NumValueDWords + 1;
9619     EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9620     SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9621     MachineMemOperand *OpDWordsMMO =
9622         MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9623     SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9624                                      OpDWordsVT, OpDWordsMMO, DAG);
9625     SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
9626                                  DAG.getVectorIdxConstant(NumValueDWords, DL));
9627     SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9628     SDValue ValueDWords =
9629         NumValueDWords == 1
9630             ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9631             : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
9632                           EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9633                           ZeroIdx);
9634     SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9635     return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9636   }
9637 
9638   if (!Subtarget->hasDwordx3LoadStores() &&
9639       (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9640     EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9641     EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9642     MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9643     SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9644     SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9645                                          WidenedMemVT, WidenedMMO);
9646     SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,
9647                                 DAG.getVectorIdxConstant(0, DL));
9648     return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9649   }
9650 
9651   return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9652 }
9653 
9654 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9655                                          bool ImageStore) const {
9656   EVT StoreVT = VData.getValueType();
9657 
9658   // No change for f16 and legal vector D16 types.
9659   if (!StoreVT.isVector())
9660     return VData;
9661 
9662   SDLoc DL(VData);
9663   unsigned NumElements = StoreVT.getVectorNumElements();
9664 
9665   if (Subtarget->hasUnpackedD16VMem()) {
9666     // We need to unpack the packed data to store.
9667     EVT IntStoreVT = StoreVT.changeTypeToInteger();
9668     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9669 
9670     EVT EquivStoreVT =
9671         EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9672     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9673     return DAG.UnrollVectorOp(ZExt.getNode());
9674   }
9675 
9676   // The sq block of gfx8.1 does not estimate register use correctly for d16
9677   // image store instructions. The data operand is computed as if it were not a
9678   // d16 image instruction.
9679   if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9680     // Bitcast to i16
9681     EVT IntStoreVT = StoreVT.changeTypeToInteger();
9682     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9683 
9684     // Decompose into scalars
9685     SmallVector<SDValue, 4> Elts;
9686     DAG.ExtractVectorElements(IntVData, Elts);
9687 
9688     // Group pairs of i16 into v2i16 and bitcast to i32
9689     SmallVector<SDValue, 4> PackedElts;
9690     for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9691       SDValue Pair =
9692           DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9693       SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9694       PackedElts.push_back(IntPair);
9695     }
9696     if ((NumElements % 2) == 1) {
9697       // Handle v3i16
9698       unsigned I = Elts.size() / 2;
9699       SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9700                                         {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9701       SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9702       PackedElts.push_back(IntPair);
9703     }
9704 
9705     // Pad using UNDEF
9706     PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9707 
9708     // Build final vector
9709     EVT VecVT =
9710         EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9711     return DAG.getBuildVector(VecVT, DL, PackedElts);
9712   }
9713 
9714   if (NumElements == 3) {
9715     EVT IntStoreVT =
9716         EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
9717     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9718 
9719     EVT WidenedStoreVT = EVT::getVectorVT(
9720         *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9721     EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9722                                          WidenedStoreVT.getStoreSizeInBits());
9723     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9724     return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9725   }
9726 
9727   assert(isTypeLegal(StoreVT));
9728   return VData;
9729 }
9730 
9731 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9732                                               SelectionDAG &DAG) const {
9733   SDLoc DL(Op);
9734   SDValue Chain = Op.getOperand(0);
9735   unsigned IntrinsicID = Op.getConstantOperandVal(1);
9736   MachineFunction &MF = DAG.getMachineFunction();
9737 
9738   switch (IntrinsicID) {
9739   case Intrinsic::amdgcn_exp_compr: {
9740     if (!Subtarget->hasCompressedExport()) {
9741       DiagnosticInfoUnsupported BadIntrin(
9742           DAG.getMachineFunction().getFunction(),
9743           "intrinsic not supported on subtarget", DL.getDebugLoc());
9744       DAG.getContext()->diagnose(BadIntrin);
9745     }
9746     SDValue Src0 = Op.getOperand(4);
9747     SDValue Src1 = Op.getOperand(5);
9748     // Hack around illegal type on SI by directly selecting it.
9749     if (isTypeLegal(Src0.getValueType()))
9750       return SDValue();
9751 
9752     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9753     SDValue Undef = DAG.getUNDEF(MVT::f32);
9754     const SDValue Ops[] = {
9755         Op.getOperand(2),                              // tgt
9756         DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9757         DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9758         Undef,                                         // src2
9759         Undef,                                         // src3
9760         Op.getOperand(7),                              // vm
9761         DAG.getTargetConstant(1, DL, MVT::i1),         // compr
9762         Op.getOperand(3),                              // en
9763         Op.getOperand(0)                               // Chain
9764     };
9765 
9766     unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9767     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9768   }
9769   case Intrinsic::amdgcn_s_barrier:
9770   case Intrinsic::amdgcn_s_barrier_signal:
9771   case Intrinsic::amdgcn_s_barrier_wait: {
9772     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
9773     if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
9774       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9775       if (WGSize <= ST.getWavefrontSize()) {
9776         // If the workgroup fits in a wave, remove s_barrier_signal and lower
9777         // s_barrier/s_barrier_wait to wave_barrier.
9778         if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9779           return Op.getOperand(0);
9780         else
9781           return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9782                                             MVT::Other, Op.getOperand(0)),
9783                          0);
9784       }
9785     }
9786 
9787     if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9788       // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9789       SDValue K =
9790           DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
9791       SDValue BarSignal =
9792           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9793                                      MVT::Other, K, Op.getOperand(0)),
9794                   0);
9795       SDValue BarWait =
9796           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9797                                      BarSignal.getValue(0)),
9798                   0);
9799       return BarWait;
9800     }
9801 
9802     return SDValue();
9803   };
9804 
9805   case Intrinsic::amdgcn_struct_tbuffer_store:
9806   case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9807     SDValue VData = Op.getOperand(2);
9808     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9809     if (IsD16)
9810       VData = handleD16VData(VData, DAG);
9811     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9812     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9813     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9814     SDValue Ops[] = {
9815         Chain,
9816         VData,                                 // vdata
9817         Rsrc,                                  // rsrc
9818         Op.getOperand(4),                      // vindex
9819         VOffset,                               // voffset
9820         SOffset,                               // soffset
9821         Offset,                                // offset
9822         Op.getOperand(7),                      // format
9823         Op.getOperand(8),                      // cachepolicy, swizzled buffer
9824         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9825     };
9826     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9827                          : AMDGPUISD::TBUFFER_STORE_FORMAT;
9828     MemSDNode *M = cast<MemSDNode>(Op);
9829     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9830                                    M->getMemoryVT(), M->getMemOperand());
9831   }
9832 
9833   case Intrinsic::amdgcn_raw_tbuffer_store:
9834   case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9835     SDValue VData = Op.getOperand(2);
9836     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9837     if (IsD16)
9838       VData = handleD16VData(VData, DAG);
9839     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9840     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9841     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9842     SDValue Ops[] = {
9843         Chain,
9844         VData,                                 // vdata
9845         Rsrc,                                  // rsrc
9846         DAG.getConstant(0, DL, MVT::i32),      // vindex
9847         VOffset,                               // voffset
9848         SOffset,                               // soffset
9849         Offset,                                // offset
9850         Op.getOperand(6),                      // format
9851         Op.getOperand(7),                      // cachepolicy, swizzled buffer
9852         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9853     };
9854     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
9855                          : AMDGPUISD::TBUFFER_STORE_FORMAT;
9856     MemSDNode *M = cast<MemSDNode>(Op);
9857     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9858                                    M->getMemoryVT(), M->getMemOperand());
9859   }
9860 
9861   case Intrinsic::amdgcn_raw_buffer_store:
9862   case Intrinsic::amdgcn_raw_ptr_buffer_store:
9863   case Intrinsic::amdgcn_raw_buffer_store_format:
9864   case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9865     const bool IsFormat =
9866         IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9867         IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9868 
9869     SDValue VData = Op.getOperand(2);
9870     EVT VDataVT = VData.getValueType();
9871     EVT EltType = VDataVT.getScalarType();
9872     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9873     if (IsD16) {
9874       VData = handleD16VData(VData, DAG);
9875       VDataVT = VData.getValueType();
9876     }
9877 
9878     if (!isTypeLegal(VDataVT)) {
9879       VData =
9880           DAG.getNode(ISD::BITCAST, DL,
9881                       getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9882     }
9883 
9884     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9885     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9886     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9887     SDValue Ops[] = {
9888         Chain,
9889         VData,
9890         Rsrc,
9891         DAG.getConstant(0, DL, MVT::i32),      // vindex
9892         VOffset,                               // voffset
9893         SOffset,                               // soffset
9894         Offset,                                // offset
9895         Op.getOperand(6),                      // cachepolicy, swizzled buffer
9896         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9897     };
9898     unsigned Opc =
9899         IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
9900     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9901     MemSDNode *M = cast<MemSDNode>(Op);
9902 
9903     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9904     if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9905       return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9906 
9907     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9908                                    M->getMemoryVT(), M->getMemOperand());
9909   }
9910 
9911   case Intrinsic::amdgcn_struct_buffer_store:
9912   case Intrinsic::amdgcn_struct_ptr_buffer_store:
9913   case Intrinsic::amdgcn_struct_buffer_store_format:
9914   case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9915     const bool IsFormat =
9916         IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9917         IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9918 
9919     SDValue VData = Op.getOperand(2);
9920     EVT VDataVT = VData.getValueType();
9921     EVT EltType = VDataVT.getScalarType();
9922     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9923 
9924     if (IsD16) {
9925       VData = handleD16VData(VData, DAG);
9926       VDataVT = VData.getValueType();
9927     }
9928 
9929     if (!isTypeLegal(VDataVT)) {
9930       VData =
9931           DAG.getNode(ISD::BITCAST, DL,
9932                       getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9933     }
9934 
9935     auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9936     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9937     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9938     SDValue Ops[] = {
9939         Chain,
9940         VData,
9941         Rsrc,
9942         Op.getOperand(4),                      // vindex
9943         VOffset,                               // voffset
9944         SOffset,                               // soffset
9945         Offset,                                // offset
9946         Op.getOperand(7),                      // cachepolicy, swizzled buffer
9947         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9948     };
9949     unsigned Opc =
9950         !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
9951     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9952     MemSDNode *M = cast<MemSDNode>(Op);
9953 
9954     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9955     EVT VDataType = VData.getValueType().getScalarType();
9956     if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9957       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9958 
9959     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9960                                    M->getMemoryVT(), M->getMemOperand());
9961   }
9962   case Intrinsic::amdgcn_raw_buffer_load_lds:
9963   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9964   case Intrinsic::amdgcn_struct_buffer_load_lds:
9965   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9966     assert(!AMDGPU::isGFX12Plus(*Subtarget));
9967     unsigned Opc;
9968     bool HasVIndex =
9969         IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9970         IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9971     unsigned OpOffset = HasVIndex ? 1 : 0;
9972     SDValue VOffset = Op.getOperand(5 + OpOffset);
9973     bool HasVOffset = !isNullConstant(VOffset);
9974     unsigned Size = Op->getConstantOperandVal(4);
9975 
9976     switch (Size) {
9977     default:
9978       return SDValue();
9979     case 1:
9980       Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9981                                       : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9982             : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9983                          : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9984       break;
9985     case 2:
9986       Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9987                                       : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9988             : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9989                          : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9990       break;
9991     case 4:
9992       Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9993                                       : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9994             : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9995                          : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9996       break;
9997     case 12:
9998       if (!Subtarget->hasLDSLoadB96_B128())
9999         return SDValue();
10000       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10001                                    : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10002                       : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10003                                    : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10004       break;
10005     case 16:
10006       if (!Subtarget->hasLDSLoadB96_B128())
10007         return SDValue();
10008       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10009                                    : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10010                       : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10011                                    : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10012       break;
10013     }
10014 
10015     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10016 
10017     SmallVector<SDValue, 8> Ops;
10018 
10019     if (HasVIndex && HasVOffset)
10020       Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10021                                        {Op.getOperand(5), // VIndex
10022                                         VOffset}));
10023     else if (HasVIndex)
10024       Ops.push_back(Op.getOperand(5));
10025     else if (HasVOffset)
10026       Ops.push_back(VOffset);
10027 
10028     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10029     Ops.push_back(Rsrc);
10030     Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10031     Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10032     bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10033     unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10034     Ops.push_back(DAG.getTargetConstant(
10035         Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10036         DL, MVT::i8)); // cpol
10037     Ops.push_back(DAG.getTargetConstant(
10038         Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10039             ? 1
10040             : 0,
10041         DL, MVT::i8));                                           // swz
10042     Ops.push_back(M0Val.getValue(0));                            // Chain
10043     Ops.push_back(M0Val.getValue(1));                            // Glue
10044 
10045     auto *M = cast<MemSDNode>(Op);
10046     MachineMemOperand *LoadMMO = M->getMemOperand();
10047     // Don't set the offset value here because the pointer points to the base of
10048     // the buffer.
10049     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10050 
10051     MachinePointerInfo StorePtrI = LoadPtrI;
10052     LoadPtrI.V = PoisonValue::get(
10053         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
10054     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10055     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10056 
10057     auto F = LoadMMO->getFlags() &
10058              ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10059     LoadMMO =
10060         MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
10061                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10062 
10063     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10064         StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10065         LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10066 
10067     auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10068     DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10069 
10070     return SDValue(Load, 0);
10071   }
10072   case Intrinsic::amdgcn_global_load_lds: {
10073     unsigned Opc;
10074     unsigned Size = Op->getConstantOperandVal(4);
10075     switch (Size) {
10076     default:
10077       return SDValue();
10078     case 1:
10079       Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10080       break;
10081     case 2:
10082       Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10083       break;
10084     case 4:
10085       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10086       break;
10087     case 12:
10088       if (!Subtarget->hasLDSLoadB96_B128())
10089         return SDValue();
10090       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10091       break;
10092     case 16:
10093       if (!Subtarget->hasLDSLoadB96_B128())
10094         return SDValue();
10095       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10096       break;
10097     }
10098 
10099     auto *M = cast<MemSDNode>(Op);
10100     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10101 
10102     SmallVector<SDValue, 6> Ops;
10103 
10104     SDValue Addr = Op.getOperand(2); // Global ptr
10105     SDValue VOffset;
10106     // Try to split SAddr and VOffset. Global and LDS pointers share the same
10107     // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10108     if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10109       SDValue LHS = Addr.getOperand(0);
10110       SDValue RHS = Addr.getOperand(1);
10111 
10112       if (LHS->isDivergent())
10113         std::swap(LHS, RHS);
10114 
10115       if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10116           RHS.getOperand(0).getValueType() == MVT::i32) {
10117         // add (i64 sgpr), (zero_extend (i32 vgpr))
10118         Addr = LHS;
10119         VOffset = RHS.getOperand(0);
10120       }
10121     }
10122 
10123     Ops.push_back(Addr);
10124     if (!Addr->isDivergent()) {
10125       Opc = AMDGPU::getGlobalSaddrOp(Opc);
10126       if (!VOffset)
10127         VOffset =
10128             SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10129                                        DAG.getTargetConstant(0, DL, MVT::i32)),
10130                     0);
10131       Ops.push_back(VOffset);
10132     }
10133 
10134     Ops.push_back(Op.getOperand(5));  // Offset
10135     Ops.push_back(Op.getOperand(6));  // CPol
10136     Ops.push_back(M0Val.getValue(0)); // Chain
10137     Ops.push_back(M0Val.getValue(1)); // Glue
10138 
10139     MachineMemOperand *LoadMMO = M->getMemOperand();
10140     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10141     LoadPtrI.Offset = Op->getConstantOperandVal(5);
10142     MachinePointerInfo StorePtrI = LoadPtrI;
10143     LoadPtrI.V = PoisonValue::get(
10144         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
10145     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10146     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10147     auto F = LoadMMO->getFlags() &
10148              ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10149     LoadMMO =
10150         MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
10151                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10152     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10153         StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10154         LoadMMO->getAAInfo());
10155 
10156     auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10157     DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10158 
10159     return SDValue(Load, 0);
10160   }
10161   case Intrinsic::amdgcn_end_cf:
10162     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10163                                       Op->getOperand(2), Chain),
10164                    0);
10165   case Intrinsic::amdgcn_s_barrier_init:
10166   case Intrinsic::amdgcn_s_barrier_signal_var: {
10167     // these two intrinsics have two operands: barrier pointer and member count
10168     SDValue Chain = Op->getOperand(0);
10169     SmallVector<SDValue, 2> Ops;
10170     SDValue BarOp = Op->getOperand(2);
10171     SDValue CntOp = Op->getOperand(3);
10172     SDValue M0Val;
10173     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10174                        ? AMDGPU::S_BARRIER_INIT_M0
10175                        : AMDGPU::S_BARRIER_SIGNAL_M0;
10176     // extract the BarrierID from bits 4-9 of BarOp
10177     SDValue BarID;
10178     BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10179                         DAG.getShiftAmountConstant(4, MVT::i32, DL));
10180     BarID =
10181         SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10182                                    DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10183                 0);
10184     // Member count should be put into M0[ShAmt:+6]
10185     // Barrier ID should be put into M0[5:0]
10186     M0Val =
10187         SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10188                                    DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10189                 0);
10190     constexpr unsigned ShAmt = 16;
10191     M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10192                         DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10193 
10194     M0Val = SDValue(
10195         DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10196 
10197     Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10198 
10199     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10200     return SDValue(NewMI, 0);
10201   }
10202   case Intrinsic::amdgcn_s_barrier_join: {
10203     // these three intrinsics have one operand: barrier pointer
10204     SDValue Chain = Op->getOperand(0);
10205     SmallVector<SDValue, 2> Ops;
10206     SDValue BarOp = Op->getOperand(2);
10207     unsigned Opc;
10208 
10209     if (isa<ConstantSDNode>(BarOp)) {
10210       uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10211       Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10212 
10213       // extract the BarrierID from bits 4-9 of the immediate
10214       unsigned BarID = (BarVal >> 4) & 0x3F;
10215       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10216       Ops.push_back(K);
10217       Ops.push_back(Chain);
10218     } else {
10219       Opc = AMDGPU::S_BARRIER_JOIN_M0;
10220 
10221       // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10222       SDValue M0Val;
10223       M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10224                           DAG.getShiftAmountConstant(4, MVT::i32, DL));
10225       M0Val =
10226           SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10227                                      DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10228                   0);
10229       Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10230     }
10231 
10232     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10233     return SDValue(NewMI, 0);
10234   }
10235   case Intrinsic::amdgcn_s_prefetch_data: {
10236     // For non-global address space preserve the chain and remove the call.
10237     if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10238       return Op.getOperand(0);
10239     return Op;
10240   }
10241   case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10242     SDValue Ops[] = {
10243         Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10244         Op.getOperand(3), // offset
10245         Op.getOperand(4), // length
10246     };
10247 
10248     MemSDNode *M = cast<MemSDNode>(Op);
10249     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
10250                                    Op->getVTList(), Ops, M->getMemoryVT(),
10251                                    M->getMemOperand());
10252   }
10253   default: {
10254     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10255             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
10256       return lowerImage(Op, ImageDimIntr, DAG, true);
10257 
10258     return Op;
10259   }
10260   }
10261 }
10262 
10263 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10264 // offset (the offset that is included in bounds checking and swizzling, to be
10265 // split between the instruction's voffset and immoffset fields) and soffset
10266 // (the offset that is excluded from bounds checking and swizzling, to go in
10267 // the instruction's soffset field).  This function takes the first kind of
10268 // offset and figures out how to split it between voffset and immoffset.
10269 std::pair<SDValue, SDValue>
10270 SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10271   SDLoc DL(Offset);
10272   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10273   SDValue N0 = Offset;
10274   ConstantSDNode *C1 = nullptr;
10275 
10276   if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10277     N0 = SDValue();
10278   else if (DAG.isBaseWithConstantOffset(N0)) {
10279     C1 = cast<ConstantSDNode>(N0.getOperand(1));
10280     N0 = N0.getOperand(0);
10281   }
10282 
10283   if (C1) {
10284     unsigned ImmOffset = C1->getZExtValue();
10285     // If the immediate value is too big for the immoffset field, put only bits
10286     // that would normally fit in the immoffset field. The remaining value that
10287     // is copied/added for the voffset field is a large power of 2, and it
10288     // stands more chance of being CSEd with the copy/add for another similar
10289     // load/store.
10290     // However, do not do that rounding down if that is a negative
10291     // number, as it appears to be illegal to have a negative offset in the
10292     // vgpr, even if adding the immediate offset makes it positive.
10293     unsigned Overflow = ImmOffset & ~MaxImm;
10294     ImmOffset -= Overflow;
10295     if ((int32_t)Overflow < 0) {
10296       Overflow += ImmOffset;
10297       ImmOffset = 0;
10298     }
10299     C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10300     if (Overflow) {
10301       auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10302       if (!N0)
10303         N0 = OverflowVal;
10304       else {
10305         SDValue Ops[] = {N0, OverflowVal};
10306         N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10307       }
10308     }
10309   }
10310   if (!N0)
10311     N0 = DAG.getConstant(0, DL, MVT::i32);
10312   if (!C1)
10313     C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10314   return {N0, SDValue(C1, 0)};
10315 }
10316 
10317 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10318 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10319 // pointed to by Offsets.
10320 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10321                                         SelectionDAG &DAG, SDValue *Offsets,
10322                                         Align Alignment) const {
10323   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10324   SDLoc DL(CombinedOffset);
10325   if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10326     uint32_t Imm = C->getZExtValue();
10327     uint32_t SOffset, ImmOffset;
10328     if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10329       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10330       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10331       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10332       return;
10333     }
10334   }
10335   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10336     SDValue N0 = CombinedOffset.getOperand(0);
10337     SDValue N1 = CombinedOffset.getOperand(1);
10338     uint32_t SOffset, ImmOffset;
10339     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10340     if (Offset >= 0 &&
10341         TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10342       Offsets[0] = N0;
10343       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10344       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10345       return;
10346     }
10347   }
10348 
10349   SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10350                             ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10351                             : DAG.getConstant(0, DL, MVT::i32);
10352 
10353   Offsets[0] = CombinedOffset;
10354   Offsets[1] = SOffsetZero;
10355   Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10356 }
10357 
10358 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10359                                                 SelectionDAG &DAG) const {
10360   if (!MaybePointer.getValueType().isScalarInteger())
10361     return MaybePointer;
10362 
10363   SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10364   return Rsrc;
10365 }
10366 
10367 // Wrap a global or flat pointer into a buffer intrinsic using the flags
10368 // specified in the intrinsic.
10369 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10370                                                    SelectionDAG &DAG) const {
10371   SDLoc Loc(Op);
10372 
10373   SDValue Pointer = Op->getOperand(1);
10374   SDValue Stride = Op->getOperand(2);
10375   SDValue NumRecords = Op->getOperand(3);
10376   SDValue Flags = Op->getOperand(4);
10377 
10378   auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10379   SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10380   SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10381   std::optional<uint32_t> ConstStride = std::nullopt;
10382   if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10383     ConstStride = ConstNode->getZExtValue();
10384 
10385   SDValue NewHighHalf = Masked;
10386   if (!ConstStride || *ConstStride != 0) {
10387     SDValue ShiftedStride;
10388     if (ConstStride) {
10389       ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10390     } else {
10391       SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10392       ShiftedStride =
10393           DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10394                       DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10395     }
10396     NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10397   }
10398 
10399   SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10400                              NewHighHalf, NumRecords, Flags);
10401   SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10402   return RsrcPtr;
10403 }
10404 
10405 // Handle 8 bit and 16 bit buffer loads
10406 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10407                                                      EVT LoadVT, SDLoc DL,
10408                                                      ArrayRef<SDValue> Ops,
10409                                                      MachineMemOperand *MMO,
10410                                                      bool IsTFE) const {
10411   EVT IntVT = LoadVT.changeTypeToInteger();
10412 
10413   if (IsTFE) {
10414     unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10415                        ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10416                        : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10417     MachineFunction &MF = DAG.getMachineFunction();
10418     MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10419     SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10420     SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10421     SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10422                                  DAG.getConstant(1, DL, MVT::i32));
10423     SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10424                                DAG.getConstant(0, DL, MVT::i32));
10425     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10426     SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10427     return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10428   }
10429 
10430   unsigned Opc = LoadVT.getScalarType() == MVT::i8
10431                      ? AMDGPUISD::BUFFER_LOAD_UBYTE
10432                      : AMDGPUISD::BUFFER_LOAD_USHORT;
10433 
10434   SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10435   SDValue BufferLoad =
10436       DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10437   SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10438   LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10439 
10440   return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10441 }
10442 
10443 // Handle 8 bit and 16 bit buffer stores
10444 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10445                                                       EVT VDataType, SDLoc DL,
10446                                                       SDValue Ops[],
10447                                                       MemSDNode *M) const {
10448   if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10449     Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10450 
10451   SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10452   Ops[1] = BufferStoreExt;
10453   unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10454                                         : AMDGPUISD::BUFFER_STORE_SHORT;
10455   ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10456   return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10457                                  M->getMemOperand());
10458 }
10459 
10460 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
10461                                  SDValue Op, const SDLoc &SL, EVT VT) {
10462   if (VT.bitsLT(Op.getValueType()))
10463     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10464 
10465   switch (ExtType) {
10466   case ISD::SEXTLOAD:
10467     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10468   case ISD::ZEXTLOAD:
10469     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10470   case ISD::EXTLOAD:
10471     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10472   case ISD::NON_EXTLOAD:
10473     return Op;
10474   }
10475 
10476   llvm_unreachable("invalid ext type");
10477 }
10478 
10479 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10480 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10481 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10482                                     DAGCombinerInfo &DCI) const {
10483   SelectionDAG &DAG = DCI.DAG;
10484   if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10485     return SDValue();
10486 
10487   // FIXME: Constant loads should all be marked invariant.
10488   unsigned AS = Ld->getAddressSpace();
10489   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10490       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10491       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10492     return SDValue();
10493 
10494   // Don't do this early, since it may interfere with adjacent load merging for
10495   // illegal types. We can avoid losing alignment information for exotic types
10496   // pre-legalize.
10497   EVT MemVT = Ld->getMemoryVT();
10498   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10499       MemVT.getSizeInBits() >= 32)
10500     return SDValue();
10501 
10502   SDLoc SL(Ld);
10503 
10504   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10505          "unexpected vector extload");
10506 
10507   // TODO: Drop only high part of range.
10508   SDValue Ptr = Ld->getBasePtr();
10509   SDValue NewLoad = DAG.getLoad(
10510       ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10511       Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10512       Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10513       nullptr); // Drop ranges
10514 
10515   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10516   if (MemVT.isFloatingPoint()) {
10517     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10518            "unexpected fp extload");
10519     TruncVT = MemVT.changeTypeToInteger();
10520   }
10521 
10522   SDValue Cvt = NewLoad;
10523   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10524     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10525                       DAG.getValueType(TruncVT));
10526   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10527              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10528     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10529   } else {
10530     assert(Ld->getExtensionType() == ISD::EXTLOAD);
10531   }
10532 
10533   EVT VT = Ld->getValueType(0);
10534   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10535 
10536   DCI.AddToWorklist(Cvt.getNode());
10537 
10538   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10539   // the appropriate extension from the 32-bit load.
10540   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10541   DCI.AddToWorklist(Cvt.getNode());
10542 
10543   // Handle conversion back to floating point if necessary.
10544   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10545 
10546   return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10547 }
10548 
10549 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10550                                           const SIMachineFunctionInfo &Info) {
10551   // TODO: Should check if the address can definitely not access stack.
10552   if (Info.isEntryFunction())
10553     return Info.getUserSGPRInfo().hasFlatScratchInit();
10554   return true;
10555 }
10556 
10557 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10558   SDLoc DL(Op);
10559   LoadSDNode *Load = cast<LoadSDNode>(Op);
10560   ISD::LoadExtType ExtType = Load->getExtensionType();
10561   EVT MemVT = Load->getMemoryVT();
10562   MachineMemOperand *MMO = Load->getMemOperand();
10563 
10564   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10565     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10566       return SDValue();
10567 
10568     // FIXME: Copied from PPC
10569     // First, load into 32 bits, then truncate to 1 bit.
10570 
10571     SDValue Chain = Load->getChain();
10572     SDValue BasePtr = Load->getBasePtr();
10573 
10574     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10575 
10576     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10577                                    RealMemVT, MMO);
10578 
10579     if (!MemVT.isVector()) {
10580       SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10581                        NewLD.getValue(1)};
10582 
10583       return DAG.getMergeValues(Ops, DL);
10584     }
10585 
10586     SmallVector<SDValue, 3> Elts;
10587     for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10588       SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10589                                 DAG.getConstant(I, DL, MVT::i32));
10590 
10591       Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10592     }
10593 
10594     SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10595 
10596     return DAG.getMergeValues(Ops, DL);
10597   }
10598 
10599   if (!MemVT.isVector())
10600     return SDValue();
10601 
10602   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10603          "Custom lowering for non-i32 vectors hasn't been implemented.");
10604 
10605   Align Alignment = Load->getAlign();
10606   unsigned AS = Load->getAddressSpace();
10607   if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10608       Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10609     return SplitVectorLoad(Op, DAG);
10610   }
10611 
10612   MachineFunction &MF = DAG.getMachineFunction();
10613   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10614   // If there is a possibility that flat instruction access scratch memory
10615   // then we need to use the same legalization rules we use for private.
10616   if (AS == AMDGPUAS::FLAT_ADDRESS &&
10617       !Subtarget->hasMultiDwordFlatScratchAddressing())
10618     AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10619              ? AMDGPUAS::PRIVATE_ADDRESS
10620              : AMDGPUAS::GLOBAL_ADDRESS;
10621 
10622   unsigned NumElements = MemVT.getVectorNumElements();
10623 
10624   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10625       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10626       (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10627        Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10628        isMemOpHasNoClobberedMemOperand(Load))) {
10629     if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10630         Alignment >= Align(4) && NumElements < 32) {
10631       if (MemVT.isPow2VectorType() ||
10632           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10633         return SDValue();
10634       return WidenOrSplitVectorLoad(Op, DAG);
10635     }
10636     // Non-uniform loads will be selected to MUBUF instructions, so they
10637     // have the same legalization requirements as global and private
10638     // loads.
10639     //
10640   }
10641   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10642       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10643       AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
10644     if (NumElements > 4)
10645       return SplitVectorLoad(Op, DAG);
10646     // v3 loads not supported on SI.
10647     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10648       return WidenOrSplitVectorLoad(Op, DAG);
10649 
10650     // v3 and v4 loads are supported for private and global memory.
10651     return SDValue();
10652   }
10653   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10654     // Depending on the setting of the private_element_size field in the
10655     // resource descriptor, we can only make private accesses up to a certain
10656     // size.
10657     switch (Subtarget->getMaxPrivateElementSize()) {
10658     case 4: {
10659       auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
10660       return DAG.getMergeValues({Op0, Op1}, DL);
10661     }
10662     case 8:
10663       if (NumElements > 2)
10664         return SplitVectorLoad(Op, DAG);
10665       return SDValue();
10666     case 16:
10667       // Same as global/flat
10668       if (NumElements > 4)
10669         return SplitVectorLoad(Op, DAG);
10670       // v3 loads not supported on SI.
10671       if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10672         return WidenOrSplitVectorLoad(Op, DAG);
10673 
10674       return SDValue();
10675     default:
10676       llvm_unreachable("unsupported private_element_size");
10677     }
10678   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10679     unsigned Fast = 0;
10680     auto Flags = Load->getMemOperand()->getFlags();
10681     if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
10682                                            Load->getAlign(), Flags, &Fast) &&
10683         Fast > 1)
10684       return SDValue();
10685 
10686     if (MemVT.isVector())
10687       return SplitVectorLoad(Op, DAG);
10688   }
10689 
10690   if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
10691                                       MemVT, *Load->getMemOperand())) {
10692     auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
10693     return DAG.getMergeValues({Op0, Op1}, DL);
10694   }
10695 
10696   return SDValue();
10697 }
10698 
10699 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10700   EVT VT = Op.getValueType();
10701   if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10702       VT.getSizeInBits() == 512)
10703     return splitTernaryVectorOp(Op, DAG);
10704 
10705   assert(VT.getSizeInBits() == 64);
10706 
10707   SDLoc DL(Op);
10708   SDValue Cond = Op.getOperand(0);
10709 
10710   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10711   SDValue One = DAG.getConstant(1, DL, MVT::i32);
10712 
10713   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10714   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10715 
10716   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10717   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10718 
10719   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10720 
10721   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10722   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10723 
10724   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10725 
10726   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10727   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10728 }
10729 
10730 // Catch division cases where we can use shortcuts with rcp and rsq
10731 // instructions.
10732 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10733                                               SelectionDAG &DAG) const {
10734   SDLoc SL(Op);
10735   SDValue LHS = Op.getOperand(0);
10736   SDValue RHS = Op.getOperand(1);
10737   EVT VT = Op.getValueType();
10738   const SDNodeFlags Flags = Op->getFlags();
10739 
10740   bool AllowInaccurateRcp =
10741       Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10742 
10743   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10744     // Without !fpmath accuracy information, we can't do more because we don't
10745     // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10746     // f16 is always accurate enough
10747     if (!AllowInaccurateRcp && VT != MVT::f16)
10748       return SDValue();
10749 
10750     if (CLHS->isExactlyValue(1.0)) {
10751       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10752       // the CI documentation has a worst case error of 1 ulp.
10753       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10754       // use it as long as we aren't trying to use denormals.
10755       //
10756       // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10757 
10758       // 1.0 / sqrt(x) -> rsq(x)
10759 
10760       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10761       // error seems really high at 2^29 ULP.
10762       // 1.0 / x -> rcp(x)
10763       return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10764     }
10765 
10766     // Same as for 1.0, but expand the sign out of the constant.
10767     if (CLHS->isExactlyValue(-1.0)) {
10768       // -1.0 / x -> rcp (fneg x)
10769       SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10770       return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10771     }
10772   }
10773 
10774   // For f16 require afn or arcp.
10775   // For f32 require afn.
10776   if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10777     return SDValue();
10778 
10779   // Turn into multiply by the reciprocal.
10780   // x / y -> x * (1.0 / y)
10781   SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10782   return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10783 }
10784 
10785 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10786                                                 SelectionDAG &DAG) const {
10787   SDLoc SL(Op);
10788   SDValue X = Op.getOperand(0);
10789   SDValue Y = Op.getOperand(1);
10790   EVT VT = Op.getValueType();
10791   const SDNodeFlags Flags = Op->getFlags();
10792 
10793   bool AllowInaccurateDiv =
10794       Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
10795   if (!AllowInaccurateDiv)
10796     return SDValue();
10797 
10798   SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10799   SDValue One = DAG.getConstantFP(1.0, SL, VT);
10800 
10801   SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10802   SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10803 
10804   R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10805   SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10806   R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10807   SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10808   SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10809   return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10810 }
10811 
10812 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10813                           EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10814                           SDNodeFlags Flags) {
10815   if (GlueChain->getNumValues() <= 1) {
10816     return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10817   }
10818 
10819   assert(GlueChain->getNumValues() == 3);
10820 
10821   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10822   switch (Opcode) {
10823   default:
10824     llvm_unreachable("no chain equivalent for opcode");
10825   case ISD::FMUL:
10826     Opcode = AMDGPUISD::FMUL_W_CHAIN;
10827     break;
10828   }
10829 
10830   return DAG.getNode(Opcode, SL, VTList,
10831                      {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10832                      Flags);
10833 }
10834 
10835 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10836                            EVT VT, SDValue A, SDValue B, SDValue C,
10837                            SDValue GlueChain, SDNodeFlags Flags) {
10838   if (GlueChain->getNumValues() <= 1) {
10839     return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10840   }
10841 
10842   assert(GlueChain->getNumValues() == 3);
10843 
10844   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10845   switch (Opcode) {
10846   default:
10847     llvm_unreachable("no chain equivalent for opcode");
10848   case ISD::FMA:
10849     Opcode = AMDGPUISD::FMA_W_CHAIN;
10850     break;
10851   }
10852 
10853   return DAG.getNode(Opcode, SL, VTList,
10854                      {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10855                      Flags);
10856 }
10857 
10858 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10859   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10860     return FastLowered;
10861 
10862   SDLoc SL(Op);
10863   SDValue LHS = Op.getOperand(0);
10864   SDValue RHS = Op.getOperand(1);
10865 
10866   // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10867   // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10868   // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10869   // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10870   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10871   // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10872   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10873   // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10874   // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10875   // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10876   // q16.u = opx(V_CVT_F16_F32, q32.u);
10877   // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10878 
10879   // We will use ISD::FMA on targets that don't support ISD::FMAD.
10880   unsigned FMADOpCode =
10881       isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
10882 
10883   SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10884   SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10885   SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10886   SDValue Rcp =
10887       DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10888   SDValue Quot =
10889       DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10890   SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10891                             Op->getFlags());
10892   Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10893   Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10894                     Op->getFlags());
10895   SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10896   SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10897   TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10898                         DAG.getConstant(0xff800000, SL, MVT::i32));
10899   Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10900   Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10901   SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10902                              DAG.getTargetConstant(0, SL, MVT::i32));
10903   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10904                      Op->getFlags());
10905 }
10906 
10907 // Faster 2.5 ULP division that does not support denormals.
10908 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10909   SDNodeFlags Flags = Op->getFlags();
10910   SDLoc SL(Op);
10911   SDValue LHS = Op.getOperand(1);
10912   SDValue RHS = Op.getOperand(2);
10913 
10914   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10915 
10916   const APFloat K0Val(0x1p+96f);
10917   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10918 
10919   const APFloat K1Val(0x1p-32f);
10920   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10921 
10922   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10923 
10924   EVT SetCCVT =
10925       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10926 
10927   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10928 
10929   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10930 
10931   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10932 
10933   // rcp does not support denormals.
10934   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10935 
10936   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10937 
10938   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10939 }
10940 
10941 // Returns immediate value for setting the F32 denorm mode when using the
10942 // S_DENORM_MODE instruction.
10943 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
10944                                     const SIMachineFunctionInfo *Info,
10945                                     const GCNSubtarget *ST) {
10946   assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10947   uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10948   uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10949   return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10950 }
10951 
10952 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10953   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10954     return FastLowered;
10955 
10956   // The selection matcher assumes anything with a chain selecting to a
10957   // mayRaiseFPException machine instruction. Since we're introducing a chain
10958   // here, we need to explicitly report nofpexcept for the regular fdiv
10959   // lowering.
10960   SDNodeFlags Flags = Op->getFlags();
10961   Flags.setNoFPExcept(true);
10962 
10963   SDLoc SL(Op);
10964   SDValue LHS = Op.getOperand(0);
10965   SDValue RHS = Op.getOperand(1);
10966 
10967   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10968 
10969   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10970 
10971   SDValue DenominatorScaled =
10972       DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
10973   SDValue NumeratorScaled =
10974       DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
10975 
10976   // Denominator is scaled to not be denormal, so using rcp is ok.
10977   SDValue ApproxRcp =
10978       DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
10979   SDValue NegDivScale0 =
10980       DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
10981 
10982   using namespace AMDGPU::Hwreg;
10983   const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10984   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10985 
10986   const MachineFunction &MF = DAG.getMachineFunction();
10987   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10988   const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10989 
10990   const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10991   const bool HasDynamicDenormals =
10992       (DenormMode.Input == DenormalMode::Dynamic) ||
10993       (DenormMode.Output == DenormalMode::Dynamic);
10994 
10995   SDValue SavedDenormMode;
10996 
10997   if (!PreservesDenormals) {
10998     // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10999     // lowering. The chain dependence is insufficient, and we need glue. We do
11000     // not need the glue variants in a strictfp function.
11001 
11002     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11003 
11004     SDValue Glue = DAG.getEntryNode();
11005     if (HasDynamicDenormals) {
11006       SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11007                                           DAG.getVTList(MVT::i32, MVT::Glue),
11008                                           {BitField, Glue});
11009       SavedDenormMode = SDValue(GetReg, 0);
11010 
11011       Glue = DAG.getMergeValues(
11012           {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11013     }
11014 
11015     SDNode *EnableDenorm;
11016     if (Subtarget->hasDenormModeInst()) {
11017       const SDValue EnableDenormValue =
11018           getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11019 
11020       EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11021                                  EnableDenormValue)
11022                          .getNode();
11023     } else {
11024       const SDValue EnableDenormValue =
11025           DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11026       EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11027                                         {EnableDenormValue, BitField, Glue});
11028     }
11029 
11030     SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11031                       SDValue(EnableDenorm, 1)};
11032 
11033     NegDivScale0 = DAG.getMergeValues(Ops, SL);
11034   }
11035 
11036   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11037                              ApproxRcp, One, NegDivScale0, Flags);
11038 
11039   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11040                              ApproxRcp, Fma0, Flags);
11041 
11042   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11043                            Fma1, Flags);
11044 
11045   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11046                              NumeratorScaled, Mul, Flags);
11047 
11048   SDValue Fma3 =
11049       getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11050 
11051   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11052                              NumeratorScaled, Fma3, Flags);
11053 
11054   if (!PreservesDenormals) {
11055     SDNode *DisableDenorm;
11056     if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11057       const SDValue DisableDenormValue = getSPDenormModeValue(
11058           FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11059 
11060       DisableDenorm =
11061           DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1),
11062                       DisableDenormValue, Fma4.getValue(2))
11063               .getNode();
11064     } else {
11065       assert(HasDynamicDenormals == (bool)SavedDenormMode);
11066       const SDValue DisableDenormValue =
11067           HasDynamicDenormals
11068               ? SavedDenormMode
11069               : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11070 
11071       DisableDenorm = DAG.getMachineNode(
11072           AMDGPU::S_SETREG_B32, SL, MVT::Other,
11073           {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11074     }
11075 
11076     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11077                                       SDValue(DisableDenorm, 0), DAG.getRoot());
11078     DAG.setRoot(OutputChain);
11079   }
11080 
11081   SDValue Scale = NumeratorScaled.getValue(1);
11082   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11083                              {Fma4, Fma1, Fma3, Scale}, Flags);
11084 
11085   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11086 }
11087 
11088 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11089   if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11090     return FastLowered;
11091 
11092   SDLoc SL(Op);
11093   SDValue X = Op.getOperand(0);
11094   SDValue Y = Op.getOperand(1);
11095 
11096   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11097 
11098   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11099 
11100   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11101 
11102   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11103 
11104   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11105 
11106   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11107 
11108   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11109 
11110   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11111 
11112   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11113 
11114   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11115   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11116 
11117   SDValue Fma4 =
11118       DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11119 
11120   SDValue Scale;
11121 
11122   if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11123     // Workaround a hardware bug on SI where the condition output from div_scale
11124     // is not usable.
11125 
11126     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11127 
11128     // Figure out if the scale to use for div_fmas.
11129     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11130     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11131     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11132     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11133 
11134     SDValue NumHi =
11135         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11136     SDValue DenHi =
11137         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11138 
11139     SDValue Scale0Hi =
11140         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11141     SDValue Scale1Hi =
11142         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11143 
11144     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11145     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11146     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11147   } else {
11148     Scale = DivScale1.getValue(1);
11149   }
11150 
11151   SDValue Fmas =
11152       DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11153 
11154   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11155 }
11156 
11157 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11158   EVT VT = Op.getValueType();
11159 
11160   if (VT == MVT::f32)
11161     return LowerFDIV32(Op, DAG);
11162 
11163   if (VT == MVT::f64)
11164     return LowerFDIV64(Op, DAG);
11165 
11166   if (VT == MVT::f16)
11167     return LowerFDIV16(Op, DAG);
11168 
11169   llvm_unreachable("Unexpected type for fdiv");
11170 }
11171 
11172 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11173   SDLoc dl(Op);
11174   SDValue Val = Op.getOperand(0);
11175   EVT VT = Val.getValueType();
11176   EVT ResultExpVT = Op->getValueType(1);
11177   EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11178 
11179   SDValue Mant = DAG.getNode(
11180       ISD::INTRINSIC_WO_CHAIN, dl, VT,
11181       DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11182 
11183   SDValue Exp = DAG.getNode(
11184       ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11185       DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11186 
11187   if (Subtarget->hasFractBug()) {
11188     SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11189     SDValue Inf =
11190         DAG.getConstantFP(APFloat::getInf(VT.getFltSemantics()), dl, VT);
11191 
11192     SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11193     SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11194     Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11195     Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11196   }
11197 
11198   SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11199   return DAG.getMergeValues({Mant, CastExp}, dl);
11200 }
11201 
11202 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11203   SDLoc DL(Op);
11204   StoreSDNode *Store = cast<StoreSDNode>(Op);
11205   EVT VT = Store->getMemoryVT();
11206 
11207   if (VT == MVT::i1) {
11208     return DAG.getTruncStore(
11209         Store->getChain(), DL,
11210         DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11211         Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11212   }
11213 
11214   assert(VT.isVector() &&
11215          Store->getValue().getValueType().getScalarType() == MVT::i32);
11216 
11217   unsigned AS = Store->getAddressSpace();
11218   if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11219       Store->getAlign().value() < VT.getStoreSize() &&
11220       VT.getSizeInBits() > 32) {
11221     return SplitVectorStore(Op, DAG);
11222   }
11223 
11224   MachineFunction &MF = DAG.getMachineFunction();
11225   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11226   // If there is a possibility that flat instruction access scratch memory
11227   // then we need to use the same legalization rules we use for private.
11228   if (AS == AMDGPUAS::FLAT_ADDRESS &&
11229       !Subtarget->hasMultiDwordFlatScratchAddressing())
11230     AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11231              ? AMDGPUAS::PRIVATE_ADDRESS
11232              : AMDGPUAS::GLOBAL_ADDRESS;
11233 
11234   unsigned NumElements = VT.getVectorNumElements();
11235   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11236     if (NumElements > 4)
11237       return SplitVectorStore(Op, DAG);
11238     // v3 stores not supported on SI.
11239     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11240       return SplitVectorStore(Op, DAG);
11241 
11242     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
11243                                         VT, *Store->getMemOperand()))
11244       return expandUnalignedStore(Store, DAG);
11245 
11246     return SDValue();
11247   }
11248   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11249     switch (Subtarget->getMaxPrivateElementSize()) {
11250     case 4:
11251       return scalarizeVectorStore(Store, DAG);
11252     case 8:
11253       if (NumElements > 2)
11254         return SplitVectorStore(Op, DAG);
11255       return SDValue();
11256     case 16:
11257       if (NumElements > 4 ||
11258           (NumElements == 3 && !Subtarget->enableFlatScratch()))
11259         return SplitVectorStore(Op, DAG);
11260       return SDValue();
11261     default:
11262       llvm_unreachable("unsupported private_element_size");
11263     }
11264   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11265     unsigned Fast = 0;
11266     auto Flags = Store->getMemOperand()->getFlags();
11267     if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
11268                                            Store->getAlign(), Flags, &Fast) &&
11269         Fast > 1)
11270       return SDValue();
11271 
11272     if (VT.isVector())
11273       return SplitVectorStore(Op, DAG);
11274 
11275     return expandUnalignedStore(Store, DAG);
11276   }
11277 
11278   // Probably an invalid store. If so we'll end up emitting a selection error.
11279   return SDValue();
11280 }
11281 
11282 // Avoid the full correct expansion for f32 sqrt when promoting from f16.
11283 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11284   SDLoc SL(Op);
11285   assert(!Subtarget->has16BitInsts());
11286   SDNodeFlags Flags = Op->getFlags();
11287   SDValue Ext =
11288       DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11289 
11290   SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11291   SDValue Sqrt =
11292       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11293 
11294   return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11295                      DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11296 }
11297 
11298 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11299   SDLoc DL(Op);
11300   SDNodeFlags Flags = Op->getFlags();
11301   MVT VT = Op.getValueType().getSimpleVT();
11302   const SDValue X = Op.getOperand(0);
11303 
11304   if (allowApproxFunc(DAG, Flags)) {
11305     // Instruction is 1ulp but ignores denormals.
11306     return DAG.getNode(
11307         ISD::INTRINSIC_WO_CHAIN, DL, VT,
11308         DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11309   }
11310 
11311   SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11312   SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11313 
11314   SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11315 
11316   SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11317 
11318   SDValue SqrtX =
11319       DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11320 
11321   SDValue SqrtS;
11322   if (needsDenormHandlingF32(DAG, X, Flags)) {
11323     SDValue SqrtID =
11324         DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11325     SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11326 
11327     SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11328     SDValue SqrtSNextDownInt =
11329         DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11330                     DAG.getAllOnesConstant(DL, MVT::i32));
11331     SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11332 
11333     SDValue NegSqrtSNextDown =
11334         DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11335 
11336     SDValue SqrtVP =
11337         DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11338 
11339     SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11340                                          DAG.getConstant(1, DL, MVT::i32));
11341     SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11342 
11343     SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11344     SDValue SqrtVS =
11345         DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11346 
11347     SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11348     SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11349 
11350     SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11351                         Flags);
11352 
11353     SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11354     SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11355                         Flags);
11356   } else {
11357     SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11358 
11359     SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11360 
11361     SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11362     SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11363     SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11364 
11365     SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11366     SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11367     SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11368 
11369     SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11370     SDValue SqrtD =
11371         DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11372     SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11373   }
11374 
11375   SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11376 
11377   SDValue ScaledDown =
11378       DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11379 
11380   SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11381   SDValue IsZeroOrInf =
11382       DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11383                   DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11384 
11385   return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11386 }
11387 
11388 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11389   // For double type, the SQRT and RSQ instructions don't have required
11390   // precision, we apply Goldschmidt's algorithm to improve the result:
11391   //
11392   //   y0 = rsq(x)
11393   //   g0 = x * y0
11394   //   h0 = 0.5 * y0
11395   //
11396   //   r0 = 0.5 - h0 * g0
11397   //   g1 = g0 * r0 + g0
11398   //   h1 = h0 * r0 + h0
11399   //
11400   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11401   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
11402   //   h2 = h1 * r1 + h1
11403   //
11404   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11405   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
11406   //
11407   //   sqrt(x) = g3
11408 
11409   SDNodeFlags Flags = Op->getFlags();
11410 
11411   SDLoc DL(Op);
11412 
11413   SDValue X = Op.getOperand(0);
11414   SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11415 
11416   SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11417 
11418   SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11419 
11420   // Scale up input if it is too small.
11421   SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11422   SDValue ScaleUp =
11423       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11424   SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11425 
11426   SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11427 
11428   SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11429 
11430   SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11431   SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11432 
11433   SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11434   SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11435 
11436   SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11437 
11438   SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11439 
11440   SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11441   SDValue SqrtD0 =
11442       DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11443 
11444   SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11445 
11446   SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11447   SDValue SqrtD1 =
11448       DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11449 
11450   SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11451 
11452   SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11453   SDValue ScaleDown =
11454       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11455   SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11456 
11457   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11458   // with finite only or nsz because rsq(+/-0) = +/-inf
11459 
11460   // TODO: Check for DAZ and expand to subnormals
11461   SDValue IsZeroOrInf =
11462       DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11463                   DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11464 
11465   // If x is +INF, +0, or -0, use its original value
11466   return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11467                      Flags);
11468 }
11469 
11470 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11471   SDLoc DL(Op);
11472   EVT VT = Op.getValueType();
11473   SDValue Arg = Op.getOperand(0);
11474   SDValue TrigVal;
11475 
11476   // Propagate fast-math flags so that the multiply we introduce can be folded
11477   // if Arg is already the result of a multiply by constant.
11478   auto Flags = Op->getFlags();
11479 
11480   SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11481 
11482   if (Subtarget->hasTrigReducedRange()) {
11483     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11484     TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11485   } else {
11486     TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11487   }
11488 
11489   switch (Op.getOpcode()) {
11490   case ISD::FCOS:
11491     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11492   case ISD::FSIN:
11493     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11494   default:
11495     llvm_unreachable("Wrong trig opcode");
11496   }
11497 }
11498 
11499 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11500                                                SelectionDAG &DAG) const {
11501   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11502   assert(AtomicNode->isCompareAndSwap());
11503   unsigned AS = AtomicNode->getAddressSpace();
11504 
11505   // No custom lowering required for local address space
11506   if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11507     return Op;
11508 
11509   // Non-local address space requires custom lowering for atomic compare
11510   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11511   SDLoc DL(Op);
11512   SDValue ChainIn = Op.getOperand(0);
11513   SDValue Addr = Op.getOperand(1);
11514   SDValue Old = Op.getOperand(2);
11515   SDValue New = Op.getOperand(3);
11516   EVT VT = Op.getValueType();
11517   MVT SimpleVT = VT.getSimpleVT();
11518   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11519 
11520   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11521   SDValue Ops[] = {ChainIn, Addr, NewOld};
11522 
11523   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
11524                                  Op->getVTList(), Ops, VT,
11525                                  AtomicNode->getMemOperand());
11526 }
11527 
11528 //===----------------------------------------------------------------------===//
11529 // Custom DAG optimizations
11530 //===----------------------------------------------------------------------===//
11531 
11532 SDValue
11533 SITargetLowering::performUCharToFloatCombine(SDNode *N,
11534                                              DAGCombinerInfo &DCI) const {
11535   EVT VT = N->getValueType(0);
11536   EVT ScalarVT = VT.getScalarType();
11537   if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11538     return SDValue();
11539 
11540   SelectionDAG &DAG = DCI.DAG;
11541   SDLoc DL(N);
11542 
11543   SDValue Src = N->getOperand(0);
11544   EVT SrcVT = Src.getValueType();
11545 
11546   // TODO: We could try to match extracting the higher bytes, which would be
11547   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11548   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11549   // about in practice.
11550   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11551     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11552       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11553       DCI.AddToWorklist(Cvt.getNode());
11554 
11555       // For the f16 case, fold to a cast to f32 and then cast back to f16.
11556       if (ScalarVT != MVT::f32) {
11557         Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11558                           DAG.getTargetConstant(0, DL, MVT::i32));
11559       }
11560       return Cvt;
11561     }
11562   }
11563 
11564   return SDValue();
11565 }
11566 
11567 SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11568                                                   DAGCombinerInfo &DCI) const {
11569   SDValue MagnitudeOp = N->getOperand(0);
11570   SDValue SignOp = N->getOperand(1);
11571   SelectionDAG &DAG = DCI.DAG;
11572   SDLoc DL(N);
11573 
11574   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11575   // lower half with a copy.
11576   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11577   if (MagnitudeOp.getValueType() == MVT::f64) {
11578     SDValue MagAsVector =
11579         DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11580     SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11581                                 MagAsVector, DAG.getConstant(0, DL, MVT::i32));
11582     SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
11583                                 MagAsVector, DAG.getConstant(1, DL, MVT::i32));
11584 
11585     SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11586 
11587     SDValue Vector =
11588         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11589 
11590     return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11591   }
11592 
11593   if (SignOp.getValueType() != MVT::f64)
11594     return SDValue();
11595 
11596   // Reduce width of sign operand, we only need the highest bit.
11597   //
11598   // fcopysign f64:x, f64:y ->
11599   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11600   // TODO: In some cases it might make sense to go all the way to f16.
11601   SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11602   SDValue SignAsF32 =
11603       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11604                   DAG.getConstant(1, DL, MVT::i32));
11605 
11606   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11607                      SignAsF32);
11608 }
11609 
11610 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11611 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11612 // bits
11613 
11614 // This is a variant of
11615 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11616 //
11617 // The normal DAG combiner will do this, but only if the add has one use since
11618 // that would increase the number of instructions.
11619 //
11620 // This prevents us from seeing a constant offset that can be folded into a
11621 // memory instruction's addressing mode. If we know the resulting add offset of
11622 // a pointer can be folded into an addressing offset, we can replace the pointer
11623 // operand with the add of new constant offset. This eliminates one of the uses,
11624 // and may allow the remaining use to also be simplified.
11625 //
11626 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
11627                                                EVT MemVT,
11628                                                DAGCombinerInfo &DCI) const {
11629   SDValue N0 = N->getOperand(0);
11630   SDValue N1 = N->getOperand(1);
11631 
11632   // We only do this to handle cases where it's profitable when there are
11633   // multiple uses of the add, so defer to the standard combine.
11634   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11635       N0->hasOneUse())
11636     return SDValue();
11637 
11638   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11639   if (!CN1)
11640     return SDValue();
11641 
11642   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11643   if (!CAdd)
11644     return SDValue();
11645 
11646   SelectionDAG &DAG = DCI.DAG;
11647 
11648   if (N0->getOpcode() == ISD::OR &&
11649       !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11650     return SDValue();
11651 
11652   // If the resulting offset is too large, we can't fold it into the
11653   // addressing mode offset.
11654   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11655   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11656 
11657   AddrMode AM;
11658   AM.HasBaseReg = true;
11659   AM.BaseOffs = Offset.getSExtValue();
11660   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11661     return SDValue();
11662 
11663   SDLoc SL(N);
11664   EVT VT = N->getValueType(0);
11665 
11666   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11667   SDValue COffset = DAG.getConstant(Offset, SL, VT);
11668 
11669   SDNodeFlags Flags;
11670   Flags.setNoUnsignedWrap(
11671       N->getFlags().hasNoUnsignedWrap() &&
11672       (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
11673 
11674   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11675 }
11676 
11677 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11678 /// by the chain and intrinsic ID. Theoretically we would also need to check the
11679 /// specific intrinsic, but they all place the pointer operand first.
11680 static unsigned getBasePtrIndex(const MemSDNode *N) {
11681   switch (N->getOpcode()) {
11682   case ISD::STORE:
11683   case ISD::INTRINSIC_W_CHAIN:
11684   case ISD::INTRINSIC_VOID:
11685     return 2;
11686   default:
11687     return 1;
11688   }
11689 }
11690 
11691 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11692                                                   DAGCombinerInfo &DCI) const {
11693   SelectionDAG &DAG = DCI.DAG;
11694   SDLoc SL(N);
11695 
11696   unsigned PtrIdx = getBasePtrIndex(N);
11697   SDValue Ptr = N->getOperand(PtrIdx);
11698 
11699   // TODO: We could also do this for multiplies.
11700   if (Ptr.getOpcode() == ISD::SHL) {
11701     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11702                                           N->getMemoryVT(), DCI);
11703     if (NewPtr) {
11704       SmallVector<SDValue, 8> NewOps(N->ops());
11705 
11706       NewOps[PtrIdx] = NewPtr;
11707       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11708     }
11709   }
11710 
11711   return SDValue();
11712 }
11713 
11714 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11715   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11716          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11717          (Opc == ISD::XOR && Val == 0);
11718 }
11719 
11720 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11721 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11722 // integer combine opportunities since most 64-bit operations are decomposed
11723 // this way.  TODO: We won't want this for SALU especially if it is an inline
11724 // immediate.
11725 SDValue SITargetLowering::splitBinaryBitConstantOp(
11726     DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
11727     const ConstantSDNode *CRHS) const {
11728   uint64_t Val = CRHS->getZExtValue();
11729   uint32_t ValLo = Lo_32(Val);
11730   uint32_t ValHi = Hi_32(Val);
11731   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11732 
11733   if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11734        bitOpWithConstantIsReducible(Opc, ValHi)) ||
11735       (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11736     // If we need to materialize a 64-bit immediate, it will be split up later
11737     // anyway. Avoid creating the harder to understand 64-bit immediate
11738     // materialization.
11739     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11740   }
11741 
11742   return SDValue();
11743 }
11744 
11745 bool llvm::isBoolSGPR(SDValue V) {
11746   if (V.getValueType() != MVT::i1)
11747     return false;
11748   switch (V.getOpcode()) {
11749   default:
11750     break;
11751   case ISD::SETCC:
11752   case AMDGPUISD::FP_CLASS:
11753     return true;
11754   case ISD::AND:
11755   case ISD::OR:
11756   case ISD::XOR:
11757     return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11758   }
11759   return false;
11760 }
11761 
11762 // If a constant has all zeroes or all ones within each byte return it.
11763 // Otherwise return 0.
11764 static uint32_t getConstantPermuteMask(uint32_t C) {
11765   // 0xff for any zero byte in the mask
11766   uint32_t ZeroByteMask = 0;
11767   if (!(C & 0x000000ff))
11768     ZeroByteMask |= 0x000000ff;
11769   if (!(C & 0x0000ff00))
11770     ZeroByteMask |= 0x0000ff00;
11771   if (!(C & 0x00ff0000))
11772     ZeroByteMask |= 0x00ff0000;
11773   if (!(C & 0xff000000))
11774     ZeroByteMask |= 0xff000000;
11775   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11776   if ((NonZeroByteMask & C) != NonZeroByteMask)
11777     return 0; // Partial bytes selected.
11778   return C;
11779 }
11780 
11781 // Check if a node selects whole bytes from its operand 0 starting at a byte
11782 // boundary while masking the rest. Returns select mask as in the v_perm_b32
11783 // or -1 if not succeeded.
11784 // Note byte select encoding:
11785 // value 0-3 selects corresponding source byte;
11786 // value 0xc selects zero;
11787 // value 0xff selects 0xff.
11788 static uint32_t getPermuteMask(SDValue V) {
11789   assert(V.getValueSizeInBits() == 32);
11790 
11791   if (V.getNumOperands() != 2)
11792     return ~0;
11793 
11794   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11795   if (!N1)
11796     return ~0;
11797 
11798   uint32_t C = N1->getZExtValue();
11799 
11800   switch (V.getOpcode()) {
11801   default:
11802     break;
11803   case ISD::AND:
11804     if (uint32_t ConstMask = getConstantPermuteMask(C))
11805       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11806     break;
11807 
11808   case ISD::OR:
11809     if (uint32_t ConstMask = getConstantPermuteMask(C))
11810       return (0x03020100 & ~ConstMask) | ConstMask;
11811     break;
11812 
11813   case ISD::SHL:
11814     if (C % 8)
11815       return ~0;
11816 
11817     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11818 
11819   case ISD::SRL:
11820     if (C % 8)
11821       return ~0;
11822 
11823     return uint32_t(0x0c0c0c0c03020100ull >> C);
11824   }
11825 
11826   return ~0;
11827 }
11828 
11829 SDValue SITargetLowering::performAndCombine(SDNode *N,
11830                                             DAGCombinerInfo &DCI) const {
11831   if (DCI.isBeforeLegalize())
11832     return SDValue();
11833 
11834   SelectionDAG &DAG = DCI.DAG;
11835   EVT VT = N->getValueType(0);
11836   SDValue LHS = N->getOperand(0);
11837   SDValue RHS = N->getOperand(1);
11838 
11839   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11840   if (VT == MVT::i64 && CRHS) {
11841     if (SDValue Split =
11842             splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11843       return Split;
11844   }
11845 
11846   if (CRHS && VT == MVT::i32) {
11847     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11848     // nb = number of trailing zeroes in mask
11849     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11850     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11851     uint64_t Mask = CRHS->getZExtValue();
11852     unsigned Bits = llvm::popcount(Mask);
11853     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11854         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11855       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11856         unsigned Shift = CShift->getZExtValue();
11857         unsigned NB = CRHS->getAPIntValue().countr_zero();
11858         unsigned Offset = NB + Shift;
11859         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11860           SDLoc SL(N);
11861           SDValue BFE =
11862               DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
11863                           DAG.getConstant(Offset, SL, MVT::i32),
11864                           DAG.getConstant(Bits, SL, MVT::i32));
11865           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11866           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11867                                     DAG.getValueType(NarrowVT));
11868           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11869                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11870           return Shl;
11871         }
11872       }
11873     }
11874 
11875     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11876     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11877         isa<ConstantSDNode>(LHS.getOperand(2))) {
11878       uint32_t Sel = getConstantPermuteMask(Mask);
11879       if (!Sel)
11880         return SDValue();
11881 
11882       // Select 0xc for all zero bytes
11883       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11884       SDLoc DL(N);
11885       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11886                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11887     }
11888   }
11889 
11890   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11891   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11892   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11893     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11894     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11895 
11896     SDValue X = LHS.getOperand(0);
11897     SDValue Y = RHS.getOperand(0);
11898     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11899         !isTypeLegal(X.getValueType()))
11900       return SDValue();
11901 
11902     if (LCC == ISD::SETO) {
11903       if (X != LHS.getOperand(1))
11904         return SDValue();
11905 
11906       if (RCC == ISD::SETUNE) {
11907         const ConstantFPSDNode *C1 =
11908             dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11909         if (!C1 || !C1->isInfinity() || C1->isNegative())
11910           return SDValue();
11911 
11912         const uint32_t Mask = SIInstrFlags::N_NORMAL |
11913                               SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
11914                               SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
11915                               SIInstrFlags::P_NORMAL;
11916 
11917         static_assert(
11918             ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
11919                 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
11920              0x3ff) == Mask,
11921             "mask not equal");
11922 
11923         SDLoc DL(N);
11924         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
11925                            DAG.getConstant(Mask, DL, MVT::i32));
11926       }
11927     }
11928   }
11929 
11930   if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11931     std::swap(LHS, RHS);
11932 
11933   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11934       RHS.hasOneUse()) {
11935     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11936     // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
11937     // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
11938     // | n_nan)
11939     const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11940     if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11941         (RHS.getOperand(0) == LHS.getOperand(0) &&
11942          LHS.getOperand(0) == LHS.getOperand(1))) {
11943       const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11944       unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
11945                                           : Mask->getZExtValue() & OrdMask;
11946 
11947       SDLoc DL(N);
11948       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11949                          DAG.getConstant(NewMask, DL, MVT::i32));
11950     }
11951   }
11952 
11953   if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
11954                          LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11955     // and x, (sext cc from i1) => select cc, x, 0
11956     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11957       std::swap(LHS, RHS);
11958     if (isBoolSGPR(RHS.getOperand(0)))
11959       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
11960                            DAG.getConstant(0, SDLoc(N), MVT::i32));
11961   }
11962 
11963   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11964   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11965   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11966       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11967     uint32_t LHSMask = getPermuteMask(LHS);
11968     uint32_t RHSMask = getPermuteMask(RHS);
11969     if (LHSMask != ~0u && RHSMask != ~0u) {
11970       // Canonicalize the expression in an attempt to have fewer unique masks
11971       // and therefore fewer registers used to hold the masks.
11972       if (LHSMask > RHSMask) {
11973         std::swap(LHSMask, RHSMask);
11974         std::swap(LHS, RHS);
11975       }
11976 
11977       // Select 0xc for each lane used from source operand. Zero has 0xc mask
11978       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11979       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11980       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11981 
11982       // Check of we need to combine values from two sources within a byte.
11983       if (!(LHSUsedLanes & RHSUsedLanes) &&
11984           // If we select high and lower word keep it for SDWA.
11985           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11986           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11987         // Each byte in each mask is either selector mask 0-3, or has higher
11988         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11989         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11990         // mask which is not 0xff wins. By anding both masks we have a correct
11991         // result except that 0x0c shall be corrected to give 0x0c only.
11992         uint32_t Mask = LHSMask & RHSMask;
11993         for (unsigned I = 0; I < 32; I += 8) {
11994           uint32_t ByteSel = 0xff << I;
11995           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11996             Mask &= (0x0c << I) & 0xffffffff;
11997         }
11998 
11999         // Add 4 to each active LHS lane. It will not affect any existing 0xff
12000         // or 0x0c.
12001         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12002         SDLoc DL(N);
12003 
12004         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12005                            RHS.getOperand(0),
12006                            DAG.getConstant(Sel, DL, MVT::i32));
12007       }
12008     }
12009   }
12010 
12011   return SDValue();
12012 }
12013 
12014 // A key component of v_perm is a mapping between byte position of the src
12015 // operands, and the byte position of the dest. To provide such, we need: 1. the
12016 // node that provides x byte of the dest of the OR, and 2. the byte of the node
12017 // used to provide that x byte. calculateByteProvider finds which node provides
12018 // a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12019 // and finds an ultimate src and byte position For example: The supported
12020 // LoadCombine pattern for vector loads is as follows
12021 //                                t1
12022 //                                or
12023 //                      /                  \
12024 //                      t2                 t3
12025 //                     zext                shl
12026 //                      |                   |     \
12027 //                     t4                  t5     16
12028 //                     or                 anyext
12029 //                 /        \               |
12030 //                t6        t7             t8
12031 //               srl        shl             or
12032 //            /    |      /     \         /     \
12033 //           t9   t10    t11   t12      t13    t14
12034 //         trunc*  8    trunc*  8      and     and
12035 //           |            |          /    |     |    \
12036 //          t15          t16        t17  t18   t19   t20
12037 //                                trunc*  255   srl   -256
12038 //                                   |         /   \
12039 //                                  t15       t15  16
12040 //
12041 // *In this example, the truncs are from i32->i16
12042 //
12043 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12044 // respectively. calculateSrcByte would find (given node) -> ultimate src &
12045 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12046 // After finding the mapping, we can combine the tree into vperm t15, t16,
12047 // 0x05000407
12048 
12049 // Find the source and byte position from a node.
12050 // \p DestByte is the byte position of the dest of the or that the src
12051 // ultimately provides. \p SrcIndex is the byte of the src that maps to this
12052 // dest of the or byte. \p Depth tracks how many recursive iterations we have
12053 // performed.
12054 static const std::optional<ByteProvider<SDValue>>
12055 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12056                  unsigned Depth = 0) {
12057   // We may need to recursively traverse a series of SRLs
12058   if (Depth >= 6)
12059     return std::nullopt;
12060 
12061   if (Op.getValueSizeInBits() < 8)
12062     return std::nullopt;
12063 
12064   if (Op.getValueType().isVector())
12065     return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12066 
12067   switch (Op->getOpcode()) {
12068   case ISD::TRUNCATE: {
12069     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12070   }
12071 
12072   case ISD::SIGN_EXTEND:
12073   case ISD::ZERO_EXTEND:
12074   case ISD::SIGN_EXTEND_INREG: {
12075     SDValue NarrowOp = Op->getOperand(0);
12076     auto NarrowVT = NarrowOp.getValueType();
12077     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12078       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12079       NarrowVT = VTSign->getVT();
12080     }
12081     if (!NarrowVT.isByteSized())
12082       return std::nullopt;
12083     uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12084 
12085     if (SrcIndex >= NarrowByteWidth)
12086       return std::nullopt;
12087     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12088   }
12089 
12090   case ISD::SRA:
12091   case ISD::SRL: {
12092     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12093     if (!ShiftOp)
12094       return std::nullopt;
12095 
12096     uint64_t BitShift = ShiftOp->getZExtValue();
12097 
12098     if (BitShift % 8 != 0)
12099       return std::nullopt;
12100 
12101     SrcIndex += BitShift / 8;
12102 
12103     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12104   }
12105 
12106   default: {
12107     return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12108   }
12109   }
12110   llvm_unreachable("fully handled switch");
12111 }
12112 
12113 // For a byte position in the result of an Or, traverse the tree and find the
12114 // node (and the byte of the node) which ultimately provides this {Or,
12115 // BytePosition}. \p Op is the operand we are currently examining. \p Index is
12116 // the byte position of the Op that corresponds with the originally requested
12117 // byte of the Or \p Depth tracks how many recursive iterations we have
12118 // performed. \p StartingIndex is the originally requested byte of the Or
12119 static const std::optional<ByteProvider<SDValue>>
12120 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12121                       unsigned StartingIndex = 0) {
12122   // Finding Src tree of RHS of or typically requires at least 1 additional
12123   // depth
12124   if (Depth > 6)
12125     return std::nullopt;
12126 
12127   unsigned BitWidth = Op.getScalarValueSizeInBits();
12128   if (BitWidth % 8 != 0)
12129     return std::nullopt;
12130   if (Index > BitWidth / 8 - 1)
12131     return std::nullopt;
12132 
12133   bool IsVec = Op.getValueType().isVector();
12134   switch (Op.getOpcode()) {
12135   case ISD::OR: {
12136     if (IsVec)
12137       return std::nullopt;
12138 
12139     auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12140                                      StartingIndex);
12141     if (!RHS)
12142       return std::nullopt;
12143     auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12144                                      StartingIndex);
12145     if (!LHS)
12146       return std::nullopt;
12147     // A well formed Or will have two ByteProviders for each byte, one of which
12148     // is constant zero
12149     if (!LHS->isConstantZero() && !RHS->isConstantZero())
12150       return std::nullopt;
12151     if (!LHS || LHS->isConstantZero())
12152       return RHS;
12153     if (!RHS || RHS->isConstantZero())
12154       return LHS;
12155     return std::nullopt;
12156   }
12157 
12158   case ISD::AND: {
12159     if (IsVec)
12160       return std::nullopt;
12161 
12162     auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12163     if (!BitMaskOp)
12164       return std::nullopt;
12165 
12166     uint32_t BitMask = BitMaskOp->getZExtValue();
12167     // Bits we expect for our StartingIndex
12168     uint32_t IndexMask = 0xFF << (Index * 8);
12169 
12170     if ((IndexMask & BitMask) != IndexMask) {
12171       // If the result of the and partially provides the byte, then it
12172       // is not well formatted
12173       if (IndexMask & BitMask)
12174         return std::nullopt;
12175       return ByteProvider<SDValue>::getConstantZero();
12176     }
12177 
12178     return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12179   }
12180 
12181   case ISD::FSHR: {
12182     if (IsVec)
12183       return std::nullopt;
12184 
12185     // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12186     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12187     if (!ShiftOp || Op.getValueType().isVector())
12188       return std::nullopt;
12189 
12190     uint64_t BitsProvided = Op.getValueSizeInBits();
12191     if (BitsProvided % 8 != 0)
12192       return std::nullopt;
12193 
12194     uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12195     if (BitShift % 8)
12196       return std::nullopt;
12197 
12198     uint64_t ConcatSizeInBytes = BitsProvided / 4;
12199     uint64_t ByteShift = BitShift / 8;
12200 
12201     uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12202     uint64_t BytesProvided = BitsProvided / 8;
12203     SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12204     NewIndex %= BytesProvided;
12205     return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12206   }
12207 
12208   case ISD::SRA:
12209   case ISD::SRL: {
12210     if (IsVec)
12211       return std::nullopt;
12212 
12213     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12214     if (!ShiftOp)
12215       return std::nullopt;
12216 
12217     uint64_t BitShift = ShiftOp->getZExtValue();
12218     if (BitShift % 8)
12219       return std::nullopt;
12220 
12221     auto BitsProvided = Op.getScalarValueSizeInBits();
12222     if (BitsProvided % 8 != 0)
12223       return std::nullopt;
12224 
12225     uint64_t BytesProvided = BitsProvided / 8;
12226     uint64_t ByteShift = BitShift / 8;
12227     // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12228     // If the byte we are trying to provide (as tracked by index) falls in this
12229     // range, then the SRL provides the byte. The byte of interest of the src of
12230     // the SRL is Index + ByteShift
12231     return BytesProvided - ByteShift > Index
12232                ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12233                                   Index + ByteShift)
12234                : ByteProvider<SDValue>::getConstantZero();
12235   }
12236 
12237   case ISD::SHL: {
12238     if (IsVec)
12239       return std::nullopt;
12240 
12241     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12242     if (!ShiftOp)
12243       return std::nullopt;
12244 
12245     uint64_t BitShift = ShiftOp->getZExtValue();
12246     if (BitShift % 8 != 0)
12247       return std::nullopt;
12248     uint64_t ByteShift = BitShift / 8;
12249 
12250     // If we are shifting by an amount greater than (or equal to)
12251     // the index we are trying to provide, then it provides 0s. If not,
12252     // then this bytes are not definitively 0s, and the corresponding byte
12253     // of interest is Index - ByteShift of the src
12254     return Index < ByteShift
12255                ? ByteProvider<SDValue>::getConstantZero()
12256                : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12257                                        Depth + 1, StartingIndex);
12258   }
12259   case ISD::ANY_EXTEND:
12260   case ISD::SIGN_EXTEND:
12261   case ISD::ZERO_EXTEND:
12262   case ISD::SIGN_EXTEND_INREG:
12263   case ISD::AssertZext:
12264   case ISD::AssertSext: {
12265     if (IsVec)
12266       return std::nullopt;
12267 
12268     SDValue NarrowOp = Op->getOperand(0);
12269     unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12270     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12271         Op->getOpcode() == ISD::AssertZext ||
12272         Op->getOpcode() == ISD::AssertSext) {
12273       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12274       NarrowBitWidth = VTSign->getVT().getSizeInBits();
12275     }
12276     if (NarrowBitWidth % 8 != 0)
12277       return std::nullopt;
12278     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12279 
12280     if (Index >= NarrowByteWidth)
12281       return Op.getOpcode() == ISD::ZERO_EXTEND
12282                  ? std::optional<ByteProvider<SDValue>>(
12283                        ByteProvider<SDValue>::getConstantZero())
12284                  : std::nullopt;
12285     return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12286   }
12287 
12288   case ISD::TRUNCATE: {
12289     if (IsVec)
12290       return std::nullopt;
12291 
12292     uint64_t NarrowByteWidth = BitWidth / 8;
12293 
12294     if (NarrowByteWidth >= Index) {
12295       return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12296                                    StartingIndex);
12297     }
12298 
12299     return std::nullopt;
12300   }
12301 
12302   case ISD::CopyFromReg: {
12303     if (BitWidth / 8 > Index)
12304       return calculateSrcByte(Op, StartingIndex, Index);
12305 
12306     return std::nullopt;
12307   }
12308 
12309   case ISD::LOAD: {
12310     auto *L = cast<LoadSDNode>(Op.getNode());
12311 
12312     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12313     if (NarrowBitWidth % 8 != 0)
12314       return std::nullopt;
12315     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12316 
12317     // If the width of the load does not reach byte we are trying to provide for
12318     // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12319     // question
12320     if (Index >= NarrowByteWidth) {
12321       return L->getExtensionType() == ISD::ZEXTLOAD
12322                  ? std::optional<ByteProvider<SDValue>>(
12323                        ByteProvider<SDValue>::getConstantZero())
12324                  : std::nullopt;
12325     }
12326 
12327     if (NarrowByteWidth > Index) {
12328       return calculateSrcByte(Op, StartingIndex, Index);
12329     }
12330 
12331     return std::nullopt;
12332   }
12333 
12334   case ISD::BSWAP: {
12335     if (IsVec)
12336       return std::nullopt;
12337 
12338     return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12339                                  Depth + 1, StartingIndex);
12340   }
12341 
12342   case ISD::EXTRACT_VECTOR_ELT: {
12343     auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12344     if (!IdxOp)
12345       return std::nullopt;
12346     auto VecIdx = IdxOp->getZExtValue();
12347     auto ScalarSize = Op.getScalarValueSizeInBits();
12348     if (ScalarSize < 32)
12349       Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12350     return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12351                             StartingIndex, Index);
12352   }
12353 
12354   case AMDGPUISD::PERM: {
12355     if (IsVec)
12356       return std::nullopt;
12357 
12358     auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12359     if (!PermMask)
12360       return std::nullopt;
12361 
12362     auto IdxMask =
12363         (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12364     if (IdxMask > 0x07 && IdxMask != 0x0c)
12365       return std::nullopt;
12366 
12367     auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12368     auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12369 
12370     return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12371                            : ByteProvider<SDValue>(
12372                                  ByteProvider<SDValue>::getConstantZero());
12373   }
12374 
12375   default: {
12376     return std::nullopt;
12377   }
12378   }
12379 
12380   llvm_unreachable("fully handled switch");
12381 }
12382 
12383 // Returns true if the Operand is a scalar and is 16 bits
12384 static bool isExtendedFrom16Bits(SDValue &Operand) {
12385 
12386   switch (Operand.getOpcode()) {
12387   case ISD::ANY_EXTEND:
12388   case ISD::SIGN_EXTEND:
12389   case ISD::ZERO_EXTEND: {
12390     auto OpVT = Operand.getOperand(0).getValueType();
12391     return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12392   }
12393   case ISD::LOAD: {
12394     LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12395     auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12396     if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12397         ExtType == ISD::EXTLOAD) {
12398       auto MemVT = L->getMemoryVT();
12399       return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12400     }
12401     return L->getMemoryVT().getSizeInBits() == 16;
12402   }
12403   default:
12404     return false;
12405   }
12406 }
12407 
12408 // Returns true if the mask matches consecutive bytes, and the first byte
12409 // begins at a power of 2 byte offset from 0th byte
12410 static bool addresses16Bits(int Mask) {
12411   int Low8 = Mask & 0xff;
12412   int Hi8 = (Mask & 0xff00) >> 8;
12413 
12414   assert(Low8 < 8 && Hi8 < 8);
12415   // Are the bytes contiguous in the order of increasing addresses.
12416   bool IsConsecutive = (Hi8 - Low8 == 1);
12417   // Is the first byte at location that is aligned for 16 bit instructions.
12418   // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12419   // In this case, we still need code to extract the 16 bit operand, so it
12420   // is better to use i8 v_perm
12421   bool Is16Aligned = !(Low8 % 2);
12422 
12423   return IsConsecutive && Is16Aligned;
12424 }
12425 
12426 // Do not lower into v_perm if the operands are actually 16 bit
12427 // and the selected bits (based on PermMask) correspond with two
12428 // easily addressable 16 bit operands.
12429 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12430                                 SDValue &OtherOp) {
12431   int Low16 = PermMask & 0xffff;
12432   int Hi16 = (PermMask & 0xffff0000) >> 16;
12433 
12434   auto TempOp = peekThroughBitcasts(Op);
12435   auto TempOtherOp = peekThroughBitcasts(OtherOp);
12436 
12437   auto OpIs16Bit =
12438       TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12439   if (!OpIs16Bit)
12440     return true;
12441 
12442   auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12443                         isExtendedFrom16Bits(TempOtherOp);
12444   if (!OtherOpIs16Bit)
12445     return true;
12446 
12447   // Do we cleanly address both
12448   return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12449 }
12450 
12451 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12452                                   unsigned DWordOffset) {
12453   SDValue Ret;
12454 
12455   auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12456   // ByteProvider must be at least 8 bits
12457   assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12458 
12459   if (TypeSize <= 32)
12460     return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12461 
12462   if (Src.getValueType().isVector()) {
12463     auto ScalarTySize = Src.getScalarValueSizeInBits();
12464     auto ScalarTy = Src.getValueType().getScalarType();
12465     if (ScalarTySize == 32) {
12466       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12467                          DAG.getConstant(DWordOffset, SL, MVT::i32));
12468     }
12469     if (ScalarTySize > 32) {
12470       Ret = DAG.getNode(
12471           ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12472           DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12473       auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12474       if (ShiftVal)
12475         Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12476                           DAG.getConstant(ShiftVal, SL, MVT::i32));
12477       return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12478     }
12479 
12480     assert(ScalarTySize < 32);
12481     auto NumElements = TypeSize / ScalarTySize;
12482     auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12483     auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12484     auto NumElementsIn32 = 32 / ScalarTySize;
12485     auto NumAvailElements = DWordOffset < Trunc32Elements
12486                                 ? NumElementsIn32
12487                                 : NumElements - NormalizedTrunc;
12488 
12489     SmallVector<SDValue, 4> VecSrcs;
12490     DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12491                               NumAvailElements);
12492 
12493     Ret = DAG.getBuildVector(
12494         MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12495         VecSrcs);
12496     return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12497   }
12498 
12499   /// Scalar Type
12500   auto ShiftVal = 32 * DWordOffset;
12501   Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12502                     DAG.getConstant(ShiftVal, SL, MVT::i32));
12503   return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12504 }
12505 
12506 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12507   SelectionDAG &DAG = DCI.DAG;
12508   [[maybe_unused]] EVT VT = N->getValueType(0);
12509   SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12510 
12511   // VT is known to be MVT::i32, so we need to provide 4 bytes.
12512   assert(VT == MVT::i32);
12513   for (int i = 0; i < 4; i++) {
12514     // Find the ByteProvider that provides the ith byte of the result of OR
12515     std::optional<ByteProvider<SDValue>> P =
12516         calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12517     // TODO support constantZero
12518     if (!P || P->isConstantZero())
12519       return SDValue();
12520 
12521     PermNodes.push_back(*P);
12522   }
12523   if (PermNodes.size() != 4)
12524     return SDValue();
12525 
12526   std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12527   std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12528   uint64_t PermMask = 0x00000000;
12529   for (size_t i = 0; i < PermNodes.size(); i++) {
12530     auto PermOp = PermNodes[i];
12531     // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12532     // by sizeof(Src2) = 4
12533     int SrcByteAdjust = 4;
12534 
12535     // If the Src uses a byte from a different DWORD, then it corresponds
12536     // with a difference source
12537     if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12538         ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12539       if (SecondSrc)
12540         if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12541             ((PermOp.SrcOffset / 4) != SecondSrc->second))
12542           return SDValue();
12543 
12544       // Set the index of the second distinct Src node
12545       SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12546       assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12547       SrcByteAdjust = 0;
12548     }
12549     assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12550     assert(!DAG.getDataLayout().isBigEndian());
12551     PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12552   }
12553   SDLoc DL(N);
12554   SDValue Op = *PermNodes[FirstSrc.first].Src;
12555   Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12556   assert(Op.getValueSizeInBits() == 32);
12557 
12558   // Check that we are not just extracting the bytes in order from an op
12559   if (!SecondSrc) {
12560     int Low16 = PermMask & 0xffff;
12561     int Hi16 = (PermMask & 0xffff0000) >> 16;
12562 
12563     bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12564     bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12565 
12566     // The perm op would really just produce Op. So combine into Op
12567     if (WellFormedLow && WellFormedHi)
12568       return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12569   }
12570 
12571   SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12572 
12573   if (SecondSrc) {
12574     OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12575     assert(OtherOp.getValueSizeInBits() == 32);
12576   }
12577 
12578   if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12579 
12580     assert(Op.getValueType().isByteSized() &&
12581            OtherOp.getValueType().isByteSized());
12582 
12583     // If the ultimate src is less than 32 bits, then we will only be
12584     // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12585     // CalculateByteProvider would not have returned Op as source if we
12586     // used a byte that is outside its ValueType. Thus, we are free to
12587     // ANY_EXTEND as the extended bits are dont-cares.
12588     Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12589     OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12590 
12591     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12592                        DAG.getConstant(PermMask, DL, MVT::i32));
12593   }
12594   return SDValue();
12595 }
12596 
12597 SDValue SITargetLowering::performOrCombine(SDNode *N,
12598                                            DAGCombinerInfo &DCI) const {
12599   SelectionDAG &DAG = DCI.DAG;
12600   SDValue LHS = N->getOperand(0);
12601   SDValue RHS = N->getOperand(1);
12602 
12603   EVT VT = N->getValueType(0);
12604   if (VT == MVT::i1) {
12605     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12606     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12607         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12608       SDValue Src = LHS.getOperand(0);
12609       if (Src != RHS.getOperand(0))
12610         return SDValue();
12611 
12612       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12613       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12614       if (!CLHS || !CRHS)
12615         return SDValue();
12616 
12617       // Only 10 bits are used.
12618       static const uint32_t MaxMask = 0x3ff;
12619 
12620       uint32_t NewMask =
12621           (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12622       SDLoc DL(N);
12623       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
12624                          DAG.getConstant(NewMask, DL, MVT::i32));
12625     }
12626 
12627     return SDValue();
12628   }
12629 
12630   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12631   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12632       LHS.getOpcode() == AMDGPUISD::PERM &&
12633       isa<ConstantSDNode>(LHS.getOperand(2))) {
12634     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12635     if (!Sel)
12636       return SDValue();
12637 
12638     Sel |= LHS.getConstantOperandVal(2);
12639     SDLoc DL(N);
12640     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12641                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12642   }
12643 
12644   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12645   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12646   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12647       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12648 
12649     // If all the uses of an or need to extract the individual elements, do not
12650     // attempt to lower into v_perm
12651     auto usesCombinedOperand = [](SDNode *OrUse) {
12652       // If we have any non-vectorized use, then it is a candidate for v_perm
12653       if (OrUse->getOpcode() != ISD::BITCAST ||
12654           !OrUse->getValueType(0).isVector())
12655         return true;
12656 
12657       // If we have any non-vectorized use, then it is a candidate for v_perm
12658       for (auto *VUser : OrUse->users()) {
12659         if (!VUser->getValueType(0).isVector())
12660           return true;
12661 
12662         // If the use of a vector is a store, then combining via a v_perm
12663         // is beneficial.
12664         // TODO -- whitelist more uses
12665         for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12666           if (VUser->getOpcode() == VectorwiseOp)
12667             return true;
12668       }
12669       return false;
12670     };
12671 
12672     if (!any_of(N->users(), usesCombinedOperand))
12673       return SDValue();
12674 
12675     uint32_t LHSMask = getPermuteMask(LHS);
12676     uint32_t RHSMask = getPermuteMask(RHS);
12677 
12678     if (LHSMask != ~0u && RHSMask != ~0u) {
12679       // Canonicalize the expression in an attempt to have fewer unique masks
12680       // and therefore fewer registers used to hold the masks.
12681       if (LHSMask > RHSMask) {
12682         std::swap(LHSMask, RHSMask);
12683         std::swap(LHS, RHS);
12684       }
12685 
12686       // Select 0xc for each lane used from source operand. Zero has 0xc mask
12687       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12688       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12689       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12690 
12691       // Check of we need to combine values from two sources within a byte.
12692       if (!(LHSUsedLanes & RHSUsedLanes) &&
12693           // If we select high and lower word keep it for SDWA.
12694           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12695           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12696         // Kill zero bytes selected by other mask. Zero value is 0xc.
12697         LHSMask &= ~RHSUsedLanes;
12698         RHSMask &= ~LHSUsedLanes;
12699         // Add 4 to each active LHS lane
12700         LHSMask |= LHSUsedLanes & 0x04040404;
12701         // Combine masks
12702         uint32_t Sel = LHSMask | RHSMask;
12703         SDLoc DL(N);
12704 
12705         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12706                            RHS.getOperand(0),
12707                            DAG.getConstant(Sel, DL, MVT::i32));
12708       }
12709     }
12710     if (LHSMask == ~0u || RHSMask == ~0u) {
12711       if (SDValue Perm = matchPERM(N, DCI))
12712         return Perm;
12713     }
12714   }
12715 
12716   if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12717     return SDValue();
12718 
12719   // TODO: This could be a generic combine with a predicate for extracting the
12720   // high half of an integer being free.
12721 
12722   // (or i64:x, (zero_extend i32:y)) ->
12723   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12724   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12725       RHS.getOpcode() != ISD::ZERO_EXTEND)
12726     std::swap(LHS, RHS);
12727 
12728   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12729     SDValue ExtSrc = RHS.getOperand(0);
12730     EVT SrcVT = ExtSrc.getValueType();
12731     if (SrcVT == MVT::i32) {
12732       SDLoc SL(N);
12733       auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
12734       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12735 
12736       DCI.AddToWorklist(LowOr.getNode());
12737       DCI.AddToWorklist(HiBits.getNode());
12738 
12739       SDValue Vec =
12740           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
12741       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12742     }
12743   }
12744 
12745   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12746   if (CRHS) {
12747     if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12748                                                  N->getOperand(0), CRHS))
12749       return Split;
12750   }
12751 
12752   return SDValue();
12753 }
12754 
12755 SDValue SITargetLowering::performXorCombine(SDNode *N,
12756                                             DAGCombinerInfo &DCI) const {
12757   if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12758     return RV;
12759 
12760   SDValue LHS = N->getOperand(0);
12761   SDValue RHS = N->getOperand(1);
12762 
12763   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12764   SelectionDAG &DAG = DCI.DAG;
12765 
12766   EVT VT = N->getValueType(0);
12767   if (CRHS && VT == MVT::i64) {
12768     if (SDValue Split =
12769             splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12770       return Split;
12771   }
12772 
12773   // Make sure to apply the 64-bit constant splitting fold before trying to fold
12774   // fneg-like xors into 64-bit select.
12775   if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12776     // This looks like an fneg, try to fold as a source modifier.
12777     if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12778         shouldFoldFNegIntoSrc(N, LHS)) {
12779       // xor (select c, a, b), 0x80000000 ->
12780       //   bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12781       SDLoc DL(N);
12782       SDValue CastLHS =
12783           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12784       SDValue CastRHS =
12785           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12786       SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12787       SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12788       SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12789                                       LHS->getOperand(0), FNegLHS, FNegRHS);
12790       return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12791     }
12792   }
12793 
12794   return SDValue();
12795 }
12796 
12797 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12798                                                    DAGCombinerInfo &DCI) const {
12799   if (!Subtarget->has16BitInsts() ||
12800       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12801     return SDValue();
12802 
12803   EVT VT = N->getValueType(0);
12804   if (VT != MVT::i32)
12805     return SDValue();
12806 
12807   SDValue Src = N->getOperand(0);
12808   if (Src.getValueType() != MVT::i16)
12809     return SDValue();
12810 
12811   return SDValue();
12812 }
12813 
12814 SDValue
12815 SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12816                                                 DAGCombinerInfo &DCI) const {
12817   SDValue Src = N->getOperand(0);
12818   auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12819 
12820   // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12821   // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12822   if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12823         VTSign->getVT() == MVT::i8) ||
12824        (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12825         VTSign->getVT() == MVT::i16))) {
12826     assert(Subtarget->hasScalarSubwordLoads() &&
12827            "s_buffer_load_{u8, i8} are supported "
12828            "in GFX12 (or newer) architectures.");
12829     EVT VT = Src.getValueType();
12830     unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12831                        ? AMDGPUISD::SBUFFER_LOAD_BYTE
12832                        : AMDGPUISD::SBUFFER_LOAD_SHORT;
12833     SDLoc DL(N);
12834     SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12835     SDValue Ops[] = {
12836         Src.getOperand(0), // source register
12837         Src.getOperand(1), // offset
12838         Src.getOperand(2)  // cachePolicy
12839     };
12840     auto *M = cast<MemSDNode>(Src);
12841     SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12842         Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12843     SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12844     return LoadVal;
12845   }
12846   if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12847         VTSign->getVT() == MVT::i8) ||
12848        (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12849         VTSign->getVT() == MVT::i16)) &&
12850       Src.hasOneUse()) {
12851     auto *M = cast<MemSDNode>(Src);
12852     SDValue Ops[] = {Src.getOperand(0), // Chain
12853                      Src.getOperand(1), // rsrc
12854                      Src.getOperand(2), // vindex
12855                      Src.getOperand(3), // voffset
12856                      Src.getOperand(4), // soffset
12857                      Src.getOperand(5), // offset
12858                      Src.getOperand(6), Src.getOperand(7)};
12859     // replace with BUFFER_LOAD_BYTE/SHORT
12860     SDVTList ResList =
12861         DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
12862     unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
12863                        ? AMDGPUISD::BUFFER_LOAD_BYTE
12864                        : AMDGPUISD::BUFFER_LOAD_SHORT;
12865     SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
12866         Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12867     return DCI.DAG.getMergeValues(
12868         {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
12869   }
12870   return SDValue();
12871 }
12872 
12873 SDValue SITargetLowering::performClassCombine(SDNode *N,
12874                                               DAGCombinerInfo &DCI) const {
12875   SelectionDAG &DAG = DCI.DAG;
12876   SDValue Mask = N->getOperand(1);
12877 
12878   // fp_class x, 0 -> false
12879   if (isNullConstant(Mask))
12880     return DAG.getConstant(0, SDLoc(N), MVT::i1);
12881 
12882   if (N->getOperand(0).isUndef())
12883     return DAG.getUNDEF(MVT::i1);
12884 
12885   return SDValue();
12886 }
12887 
12888 SDValue SITargetLowering::performRcpCombine(SDNode *N,
12889                                             DAGCombinerInfo &DCI) const {
12890   EVT VT = N->getValueType(0);
12891   SDValue N0 = N->getOperand(0);
12892 
12893   if (N0.isUndef()) {
12894     return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12895                                  SDLoc(N), VT);
12896   }
12897 
12898   if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12899                          N0.getOpcode() == ISD::SINT_TO_FP)) {
12900     return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12901                            N->getFlags());
12902   }
12903 
12904   // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12905   if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12906       N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12907     return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
12908                            N->getFlags());
12909   }
12910 
12911   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
12912 }
12913 
12914 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
12915                                        unsigned MaxDepth) const {
12916   unsigned Opcode = Op.getOpcode();
12917   if (Opcode == ISD::FCANONICALIZE)
12918     return true;
12919 
12920   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12921     const auto &F = CFP->getValueAPF();
12922     if (F.isNaN() && F.isSignaling())
12923       return false;
12924     if (!F.isDenormal())
12925       return true;
12926 
12927     DenormalMode Mode =
12928         DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12929     return Mode == DenormalMode::getIEEE();
12930   }
12931 
12932   // If source is a result of another standard FP operation it is already in
12933   // canonical form.
12934   if (MaxDepth == 0)
12935     return false;
12936 
12937   switch (Opcode) {
12938   // These will flush denorms if required.
12939   case ISD::FADD:
12940   case ISD::FSUB:
12941   case ISD::FMUL:
12942   case ISD::FCEIL:
12943   case ISD::FFLOOR:
12944   case ISD::FMA:
12945   case ISD::FMAD:
12946   case ISD::FSQRT:
12947   case ISD::FDIV:
12948   case ISD::FREM:
12949   case ISD::FP_ROUND:
12950   case ISD::FP_EXTEND:
12951   case ISD::FP16_TO_FP:
12952   case ISD::FP_TO_FP16:
12953   case ISD::BF16_TO_FP:
12954   case ISD::FP_TO_BF16:
12955   case ISD::FLDEXP:
12956   case AMDGPUISD::FMUL_LEGACY:
12957   case AMDGPUISD::FMAD_FTZ:
12958   case AMDGPUISD::RCP:
12959   case AMDGPUISD::RSQ:
12960   case AMDGPUISD::RSQ_CLAMP:
12961   case AMDGPUISD::RCP_LEGACY:
12962   case AMDGPUISD::RCP_IFLAG:
12963   case AMDGPUISD::LOG:
12964   case AMDGPUISD::EXP:
12965   case AMDGPUISD::DIV_SCALE:
12966   case AMDGPUISD::DIV_FMAS:
12967   case AMDGPUISD::DIV_FIXUP:
12968   case AMDGPUISD::FRACT:
12969   case AMDGPUISD::CVT_PKRTZ_F16_F32:
12970   case AMDGPUISD::CVT_F32_UBYTE0:
12971   case AMDGPUISD::CVT_F32_UBYTE1:
12972   case AMDGPUISD::CVT_F32_UBYTE2:
12973   case AMDGPUISD::CVT_F32_UBYTE3:
12974   case AMDGPUISD::FP_TO_FP16:
12975   case AMDGPUISD::SIN_HW:
12976   case AMDGPUISD::COS_HW:
12977     return true;
12978 
12979   // It can/will be lowered or combined as a bit operation.
12980   // Need to check their input recursively to handle.
12981   case ISD::FNEG:
12982   case ISD::FABS:
12983   case ISD::FCOPYSIGN:
12984     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12985 
12986   case ISD::AND:
12987     if (Op.getValueType() == MVT::i32) {
12988       // Be careful as we only know it is a bitcast floating point type. It
12989       // could be f32, v2f16, we have no way of knowing. Luckily the constant
12990       // value that we optimize for, which comes up in fp32 to bf16 conversions,
12991       // is valid to optimize for all types.
12992       if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12993         if (RHS->getZExtValue() == 0xffff0000) {
12994           return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12995         }
12996       }
12997     }
12998     break;
12999 
13000   case ISD::FSIN:
13001   case ISD::FCOS:
13002   case ISD::FSINCOS:
13003     return Op.getValueType().getScalarType() != MVT::f16;
13004 
13005   case ISD::FMINNUM:
13006   case ISD::FMAXNUM:
13007   case ISD::FMINNUM_IEEE:
13008   case ISD::FMAXNUM_IEEE:
13009   case ISD::FMINIMUM:
13010   case ISD::FMAXIMUM:
13011   case AMDGPUISD::CLAMP:
13012   case AMDGPUISD::FMED3:
13013   case AMDGPUISD::FMAX3:
13014   case AMDGPUISD::FMIN3:
13015   case AMDGPUISD::FMAXIMUM3:
13016   case AMDGPUISD::FMINIMUM3: {
13017     // FIXME: Shouldn't treat the generic operations different based these.
13018     // However, we aren't really required to flush the result from
13019     // minnum/maxnum..
13020 
13021     // snans will be quieted, so we only need to worry about denormals.
13022     if (Subtarget->supportsMinMaxDenormModes() ||
13023         // FIXME: denormalsEnabledForType is broken for dynamic
13024         denormalsEnabledForType(DAG, Op.getValueType()))
13025       return true;
13026 
13027     // Flushing may be required.
13028     // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13029     // targets need to check their input recursively.
13030 
13031     // FIXME: Does this apply with clamp? It's implemented with max.
13032     for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13033       if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13034         return false;
13035     }
13036 
13037     return true;
13038   }
13039   case ISD::SELECT: {
13040     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13041            isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13042   }
13043   case ISD::BUILD_VECTOR: {
13044     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13045       SDValue SrcOp = Op.getOperand(i);
13046       if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13047         return false;
13048     }
13049 
13050     return true;
13051   }
13052   case ISD::EXTRACT_VECTOR_ELT:
13053   case ISD::EXTRACT_SUBVECTOR: {
13054     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13055   }
13056   case ISD::INSERT_VECTOR_ELT: {
13057     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13058            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13059   }
13060   case ISD::UNDEF:
13061     // Could be anything.
13062     return false;
13063 
13064   case ISD::BITCAST:
13065     // TODO: This is incorrect as it loses track of the operand's type. We may
13066     // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13067     // same bits that are canonicalized in one type need not be in the other.
13068     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13069   case ISD::TRUNCATE: {
13070     // Hack round the mess we make when legalizing extract_vector_elt
13071     if (Op.getValueType() == MVT::i16) {
13072       SDValue TruncSrc = Op.getOperand(0);
13073       if (TruncSrc.getValueType() == MVT::i32 &&
13074           TruncSrc.getOpcode() == ISD::BITCAST &&
13075           TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13076         return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13077       }
13078     }
13079     return false;
13080   }
13081   case ISD::INTRINSIC_WO_CHAIN: {
13082     unsigned IntrinsicID = Op.getConstantOperandVal(0);
13083     // TODO: Handle more intrinsics
13084     switch (IntrinsicID) {
13085     case Intrinsic::amdgcn_cvt_pkrtz:
13086     case Intrinsic::amdgcn_cubeid:
13087     case Intrinsic::amdgcn_frexp_mant:
13088     case Intrinsic::amdgcn_fdot2:
13089     case Intrinsic::amdgcn_rcp:
13090     case Intrinsic::amdgcn_rsq:
13091     case Intrinsic::amdgcn_rsq_clamp:
13092     case Intrinsic::amdgcn_rcp_legacy:
13093     case Intrinsic::amdgcn_rsq_legacy:
13094     case Intrinsic::amdgcn_trig_preop:
13095     case Intrinsic::amdgcn_log:
13096     case Intrinsic::amdgcn_exp2:
13097     case Intrinsic::amdgcn_sqrt:
13098       return true;
13099     default:
13100       break;
13101     }
13102 
13103     break;
13104   }
13105   default:
13106     break;
13107   }
13108 
13109   // FIXME: denormalsEnabledForType is broken for dynamic
13110   return denormalsEnabledForType(DAG, Op.getValueType()) &&
13111          DAG.isKnownNeverSNaN(Op);
13112 }
13113 
13114 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
13115                                        unsigned MaxDepth) const {
13116   const MachineRegisterInfo &MRI = MF.getRegInfo();
13117   MachineInstr *MI = MRI.getVRegDef(Reg);
13118   unsigned Opcode = MI->getOpcode();
13119 
13120   if (Opcode == AMDGPU::G_FCANONICALIZE)
13121     return true;
13122 
13123   std::optional<FPValueAndVReg> FCR;
13124   // Constant splat (can be padded with undef) or scalar constant.
13125   if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13126     if (FCR->Value.isSignaling())
13127       return false;
13128     if (!FCR->Value.isDenormal())
13129       return true;
13130 
13131     DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13132     return Mode == DenormalMode::getIEEE();
13133   }
13134 
13135   if (MaxDepth == 0)
13136     return false;
13137 
13138   switch (Opcode) {
13139   case AMDGPU::G_FADD:
13140   case AMDGPU::G_FSUB:
13141   case AMDGPU::G_FMUL:
13142   case AMDGPU::G_FCEIL:
13143   case AMDGPU::G_FFLOOR:
13144   case AMDGPU::G_FRINT:
13145   case AMDGPU::G_FNEARBYINT:
13146   case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13147   case AMDGPU::G_INTRINSIC_TRUNC:
13148   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13149   case AMDGPU::G_FMA:
13150   case AMDGPU::G_FMAD:
13151   case AMDGPU::G_FSQRT:
13152   case AMDGPU::G_FDIV:
13153   case AMDGPU::G_FREM:
13154   case AMDGPU::G_FPOW:
13155   case AMDGPU::G_FPEXT:
13156   case AMDGPU::G_FLOG:
13157   case AMDGPU::G_FLOG2:
13158   case AMDGPU::G_FLOG10:
13159   case AMDGPU::G_FPTRUNC:
13160   case AMDGPU::G_AMDGPU_RCP_IFLAG:
13161   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13162   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13163   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13164   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13165     return true;
13166   case AMDGPU::G_FNEG:
13167   case AMDGPU::G_FABS:
13168   case AMDGPU::G_FCOPYSIGN:
13169     return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13170   case AMDGPU::G_FMINNUM:
13171   case AMDGPU::G_FMAXNUM:
13172   case AMDGPU::G_FMINNUM_IEEE:
13173   case AMDGPU::G_FMAXNUM_IEEE:
13174   case AMDGPU::G_FMINIMUM:
13175   case AMDGPU::G_FMAXIMUM: {
13176     if (Subtarget->supportsMinMaxDenormModes() ||
13177         // FIXME: denormalsEnabledForType is broken for dynamic
13178         denormalsEnabledForType(MRI.getType(Reg), MF))
13179       return true;
13180 
13181     [[fallthrough]];
13182   }
13183   case AMDGPU::G_BUILD_VECTOR:
13184     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13185       if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13186         return false;
13187     return true;
13188   case AMDGPU::G_INTRINSIC:
13189   case AMDGPU::G_INTRINSIC_CONVERGENT:
13190     switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13191     case Intrinsic::amdgcn_fmul_legacy:
13192     case Intrinsic::amdgcn_fmad_ftz:
13193     case Intrinsic::amdgcn_sqrt:
13194     case Intrinsic::amdgcn_fmed3:
13195     case Intrinsic::amdgcn_sin:
13196     case Intrinsic::amdgcn_cos:
13197     case Intrinsic::amdgcn_log:
13198     case Intrinsic::amdgcn_exp2:
13199     case Intrinsic::amdgcn_log_clamp:
13200     case Intrinsic::amdgcn_rcp:
13201     case Intrinsic::amdgcn_rcp_legacy:
13202     case Intrinsic::amdgcn_rsq:
13203     case Intrinsic::amdgcn_rsq_clamp:
13204     case Intrinsic::amdgcn_rsq_legacy:
13205     case Intrinsic::amdgcn_div_scale:
13206     case Intrinsic::amdgcn_div_fmas:
13207     case Intrinsic::amdgcn_div_fixup:
13208     case Intrinsic::amdgcn_fract:
13209     case Intrinsic::amdgcn_cvt_pkrtz:
13210     case Intrinsic::amdgcn_cubeid:
13211     case Intrinsic::amdgcn_cubema:
13212     case Intrinsic::amdgcn_cubesc:
13213     case Intrinsic::amdgcn_cubetc:
13214     case Intrinsic::amdgcn_frexp_mant:
13215     case Intrinsic::amdgcn_fdot2:
13216     case Intrinsic::amdgcn_trig_preop:
13217       return true;
13218     default:
13219       break;
13220     }
13221 
13222     [[fallthrough]];
13223   default:
13224     return false;
13225   }
13226 
13227   llvm_unreachable("invalid operation");
13228 }
13229 
13230 // Constant fold canonicalize.
13231 SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13232                                                  const SDLoc &SL, EVT VT,
13233                                                  const APFloat &C) const {
13234   // Flush denormals to 0 if not enabled.
13235   if (C.isDenormal()) {
13236     DenormalMode Mode =
13237         DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13238     if (Mode == DenormalMode::getPreserveSign()) {
13239       return DAG.getConstantFP(
13240           APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13241     }
13242 
13243     if (Mode != DenormalMode::getIEEE())
13244       return SDValue();
13245   }
13246 
13247   if (C.isNaN()) {
13248     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13249     if (C.isSignaling()) {
13250       // Quiet a signaling NaN.
13251       // FIXME: Is this supposed to preserve payload bits?
13252       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13253     }
13254 
13255     // Make sure it is the canonical NaN bitpattern.
13256     //
13257     // TODO: Can we use -1 as the canonical NaN value since it's an inline
13258     // immediate?
13259     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13260       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13261   }
13262 
13263   // Already canonical.
13264   return DAG.getConstantFP(C, SL, VT);
13265 }
13266 
13267 static bool vectorEltWillFoldAway(SDValue Op) {
13268   return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13269 }
13270 
13271 SDValue
13272 SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13273                                               DAGCombinerInfo &DCI) const {
13274   SelectionDAG &DAG = DCI.DAG;
13275   SDValue N0 = N->getOperand(0);
13276   EVT VT = N->getValueType(0);
13277 
13278   // fcanonicalize undef -> qnan
13279   if (N0.isUndef()) {
13280     APFloat QNaN = APFloat::getQNaN(VT.getFltSemantics());
13281     return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13282   }
13283 
13284   if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13285     EVT VT = N->getValueType(0);
13286     return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13287   }
13288 
13289   // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13290   //                                                   (fcanonicalize k)
13291   //
13292   // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13293 
13294   // TODO: This could be better with wider vectors that will be split to v2f16,
13295   // and to consider uses since there aren't that many packed operations.
13296   if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13297       isTypeLegal(MVT::v2f16)) {
13298     SDLoc SL(N);
13299     SDValue NewElts[2];
13300     SDValue Lo = N0.getOperand(0);
13301     SDValue Hi = N0.getOperand(1);
13302     EVT EltVT = Lo.getValueType();
13303 
13304     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
13305       for (unsigned I = 0; I != 2; ++I) {
13306         SDValue Op = N0.getOperand(I);
13307         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13308           NewElts[I] =
13309               getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13310         } else if (Op.isUndef()) {
13311           // Handled below based on what the other operand is.
13312           NewElts[I] = Op;
13313         } else {
13314           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13315         }
13316       }
13317 
13318       // If one half is undef, and one is constant, prefer a splat vector rather
13319       // than the normal qNaN. If it's a register, prefer 0.0 since that's
13320       // cheaper to use and may be free with a packed operation.
13321       if (NewElts[0].isUndef()) {
13322         if (isa<ConstantFPSDNode>(NewElts[1]))
13323           NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13324                            ? NewElts[1]
13325                            : DAG.getConstantFP(0.0f, SL, EltVT);
13326       }
13327 
13328       if (NewElts[1].isUndef()) {
13329         NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13330                          ? NewElts[0]
13331                          : DAG.getConstantFP(0.0f, SL, EltVT);
13332       }
13333 
13334       return DAG.getBuildVector(VT, SL, NewElts);
13335     }
13336   }
13337 
13338   return SDValue();
13339 }
13340 
13341 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13342   switch (Opc) {
13343   case ISD::FMAXNUM:
13344   case ISD::FMAXNUM_IEEE:
13345     return AMDGPUISD::FMAX3;
13346   case ISD::FMAXIMUM:
13347     return AMDGPUISD::FMAXIMUM3;
13348   case ISD::SMAX:
13349     return AMDGPUISD::SMAX3;
13350   case ISD::UMAX:
13351     return AMDGPUISD::UMAX3;
13352   case ISD::FMINNUM:
13353   case ISD::FMINNUM_IEEE:
13354     return AMDGPUISD::FMIN3;
13355   case ISD::FMINIMUM:
13356     return AMDGPUISD::FMINIMUM3;
13357   case ISD::SMIN:
13358     return AMDGPUISD::SMIN3;
13359   case ISD::UMIN:
13360     return AMDGPUISD::UMIN3;
13361   default:
13362     llvm_unreachable("Not a min/max opcode");
13363   }
13364 }
13365 
13366 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13367                                                    const SDLoc &SL, SDValue Src,
13368                                                    SDValue MinVal,
13369                                                    SDValue MaxVal,
13370                                                    bool Signed) const {
13371 
13372   // med3 comes from
13373   //    min(max(x, K0), K1), K0 < K1
13374   //    max(min(x, K0), K1), K1 < K0
13375   //
13376   // "MinVal" and "MaxVal" respectively refer to the rhs of the
13377   // min/max op.
13378   ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13379   ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13380 
13381   if (!MinK || !MaxK)
13382     return SDValue();
13383 
13384   if (Signed) {
13385     if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13386       return SDValue();
13387   } else {
13388     if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13389       return SDValue();
13390   }
13391 
13392   EVT VT = MinK->getValueType(0);
13393   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13394   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13395     return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13396 
13397   // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13398   // not available, but this is unlikely to be profitable as constants
13399   // will often need to be materialized & extended, especially on
13400   // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13401   return SDValue();
13402 }
13403 
13404 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13405   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13406     return C;
13407 
13408   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13409     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13410       return C;
13411   }
13412 
13413   return nullptr;
13414 }
13415 
13416 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13417                                                   const SDLoc &SL, SDValue Op0,
13418                                                   SDValue Op1) const {
13419   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
13420   if (!K1)
13421     return SDValue();
13422 
13423   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
13424   if (!K0)
13425     return SDValue();
13426 
13427   // Ordered >= (although NaN inputs should have folded away by now).
13428   if (K0->getValueAPF() > K1->getValueAPF())
13429     return SDValue();
13430 
13431   const MachineFunction &MF = DAG.getMachineFunction();
13432   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13433 
13434   // TODO: Check IEEE bit enabled?
13435   EVT VT = Op0.getValueType();
13436   if (Info->getMode().DX10Clamp) {
13437     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13438     // hardware fmed3 behavior converting to a min.
13439     // FIXME: Should this be allowing -0.0?
13440     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13441       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13442   }
13443 
13444   // med3 for f16 is only available on gfx9+, and not available for v2f16.
13445   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13446     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13447     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13448     // then give the other result, which is different from med3 with a NaN
13449     // input.
13450     SDValue Var = Op0.getOperand(0);
13451     if (!DAG.isKnownNeverSNaN(Var))
13452       return SDValue();
13453 
13454     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13455 
13456     if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13457         (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13458       return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13459                          SDValue(K0, 0), SDValue(K1, 0));
13460     }
13461   }
13462 
13463   return SDValue();
13464 }
13465 
13466 /// \return true if the subtarget supports minimum3 and maximum3 with the given
13467 /// base min/max opcode \p Opc for type \p VT.
13468 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13469                              EVT VT) {
13470   switch (Opc) {
13471   case ISD::FMINNUM:
13472   case ISD::FMAXNUM:
13473   case ISD::FMINNUM_IEEE:
13474   case ISD::FMAXNUM_IEEE:
13475   case AMDGPUISD::FMIN_LEGACY:
13476   case AMDGPUISD::FMAX_LEGACY:
13477     return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13478   case ISD::FMINIMUM:
13479   case ISD::FMAXIMUM:
13480     return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13481            (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
13482   case ISD::SMAX:
13483   case ISD::SMIN:
13484   case ISD::UMAX:
13485   case ISD::UMIN:
13486     return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13487   default:
13488     return false;
13489   }
13490 
13491   llvm_unreachable("not a min/max opcode");
13492 }
13493 
13494 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13495                                                DAGCombinerInfo &DCI) const {
13496   SelectionDAG &DAG = DCI.DAG;
13497 
13498   EVT VT = N->getValueType(0);
13499   unsigned Opc = N->getOpcode();
13500   SDValue Op0 = N->getOperand(0);
13501   SDValue Op1 = N->getOperand(1);
13502 
13503   // Only do this if the inner op has one use since this will just increases
13504   // register pressure for no benefit.
13505 
13506   if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13507     // max(max(a, b), c) -> max3(a, b, c)
13508     // min(min(a, b), c) -> min3(a, b, c)
13509     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13510       SDLoc DL(N);
13511       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13512                          Op0.getOperand(0), Op0.getOperand(1), Op1);
13513     }
13514 
13515     // Try commuted.
13516     // max(a, max(b, c)) -> max3(a, b, c)
13517     // min(a, min(b, c)) -> min3(a, b, c)
13518     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13519       SDLoc DL(N);
13520       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13521                          Op0, Op1.getOperand(0), Op1.getOperand(1));
13522     }
13523   }
13524 
13525   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13526   // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13527   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13528     if (SDValue Med3 = performIntMed3ImmCombine(
13529             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13530       return Med3;
13531   }
13532   if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13533     if (SDValue Med3 = performIntMed3ImmCombine(
13534             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13535       return Med3;
13536   }
13537 
13538   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13539     if (SDValue Med3 = performIntMed3ImmCombine(
13540             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13541       return Med3;
13542   }
13543   if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13544     if (SDValue Med3 = performIntMed3ImmCombine(
13545             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13546       return Med3;
13547   }
13548 
13549   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13550   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13551        (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13552        (Opc == AMDGPUISD::FMIN_LEGACY &&
13553         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13554       (VT == MVT::f32 || VT == MVT::f64 ||
13555        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13556        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13557       Op0.hasOneUse()) {
13558     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13559       return Res;
13560   }
13561 
13562   return SDValue();
13563 }
13564 
13565 static bool isClampZeroToOne(SDValue A, SDValue B) {
13566   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13567     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13568       // FIXME: Should this be allowing -0.0?
13569       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13570              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13571     }
13572   }
13573 
13574   return false;
13575 }
13576 
13577 // FIXME: Should only worry about snans for version with chain.
13578 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13579                                               DAGCombinerInfo &DCI) const {
13580   EVT VT = N->getValueType(0);
13581   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13582   // NaNs. With a NaN input, the order of the operands may change the result.
13583 
13584   SelectionDAG &DAG = DCI.DAG;
13585   SDLoc SL(N);
13586 
13587   SDValue Src0 = N->getOperand(0);
13588   SDValue Src1 = N->getOperand(1);
13589   SDValue Src2 = N->getOperand(2);
13590 
13591   if (isClampZeroToOne(Src0, Src1)) {
13592     // const_a, const_b, x -> clamp is safe in all cases including signaling
13593     // nans.
13594     // FIXME: Should this be allowing -0.0?
13595     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13596   }
13597 
13598   const MachineFunction &MF = DAG.getMachineFunction();
13599   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13600 
13601   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13602   // handling no dx10-clamp?
13603   if (Info->getMode().DX10Clamp) {
13604     // If NaNs is clamped to 0, we are free to reorder the inputs.
13605 
13606     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13607       std::swap(Src0, Src1);
13608 
13609     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13610       std::swap(Src1, Src2);
13611 
13612     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13613       std::swap(Src0, Src1);
13614 
13615     if (isClampZeroToOne(Src1, Src2))
13616       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13617   }
13618 
13619   return SDValue();
13620 }
13621 
13622 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13623                                                  DAGCombinerInfo &DCI) const {
13624   SDValue Src0 = N->getOperand(0);
13625   SDValue Src1 = N->getOperand(1);
13626   if (Src0.isUndef() && Src1.isUndef())
13627     return DCI.DAG.getUNDEF(N->getValueType(0));
13628   return SDValue();
13629 }
13630 
13631 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13632 // expanded into a set of cmp/select instructions.
13633 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
13634                                                 unsigned NumElem,
13635                                                 bool IsDivergentIdx,
13636                                                 const GCNSubtarget *Subtarget) {
13637   if (UseDivergentRegisterIndexing)
13638     return false;
13639 
13640   unsigned VecSize = EltSize * NumElem;
13641 
13642   // Sub-dword vectors of size 2 dword or less have better implementation.
13643   if (VecSize <= 64 && EltSize < 32)
13644     return false;
13645 
13646   // Always expand the rest of sub-dword instructions, otherwise it will be
13647   // lowered via memory.
13648   if (EltSize < 32)
13649     return true;
13650 
13651   // Always do this if var-idx is divergent, otherwise it will become a loop.
13652   if (IsDivergentIdx)
13653     return true;
13654 
13655   // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13656   unsigned NumInsts = NumElem /* Number of compares */ +
13657                       ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13658 
13659   // On some architectures (GFX9) movrel is not available and it's better
13660   // to expand.
13661   if (Subtarget->useVGPRIndexMode())
13662     return NumInsts <= 16;
13663 
13664   // If movrel is available, use it instead of expanding for vector of 8
13665   // elements.
13666   if (Subtarget->hasMovrel())
13667     return NumInsts <= 15;
13668 
13669   return true;
13670 }
13671 
13672 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
13673   SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13674   if (isa<ConstantSDNode>(Idx))
13675     return false;
13676 
13677   SDValue Vec = N->getOperand(0);
13678   EVT VecVT = Vec.getValueType();
13679   EVT EltVT = VecVT.getVectorElementType();
13680   unsigned EltSize = EltVT.getSizeInBits();
13681   unsigned NumElem = VecVT.getVectorNumElements();
13682 
13683   return SITargetLowering::shouldExpandVectorDynExt(
13684       EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13685 }
13686 
13687 SDValue
13688 SITargetLowering::performExtractVectorEltCombine(SDNode *N,
13689                                                  DAGCombinerInfo &DCI) const {
13690   SDValue Vec = N->getOperand(0);
13691   SelectionDAG &DAG = DCI.DAG;
13692 
13693   EVT VecVT = Vec.getValueType();
13694   EVT VecEltVT = VecVT.getVectorElementType();
13695   EVT ResVT = N->getValueType(0);
13696 
13697   unsigned VecSize = VecVT.getSizeInBits();
13698   unsigned VecEltSize = VecEltVT.getSizeInBits();
13699 
13700   if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
13701       allUsesHaveSourceMods(N)) {
13702     SDLoc SL(N);
13703     SDValue Idx = N->getOperand(1);
13704     SDValue Elt =
13705         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13706     return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13707   }
13708 
13709   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13710   //    =>
13711   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13712   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13713   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13714   if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13715     SDLoc SL(N);
13716     SDValue Idx = N->getOperand(1);
13717     unsigned Opc = Vec.getOpcode();
13718 
13719     switch (Opc) {
13720     default:
13721       break;
13722       // TODO: Support other binary operations.
13723     case ISD::FADD:
13724     case ISD::FSUB:
13725     case ISD::FMUL:
13726     case ISD::ADD:
13727     case ISD::UMIN:
13728     case ISD::UMAX:
13729     case ISD::SMIN:
13730     case ISD::SMAX:
13731     case ISD::FMAXNUM:
13732     case ISD::FMINNUM:
13733     case ISD::FMAXNUM_IEEE:
13734     case ISD::FMINNUM_IEEE:
13735     case ISD::FMAXIMUM:
13736     case ISD::FMINIMUM: {
13737       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13738                                  Vec.getOperand(0), Idx);
13739       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13740                                  Vec.getOperand(1), Idx);
13741 
13742       DCI.AddToWorklist(Elt0.getNode());
13743       DCI.AddToWorklist(Elt1.getNode());
13744       return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13745     }
13746     }
13747   }
13748 
13749   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13750   if (shouldExpandVectorDynExt(N)) {
13751     SDLoc SL(N);
13752     SDValue Idx = N->getOperand(1);
13753     SDValue V;
13754     for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13755       SDValue IC = DAG.getVectorIdxConstant(I, SL);
13756       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13757       if (I == 0)
13758         V = Elt;
13759       else
13760         V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13761     }
13762     return V;
13763   }
13764 
13765   if (!DCI.isBeforeLegalize())
13766     return SDValue();
13767 
13768   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13769   // elements. This exposes more load reduction opportunities by replacing
13770   // multiple small extract_vector_elements with a single 32-bit extract.
13771   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13772   if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13773       VecSize > 32 && VecSize % 32 == 0 && Idx) {
13774     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13775 
13776     unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13777     unsigned EltIdx = BitIndex / 32;
13778     unsigned LeftoverBitIdx = BitIndex % 32;
13779     SDLoc SL(N);
13780 
13781     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13782     DCI.AddToWorklist(Cast.getNode());
13783 
13784     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13785                               DAG.getConstant(EltIdx, SL, MVT::i32));
13786     DCI.AddToWorklist(Elt.getNode());
13787     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13788                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13789     DCI.AddToWorklist(Srl.getNode());
13790 
13791     EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13792     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13793     DCI.AddToWorklist(Trunc.getNode());
13794 
13795     if (VecEltVT == ResVT) {
13796       return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13797     }
13798 
13799     assert(ResVT.isScalarInteger());
13800     return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13801   }
13802 
13803   return SDValue();
13804 }
13805 
13806 SDValue
13807 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13808                                                 DAGCombinerInfo &DCI) const {
13809   SDValue Vec = N->getOperand(0);
13810   SDValue Idx = N->getOperand(2);
13811   EVT VecVT = Vec.getValueType();
13812   EVT EltVT = VecVT.getVectorElementType();
13813 
13814   // INSERT_VECTOR_ELT (<n x e>, var-idx)
13815   // => BUILD_VECTOR n x select (e, const-idx)
13816   if (!shouldExpandVectorDynExt(N))
13817     return SDValue();
13818 
13819   SelectionDAG &DAG = DCI.DAG;
13820   SDLoc SL(N);
13821   SDValue Ins = N->getOperand(1);
13822   EVT IdxVT = Idx.getValueType();
13823 
13824   SmallVector<SDValue, 16> Ops;
13825   for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13826     SDValue IC = DAG.getConstant(I, SL, IdxVT);
13827     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13828     SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13829     Ops.push_back(V);
13830   }
13831 
13832   return DAG.getBuildVector(VecVT, SL, Ops);
13833 }
13834 
13835 /// Return the source of an fp_extend from f16 to f32, or a converted FP
13836 /// constant.
13837 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
13838   if (Src.getOpcode() == ISD::FP_EXTEND &&
13839       Src.getOperand(0).getValueType() == MVT::f16) {
13840     return Src.getOperand(0);
13841   }
13842 
13843   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13844     APFloat Val = CFP->getValueAPF();
13845     bool LosesInfo = true;
13846     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
13847     if (!LosesInfo)
13848       return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13849   }
13850 
13851   return SDValue();
13852 }
13853 
13854 SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13855                                                 DAGCombinerInfo &DCI) const {
13856   assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13857          "combine only useful on gfx8");
13858 
13859   SDValue TruncSrc = N->getOperand(0);
13860   EVT VT = N->getValueType(0);
13861   if (VT != MVT::f16)
13862     return SDValue();
13863 
13864   if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13865       TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13866     return SDValue();
13867 
13868   SelectionDAG &DAG = DCI.DAG;
13869   SDLoc SL(N);
13870 
13871   // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13872   // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13873   // casting back.
13874 
13875   // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13876   // fmin(fmax(a, b), fmax(fmin(a, b), c))
13877   SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13878   if (!A)
13879     return SDValue();
13880 
13881   SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13882   if (!B)
13883     return SDValue();
13884 
13885   SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13886   if (!C)
13887     return SDValue();
13888 
13889   // This changes signaling nan behavior. If an input is a signaling nan, it
13890   // would have been quieted by the fpext originally. We don't care because
13891   // these are unconstrained ops. If we needed to insert quieting canonicalizes
13892   // we would be worse off than just doing the promotion.
13893   SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13894   SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13895   SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13896   return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13897 }
13898 
13899 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13900                                           const SDNode *N0,
13901                                           const SDNode *N1) const {
13902   EVT VT = N0->getValueType(0);
13903 
13904   // Only do this if we are not trying to support denormals. v_mad_f32 does not
13905   // support denormals ever.
13906   if (((VT == MVT::f32 &&
13907         denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||
13908        (VT == MVT::f16 && Subtarget->hasMadF16() &&
13909         denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&
13910       isOperationLegal(ISD::FMAD, VT))
13911     return ISD::FMAD;
13912 
13913   const TargetOptions &Options = DAG.getTarget().Options;
13914   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13915        (N0->getFlags().hasAllowContract() &&
13916         N1->getFlags().hasAllowContract())) &&
13917       isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
13918     return ISD::FMA;
13919   }
13920 
13921   return 0;
13922 }
13923 
13924 // For a reassociatable opcode perform:
13925 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13926 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13927                                                SelectionDAG &DAG) const {
13928   EVT VT = N->getValueType(0);
13929   if (VT != MVT::i32 && VT != MVT::i64)
13930     return SDValue();
13931 
13932   if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13933     return SDValue();
13934 
13935   unsigned Opc = N->getOpcode();
13936   SDValue Op0 = N->getOperand(0);
13937   SDValue Op1 = N->getOperand(1);
13938 
13939   if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13940     return SDValue();
13941 
13942   if (Op0->isDivergent())
13943     std::swap(Op0, Op1);
13944 
13945   if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13946     return SDValue();
13947 
13948   SDValue Op2 = Op1.getOperand(1);
13949   Op1 = Op1.getOperand(0);
13950   if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13951     return SDValue();
13952 
13953   if (Op1->isDivergent())
13954     std::swap(Op1, Op2);
13955 
13956   SDLoc SL(N);
13957   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13958   return DAG.getNode(Opc, SL, VT, Add1, Op2);
13959 }
13960 
13961 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
13962                            SDValue N0, SDValue N1, SDValue N2, bool Signed) {
13963   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
13964   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13965   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13966   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13967 }
13968 
13969 // Fold
13970 //     y = lshr i64 x, 32
13971 //     res = add (mul i64 y, Const), x   where "Const" is a 64-bit constant
13972 //     with Const.hi == -1
13973 // To
13974 //     res = mad_u64_u32 y.lo ,Const.lo, x.lo
13975 static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
13976                                  SDValue MulLHS, SDValue MulRHS,
13977                                  SDValue AddRHS) {
13978   if (MulRHS.getOpcode() == ISD::SRL)
13979     std::swap(MulLHS, MulRHS);
13980 
13981   if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
13982     return SDValue();
13983 
13984   ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
13985   if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
13986       MulLHS.getOperand(0) != AddRHS)
13987     return SDValue();
13988 
13989   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
13990   if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
13991     return SDValue();
13992 
13993   SDValue ConstMul =
13994       DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
13995   return getMad64_32(DAG, SL, MVT::i64,
13996                      DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
13997                      DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
13998 }
13999 
14000 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14001 // multiplies, if any.
14002 //
14003 // Full 64-bit multiplies that feed into an addition are lowered here instead
14004 // of using the generic expansion. The generic expansion ends up with
14005 // a tree of ADD nodes that prevents us from using the "add" part of the
14006 // MAD instruction. The expansion produced here results in a chain of ADDs
14007 // instead of a tree.
14008 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14009                                             DAGCombinerInfo &DCI) const {
14010   assert(N->getOpcode() == ISD::ADD);
14011 
14012   SelectionDAG &DAG = DCI.DAG;
14013   EVT VT = N->getValueType(0);
14014   SDLoc SL(N);
14015   SDValue LHS = N->getOperand(0);
14016   SDValue RHS = N->getOperand(1);
14017 
14018   if (VT.isVector())
14019     return SDValue();
14020 
14021   // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14022   // result in scalar registers for uniform values.
14023   if (!N->isDivergent() && Subtarget->hasSMulHi())
14024     return SDValue();
14025 
14026   unsigned NumBits = VT.getScalarSizeInBits();
14027   if (NumBits <= 32 || NumBits > 64)
14028     return SDValue();
14029 
14030   if (LHS.getOpcode() != ISD::MUL) {
14031     assert(RHS.getOpcode() == ISD::MUL);
14032     std::swap(LHS, RHS);
14033   }
14034 
14035   // Avoid the fold if it would unduly increase the number of multiplies due to
14036   // multiple uses, except on hardware with full-rate multiply-add (which is
14037   // part of full-rate 64-bit ops).
14038   if (!Subtarget->hasFullRate64Ops()) {
14039     unsigned NumUsers = 0;
14040     for (SDNode *User : LHS->users()) {
14041       // There is a use that does not feed into addition, so the multiply can't
14042       // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14043       if (User->getOpcode() != ISD::ADD)
14044         return SDValue();
14045 
14046       // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14047       // MUL + 3xADD + 3xADDC over 3xMAD.
14048       ++NumUsers;
14049       if (NumUsers >= 3)
14050         return SDValue();
14051     }
14052   }
14053 
14054   SDValue MulLHS = LHS.getOperand(0);
14055   SDValue MulRHS = LHS.getOperand(1);
14056   SDValue AddRHS = RHS;
14057 
14058   if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14059     return FoldedMAD;
14060 
14061   // Always check whether operands are small unsigned values, since that
14062   // knowledge is useful in more cases. Check for small signed values only if
14063   // doing so can unlock a shorter code sequence.
14064   bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14065   bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14066 
14067   bool MulSignedLo = false;
14068   if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14069     MulSignedLo =
14070         numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14071   }
14072 
14073   // The operands and final result all have the same number of bits. If
14074   // operands need to be extended, they can be extended with garbage. The
14075   // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14076   // truncated away in the end.
14077   if (VT != MVT::i64) {
14078     MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14079     MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14080     AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14081   }
14082 
14083   // The basic code generated is conceptually straightforward. Pseudo code:
14084   //
14085   //   accum = mad_64_32 lhs.lo, rhs.lo, accum
14086   //   accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14087   //   accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14088   //
14089   // The second and third lines are optional, depending on whether the factors
14090   // are {sign,zero}-extended or not.
14091   //
14092   // The actual DAG is noisier than the pseudo code, but only due to
14093   // instructions that disassemble values into low and high parts, and
14094   // assemble the final result.
14095   SDValue One = DAG.getConstant(1, SL, MVT::i32);
14096 
14097   auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14098   auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14099   SDValue Accum =
14100       getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14101 
14102   if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14103     auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14104 
14105     if (!MulLHSUnsigned32) {
14106       auto MulLHSHi =
14107           DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14108       SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14109       AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14110     }
14111 
14112     if (!MulRHSUnsigned32) {
14113       auto MulRHSHi =
14114           DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14115       SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14116       AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14117     }
14118 
14119     Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14120     Accum = DAG.getBitcast(MVT::i64, Accum);
14121   }
14122 
14123   if (VT != MVT::i64)
14124     Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14125   return Accum;
14126 }
14127 
14128 SDValue
14129 SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14130                                                   DAGCombinerInfo &DCI) const {
14131   SDValue RHS = N->getOperand(1);
14132   auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14133   if (!CRHS)
14134     return SDValue();
14135 
14136   // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14137   // common.
14138   uint64_t Val = CRHS->getZExtValue();
14139   if (countr_zero(Val) >= 32) {
14140     SelectionDAG &DAG = DCI.DAG;
14141     SDLoc SL(N);
14142     SDValue LHS = N->getOperand(0);
14143 
14144     // Avoid carry machinery if we know the low half of the add does not
14145     // contribute to the final result.
14146     //
14147     // add i64:x, K if computeTrailingZeros(K) >= 32
14148     //  => build_pair (add x.hi, K.hi), x.lo
14149 
14150     // Breaking the 64-bit add here with this strange constant is unlikely
14151     // to interfere with addressing mode patterns.
14152 
14153     SDValue Hi = getHiHalf64(LHS, DAG);
14154     SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14155     SDValue AddHi =
14156         DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14157 
14158     SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14159     return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14160   }
14161 
14162   return SDValue();
14163 }
14164 
14165 // Collect the ultimate src of each of the mul node's operands, and confirm
14166 // each operand is 8 bytes.
14167 static std::optional<ByteProvider<SDValue>>
14168 handleMulOperand(const SDValue &MulOperand) {
14169   auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14170   if (!Byte0 || Byte0->isConstantZero()) {
14171     return std::nullopt;
14172   }
14173   auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14174   if (Byte1 && !Byte1->isConstantZero()) {
14175     return std::nullopt;
14176   }
14177   return Byte0;
14178 }
14179 
14180 static unsigned addPermMasks(unsigned First, unsigned Second) {
14181   unsigned FirstCs = First & 0x0c0c0c0c;
14182   unsigned SecondCs = Second & 0x0c0c0c0c;
14183   unsigned FirstNoCs = First & ~0x0c0c0c0c;
14184   unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14185 
14186   assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14187   assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14188   assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14189   assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14190 
14191   return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14192 }
14193 
14194 struct DotSrc {
14195   SDValue SrcOp;
14196   int64_t PermMask;
14197   int64_t DWordOffset;
14198 };
14199 
14200 static void placeSources(ByteProvider<SDValue> &Src0,
14201                          ByteProvider<SDValue> &Src1,
14202                          SmallVectorImpl<DotSrc> &Src0s,
14203                          SmallVectorImpl<DotSrc> &Src1s, int Step) {
14204 
14205   assert(Src0.Src.has_value() && Src1.Src.has_value());
14206   // Src0s and Src1s are empty, just place arbitrarily.
14207   if (Step == 0) {
14208     Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14209                      Src0.SrcOffset / 4});
14210     Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14211                      Src1.SrcOffset / 4});
14212     return;
14213   }
14214 
14215   for (int BPI = 0; BPI < 2; BPI++) {
14216     std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14217     if (BPI == 1) {
14218       BPP = {Src1, Src0};
14219     }
14220     unsigned ZeroMask = 0x0c0c0c0c;
14221     unsigned FMask = 0xFF << (8 * (3 - Step));
14222 
14223     unsigned FirstMask =
14224         (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14225     unsigned SecondMask =
14226         (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14227     // Attempt to find Src vector which contains our SDValue, if so, add our
14228     // perm mask to the existing one. If we are unable to find a match for the
14229     // first SDValue, attempt to find match for the second.
14230     int FirstGroup = -1;
14231     for (int I = 0; I < 2; I++) {
14232       SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14233       auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14234         return IterElt.SrcOp == *BPP.first.Src &&
14235                (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14236       };
14237 
14238       auto *Match = llvm::find_if(Srcs, MatchesFirst);
14239       if (Match != Srcs.end()) {
14240         Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14241         FirstGroup = I;
14242         break;
14243       }
14244     }
14245     if (FirstGroup != -1) {
14246       SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14247       auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14248         return IterElt.SrcOp == *BPP.second.Src &&
14249                (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14250       };
14251       auto *Match = llvm::find_if(Srcs, MatchesSecond);
14252       if (Match != Srcs.end()) {
14253         Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14254       } else
14255         Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14256       return;
14257     }
14258   }
14259 
14260   // If we have made it here, then we could not find a match in Src0s or Src1s
14261   // for either Src0 or Src1, so just place them arbitrarily.
14262 
14263   unsigned ZeroMask = 0x0c0c0c0c;
14264   unsigned FMask = 0xFF << (8 * (3 - Step));
14265 
14266   Src0s.push_back(
14267       {*Src0.Src,
14268        ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14269        Src0.SrcOffset / 4});
14270   Src1s.push_back(
14271       {*Src1.Src,
14272        ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14273        Src1.SrcOffset / 4});
14274 }
14275 
14276 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
14277                               SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14278                               bool IsAny) {
14279 
14280   // If we just have one source, just permute it accordingly.
14281   if (Srcs.size() == 1) {
14282     auto *Elt = Srcs.begin();
14283     auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14284 
14285     // v_perm will produce the original value
14286     if (Elt->PermMask == 0x3020100)
14287       return EltOp;
14288 
14289     return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14290                        DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14291   }
14292 
14293   auto *FirstElt = Srcs.begin();
14294   auto *SecondElt = std::next(FirstElt);
14295 
14296   SmallVector<SDValue, 2> Perms;
14297 
14298   // If we have multiple sources in the chain, combine them via perms (using
14299   // calculated perm mask) and Ors.
14300   while (true) {
14301     auto FirstMask = FirstElt->PermMask;
14302     auto SecondMask = SecondElt->PermMask;
14303 
14304     unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14305     unsigned FirstPlusFour = FirstMask | 0x04040404;
14306     // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14307     // original 0x0C.
14308     FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14309 
14310     auto PermMask = addPermMasks(FirstMask, SecondMask);
14311     auto FirstVal =
14312         getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14313     auto SecondVal =
14314         getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14315 
14316     Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14317                                 SecondVal,
14318                                 DAG.getConstant(PermMask, SL, MVT::i32)));
14319 
14320     FirstElt = std::next(SecondElt);
14321     if (FirstElt == Srcs.end())
14322       break;
14323 
14324     SecondElt = std::next(FirstElt);
14325     // If we only have a FirstElt, then just combine that into the cumulative
14326     // source node.
14327     if (SecondElt == Srcs.end()) {
14328       auto EltOp =
14329           getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14330 
14331       Perms.push_back(
14332           DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14333                       DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14334       break;
14335     }
14336   }
14337 
14338   assert(Perms.size() == 1 || Perms.size() == 2);
14339   return Perms.size() == 2
14340              ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14341              : Perms[0];
14342 }
14343 
14344 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14345   for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14346     EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14347     auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14348     EntryMask += ZeroMask;
14349   }
14350 }
14351 
14352 static bool isMul(const SDValue Op) {
14353   auto Opcode = Op.getOpcode();
14354 
14355   return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14356           Opcode == AMDGPUISD::MUL_I24);
14357 }
14358 
14359 static std::optional<bool>
14360 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
14361                        ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14362                        const SDValue &S1Op, const SelectionDAG &DAG) {
14363   // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14364   // of the dot4 is irrelevant.
14365   if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14366     return false;
14367 
14368   auto Known0 = DAG.computeKnownBits(S0Op, 0);
14369   bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14370   bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14371   auto Known1 = DAG.computeKnownBits(S1Op, 0);
14372   bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14373   bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14374 
14375   assert(!(S0IsUnsigned && S0IsSigned));
14376   assert(!(S1IsUnsigned && S1IsSigned));
14377 
14378   // There are 9 possible permutations of
14379   // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14380 
14381   // In two permutations, the sign bits are known to be the same for both Ops,
14382   // so simply return Signed / Unsigned corresponding to the MSB
14383 
14384   if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14385     return S0IsSigned;
14386 
14387   // In another two permutations, the sign bits are known to be opposite. In
14388   // this case return std::nullopt to indicate a bad match.
14389 
14390   if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14391     return std::nullopt;
14392 
14393   // In the remaining five permutations, we don't know the value of the sign
14394   // bit for at least one Op. Since we have a valid ByteProvider, we know that
14395   // the upper bits must be extension bits. Thus, the only ways for the sign
14396   // bit to be unknown is if it was sign extended from unknown value, or if it
14397   // was any extended. In either case, it is correct to use the signed
14398   // version of the signedness semantics of dot4
14399 
14400   // In two of such permutations, we known the sign bit is set for
14401   // one op, and the other is unknown. It is okay to used signed version of
14402   // dot4.
14403   if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14404       ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14405     return true;
14406 
14407   // In one such permutation, we don't know either of the sign bits. It is okay
14408   // to used the signed version of dot4.
14409   if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14410     return true;
14411 
14412   // In two of such permutations, we known the sign bit is unset for
14413   // one op, and the other is unknown. Return std::nullopt to indicate a
14414   // bad match.
14415   if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14416       ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14417     return std::nullopt;
14418 
14419   llvm_unreachable("Fully covered condition");
14420 }
14421 
14422 SDValue SITargetLowering::performAddCombine(SDNode *N,
14423                                             DAGCombinerInfo &DCI) const {
14424   SelectionDAG &DAG = DCI.DAG;
14425   EVT VT = N->getValueType(0);
14426   SDLoc SL(N);
14427   SDValue LHS = N->getOperand(0);
14428   SDValue RHS = N->getOperand(1);
14429 
14430   if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14431     if (Subtarget->hasMad64_32()) {
14432       if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14433         return Folded;
14434     }
14435   }
14436 
14437   if (SDValue V = reassociateScalarOps(N, DAG)) {
14438     return V;
14439   }
14440 
14441   if (VT == MVT::i64) {
14442     if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14443       return Folded;
14444   }
14445 
14446   if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14447       (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14448     SDValue TempNode(N, 0);
14449     std::optional<bool> IsSigned;
14450     SmallVector<DotSrc, 4> Src0s;
14451     SmallVector<DotSrc, 4> Src1s;
14452     SmallVector<SDValue, 4> Src2s;
14453 
14454     // Match the v_dot4 tree, while collecting src nodes.
14455     int ChainLength = 0;
14456     for (int I = 0; I < 4; I++) {
14457       auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14458       if (MulIdx == -1)
14459         break;
14460       auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14461       if (!Src0)
14462         break;
14463       auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14464       if (!Src1)
14465         break;
14466 
14467       auto IterIsSigned = checkDot4MulSignedness(
14468           TempNode->getOperand(MulIdx), *Src0, *Src1,
14469           TempNode->getOperand(MulIdx)->getOperand(0),
14470           TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14471       if (!IterIsSigned)
14472         break;
14473       if (!IsSigned)
14474         IsSigned = *IterIsSigned;
14475       if (*IterIsSigned != *IsSigned)
14476         break;
14477       placeSources(*Src0, *Src1, Src0s, Src1s, I);
14478       auto AddIdx = 1 - MulIdx;
14479       // Allow the special case where add (add (mul24, 0), mul24) became ->
14480       // add (mul24, mul24).
14481       if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14482         Src2s.push_back(TempNode->getOperand(AddIdx));
14483         auto Src0 =
14484             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14485         if (!Src0)
14486           break;
14487         auto Src1 =
14488             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14489         if (!Src1)
14490           break;
14491         auto IterIsSigned = checkDot4MulSignedness(
14492             TempNode->getOperand(AddIdx), *Src0, *Src1,
14493             TempNode->getOperand(AddIdx)->getOperand(0),
14494             TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14495         if (!IterIsSigned)
14496           break;
14497         assert(IsSigned);
14498         if (*IterIsSigned != *IsSigned)
14499           break;
14500         placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14501         Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14502         ChainLength = I + 2;
14503         break;
14504       }
14505 
14506       TempNode = TempNode->getOperand(AddIdx);
14507       Src2s.push_back(TempNode);
14508       ChainLength = I + 1;
14509       if (TempNode->getNumOperands() < 2)
14510         break;
14511       LHS = TempNode->getOperand(0);
14512       RHS = TempNode->getOperand(1);
14513     }
14514 
14515     if (ChainLength < 2)
14516       return SDValue();
14517 
14518     // Masks were constructed with assumption that we would find a chain of
14519     // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14520     // 0x0c) so they do not affect dot calculation.
14521     if (ChainLength < 4) {
14522       fixMasks(Src0s, ChainLength);
14523       fixMasks(Src1s, ChainLength);
14524     }
14525 
14526     SDValue Src0, Src1;
14527 
14528     // If we are just using a single source for both, and have permuted the
14529     // bytes consistently, we can just use the sources without permuting
14530     // (commutation).
14531     bool UseOriginalSrc = false;
14532     if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14533         Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14534         Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14535         Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14536       SmallVector<unsigned, 4> SrcBytes;
14537       auto Src0Mask = Src0s.begin()->PermMask;
14538       SrcBytes.push_back(Src0Mask & 0xFF000000);
14539       bool UniqueEntries = true;
14540       for (auto I = 1; I < 4; I++) {
14541         auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14542 
14543         if (is_contained(SrcBytes, NextByte)) {
14544           UniqueEntries = false;
14545           break;
14546         }
14547         SrcBytes.push_back(NextByte);
14548       }
14549 
14550       if (UniqueEntries) {
14551         UseOriginalSrc = true;
14552 
14553         auto *FirstElt = Src0s.begin();
14554         auto FirstEltOp =
14555             getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14556 
14557         auto *SecondElt = Src1s.begin();
14558         auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14559                                               SecondElt->DWordOffset);
14560 
14561         Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14562                                              MVT::getIntegerVT(32));
14563         Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14564                                              MVT::getIntegerVT(32));
14565       }
14566     }
14567 
14568     if (!UseOriginalSrc) {
14569       Src0 = resolveSources(DAG, SL, Src0s, false, true);
14570       Src1 = resolveSources(DAG, SL, Src1s, false, true);
14571     }
14572 
14573     assert(IsSigned);
14574     SDValue Src2 =
14575         DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14576 
14577     SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14578                                                   : Intrinsic::amdgcn_udot4,
14579                                         SL, MVT::i64);
14580 
14581     assert(!VT.isVector());
14582     auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14583                            Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14584 
14585     return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14586   }
14587 
14588   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14589     return SDValue();
14590 
14591   // add x, zext (setcc) => uaddo_carry x, 0, setcc
14592   // add x, sext (setcc) => usubo_carry x, 0, setcc
14593   unsigned Opc = LHS.getOpcode();
14594   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14595       Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14596     std::swap(RHS, LHS);
14597 
14598   Opc = RHS.getOpcode();
14599   switch (Opc) {
14600   default:
14601     break;
14602   case ISD::ZERO_EXTEND:
14603   case ISD::SIGN_EXTEND:
14604   case ISD::ANY_EXTEND: {
14605     auto Cond = RHS.getOperand(0);
14606     // If this won't be a real VOPC output, we would still need to insert an
14607     // extra instruction anyway.
14608     if (!isBoolSGPR(Cond))
14609       break;
14610     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14611     SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14612     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
14613     return DAG.getNode(Opc, SL, VTList, Args);
14614   }
14615   case ISD::UADDO_CARRY: {
14616     // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14617     if (!isNullConstant(RHS.getOperand(1)))
14618       break;
14619     SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
14620     return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14621   }
14622   }
14623   return SDValue();
14624 }
14625 
14626 SDValue SITargetLowering::performSubCombine(SDNode *N,
14627                                             DAGCombinerInfo &DCI) const {
14628   SelectionDAG &DAG = DCI.DAG;
14629   EVT VT = N->getValueType(0);
14630 
14631   if (VT == MVT::i64) {
14632     if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14633       return Folded;
14634   }
14635 
14636   if (VT != MVT::i32)
14637     return SDValue();
14638 
14639   SDLoc SL(N);
14640   SDValue LHS = N->getOperand(0);
14641   SDValue RHS = N->getOperand(1);
14642 
14643   // sub x, zext (setcc) => usubo_carry x, 0, setcc
14644   // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14645   unsigned Opc = RHS.getOpcode();
14646   switch (Opc) {
14647   default:
14648     break;
14649   case ISD::ZERO_EXTEND:
14650   case ISD::SIGN_EXTEND:
14651   case ISD::ANY_EXTEND: {
14652     auto Cond = RHS.getOperand(0);
14653     // If this won't be a real VOPC output, we would still need to insert an
14654     // extra instruction anyway.
14655     if (!isBoolSGPR(Cond))
14656       break;
14657     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14658     SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
14659     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
14660     return DAG.getNode(Opc, SL, VTList, Args);
14661   }
14662   }
14663 
14664   if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14665     // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14666     if (!isNullConstant(LHS.getOperand(1)))
14667       return SDValue();
14668     SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
14669     return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14670   }
14671   return SDValue();
14672 }
14673 
14674 SDValue
14675 SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14676                                                  DAGCombinerInfo &DCI) const {
14677 
14678   if (N->getValueType(0) != MVT::i32)
14679     return SDValue();
14680 
14681   if (!isNullConstant(N->getOperand(1)))
14682     return SDValue();
14683 
14684   SelectionDAG &DAG = DCI.DAG;
14685   SDValue LHS = N->getOperand(0);
14686 
14687   // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14688   // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14689   unsigned LHSOpc = LHS.getOpcode();
14690   unsigned Opc = N->getOpcode();
14691   if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14692       (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14693     SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
14694     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14695   }
14696   return SDValue();
14697 }
14698 
14699 SDValue SITargetLowering::performFAddCombine(SDNode *N,
14700                                              DAGCombinerInfo &DCI) const {
14701   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14702     return SDValue();
14703 
14704   SelectionDAG &DAG = DCI.DAG;
14705   EVT VT = N->getValueType(0);
14706 
14707   SDLoc SL(N);
14708   SDValue LHS = N->getOperand(0);
14709   SDValue RHS = N->getOperand(1);
14710 
14711   // These should really be instruction patterns, but writing patterns with
14712   // source modifiers is a pain.
14713 
14714   // fadd (fadd (a, a), b) -> mad 2.0, a, b
14715   if (LHS.getOpcode() == ISD::FADD) {
14716     SDValue A = LHS.getOperand(0);
14717     if (A == LHS.getOperand(1)) {
14718       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14719       if (FusedOp != 0) {
14720         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14721         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14722       }
14723     }
14724   }
14725 
14726   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14727   if (RHS.getOpcode() == ISD::FADD) {
14728     SDValue A = RHS.getOperand(0);
14729     if (A == RHS.getOperand(1)) {
14730       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14731       if (FusedOp != 0) {
14732         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14733         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14734       }
14735     }
14736   }
14737 
14738   return SDValue();
14739 }
14740 
14741 SDValue SITargetLowering::performFSubCombine(SDNode *N,
14742                                              DAGCombinerInfo &DCI) const {
14743   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14744     return SDValue();
14745 
14746   SelectionDAG &DAG = DCI.DAG;
14747   SDLoc SL(N);
14748   EVT VT = N->getValueType(0);
14749   assert(!VT.isVector());
14750 
14751   // Try to get the fneg to fold into the source modifier. This undoes generic
14752   // DAG combines and folds them into the mad.
14753   //
14754   // Only do this if we are not trying to support denormals. v_mad_f32 does
14755   // not support denormals ever.
14756   SDValue LHS = N->getOperand(0);
14757   SDValue RHS = N->getOperand(1);
14758   if (LHS.getOpcode() == ISD::FADD) {
14759     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14760     SDValue A = LHS.getOperand(0);
14761     if (A == LHS.getOperand(1)) {
14762       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14763       if (FusedOp != 0) {
14764         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14765         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14766 
14767         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14768       }
14769     }
14770   }
14771 
14772   if (RHS.getOpcode() == ISD::FADD) {
14773     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14774 
14775     SDValue A = RHS.getOperand(0);
14776     if (A == RHS.getOperand(1)) {
14777       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14778       if (FusedOp != 0) {
14779         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14780         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14781       }
14782     }
14783   }
14784 
14785   return SDValue();
14786 }
14787 
14788 SDValue SITargetLowering::performFDivCombine(SDNode *N,
14789                                              DAGCombinerInfo &DCI) const {
14790   SelectionDAG &DAG = DCI.DAG;
14791   SDLoc SL(N);
14792   EVT VT = N->getValueType(0);
14793   if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14794     return SDValue();
14795 
14796   SDValue LHS = N->getOperand(0);
14797   SDValue RHS = N->getOperand(1);
14798 
14799   SDNodeFlags Flags = N->getFlags();
14800   SDNodeFlags RHSFlags = RHS->getFlags();
14801   if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14802       !RHS->hasOneUse())
14803     return SDValue();
14804 
14805   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14806     bool IsNegative = false;
14807     if (CLHS->isExactlyValue(1.0) ||
14808         (IsNegative = CLHS->isExactlyValue(-1.0))) {
14809       // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14810       // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14811       if (RHS.getOpcode() == ISD::FSQRT) {
14812         // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14813         SDValue Rsq =
14814             DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14815         return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14816       }
14817     }
14818   }
14819 
14820   return SDValue();
14821 }
14822 
14823 SDValue SITargetLowering::performFMulCombine(SDNode *N,
14824                                              DAGCombinerInfo &DCI) const {
14825   SelectionDAG &DAG = DCI.DAG;
14826   EVT VT = N->getValueType(0);
14827   EVT ScalarVT = VT.getScalarType();
14828   EVT IntVT = VT.changeElementType(MVT::i32);
14829 
14830   SDValue LHS = N->getOperand(0);
14831   SDValue RHS = N->getOperand(1);
14832 
14833   // It is cheaper to realize i32 inline constants as compared against
14834   // materializing f16 or f64 (or even non-inline f32) values,
14835   // possible via ldexp usage, as shown below :
14836   //
14837   // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
14838   // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
14839   // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
14840   if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
14841       (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
14842     const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
14843     if (!TrueNode)
14844       return SDValue();
14845     const ConstantFPSDNode *FalseNode =
14846         isConstOrConstSplatFP(RHS.getOperand(2));
14847     if (!FalseNode)
14848       return SDValue();
14849 
14850     if (TrueNode->isNegative() != FalseNode->isNegative())
14851       return SDValue();
14852 
14853     // For f32, only non-inline constants should be transformed.
14854     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14855     if (ScalarVT == MVT::f32 &&
14856         TII->isInlineConstant(TrueNode->getValueAPF()) &&
14857         TII->isInlineConstant(FalseNode->getValueAPF()))
14858       return SDValue();
14859 
14860     int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
14861     if (TrueNodeExpVal == INT_MIN)
14862       return SDValue();
14863     int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
14864     if (FalseNodeExpVal == INT_MIN)
14865       return SDValue();
14866 
14867     SDLoc SL(N);
14868     SDValue SelectNode =
14869         DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
14870                     DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
14871                     DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
14872 
14873     LHS = TrueNode->isNegative()
14874               ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
14875               : LHS;
14876 
14877     return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
14878   }
14879 
14880   return SDValue();
14881 }
14882 
14883 SDValue SITargetLowering::performFMACombine(SDNode *N,
14884                                             DAGCombinerInfo &DCI) const {
14885   SelectionDAG &DAG = DCI.DAG;
14886   EVT VT = N->getValueType(0);
14887   SDLoc SL(N);
14888 
14889   if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
14890     return SDValue();
14891 
14892   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14893   //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14894   SDValue Op1 = N->getOperand(0);
14895   SDValue Op2 = N->getOperand(1);
14896   SDValue FMA = N->getOperand(2);
14897 
14898   if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
14899       Op2.getOpcode() != ISD::FP_EXTEND)
14900     return SDValue();
14901 
14902   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14903   // regardless of the denorm mode setting. Therefore,
14904   // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14905   const TargetOptions &Options = DAG.getTarget().Options;
14906   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14907       (N->getFlags().hasAllowContract() &&
14908        FMA->getFlags().hasAllowContract())) {
14909     Op1 = Op1.getOperand(0);
14910     Op2 = Op2.getOperand(0);
14911     if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14912         Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14913       return SDValue();
14914 
14915     SDValue Vec1 = Op1.getOperand(0);
14916     SDValue Idx1 = Op1.getOperand(1);
14917     SDValue Vec2 = Op2.getOperand(0);
14918 
14919     SDValue FMAOp1 = FMA.getOperand(0);
14920     SDValue FMAOp2 = FMA.getOperand(1);
14921     SDValue FMAAcc = FMA.getOperand(2);
14922 
14923     if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14924         FMAOp2.getOpcode() != ISD::FP_EXTEND)
14925       return SDValue();
14926 
14927     FMAOp1 = FMAOp1.getOperand(0);
14928     FMAOp2 = FMAOp2.getOperand(0);
14929     if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14930         FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14931       return SDValue();
14932 
14933     SDValue Vec3 = FMAOp1.getOperand(0);
14934     SDValue Vec4 = FMAOp2.getOperand(0);
14935     SDValue Idx2 = FMAOp1.getOperand(1);
14936 
14937     if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14938         // Idx1 and Idx2 cannot be the same.
14939         Idx1 == Idx2)
14940       return SDValue();
14941 
14942     if (Vec1 == Vec2 || Vec3 == Vec4)
14943       return SDValue();
14944 
14945     if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14946       return SDValue();
14947 
14948     if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
14949       return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14950                          DAG.getTargetConstant(0, SL, MVT::i1));
14951     }
14952   }
14953   return SDValue();
14954 }
14955 
14956 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14957                                               DAGCombinerInfo &DCI) const {
14958   SelectionDAG &DAG = DCI.DAG;
14959   SDLoc SL(N);
14960 
14961   SDValue LHS = N->getOperand(0);
14962   SDValue RHS = N->getOperand(1);
14963   EVT VT = LHS.getValueType();
14964   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14965 
14966   auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14967   if (!CRHS) {
14968     CRHS = dyn_cast<ConstantSDNode>(LHS);
14969     if (CRHS) {
14970       std::swap(LHS, RHS);
14971       CC = getSetCCSwappedOperands(CC);
14972     }
14973   }
14974 
14975   if (CRHS) {
14976     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14977         isBoolSGPR(LHS.getOperand(0))) {
14978       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14979       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14980       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
14981       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
14982       if ((CRHS->isAllOnes() &&
14983            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14984           (CRHS->isZero() &&
14985            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14986         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14987                            DAG.getAllOnesConstant(SL, MVT::i1));
14988       if ((CRHS->isAllOnes() &&
14989            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14990           (CRHS->isZero() &&
14991            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14992         return LHS.getOperand(0);
14993     }
14994 
14995     const APInt &CRHSVal = CRHS->getAPIntValue();
14996     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14997         LHS.getOpcode() == ISD::SELECT &&
14998         isa<ConstantSDNode>(LHS.getOperand(1)) &&
14999         isa<ConstantSDNode>(LHS.getOperand(2)) &&
15000         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15001         isBoolSGPR(LHS.getOperand(0))) {
15002       // Given CT != FT:
15003       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15004       // setcc (select cc, CT, CF), CF, ne => cc
15005       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15006       // setcc (select cc, CT, CF), CT, eq => cc
15007       const APInt &CT = LHS.getConstantOperandAPInt(1);
15008       const APInt &CF = LHS.getConstantOperandAPInt(2);
15009 
15010       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15011           (CT == CRHSVal && CC == ISD::SETNE))
15012         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15013                            DAG.getAllOnesConstant(SL, MVT::i1));
15014       if ((CF == CRHSVal && CC == ISD::SETNE) ||
15015           (CT == CRHSVal && CC == ISD::SETEQ))
15016         return LHS.getOperand(0);
15017     }
15018   }
15019 
15020   if (VT != MVT::f32 && VT != MVT::f64 &&
15021       (!Subtarget->has16BitInsts() || VT != MVT::f16))
15022     return SDValue();
15023 
15024   // Match isinf/isfinite pattern
15025   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15026   // (fcmp one (fabs x), inf) -> (fp_class x,
15027   // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15028   if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15029       LHS.getOpcode() == ISD::FABS) {
15030     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15031     if (!CRHS)
15032       return SDValue();
15033 
15034     const APFloat &APF = CRHS->getValueAPF();
15035     if (APF.isInfinity() && !APF.isNegative()) {
15036       const unsigned IsInfMask =
15037           SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
15038       const unsigned IsFiniteMask =
15039           SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
15040           SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
15041           SIInstrFlags::P_SUBNORMAL;
15042       unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15043       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15044                          DAG.getConstant(Mask, SL, MVT::i32));
15045     }
15046   }
15047 
15048   return SDValue();
15049 }
15050 
15051 SDValue
15052 SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15053                                              DAGCombinerInfo &DCI) const {
15054   SelectionDAG &DAG = DCI.DAG;
15055   SDLoc SL(N);
15056   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15057 
15058   SDValue Src = N->getOperand(0);
15059   SDValue Shift = N->getOperand(0);
15060 
15061   // TODO: Extend type shouldn't matter (assuming legal types).
15062   if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15063     Shift = Shift.getOperand(0);
15064 
15065   if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15066     // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x
15067     // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15068     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15069     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15070     // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
15071     if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15072       SDValue Shifted = DAG.getZExtOrTrunc(
15073           Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
15074 
15075       unsigned ShiftOffset = 8 * Offset;
15076       if (Shift.getOpcode() == ISD::SHL)
15077         ShiftOffset -= C->getZExtValue();
15078       else
15079         ShiftOffset += C->getZExtValue();
15080 
15081       if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15082         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15083                            MVT::f32, Shifted);
15084       }
15085     }
15086   }
15087 
15088   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15089   APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15090   if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15091     // We simplified Src. If this node is not dead, visit it again so it is
15092     // folded properly.
15093     if (N->getOpcode() != ISD::DELETED_NODE)
15094       DCI.AddToWorklist(N);
15095     return SDValue(N, 0);
15096   }
15097 
15098   // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15099   if (SDValue DemandedSrc =
15100           TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
15101     return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15102 
15103   return SDValue();
15104 }
15105 
15106 SDValue SITargetLowering::performClampCombine(SDNode *N,
15107                                               DAGCombinerInfo &DCI) const {
15108   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15109   if (!CSrc)
15110     return SDValue();
15111 
15112   const MachineFunction &MF = DCI.DAG.getMachineFunction();
15113   const APFloat &F = CSrc->getValueAPF();
15114   APFloat Zero = APFloat::getZero(F.getSemantics());
15115   if (F < Zero ||
15116       (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15117     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15118   }
15119 
15120   APFloat One(F.getSemantics(), "1.0");
15121   if (F > One)
15122     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15123 
15124   return SDValue(CSrc, 0);
15125 }
15126 
15127 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
15128                                             DAGCombinerInfo &DCI) const {
15129   switch (N->getOpcode()) {
15130   case ISD::ADD:
15131   case ISD::SUB:
15132   case ISD::SHL:
15133   case ISD::SRL:
15134   case ISD::SRA:
15135   case ISD::AND:
15136   case ISD::OR:
15137   case ISD::XOR:
15138   case ISD::MUL:
15139   case ISD::SETCC:
15140   case ISD::SELECT:
15141   case ISD::SMIN:
15142   case ISD::SMAX:
15143   case ISD::UMIN:
15144   case ISD::UMAX:
15145     if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15146       return Res;
15147     break;
15148   default:
15149     break;
15150   }
15151 
15152   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15153     return SDValue();
15154 
15155   switch (N->getOpcode()) {
15156   case ISD::ADD:
15157     return performAddCombine(N, DCI);
15158   case ISD::SUB:
15159     return performSubCombine(N, DCI);
15160   case ISD::UADDO_CARRY:
15161   case ISD::USUBO_CARRY:
15162     return performAddCarrySubCarryCombine(N, DCI);
15163   case ISD::FADD:
15164     return performFAddCombine(N, DCI);
15165   case ISD::FSUB:
15166     return performFSubCombine(N, DCI);
15167   case ISD::FDIV:
15168     return performFDivCombine(N, DCI);
15169   case ISD::FMUL:
15170     return performFMulCombine(N, DCI);
15171   case ISD::SETCC:
15172     return performSetCCCombine(N, DCI);
15173   case ISD::FMAXNUM:
15174   case ISD::FMINNUM:
15175   case ISD::FMAXNUM_IEEE:
15176   case ISD::FMINNUM_IEEE:
15177   case ISD::FMAXIMUM:
15178   case ISD::FMINIMUM:
15179   case ISD::SMAX:
15180   case ISD::SMIN:
15181   case ISD::UMAX:
15182   case ISD::UMIN:
15183   case AMDGPUISD::FMIN_LEGACY:
15184   case AMDGPUISD::FMAX_LEGACY:
15185     return performMinMaxCombine(N, DCI);
15186   case ISD::FMA:
15187     return performFMACombine(N, DCI);
15188   case ISD::AND:
15189     return performAndCombine(N, DCI);
15190   case ISD::OR:
15191     return performOrCombine(N, DCI);
15192   case ISD::FSHR: {
15193     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15194     if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15195         TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15196       return matchPERM(N, DCI);
15197     }
15198     break;
15199   }
15200   case ISD::XOR:
15201     return performXorCombine(N, DCI);
15202   case ISD::ZERO_EXTEND:
15203     return performZeroExtendCombine(N, DCI);
15204   case ISD::SIGN_EXTEND_INREG:
15205     return performSignExtendInRegCombine(N, DCI);
15206   case AMDGPUISD::FP_CLASS:
15207     return performClassCombine(N, DCI);
15208   case ISD::FCANONICALIZE:
15209     return performFCanonicalizeCombine(N, DCI);
15210   case AMDGPUISD::RCP:
15211     return performRcpCombine(N, DCI);
15212   case ISD::FLDEXP:
15213   case AMDGPUISD::FRACT:
15214   case AMDGPUISD::RSQ:
15215   case AMDGPUISD::RCP_LEGACY:
15216   case AMDGPUISD::RCP_IFLAG:
15217   case AMDGPUISD::RSQ_CLAMP: {
15218     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15219     SDValue Src = N->getOperand(0);
15220     if (Src.isUndef())
15221       return Src;
15222     break;
15223   }
15224   case ISD::SINT_TO_FP:
15225   case ISD::UINT_TO_FP:
15226     return performUCharToFloatCombine(N, DCI);
15227   case ISD::FCOPYSIGN:
15228     return performFCopySignCombine(N, DCI);
15229   case AMDGPUISD::CVT_F32_UBYTE0:
15230   case AMDGPUISD::CVT_F32_UBYTE1:
15231   case AMDGPUISD::CVT_F32_UBYTE2:
15232   case AMDGPUISD::CVT_F32_UBYTE3:
15233     return performCvtF32UByteNCombine(N, DCI);
15234   case AMDGPUISD::FMED3:
15235     return performFMed3Combine(N, DCI);
15236   case AMDGPUISD::CVT_PKRTZ_F16_F32:
15237     return performCvtPkRTZCombine(N, DCI);
15238   case AMDGPUISD::CLAMP:
15239     return performClampCombine(N, DCI);
15240   case ISD::SCALAR_TO_VECTOR: {
15241     SelectionDAG &DAG = DCI.DAG;
15242     EVT VT = N->getValueType(0);
15243 
15244     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15245     if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15246       SDLoc SL(N);
15247       SDValue Src = N->getOperand(0);
15248       EVT EltVT = Src.getValueType();
15249       if (EltVT != MVT::i16)
15250         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15251 
15252       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15253       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15254     }
15255 
15256     break;
15257   }
15258   case ISD::EXTRACT_VECTOR_ELT:
15259     return performExtractVectorEltCombine(N, DCI);
15260   case ISD::INSERT_VECTOR_ELT:
15261     return performInsertVectorEltCombine(N, DCI);
15262   case ISD::FP_ROUND:
15263     return performFPRoundCombine(N, DCI);
15264   case ISD::LOAD: {
15265     if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15266       return Widened;
15267     [[fallthrough]];
15268   }
15269   default: {
15270     if (!DCI.isBeforeLegalize()) {
15271       if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15272         return performMemSDNodeCombine(MemNode, DCI);
15273     }
15274 
15275     break;
15276   }
15277   }
15278 
15279   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
15280 }
15281 
15282 /// Helper function for adjustWritemask
15283 static unsigned SubIdx2Lane(unsigned Idx) {
15284   switch (Idx) {
15285   default:
15286     return ~0u;
15287   case AMDGPU::sub0:
15288     return 0;
15289   case AMDGPU::sub1:
15290     return 1;
15291   case AMDGPU::sub2:
15292     return 2;
15293   case AMDGPU::sub3:
15294     return 3;
15295   case AMDGPU::sub4:
15296     return 4; // Possible with TFE/LWE
15297   }
15298 }
15299 
15300 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15301 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15302                                           SelectionDAG &DAG) const {
15303   unsigned Opcode = Node->getMachineOpcode();
15304 
15305   // Subtract 1 because the vdata output is not a MachineSDNode operand.
15306   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15307   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15308     return Node; // not implemented for D16
15309 
15310   SDNode *Users[5] = {nullptr};
15311   unsigned Lane = 0;
15312   unsigned DmaskIdx =
15313       AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15314   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15315   unsigned NewDmask = 0;
15316   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15317   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15318   bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15319                   (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
15320                      ? true
15321                      : false;
15322   unsigned TFCLane = 0;
15323   bool HasChain = Node->getNumValues() > 1;
15324 
15325   if (OldDmask == 0) {
15326     // These are folded out, but on the chance it happens don't assert.
15327     return Node;
15328   }
15329 
15330   unsigned OldBitsSet = llvm::popcount(OldDmask);
15331   // Work out which is the TFE/LWE lane if that is enabled.
15332   if (UsesTFC) {
15333     TFCLane = OldBitsSet;
15334   }
15335 
15336   // Try to figure out the used register components
15337   for (SDUse &Use : Node->uses()) {
15338 
15339     // Don't look at users of the chain.
15340     if (Use.getResNo() != 0)
15341       continue;
15342 
15343     SDNode *User = Use.getUser();
15344 
15345     // Abort if we can't understand the usage
15346     if (!User->isMachineOpcode() ||
15347         User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15348       return Node;
15349 
15350     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15351     // Note that subregs are packed, i.e. Lane==0 is the first bit set
15352     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15353     // set, etc.
15354     Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15355     if (Lane == ~0u)
15356       return Node;
15357 
15358     // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15359     if (UsesTFC && Lane == TFCLane) {
15360       Users[Lane] = User;
15361     } else {
15362       // Set which texture component corresponds to the lane.
15363       unsigned Comp;
15364       for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15365         Comp = llvm::countr_zero(Dmask);
15366         Dmask &= ~(1 << Comp);
15367       }
15368 
15369       // Abort if we have more than one user per component.
15370       if (Users[Lane])
15371         return Node;
15372 
15373       Users[Lane] = User;
15374       NewDmask |= 1 << Comp;
15375     }
15376   }
15377 
15378   // Don't allow 0 dmask, as hardware assumes one channel enabled.
15379   bool NoChannels = !NewDmask;
15380   if (NoChannels) {
15381     if (!UsesTFC) {
15382       // No uses of the result and not using TFC. Then do nothing.
15383       return Node;
15384     }
15385     // If the original dmask has one channel - then nothing to do
15386     if (OldBitsSet == 1)
15387       return Node;
15388     // Use an arbitrary dmask - required for the instruction to work
15389     NewDmask = 1;
15390   }
15391   // Abort if there's no change
15392   if (NewDmask == OldDmask)
15393     return Node;
15394 
15395   unsigned BitsSet = llvm::popcount(NewDmask);
15396 
15397   // Check for TFE or LWE - increase the number of channels by one to account
15398   // for the extra return value
15399   // This will need adjustment for D16 if this is also included in
15400   // adjustWriteMask (this function) but at present D16 are excluded.
15401   unsigned NewChannels = BitsSet + UsesTFC;
15402 
15403   int NewOpcode =
15404       AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15405   assert(NewOpcode != -1 &&
15406          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15407          "failed to find equivalent MIMG op");
15408 
15409   // Adjust the writemask in the node
15410   SmallVector<SDValue, 12> Ops;
15411   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15412   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15413   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15414 
15415   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15416 
15417   MVT ResultVT = NewChannels == 1
15418                      ? SVT
15419                      : MVT::getVectorVT(SVT, NewChannels == 3   ? 4
15420                                              : NewChannels == 5 ? 8
15421                                                                 : NewChannels);
15422   SDVTList NewVTList =
15423       HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15424 
15425   MachineSDNode *NewNode =
15426       DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15427 
15428   if (HasChain) {
15429     // Update chain.
15430     DAG.setNodeMemRefs(NewNode, Node->memoperands());
15431     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15432   }
15433 
15434   if (NewChannels == 1) {
15435     assert(Node->hasNUsesOfValue(1, 0));
15436     SDNode *Copy =
15437         DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15438                            Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15439     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15440     return nullptr;
15441   }
15442 
15443   // Update the users of the node with the new indices
15444   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15445     SDNode *User = Users[i];
15446     if (!User) {
15447       // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15448       // Users[0] is still nullptr because channel 0 doesn't really have a use.
15449       if (i || !NoChannels)
15450         continue;
15451     } else {
15452       SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15453       SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15454       if (NewUser != User) {
15455         DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15456         DAG.RemoveDeadNode(User);
15457       }
15458     }
15459 
15460     switch (Idx) {
15461     default:
15462       break;
15463     case AMDGPU::sub0:
15464       Idx = AMDGPU::sub1;
15465       break;
15466     case AMDGPU::sub1:
15467       Idx = AMDGPU::sub2;
15468       break;
15469     case AMDGPU::sub2:
15470       Idx = AMDGPU::sub3;
15471       break;
15472     case AMDGPU::sub3:
15473       Idx = AMDGPU::sub4;
15474       break;
15475     }
15476   }
15477 
15478   DAG.RemoveDeadNode(Node);
15479   return nullptr;
15480 }
15481 
15482 static bool isFrameIndexOp(SDValue Op) {
15483   if (Op.getOpcode() == ISD::AssertZext)
15484     Op = Op.getOperand(0);
15485 
15486   return isa<FrameIndexSDNode>(Op);
15487 }
15488 
15489 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
15490 /// with frame index operands.
15491 /// LLVM assumes that inputs are to these instructions are registers.
15492 SDNode *
15493 SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
15494                                                 SelectionDAG &DAG) const {
15495   if (Node->getOpcode() == ISD::CopyToReg) {
15496     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15497     SDValue SrcVal = Node->getOperand(2);
15498 
15499     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15500     // to try understanding copies to physical registers.
15501     if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15502       SDLoc SL(Node);
15503       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15504       SDValue VReg = DAG.getRegister(
15505           MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15506 
15507       SDNode *Glued = Node->getGluedNode();
15508       SDValue ToVReg = DAG.getCopyToReg(
15509           Node->getOperand(0), SL, VReg, SrcVal,
15510           SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15511       SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15512                                              VReg, ToVReg.getValue(1));
15513       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15514       DAG.RemoveDeadNode(Node);
15515       return ToResultReg.getNode();
15516     }
15517   }
15518 
15519   SmallVector<SDValue, 8> Ops;
15520   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15521     if (!isFrameIndexOp(Node->getOperand(i))) {
15522       Ops.push_back(Node->getOperand(i));
15523       continue;
15524     }
15525 
15526     SDLoc DL(Node);
15527     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15528                                              Node->getOperand(i).getValueType(),
15529                                              Node->getOperand(i)),
15530                           0));
15531   }
15532 
15533   return DAG.UpdateNodeOperands(Node, Ops);
15534 }
15535 
15536 /// Fold the instructions after selecting them.
15537 /// Returns null if users were already updated.
15538 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
15539                                           SelectionDAG &DAG) const {
15540   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15541   unsigned Opcode = Node->getMachineOpcode();
15542 
15543   if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15544       !TII->isGather4(Opcode) &&
15545       AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15546     return adjustWritemask(Node, DAG);
15547   }
15548 
15549   if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
15550     legalizeTargetIndependentNode(Node, DAG);
15551     return Node;
15552   }
15553 
15554   switch (Opcode) {
15555   case AMDGPU::V_DIV_SCALE_F32_e64:
15556   case AMDGPU::V_DIV_SCALE_F64_e64: {
15557     // Satisfy the operand register constraint when one of the inputs is
15558     // undefined. Ordinarily each undef value will have its own implicit_def of
15559     // a vreg, so force these to use a single register.
15560     SDValue Src0 = Node->getOperand(1);
15561     SDValue Src1 = Node->getOperand(3);
15562     SDValue Src2 = Node->getOperand(5);
15563 
15564     if ((Src0.isMachineOpcode() &&
15565          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15566         (Src0 == Src1 || Src0 == Src2))
15567       break;
15568 
15569     MVT VT = Src0.getValueType().getSimpleVT();
15570     const TargetRegisterClass *RC =
15571         getRegClassFor(VT, Src0.getNode()->isDivergent());
15572 
15573     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15574     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15575 
15576     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
15577                                       Src0, SDValue());
15578 
15579     // src0 must be the same register as src1 or src2, even if the value is
15580     // undefined, so make sure we don't violate this constraint.
15581     if (Src0.isMachineOpcode() &&
15582         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15583       if (Src1.isMachineOpcode() &&
15584           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15585         Src0 = Src1;
15586       else if (Src2.isMachineOpcode() &&
15587                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15588         Src0 = Src2;
15589       else {
15590         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15591         Src0 = UndefReg;
15592         Src1 = UndefReg;
15593       }
15594     } else
15595       break;
15596 
15597     SmallVector<SDValue, 9> Ops(Node->ops());
15598     Ops[1] = Src0;
15599     Ops[3] = Src1;
15600     Ops[5] = Src2;
15601     Ops.push_back(ImpDef.getValue(1));
15602     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15603   }
15604   default:
15605     break;
15606   }
15607 
15608   return Node;
15609 }
15610 
15611 // Any MIMG instructions that use tfe or lwe require an initialization of the
15612 // result register that will be written in the case of a memory access failure.
15613 // The required code is also added to tie this init code to the result of the
15614 // img instruction.
15615 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
15616   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15617   const SIRegisterInfo &TRI = TII->getRegisterInfo();
15618   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15619   MachineBasicBlock &MBB = *MI.getParent();
15620 
15621   int DstIdx =
15622       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15623   unsigned InitIdx = 0;
15624 
15625   if (TII->isImage(MI)) {
15626     MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15627     MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15628     MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15629 
15630     if (!TFE && !LWE) // intersect_ray
15631       return;
15632 
15633     unsigned TFEVal = TFE ? TFE->getImm() : 0;
15634     unsigned LWEVal = LWE ? LWE->getImm() : 0;
15635     unsigned D16Val = D16 ? D16->getImm() : 0;
15636 
15637     if (!TFEVal && !LWEVal)
15638       return;
15639 
15640     // At least one of TFE or LWE are non-zero
15641     // We have to insert a suitable initialization of the result value and
15642     // tie this to the dest of the image instruction.
15643 
15644     // Calculate which dword we have to initialize to 0.
15645     MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15646 
15647     // check that dmask operand is found.
15648     assert(MO_Dmask && "Expected dmask operand in instruction");
15649 
15650     unsigned dmask = MO_Dmask->getImm();
15651     // Determine the number of active lanes taking into account the
15652     // Gather4 special case
15653     unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15654 
15655     bool Packed = !Subtarget->hasUnpackedD16VMem();
15656 
15657     InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15658 
15659     // Abandon attempt if the dst size isn't large enough
15660     // - this is in fact an error but this is picked up elsewhere and
15661     // reported correctly.
15662     uint32_t DstSize =
15663         TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15664     if (DstSize < InitIdx)
15665       return;
15666   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15667     InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15668   } else {
15669     return;
15670   }
15671 
15672   const DebugLoc &DL = MI.getDebugLoc();
15673 
15674   // Create a register for the initialization value.
15675   Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15676   unsigned NewDst = 0; // Final initialized value will be in here
15677 
15678   // If PRTStrictNull feature is enabled (the default) then initialize
15679   // all the result registers to 0, otherwise just the error indication
15680   // register (VGPRn+1)
15681   unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15682   unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15683 
15684   BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15685   for (; SizeLeft; SizeLeft--, CurrIdx++) {
15686     NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15687     // Initialize dword
15688     Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15689     // clang-format off
15690     BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15691         .addImm(0);
15692     // clang-format on
15693     // Insert into the super-reg
15694     BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15695         .addReg(PrevDst)
15696         .addReg(SubReg)
15697         .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
15698 
15699     PrevDst = NewDst;
15700   }
15701 
15702   // Add as an implicit operand
15703   MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15704 
15705   // Tie the just added implicit operand to the dst
15706   MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15707 }
15708 
15709 /// Assign the register class depending on the number of
15710 /// bits set in the writemask
15711 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
15712                                                      SDNode *Node) const {
15713   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15714 
15715   MachineFunction *MF = MI.getParent()->getParent();
15716   MachineRegisterInfo &MRI = MF->getRegInfo();
15717   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
15718 
15719   if (TII->isVOP3(MI.getOpcode())) {
15720     // Make sure constant bus requirements are respected.
15721     TII->legalizeOperandsVOP3(MRI, MI);
15722 
15723     // Prefer VGPRs over AGPRs in mAI instructions where possible.
15724     // This saves a chain-copy of registers and better balance register
15725     // use between vgpr and agpr as agpr tuples tend to be big.
15726     if (!MI.getDesc().operands().empty()) {
15727       unsigned Opc = MI.getOpcode();
15728       bool HasAGPRs = Info->mayNeedAGPRs();
15729       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15730       int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15731       for (auto I :
15732            {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15733             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15734         if (I == -1)
15735           break;
15736         if ((I == Src2Idx) && (HasAGPRs))
15737           break;
15738         MachineOperand &Op = MI.getOperand(I);
15739         if (!Op.isReg() || !Op.getReg().isVirtual())
15740           continue;
15741         auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15742         if (!TRI->hasAGPRs(RC))
15743           continue;
15744         auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15745         if (!Src || !Src->isCopy() ||
15746             !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15747           continue;
15748         auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15749         // All uses of agpr64 and agpr32 can also accept vgpr except for
15750         // v_accvgpr_read, but we do not produce agpr reads during selection,
15751         // so no use checks are needed.
15752         MRI.setRegClass(Op.getReg(), NewRC);
15753       }
15754 
15755       if (TII->isMAI(MI)) {
15756         // The ordinary src0, src1, src2 were legalized above.
15757         //
15758         // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
15759         // as a separate instruction.
15760         int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15761                                                  AMDGPU::OpName::scale_src0);
15762         if (Src0Idx != -1) {
15763           int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
15764                                                    AMDGPU::OpName::scale_src1);
15765           if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
15766               TII->usesConstantBus(MRI, MI, Src1Idx))
15767             TII->legalizeOpWithMove(MI, Src1Idx);
15768         }
15769       }
15770 
15771       if (!HasAGPRs)
15772         return;
15773 
15774       // Resolve the rest of AV operands to AGPRs.
15775       if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15776         if (Src2->isReg() && Src2->getReg().isVirtual()) {
15777           auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15778           if (TRI->isVectorSuperClass(RC)) {
15779             auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15780             MRI.setRegClass(Src2->getReg(), NewRC);
15781             if (Src2->isTied())
15782               MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15783           }
15784         }
15785       }
15786     }
15787 
15788     return;
15789   }
15790 
15791   if (TII->isImage(MI))
15792     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15793 }
15794 
15795 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
15796                               uint64_t Val) {
15797   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15798   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15799 }
15800 
15801 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
15802                                                 const SDLoc &DL,
15803                                                 SDValue Ptr) const {
15804   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15805 
15806   // Build the half of the subregister with the constants before building the
15807   // full 128-bit register. If we are building multiple resource descriptors,
15808   // this will allow CSEing of the 2-component register.
15809   const SDValue Ops0[] = {
15810       DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15811       buildSMovImm32(DAG, DL, 0),
15812       DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15813       buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15814       DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
15815 
15816   SDValue SubRegHi = SDValue(
15817       DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
15818 
15819   // Combine the constants and the pointer.
15820   const SDValue Ops1[] = {
15821       DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
15822       DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
15823       DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
15824 
15825   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15826 }
15827 
15828 /// Return a resource descriptor with the 'Add TID' bit enabled
15829 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15830 ///        of the resource descriptor) to create an offset, which is added to
15831 ///        the resource pointer.
15832 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
15833                                            SDValue Ptr, uint32_t RsrcDword1,
15834                                            uint64_t RsrcDword2And3) const {
15835   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15836   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15837   if (RsrcDword1) {
15838     PtrHi =
15839         SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15840                                    DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15841                 0);
15842   }
15843 
15844   SDValue DataLo =
15845       buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15846   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15847 
15848   const SDValue Ops[] = {
15849       DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15850       PtrLo,
15851       DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15852       PtrHi,
15853       DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15854       DataLo,
15855       DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15856       DataHi,
15857       DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
15858 
15859   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15860 }
15861 
15862 //===----------------------------------------------------------------------===//
15863 //                         SI Inline Assembly Support
15864 //===----------------------------------------------------------------------===//
15865 
15866 std::pair<unsigned, const TargetRegisterClass *>
15867 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
15868                                                StringRef Constraint,
15869                                                MVT VT) const {
15870   const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15871 
15872   const TargetRegisterClass *RC = nullptr;
15873   if (Constraint.size() == 1) {
15874     const unsigned BitWidth = VT.getSizeInBits();
15875     switch (Constraint[0]) {
15876     default:
15877       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15878     case 's':
15879     case 'r':
15880       switch (BitWidth) {
15881       case 16:
15882         RC = &AMDGPU::SReg_32RegClass;
15883         break;
15884       case 64:
15885         RC = &AMDGPU::SGPR_64RegClass;
15886         break;
15887       default:
15888         RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
15889         if (!RC)
15890           return std::pair(0U, nullptr);
15891         break;
15892       }
15893       break;
15894     case 'v':
15895       switch (BitWidth) {
15896       case 16:
15897         RC = &AMDGPU::VGPR_32RegClass;
15898         break;
15899       default:
15900         RC = TRI->getVGPRClassForBitWidth(BitWidth);
15901         if (!RC)
15902           return std::pair(0U, nullptr);
15903         break;
15904       }
15905       break;
15906     case 'a':
15907       if (!Subtarget->hasMAIInsts())
15908         break;
15909       switch (BitWidth) {
15910       case 16:
15911         RC = &AMDGPU::AGPR_32RegClass;
15912         break;
15913       default:
15914         RC = TRI->getAGPRClassForBitWidth(BitWidth);
15915         if (!RC)
15916           return std::pair(0U, nullptr);
15917         break;
15918       }
15919       break;
15920     }
15921     // We actually support i128, i16 and f16 as inline parameters
15922     // even if they are not reported as legal
15923     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15924                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15925       return std::pair(0U, RC);
15926   }
15927 
15928   if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15929     StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15930     if (RegName.consume_front("v")) {
15931       RC = &AMDGPU::VGPR_32RegClass;
15932     } else if (RegName.consume_front("s")) {
15933       RC = &AMDGPU::SGPR_32RegClass;
15934     } else if (RegName.consume_front("a")) {
15935       RC = &AMDGPU::AGPR_32RegClass;
15936     }
15937 
15938     if (RC) {
15939       uint32_t Idx;
15940       if (RegName.consume_front("[")) {
15941         uint32_t End;
15942         bool Failed = RegName.consumeInteger(10, Idx);
15943         Failed |= !RegName.consume_front(":");
15944         Failed |= RegName.consumeInteger(10, End);
15945         Failed |= !RegName.consume_back("]");
15946         if (!Failed) {
15947           uint32_t Width = (End - Idx + 1) * 32;
15948           // Prohibit constraints for register ranges with a width that does not
15949           // match the required type.
15950           if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
15951             return std::pair(0U, nullptr);
15952           MCRegister Reg = RC->getRegister(Idx);
15953           if (SIRegisterInfo::isVGPRClass(RC))
15954             RC = TRI->getVGPRClassForBitWidth(Width);
15955           else if (SIRegisterInfo::isSGPRClass(RC))
15956             RC = TRI->getSGPRClassForBitWidth(Width);
15957           else if (SIRegisterInfo::isAGPRClass(RC))
15958             RC = TRI->getAGPRClassForBitWidth(Width);
15959           if (RC) {
15960             Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15961             if (!Reg) {
15962               // The register class does not contain the requested register,
15963               // e.g., because it is an SGPR pair that would violate alignment
15964               // requirements.
15965               return std::pair(0U, nullptr);
15966             }
15967             return std::pair(Reg, RC);
15968           }
15969         }
15970       } else {
15971         // Check for lossy scalar/vector conversions.
15972         if (VT.isVector() && VT.getSizeInBits() != 32)
15973           return std::pair(0U, nullptr);
15974         bool Failed = RegName.getAsInteger(10, Idx);
15975         if (!Failed && Idx < RC->getNumRegs())
15976           return std::pair(RC->getRegister(Idx), RC);
15977       }
15978     }
15979   }
15980 
15981   auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15982   if (Ret.first)
15983     Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15984 
15985   return Ret;
15986 }
15987 
15988 static bool isImmConstraint(StringRef Constraint) {
15989   if (Constraint.size() == 1) {
15990     switch (Constraint[0]) {
15991     default:
15992       break;
15993     case 'I':
15994     case 'J':
15995     case 'A':
15996     case 'B':
15997     case 'C':
15998       return true;
15999     }
16000   } else if (Constraint == "DA" || Constraint == "DB") {
16001     return true;
16002   }
16003   return false;
16004 }
16005 
16006 SITargetLowering::ConstraintType
16007 SITargetLowering::getConstraintType(StringRef Constraint) const {
16008   if (Constraint.size() == 1) {
16009     switch (Constraint[0]) {
16010     default:
16011       break;
16012     case 's':
16013     case 'v':
16014     case 'a':
16015       return C_RegisterClass;
16016     }
16017   }
16018   if (isImmConstraint(Constraint)) {
16019     return C_Other;
16020   }
16021   return TargetLowering::getConstraintType(Constraint);
16022 }
16023 
16024 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16025   if (!AMDGPU::isInlinableIntLiteral(Val)) {
16026     Val = Val & maskTrailingOnes<uint64_t>(Size);
16027   }
16028   return Val;
16029 }
16030 
16031 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16032                                                     StringRef Constraint,
16033                                                     std::vector<SDValue> &Ops,
16034                                                     SelectionDAG &DAG) const {
16035   if (isImmConstraint(Constraint)) {
16036     uint64_t Val;
16037     if (getAsmOperandConstVal(Op, Val) &&
16038         checkAsmConstraintVal(Op, Constraint, Val)) {
16039       Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
16040       Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
16041     }
16042   } else {
16043     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16044   }
16045 }
16046 
16047 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
16048   unsigned Size = Op.getScalarValueSizeInBits();
16049   if (Size > 64)
16050     return false;
16051 
16052   if (Size == 16 && !Subtarget->has16BitInsts())
16053     return false;
16054 
16055   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16056     Val = C->getSExtValue();
16057     return true;
16058   }
16059   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
16060     Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16061     return true;
16062   }
16063   if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
16064     if (Size != 16 || Op.getNumOperands() != 2)
16065       return false;
16066     if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
16067       return false;
16068     if (ConstantSDNode *C = V->getConstantSplatNode()) {
16069       Val = C->getSExtValue();
16070       return true;
16071     }
16072     if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16073       Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16074       return true;
16075     }
16076   }
16077 
16078   return false;
16079 }
16080 
16081 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
16082                                              uint64_t Val) const {
16083   if (Constraint.size() == 1) {
16084     switch (Constraint[0]) {
16085     case 'I':
16086       return AMDGPU::isInlinableIntLiteral(Val);
16087     case 'J':
16088       return isInt<16>(Val);
16089     case 'A':
16090       return checkAsmConstraintValA(Op, Val);
16091     case 'B':
16092       return isInt<32>(Val);
16093     case 'C':
16094       return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16095              AMDGPU::isInlinableIntLiteral(Val);
16096     default:
16097       break;
16098     }
16099   } else if (Constraint.size() == 2) {
16100     if (Constraint == "DA") {
16101       int64_t HiBits = static_cast<int32_t>(Val >> 32);
16102       int64_t LoBits = static_cast<int32_t>(Val);
16103       return checkAsmConstraintValA(Op, HiBits, 32) &&
16104              checkAsmConstraintValA(Op, LoBits, 32);
16105     }
16106     if (Constraint == "DB") {
16107       return true;
16108     }
16109   }
16110   llvm_unreachable("Invalid asm constraint");
16111 }
16112 
16113 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
16114                                               unsigned MaxSize) const {
16115   unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16116   bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16117   if (Size == 16) {
16118     MVT VT = Op.getSimpleValueType();
16119     switch (VT.SimpleTy) {
16120     default:
16121       return false;
16122     case MVT::i16:
16123       return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16124     case MVT::f16:
16125       return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16126     case MVT::bf16:
16127       return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16128     case MVT::v2i16:
16129       return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16130     case MVT::v2f16:
16131       return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16132     case MVT::v2bf16:
16133       return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16134     }
16135   }
16136   if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16137       (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16138     return true;
16139   return false;
16140 }
16141 
16142 static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16143   switch (UnalignedClassID) {
16144   case AMDGPU::VReg_64RegClassID:
16145     return AMDGPU::VReg_64_Align2RegClassID;
16146   case AMDGPU::VReg_96RegClassID:
16147     return AMDGPU::VReg_96_Align2RegClassID;
16148   case AMDGPU::VReg_128RegClassID:
16149     return AMDGPU::VReg_128_Align2RegClassID;
16150   case AMDGPU::VReg_160RegClassID:
16151     return AMDGPU::VReg_160_Align2RegClassID;
16152   case AMDGPU::VReg_192RegClassID:
16153     return AMDGPU::VReg_192_Align2RegClassID;
16154   case AMDGPU::VReg_224RegClassID:
16155     return AMDGPU::VReg_224_Align2RegClassID;
16156   case AMDGPU::VReg_256RegClassID:
16157     return AMDGPU::VReg_256_Align2RegClassID;
16158   case AMDGPU::VReg_288RegClassID:
16159     return AMDGPU::VReg_288_Align2RegClassID;
16160   case AMDGPU::VReg_320RegClassID:
16161     return AMDGPU::VReg_320_Align2RegClassID;
16162   case AMDGPU::VReg_352RegClassID:
16163     return AMDGPU::VReg_352_Align2RegClassID;
16164   case AMDGPU::VReg_384RegClassID:
16165     return AMDGPU::VReg_384_Align2RegClassID;
16166   case AMDGPU::VReg_512RegClassID:
16167     return AMDGPU::VReg_512_Align2RegClassID;
16168   case AMDGPU::VReg_1024RegClassID:
16169     return AMDGPU::VReg_1024_Align2RegClassID;
16170   case AMDGPU::AReg_64RegClassID:
16171     return AMDGPU::AReg_64_Align2RegClassID;
16172   case AMDGPU::AReg_96RegClassID:
16173     return AMDGPU::AReg_96_Align2RegClassID;
16174   case AMDGPU::AReg_128RegClassID:
16175     return AMDGPU::AReg_128_Align2RegClassID;
16176   case AMDGPU::AReg_160RegClassID:
16177     return AMDGPU::AReg_160_Align2RegClassID;
16178   case AMDGPU::AReg_192RegClassID:
16179     return AMDGPU::AReg_192_Align2RegClassID;
16180   case AMDGPU::AReg_256RegClassID:
16181     return AMDGPU::AReg_256_Align2RegClassID;
16182   case AMDGPU::AReg_512RegClassID:
16183     return AMDGPU::AReg_512_Align2RegClassID;
16184   case AMDGPU::AReg_1024RegClassID:
16185     return AMDGPU::AReg_1024_Align2RegClassID;
16186   default:
16187     return -1;
16188   }
16189 }
16190 
16191 // Figure out which registers should be reserved for stack access. Only after
16192 // the function is legalized do we know all of the non-spill stack objects or if
16193 // calls are present.
16194 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
16195   MachineRegisterInfo &MRI = MF.getRegInfo();
16196   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16197   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16198   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16199   const SIInstrInfo *TII = ST.getInstrInfo();
16200 
16201   if (Info->isEntryFunction()) {
16202     // Callable functions have fixed registers used for stack access.
16203     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
16204   }
16205 
16206   // TODO: Move this logic to getReservedRegs()
16207   // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16208   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16209   Register SReg = ST.isWave32()
16210                       ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16211                       : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16212                                                      &AMDGPU::SGPR_64RegClass);
16213   Info->setSGPRForEXECCopy(SReg);
16214 
16215   assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16216                              Info->getStackPtrOffsetReg()));
16217   if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16218     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16219 
16220   // We need to worry about replacing the default register with itself in case
16221   // of MIR testcases missing the MFI.
16222   if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16223     MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16224 
16225   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16226     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16227 
16228   Info->limitOccupancy(MF);
16229 
16230   if (ST.isWave32() && !MF.empty()) {
16231     for (auto &MBB : MF) {
16232       for (auto &MI : MBB) {
16233         TII->fixImplicitOperands(MI);
16234       }
16235     }
16236   }
16237 
16238   // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16239   // classes if required. Ideally the register class constraints would differ
16240   // per-subtarget, but there's no easy way to achieve that right now. This is
16241   // not a problem for VGPRs because the correctly aligned VGPR class is implied
16242   // from using them as the register class for legal types.
16243   if (ST.needsAlignedVGPRs()) {
16244     for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16245       const Register Reg = Register::index2VirtReg(I);
16246       const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16247       if (!RC)
16248         continue;
16249       int NewClassID = getAlignedAGPRClassID(RC->getID());
16250       if (NewClassID != -1)
16251         MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16252     }
16253   }
16254 
16255   TargetLoweringBase::finalizeLowering(MF);
16256 }
16257 
16258 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16259                                                      KnownBits &Known,
16260                                                      const APInt &DemandedElts,
16261                                                      const SelectionDAG &DAG,
16262                                                      unsigned Depth) const {
16263   Known.resetAll();
16264   unsigned Opc = Op.getOpcode();
16265   switch (Opc) {
16266   case ISD::INTRINSIC_WO_CHAIN: {
16267     unsigned IID = Op.getConstantOperandVal(0);
16268     switch (IID) {
16269     case Intrinsic::amdgcn_mbcnt_lo:
16270     case Intrinsic::amdgcn_mbcnt_hi: {
16271       const GCNSubtarget &ST =
16272           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
16273       // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16274       // most 31 + src1.
16275       Known.Zero.setBitsFrom(
16276           IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16277       KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16278       Known = KnownBits::add(Known, Known2);
16279       return;
16280     }
16281     }
16282     break;
16283   }
16284   }
16285   return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
16286       Op, Known, DemandedElts, DAG, Depth);
16287 }
16288 
16289 void SITargetLowering::computeKnownBitsForFrameIndex(
16290     const int FI, KnownBits &Known, const MachineFunction &MF) const {
16291   TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
16292 
16293   // Set the high bits to zero based on the maximum allowed scratch size per
16294   // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16295   // calculation won't overflow, so assume the sign bit is never set.
16296   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16297 }
16298 
16299 static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
16300                                    KnownBits &Known, unsigned Dim) {
16301   unsigned MaxValue =
16302       ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
16303   Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16304 }
16305 
16306 void SITargetLowering::computeKnownBitsForTargetInstr(
16307     GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
16308     const MachineRegisterInfo &MRI, unsigned Depth) const {
16309   const MachineInstr *MI = MRI.getVRegDef(R);
16310   switch (MI->getOpcode()) {
16311   case AMDGPU::G_INTRINSIC:
16312   case AMDGPU::G_INTRINSIC_CONVERGENT: {
16313     Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16314     switch (IID) {
16315     case Intrinsic::amdgcn_workitem_id_x:
16316       knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
16317       break;
16318     case Intrinsic::amdgcn_workitem_id_y:
16319       knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
16320       break;
16321     case Intrinsic::amdgcn_workitem_id_z:
16322       knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
16323       break;
16324     case Intrinsic::amdgcn_mbcnt_lo:
16325     case Intrinsic::amdgcn_mbcnt_hi: {
16326       // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16327       // most 31 + src1.
16328       Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16329                                  ? getSubtarget()->getWavefrontSizeLog2()
16330                                  : 5);
16331       KnownBits Known2;
16332       KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16333                               Depth + 1);
16334       Known = KnownBits::add(Known, Known2);
16335       break;
16336     }
16337     case Intrinsic::amdgcn_groupstaticsize: {
16338       // We can report everything over the maximum size as 0. We can't report
16339       // based on the actual size because we don't know if it's accurate or not
16340       // at any given point.
16341       Known.Zero.setHighBits(
16342           llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16343       break;
16344     }
16345     }
16346     break;
16347   }
16348   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16349     Known.Zero.setHighBits(24);
16350     break;
16351   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16352     Known.Zero.setHighBits(16);
16353     break;
16354   case AMDGPU::G_AMDGPU_SMED3:
16355   case AMDGPU::G_AMDGPU_UMED3: {
16356     auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16357 
16358     KnownBits Known2;
16359     KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16360     if (Known2.isUnknown())
16361       break;
16362 
16363     KnownBits Known1;
16364     KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16365     if (Known1.isUnknown())
16366       break;
16367 
16368     KnownBits Known0;
16369     KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16370     if (Known0.isUnknown())
16371       break;
16372 
16373     // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16374     Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16375     Known.One = Known0.One & Known1.One & Known2.One;
16376     break;
16377   }
16378   }
16379 }
16380 
16381 Align SITargetLowering::computeKnownAlignForTargetInstr(
16382     GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
16383     unsigned Depth) const {
16384   const MachineInstr *MI = MRI.getVRegDef(R);
16385   if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16386     // FIXME: Can this move to generic code? What about the case where the call
16387     // site specifies a lower alignment?
16388     Intrinsic::ID IID = GI->getIntrinsicID();
16389     LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
16390     AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
16391     if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16392       return *RetAlign;
16393   }
16394   return Align(1);
16395 }
16396 
16397 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
16398   const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
16399   const Align CacheLineAlign = Align(64);
16400 
16401   // Pre-GFX10 target did not benefit from loop alignment
16402   if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16403       getSubtarget()->hasInstFwdPrefetchBug())
16404     return PrefAlign;
16405 
16406   // On GFX10 I$ is 4 x 64 bytes cache lines.
16407   // By default prefetcher keeps one cache line behind and reads two ahead.
16408   // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16409   // behind and one ahead.
16410   // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16411   // If loop fits 64 bytes it always spans no more than two cache lines and
16412   // does not need an alignment.
16413   // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16414   // Else if loop is less or equal 192 bytes we need two lines behind.
16415 
16416   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16417   const MachineBasicBlock *Header = ML->getHeader();
16418   if (Header->getAlignment() != PrefAlign)
16419     return Header->getAlignment(); // Already processed.
16420 
16421   unsigned LoopSize = 0;
16422   for (const MachineBasicBlock *MBB : ML->blocks()) {
16423     // If inner loop block is aligned assume in average half of the alignment
16424     // size to be added as nops.
16425     if (MBB != Header)
16426       LoopSize += MBB->getAlignment().value() / 2;
16427 
16428     for (const MachineInstr &MI : *MBB) {
16429       LoopSize += TII->getInstSizeInBytes(MI);
16430       if (LoopSize > 192)
16431         return PrefAlign;
16432     }
16433   }
16434 
16435   if (LoopSize <= 64)
16436     return PrefAlign;
16437 
16438   if (LoopSize <= 128)
16439     return CacheLineAlign;
16440 
16441   // If any of parent loops is surrounded by prefetch instructions do not
16442   // insert new for inner loop, which would reset parent's settings.
16443   for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16444     if (MachineBasicBlock *Exit = P->getExitBlock()) {
16445       auto I = Exit->getFirstNonDebugInstr();
16446       if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16447         return CacheLineAlign;
16448     }
16449   }
16450 
16451   MachineBasicBlock *Pre = ML->getLoopPreheader();
16452   MachineBasicBlock *Exit = ML->getExitBlock();
16453 
16454   if (Pre && Exit) {
16455     auto PreTerm = Pre->getFirstTerminator();
16456     if (PreTerm == Pre->begin() ||
16457         std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16458       BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16459           .addImm(1); // prefetch 2 lines behind PC
16460 
16461     auto ExitHead = Exit->getFirstNonDebugInstr();
16462     if (ExitHead == Exit->end() ||
16463         ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16464       BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16465           .addImm(2); // prefetch 1 line behind PC
16466   }
16467 
16468   return CacheLineAlign;
16469 }
16470 
16471 LLVM_ATTRIBUTE_UNUSED
16472 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16473   assert(N->getOpcode() == ISD::CopyFromReg);
16474   do {
16475     // Follow the chain until we find an INLINEASM node.
16476     N = N->getOperand(0).getNode();
16477     if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
16478       return true;
16479   } while (N->getOpcode() == ISD::CopyFromReg);
16480   return false;
16481 }
16482 
16483 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
16484                                                   FunctionLoweringInfo *FLI,
16485                                                   UniformityInfo *UA) const {
16486   switch (N->getOpcode()) {
16487   case ISD::CopyFromReg: {
16488     const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16489     const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16490     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16491     Register Reg = R->getReg();
16492 
16493     // FIXME: Why does this need to consider isLiveIn?
16494     if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16495       return !TRI->isSGPRReg(MRI, Reg);
16496 
16497     if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16498       return UA->isDivergent(V);
16499 
16500     assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16501     return !TRI->isSGPRReg(MRI, Reg);
16502   }
16503   case ISD::LOAD: {
16504     const LoadSDNode *L = cast<LoadSDNode>(N);
16505     unsigned AS = L->getAddressSpace();
16506     // A flat load may access private memory.
16507     return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
16508   }
16509   case ISD::CALLSEQ_END:
16510     return true;
16511   case ISD::INTRINSIC_WO_CHAIN:
16512     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16513   case ISD::INTRINSIC_W_CHAIN:
16514     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16515   case AMDGPUISD::ATOMIC_CMP_SWAP:
16516   case AMDGPUISD::BUFFER_ATOMIC_SWAP:
16517   case AMDGPUISD::BUFFER_ATOMIC_ADD:
16518   case AMDGPUISD::BUFFER_ATOMIC_SUB:
16519   case AMDGPUISD::BUFFER_ATOMIC_SMIN:
16520   case AMDGPUISD::BUFFER_ATOMIC_UMIN:
16521   case AMDGPUISD::BUFFER_ATOMIC_SMAX:
16522   case AMDGPUISD::BUFFER_ATOMIC_UMAX:
16523   case AMDGPUISD::BUFFER_ATOMIC_AND:
16524   case AMDGPUISD::BUFFER_ATOMIC_OR:
16525   case AMDGPUISD::BUFFER_ATOMIC_XOR:
16526   case AMDGPUISD::BUFFER_ATOMIC_INC:
16527   case AMDGPUISD::BUFFER_ATOMIC_DEC:
16528   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
16529   case AMDGPUISD::BUFFER_ATOMIC_CSUB:
16530   case AMDGPUISD::BUFFER_ATOMIC_FADD:
16531   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
16532   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
16533     // Target-specific read-modify-write atomics are sources of divergence.
16534     return true;
16535   default:
16536     if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16537       // Generic read-modify-write atomics are sources of divergence.
16538       return A->readMem() && A->writeMem();
16539     }
16540     return false;
16541   }
16542 }
16543 
16544 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
16545                                                EVT VT) const {
16546   switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16547   case MVT::f32:
16548     return !denormalModeIsFlushAllF32(DAG.getMachineFunction());
16549   case MVT::f64:
16550   case MVT::f16:
16551     return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
16552   default:
16553     return false;
16554   }
16555 }
16556 
16557 bool SITargetLowering::denormalsEnabledForType(
16558     LLT Ty, const MachineFunction &MF) const {
16559   switch (Ty.getScalarSizeInBits()) {
16560   case 32:
16561     return !denormalModeIsFlushAllF32(MF);
16562   case 64:
16563   case 16:
16564     return !denormalModeIsFlushAllF64F16(MF);
16565   default:
16566     return false;
16567   }
16568 }
16569 
16570 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
16571                                                     const SelectionDAG &DAG,
16572                                                     bool SNaN,
16573                                                     unsigned Depth) const {
16574   if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16575     const MachineFunction &MF = DAG.getMachineFunction();
16576     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16577 
16578     if (Info->getMode().DX10Clamp)
16579       return true; // Clamped to 0.
16580     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16581   }
16582 
16583   return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, SNaN,
16584                                                             Depth);
16585 }
16586 
16587 // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16588 // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16589 static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16590   if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16591     return true;
16592 
16593   const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16594   auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16595   if (DenormMode == DenormalMode::getPreserveSign())
16596     return true;
16597 
16598   // TODO: Remove this.
16599   return RMW->getFunction()
16600       ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16601       .getValueAsBool();
16602 }
16603 
16604 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
16605   LLVMContext &Ctx = RMW->getContext();
16606   StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16607   StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16608 
16609   return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16610          << "Hardware instruction generated for atomic "
16611          << RMW->getOperationName(RMW->getOperation())
16612          << " operation at memory scope " << MemScope;
16613 }
16614 
16615 static bool isV2F16OrV2BF16(Type *Ty) {
16616   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16617     Type *EltTy = VT->getElementType();
16618     return VT->getNumElements() == 2 &&
16619            (EltTy->isHalfTy() || EltTy->isBFloatTy());
16620   }
16621 
16622   return false;
16623 }
16624 
16625 static bool isV2F16(Type *Ty) {
16626   FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16627   return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16628 }
16629 
16630 static bool isV2BF16(Type *Ty) {
16631   FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16632   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16633 }
16634 
16635 /// \return true if atomicrmw integer ops work for the type.
16636 static bool isAtomicRMWLegalIntTy(Type *Ty) {
16637   if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16638     unsigned BW = IT->getBitWidth();
16639     return BW == 32 || BW == 64;
16640   }
16641 
16642   return false;
16643 }
16644 
16645 /// \return true if this atomicrmw xchg type can be selected.
16646 static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16647   Type *Ty = RMW->getType();
16648   if (isAtomicRMWLegalIntTy(Ty))
16649     return true;
16650 
16651   if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16652     const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16653     unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16654     return BW == 32 || BW == 64;
16655   }
16656 
16657   if (Ty->isFloatTy() || Ty->isDoubleTy())
16658     return true;
16659 
16660   if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16661     return VT->getNumElements() == 2 &&
16662            VT->getElementType()->getPrimitiveSizeInBits() == 16;
16663   }
16664 
16665   return false;
16666 }
16667 
16668 /// \returns true if it's valid to emit a native instruction for \p RMW, based
16669 /// on the properties of the target memory.
16670 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16671                                         const AtomicRMWInst *RMW,
16672                                         bool HasSystemScope) {
16673   // The remote/fine-grained access logic is different from the integer
16674   // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16675   // fine-grained access does not work, even for a device local allocation.
16676   //
16677   // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16678   // allocations work.
16679   if (HasSystemScope) {
16680     if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16681         RMW->hasMetadata("amdgpu.no.remote.memory"))
16682       return true;
16683   } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16684     return true;
16685 
16686   return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16687 }
16688 
16689 /// \return Action to perform on AtomicRMWInsts for integer operations.
16690 static TargetLowering::AtomicExpansionKind
16691 atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16692   return isAtomicRMWLegalIntTy(RMW->getType())
16693              ? TargetLowering::AtomicExpansionKind::None
16694              : TargetLowering::AtomicExpansionKind::CmpXChg;
16695 }
16696 
16697 /// Return if a flat address space atomicrmw can access private memory.
16698 static bool flatInstrMayAccessPrivate(const Instruction *I) {
16699   const MDNode *NoaliasAddrSpaceMD =
16700       I->getMetadata(LLVMContext::MD_noalias_addrspace);
16701   if (!NoaliasAddrSpaceMD)
16702     return true;
16703 
16704   for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16705        ++I) {
16706     auto *Low = mdconst::extract<ConstantInt>(
16707         NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16708     if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16709       auto *High = mdconst::extract<ConstantInt>(
16710           NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16711       return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16712     }
16713   }
16714 
16715   return true;
16716 }
16717 
16718 TargetLowering::AtomicExpansionKind
16719 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16720   unsigned AS = RMW->getPointerAddressSpace();
16721   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16722     return AtomicExpansionKind::NotAtomic;
16723 
16724   // 64-bit flat atomics that dynamically reside in private memory will silently
16725   // be dropped.
16726   //
16727   // Note that we will emit a new copy of the original atomic in the expansion,
16728   // which will be incrementally relegalized.
16729   const DataLayout &DL = RMW->getFunction()->getDataLayout();
16730   if (AS == AMDGPUAS::FLAT_ADDRESS &&
16731       DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16732       flatInstrMayAccessPrivate(RMW))
16733     return AtomicExpansionKind::Expand;
16734 
16735   auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16736     OptimizationRemarkEmitter ORE(RMW->getFunction());
16737     ORE.emit([=]() {
16738       return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16739     });
16740     return Kind;
16741   };
16742 
16743   auto SSID = RMW->getSyncScopeID();
16744   bool HasSystemScope =
16745       SSID == SyncScope::System ||
16746       SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16747 
16748   auto Op = RMW->getOperation();
16749   switch (Op) {
16750   case AtomicRMWInst::Xchg: {
16751     // PCIe supports add and xchg for system atomics.
16752     return isAtomicRMWLegalXChgTy(RMW)
16753                ? TargetLowering::AtomicExpansionKind::None
16754                : TargetLowering::AtomicExpansionKind::CmpXChg;
16755   }
16756   case AtomicRMWInst::Add:
16757   case AtomicRMWInst::And:
16758   case AtomicRMWInst::UIncWrap:
16759   case AtomicRMWInst::UDecWrap:
16760     return atomicSupportedIfLegalIntType(RMW);
16761   case AtomicRMWInst::Sub:
16762   case AtomicRMWInst::Or:
16763   case AtomicRMWInst::Xor: {
16764     // Atomic sub/or/xor do not work over PCI express, but atomic add
16765     // does. InstCombine transforms these with 0 to or, so undo that.
16766     if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16767       if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16768           ConstVal && ConstVal->isNullValue())
16769         return AtomicExpansionKind::Expand;
16770     }
16771 
16772     return atomicSupportedIfLegalIntType(RMW);
16773   }
16774   case AtomicRMWInst::FAdd: {
16775     Type *Ty = RMW->getType();
16776 
16777     // TODO: Handle REGION_ADDRESS
16778     if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16779       // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16780       // is fixed to round-to-nearest-even.
16781       //
16782       // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16783       // round-to-nearest-even.
16784       //
16785       // We ignore the rounding mode problem, even in strictfp. The C++ standard
16786       // suggests it is OK if the floating-point mode may not match the calling
16787       // thread.
16788       if (Ty->isFloatTy()) {
16789         return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16790                                                  : AtomicExpansionKind::CmpXChg;
16791       }
16792 
16793       if (Ty->isDoubleTy()) {
16794         // Ignores denormal mode, but we don't consider flushing mandatory.
16795         return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16796                                                  : AtomicExpansionKind::CmpXChg;
16797       }
16798 
16799       if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16800         return AtomicExpansionKind::None;
16801 
16802       return AtomicExpansionKind::CmpXChg;
16803     }
16804 
16805     // LDS atomics respect the denormal mode from the mode register.
16806     //
16807     // Traditionally f32 global/buffer memory atomics would unconditionally
16808     // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16809     // flush.
16810     //
16811     // On targets with flat atomic fadd, denormals would flush depending on
16812     // whether the target address resides in LDS or global memory. We consider
16813     // this flat-maybe-flush as will-flush.
16814     if (Ty->isFloatTy() &&
16815         !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16816         !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
16817       return AtomicExpansionKind::CmpXChg;
16818 
16819     // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16820     // safe. The message phrasing also should be better.
16821     if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16822       if (AS == AMDGPUAS::FLAT_ADDRESS) {
16823         // gfx940, gfx12
16824         if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16825           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16826       } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16827         // gfx90a, gfx940, gfx12
16828         if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16829           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16830 
16831         // gfx940, gfx12
16832         if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16833           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16834       } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16835         // gfx90a, gfx940, gfx12
16836         if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16837           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16838 
16839         // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16840         // buffer. gfx12 does have the buffer version.
16841         if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16842           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16843       }
16844 
16845       // global and flat atomic fadd f64: gfx90a, gfx940.
16846       if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16847         return ReportUnsafeHWInst(AtomicExpansionKind::None);
16848 
16849       if (AS != AMDGPUAS::FLAT_ADDRESS) {
16850         if (Ty->isFloatTy()) {
16851           // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16852           // gfx11+.
16853           if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16854             return ReportUnsafeHWInst(AtomicExpansionKind::None);
16855           // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16856           if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16857             return ReportUnsafeHWInst(AtomicExpansionKind::None);
16858         } else {
16859           // gfx908
16860           if (RMW->use_empty() &&
16861               Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16862               isV2F16(Ty))
16863             return ReportUnsafeHWInst(AtomicExpansionKind::None);
16864         }
16865       }
16866 
16867       // flat atomic fadd f32: gfx940, gfx11+.
16868       if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16869         if (Subtarget->hasFlatAtomicFaddF32Inst())
16870           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16871 
16872         // If it is in flat address space, and the type is float, we will try to
16873         // expand it, if the target supports global and lds atomic fadd. The
16874         // reason we need that is, in the expansion, we emit the check of
16875         // address space. If it is in global address space, we emit the global
16876         // atomic fadd; if it is in shared address space, we emit the LDS atomic
16877         // fadd.
16878         if (Subtarget->hasLDSFPAtomicAddF32()) {
16879           if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16880             return AtomicExpansionKind::Expand;
16881           if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16882             return AtomicExpansionKind::Expand;
16883         }
16884       }
16885     }
16886 
16887     return AtomicExpansionKind::CmpXChg;
16888   }
16889   case AtomicRMWInst::FMin:
16890   case AtomicRMWInst::FMax: {
16891     Type *Ty = RMW->getType();
16892 
16893     // LDS float and double fmin/fmax were always supported.
16894     if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16895       return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16896                                                  : AtomicExpansionKind::CmpXChg;
16897     }
16898 
16899     if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16900       // For flat and global cases:
16901       // float, double in gfx7. Manual claims denormal support.
16902       // Removed in gfx8.
16903       // float, double restored in gfx10.
16904       // double removed again in gfx11, so only f32 for gfx11/gfx12.
16905       //
16906       // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16907       // no f32.
16908       if (AS == AMDGPUAS::FLAT_ADDRESS) {
16909         if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16910           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16911         if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16912           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16913       } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16914                  AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16915         if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16916           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16917         if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16918           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16919       }
16920     }
16921 
16922     return AtomicExpansionKind::CmpXChg;
16923   }
16924   case AtomicRMWInst::Min:
16925   case AtomicRMWInst::Max:
16926   case AtomicRMWInst::UMin:
16927   case AtomicRMWInst::UMax: {
16928     if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16929         AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16930       // Always expand system scope min/max atomics.
16931       if (HasSystemScope)
16932         return AtomicExpansionKind::CmpXChg;
16933     }
16934 
16935     return atomicSupportedIfLegalIntType(RMW);
16936   }
16937   case AtomicRMWInst::Nand:
16938   case AtomicRMWInst::FSub:
16939   default:
16940     return AtomicExpansionKind::CmpXChg;
16941   }
16942 
16943   llvm_unreachable("covered atomicrmw op switch");
16944 }
16945 
16946 TargetLowering::AtomicExpansionKind
16947 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
16948   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16949              ? AtomicExpansionKind::NotAtomic
16950              : AtomicExpansionKind::None;
16951 }
16952 
16953 TargetLowering::AtomicExpansionKind
16954 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16955   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16956              ? AtomicExpansionKind::NotAtomic
16957              : AtomicExpansionKind::None;
16958 }
16959 
16960 TargetLowering::AtomicExpansionKind
16961 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16962   unsigned AddrSpace = CmpX->getPointerAddressSpace();
16963   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16964     return AtomicExpansionKind::NotAtomic;
16965 
16966   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16967     return AtomicExpansionKind::None;
16968 
16969   const DataLayout &DL = CmpX->getDataLayout();
16970 
16971   Type *ValTy = CmpX->getNewValOperand()->getType();
16972 
16973   // If a 64-bit flat atomic may alias private, we need to avoid using the
16974   // atomic in the private case.
16975   return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16976                                            : AtomicExpansionKind::None;
16977 }
16978 
16979 const TargetRegisterClass *
16980 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16981   const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
16982   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16983   if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16984     return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
16985                                  : &AMDGPU::SReg_32RegClass;
16986   if (!TRI->isSGPRClass(RC) && !isDivergent)
16987     return TRI->getEquivalentSGPRClass(RC);
16988   if (TRI->isSGPRClass(RC) && isDivergent)
16989     return TRI->getEquivalentVGPRClass(RC);
16990 
16991   return RC;
16992 }
16993 
16994 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
16995 // uniform values (as produced by the mask results of control flow intrinsics)
16996 // used outside of divergent blocks. The phi users need to also be treated as
16997 // always uniform.
16998 //
16999 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
17000 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
17001                       unsigned WaveSize) {
17002   // FIXME: We assume we never cast the mask results of a control flow
17003   // intrinsic.
17004   // Early exit if the type won't be consistent as a compile time hack.
17005   IntegerType *IT = dyn_cast<IntegerType>(V->getType());
17006   if (!IT || IT->getBitWidth() != WaveSize)
17007     return false;
17008 
17009   if (!isa<Instruction>(V))
17010     return false;
17011   if (!Visited.insert(V).second)
17012     return false;
17013   bool Result = false;
17014   for (const auto *U : V->users()) {
17015     if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
17016       if (V == U->getOperand(1)) {
17017         switch (Intrinsic->getIntrinsicID()) {
17018         default:
17019           Result = false;
17020           break;
17021         case Intrinsic::amdgcn_if_break:
17022         case Intrinsic::amdgcn_if:
17023         case Intrinsic::amdgcn_else:
17024           Result = true;
17025           break;
17026         }
17027       }
17028       if (V == U->getOperand(0)) {
17029         switch (Intrinsic->getIntrinsicID()) {
17030         default:
17031           Result = false;
17032           break;
17033         case Intrinsic::amdgcn_end_cf:
17034         case Intrinsic::amdgcn_loop:
17035           Result = true;
17036           break;
17037         }
17038       }
17039     } else {
17040       Result = hasCFUser(U, Visited, WaveSize);
17041     }
17042     if (Result)
17043       break;
17044   }
17045   return Result;
17046 }
17047 
17048 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
17049                                                const Value *V) const {
17050   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
17051     if (CI->isInlineAsm()) {
17052       // FIXME: This cannot give a correct answer. This should only trigger in
17053       // the case where inline asm returns mixed SGPR and VGPR results, used
17054       // outside the defining block. We don't have a specific result to
17055       // consider, so this assumes if any value is SGPR, the overall register
17056       // also needs to be SGPR.
17057       const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17058       TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
17059           MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
17060       for (auto &TC : TargetConstraints) {
17061         if (TC.Type == InlineAsm::isOutput) {
17062           ComputeConstraintToUse(TC, SDValue());
17063           const TargetRegisterClass *RC =
17064               getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
17065                                            TC.ConstraintVT)
17066                   .second;
17067           if (RC && SIRI->isSGPRClass(RC))
17068             return true;
17069         }
17070       }
17071     }
17072   }
17073   SmallPtrSet<const Value *, 16> Visited;
17074   return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
17075 }
17076 
17077 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
17078   for (SDUse &Use : N->uses()) {
17079     if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
17080       if (getBasePtrIndex(M) == Use.getOperandNo())
17081         return true;
17082     }
17083   }
17084   return false;
17085 }
17086 
17087 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
17088                                            SDValue N1) const {
17089   if (!N0.hasOneUse())
17090     return false;
17091   // Take care of the opportunity to keep N0 uniform
17092   if (N0->isDivergent() || !N1->isDivergent())
17093     return true;
17094   // Check if we have a good chance to form the memory access pattern with the
17095   // base and offset
17096   return (DAG.isBaseWithConstantOffset(N0) &&
17097           hasMemSDNodeUser(*N0->user_begin()));
17098 }
17099 
17100 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
17101                                            Register N0, Register N1) const {
17102   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17103 }
17104 
17105 MachineMemOperand::Flags
17106 SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
17107   // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17108   MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
17109   if (I.getMetadata("amdgpu.noclobber"))
17110     Flags |= MONoClobber;
17111   if (I.getMetadata("amdgpu.last.use"))
17112     Flags |= MOLastUse;
17113   return Flags;
17114 }
17115 
17116 bool SITargetLowering::checkForPhysRegDependency(
17117     SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17118     const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
17119   if (User->getOpcode() != ISD::CopyToReg)
17120     return false;
17121   if (!Def->isMachineOpcode())
17122     return false;
17123   MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17124   if (!MDef)
17125     return false;
17126 
17127   unsigned ResNo = User->getOperand(Op).getResNo();
17128   if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17129     return false;
17130   const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17131   if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17132     PhysReg = AMDGPU::SCC;
17133     const TargetRegisterClass *RC =
17134         TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17135     Cost = RC->getCopyCost();
17136     return true;
17137   }
17138   return false;
17139 }
17140 
17141 /// Check if it is profitable to hoist instruction in then/else to if.
17142 bool SITargetLowering::isProfitableToHoist(Instruction *I) const {
17143   if (!I->hasOneUse())
17144     return true;
17145 
17146   Instruction *User = I->user_back();
17147   // TODO: Add more patterns that are not profitable to hoist and
17148   // handle modifiers such as fabs and fneg
17149   switch (I->getOpcode()) {
17150   case Instruction::FMul: {
17151     if (User->getOpcode() != Instruction::FSub &&
17152         User->getOpcode() != Instruction::FAdd)
17153       return true;
17154 
17155     const TargetOptions &Options = getTargetMachine().Options;
17156 
17157     return ((!I->hasAllowContract() || !User->hasAllowContract()) &&
17158             Options.AllowFPOpFusion != FPOpFusion::Fast &&
17159             !Options.UnsafeFPMath) ||
17160            !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType());
17161   }
17162   default:
17163     return true;
17164   }
17165   return true;
17166 }
17167 
17168 void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
17169     Instruction *AI) const {
17170   // Given: atomicrmw fadd ptr %addr, float %val ordering
17171   //
17172   // With this expansion we produce the following code:
17173   //   [...]
17174   //   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17175   //   br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17176   //
17177   // atomicrmw.shared:
17178   //   %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17179   //   %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17180   //                                   float %val ordering
17181   //   br label %atomicrmw.phi
17182   //
17183   // atomicrmw.check.private:
17184   //   %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17185   //   br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17186   //
17187   // atomicrmw.private:
17188   //   %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17189   //   %loaded.private = load float, ptr addrspace(5) %cast.private
17190   //   %val.new = fadd float %loaded.private, %val
17191   //   store float %val.new, ptr addrspace(5) %cast.private
17192   //   br label %atomicrmw.phi
17193   //
17194   // atomicrmw.global:
17195   //   %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17196   //   %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17197   //                                   float %val ordering
17198   //   br label %atomicrmw.phi
17199   //
17200   // atomicrmw.phi:
17201   //   %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17202   //                           [ %loaded.private, %atomicrmw.private ],
17203   //                           [ %loaded.global, %atomicrmw.global ]
17204   //   br label %atomicrmw.end
17205   //
17206   // atomicrmw.end:
17207   //    [...]
17208   //
17209   //
17210   // For 64-bit atomics which may reside in private memory, we perform a simpler
17211   // version that only inserts the private check, and uses the flat operation.
17212 
17213   IRBuilder<> Builder(AI);
17214   LLVMContext &Ctx = Builder.getContext();
17215 
17216   auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17217   const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17218                                 : AtomicCmpXchgInst::getPointerOperandIndex();
17219   Value *Addr = AI->getOperand(PtrOpIdx);
17220 
17221   /// TODO: Only need to check private, then emit flat-known-not private (no
17222   /// need for shared block, or cast to global).
17223   AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17224 
17225   Align Alignment;
17226   if (RMW)
17227     Alignment = RMW->getAlign();
17228   else if (CX)
17229     Alignment = CX->getAlign();
17230   else
17231     llvm_unreachable("unhandled atomic operation");
17232 
17233   // FullFlatEmulation is true if we need to issue the private, shared, and
17234   // global cases.
17235   //
17236   // If this is false, we are only dealing with the flat-targeting-private case,
17237   // where we only insert a check for private and still use the flat instruction
17238   // for global and shared.
17239 
17240   bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17241                            Subtarget->hasAtomicFaddInsts() &&
17242                            RMW->getType()->isFloatTy();
17243 
17244   // If the return value isn't used, do not introduce a false use in the phi.
17245   bool ReturnValueIsUsed = !AI->use_empty();
17246 
17247   BasicBlock *BB = Builder.GetInsertBlock();
17248   Function *F = BB->getParent();
17249   BasicBlock *ExitBB =
17250       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17251   BasicBlock *SharedBB = nullptr;
17252 
17253   BasicBlock *CheckPrivateBB = BB;
17254   if (FullFlatEmulation) {
17255     SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17256     CheckPrivateBB =
17257         BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17258   }
17259 
17260   BasicBlock *PrivateBB =
17261       BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17262   BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17263   BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17264 
17265   std::prev(BB->end())->eraseFromParent();
17266   Builder.SetInsertPoint(BB);
17267 
17268   Value *LoadedShared = nullptr;
17269   if (FullFlatEmulation) {
17270     CallInst *IsShared = Builder.CreateIntrinsic(
17271         Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
17272     Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17273     Builder.SetInsertPoint(SharedBB);
17274     Value *CastToLocal = Builder.CreateAddrSpaceCast(
17275         Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
17276 
17277     Instruction *Clone = AI->clone();
17278     Clone->insertInto(SharedBB, SharedBB->end());
17279     Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17280     LoadedShared = Clone;
17281 
17282     Builder.CreateBr(PhiBB);
17283     Builder.SetInsertPoint(CheckPrivateBB);
17284   }
17285 
17286   CallInst *IsPrivate = Builder.CreateIntrinsic(
17287       Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
17288   Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17289 
17290   Builder.SetInsertPoint(PrivateBB);
17291 
17292   Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17293       Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
17294 
17295   Value *LoadedPrivate;
17296   if (RMW) {
17297     LoadedPrivate = Builder.CreateAlignedLoad(
17298         RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17299 
17300     Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17301                                         LoadedPrivate, RMW->getValOperand());
17302 
17303     Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17304   } else {
17305     auto [ResultLoad, Equal] =
17306         buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17307                           CX->getNewValOperand(), CX->getAlign());
17308 
17309     Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17310                                               ResultLoad, 0);
17311     LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17312   }
17313 
17314   Builder.CreateBr(PhiBB);
17315 
17316   Builder.SetInsertPoint(GlobalBB);
17317 
17318   // Continue using a flat instruction if we only emitted the check for private.
17319   Instruction *LoadedGlobal = AI;
17320   if (FullFlatEmulation) {
17321     Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17322         Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
17323     AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17324   }
17325 
17326   AI->removeFromParent();
17327   AI->insertInto(GlobalBB, GlobalBB->end());
17328 
17329   // The new atomicrmw may go through another round of legalization later.
17330   if (!FullFlatEmulation) {
17331     // We inserted the runtime check already, make sure we do not try to
17332     // re-expand this.
17333     // TODO: Should union with any existing metadata.
17334     MDBuilder MDB(F->getContext());
17335     MDNode *RangeNotPrivate =
17336         MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
17337                         APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
17338     LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17339                               RangeNotPrivate);
17340   }
17341 
17342   Builder.CreateBr(PhiBB);
17343 
17344   Builder.SetInsertPoint(PhiBB);
17345 
17346   if (ReturnValueIsUsed) {
17347     PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17348     AI->replaceAllUsesWith(Loaded);
17349     if (FullFlatEmulation)
17350       Loaded->addIncoming(LoadedShared, SharedBB);
17351     Loaded->addIncoming(LoadedPrivate, PrivateBB);
17352     Loaded->addIncoming(LoadedGlobal, GlobalBB);
17353     Loaded->takeName(AI);
17354   }
17355 
17356   Builder.CreateBr(ExitBB);
17357 }
17358 
17359 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
17360   AtomicRMWInst::BinOp Op = AI->getOperation();
17361 
17362   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17363       Op == AtomicRMWInst::Xor) {
17364     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17365         ConstVal && ConstVal->isNullValue()) {
17366       // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17367       AI->setOperation(AtomicRMWInst::Add);
17368 
17369       // We may still need the private-alias-flat handling below.
17370 
17371       // TODO: Skip this for cases where we cannot access remote memory.
17372     }
17373   }
17374 
17375   // The non-flat expansions should only perform the de-canonicalization of
17376   // identity values.
17377   if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
17378     return;
17379 
17380   emitExpandAtomicAddrSpacePredicate(AI);
17381 }
17382 
17383 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
17384   emitExpandAtomicAddrSpacePredicate(CI);
17385 }
17386 
17387 LoadInst *
17388 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
17389   IRBuilder<> Builder(AI);
17390   auto Order = AI->getOrdering();
17391 
17392   // The optimization removes store aspect of the atomicrmw. Therefore, cache
17393   // must be flushed if the atomic ordering had a release semantics. This is
17394   // not necessary a fence, a release fence just coincides to do that flush.
17395   // Avoid replacing of an atomicrmw with a release semantics.
17396   if (isReleaseOrStronger(Order))
17397     return nullptr;
17398 
17399   LoadInst *LI = Builder.CreateAlignedLoad(
17400       AI->getType(), AI->getPointerOperand(), AI->getAlign());
17401   LI->setAtomic(Order, AI->getSyncScopeID());
17402   LI->copyMetadata(*AI);
17403   LI->takeName(AI);
17404   AI->replaceAllUsesWith(LI);
17405   AI->eraseFromParent();
17406   return LI;
17407 }
17408