xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 4a486e773e0ef1add4515ee47b038c274ced2e76)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUMemoryUtils.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/ScopeExit.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47   "amdgpu-global-isel-new-legality",
48   cl::desc("Use GlobalISel desired legality, rather than try to use"
49            "rules compatible with selection patterns"),
50   cl::init(false),
51   cl::ReallyHidden);
52 
53 static constexpr unsigned MaxRegisterSize = 1024;
54 
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57   unsigned NElts = Ty.getNumElements();
58   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61 
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64   unsigned Bits = Ty.getSizeInBits();
65   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
66   return LLT::scalar(Pow2Bits);
67 }
68 
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     if (!Ty.isVector())
76       return false;
77 
78     const LLT EltTy = Ty.getElementType();
79     const unsigned EltSize = EltTy.getSizeInBits();
80     return Ty.getNumElements() % 2 != 0 &&
81            EltSize > 1 && EltSize < 32 &&
82            Ty.getSizeInBits() % 32 != 0;
83   };
84 }
85 
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     return Ty.getSizeInBits() % 32 == 0;
90   };
91 }
92 
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getScalarType();
97     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98   };
99 }
100 
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     return std::pair(TypeIdx,
106                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107   };
108 }
109 
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111   return [=](const LegalityQuery &Query) {
112     const LLT Ty = Query.Types[TypeIdx];
113     const LLT EltTy = Ty.getElementType();
114     unsigned Size = Ty.getSizeInBits();
115     unsigned Pieces = (Size + 63) / 64;
116     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117     return std::pair(TypeIdx, LLT::scalarOrVector(
118                                   ElementCount::getFixed(NewNumElts), EltTy));
119   };
120 }
121 
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125   return [=](const LegalityQuery &Query) {
126     const LLT Ty = Query.Types[TypeIdx];
127 
128     const LLT EltTy = Ty.getElementType();
129     const int Size = Ty.getSizeInBits();
130     const int EltSize = EltTy.getSizeInBits();
131     const int NextMul32 = (Size + 31) / 32;
132 
133     assert(EltSize < 32);
134 
135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137   };
138 }
139 
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142   return [=](const LegalityQuery &Query) {
143     const LLT Ty = Query.Types[TypeIdx];
144     const unsigned NumElts = Ty.getNumElements();
145     const unsigned EltSize = Ty.getElementType().getSizeInBits();
146     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147 
148     assert(EltSize == 32 || EltSize == 64);
149     assert(Ty.getSizeInBits() < MaxRegisterSize);
150 
151     unsigned NewNumElts;
152     // Find the nearest legal RegClass that is larger than the current type.
153     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155         break;
156     }
157     return std::pair(TypeIdx,
158                      LLT::fixed_vector(NewNumElts, Ty.getElementType()));
159   };
160 }
161 
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163   if (!Ty.isVector())
164     return LLT::scalar(128);
165   const ElementCount NumElems = Ty.getElementCount();
166   return LLT::vector(NumElems, LLT::scalar(128));
167 }
168 
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170   if (!Ty.isVector())
171     return LLT::fixed_vector(4, LLT::scalar(32));
172   const unsigned NumElems = Ty.getElementCount().getFixedValue();
173   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175 
176 static LLT getBitcastRegisterType(const LLT Ty) {
177   const unsigned Size = Ty.getSizeInBits();
178 
179   if (Size <= 32) {
180     // <2 x s8> -> s16
181     // <4 x s8> -> s32
182     return LLT::scalar(Size);
183   }
184 
185   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187 
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189   return [=](const LegalityQuery &Query) {
190     const LLT Ty = Query.Types[TypeIdx];
191     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192   };
193 }
194 
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196   return [=](const LegalityQuery &Query) {
197     const LLT Ty = Query.Types[TypeIdx];
198     unsigned Size = Ty.getSizeInBits();
199     assert(Size % 32 == 0);
200     return std::pair(
201         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202   };
203 }
204 
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206   return [=](const LegalityQuery &Query) {
207     const LLT QueryTy = Query.Types[TypeIdx];
208     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209   };
210 }
211 
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213   return [=](const LegalityQuery &Query) {
214     const LLT QueryTy = Query.Types[TypeIdx];
215     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216   };
217 }
218 
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223   };
224 }
225 
226 static bool isRegisterSize(unsigned Size) {
227   return Size % 32 == 0 && Size <= MaxRegisterSize;
228 }
229 
230 static bool isRegisterVectorElementType(LLT EltTy) {
231   const int EltSize = EltTy.getSizeInBits();
232   return EltSize == 16 || EltSize % 32 == 0;
233 }
234 
235 static bool isRegisterVectorType(LLT Ty) {
236   const int EltSize = Ty.getElementType().getSizeInBits();
237   return EltSize == 32 || EltSize == 64 ||
238          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239          EltSize == 128 || EltSize == 256;
240 }
241 
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
243 static bool isRegisterType(LLT Ty) {
244   if (!isRegisterSize(Ty.getSizeInBits()))
245     return false;
246 
247   if (Ty.isVector())
248     return isRegisterVectorType(Ty);
249 
250   return true;
251 }
252 
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
255 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256   return [=](const LegalityQuery &Query) {
257     return isRegisterType(Query.Types[TypeIdx]);
258   };
259 }
260 
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265   return [=](const LegalityQuery &Query) {
266     LLT Ty = Query.Types[TypeIdx];
267     return isRegisterType(Ty) &&
268            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
269   };
270 }
271 
272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273   return [=](const LegalityQuery &Query) {
274     const LLT QueryTy = Query.Types[TypeIdx];
275     if (!QueryTy.isVector())
276       return false;
277     const LLT EltTy = QueryTy.getElementType();
278     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279   };
280 }
281 
282 static const LLT S1 = LLT::scalar(1);
283 static const LLT S8 = LLT::scalar(8);
284 static const LLT S16 = LLT::scalar(16);
285 static const LLT S32 = LLT::scalar(32);
286 static const LLT F32 = LLT::float32();
287 static const LLT S64 = LLT::scalar(64);
288 static const LLT F64 = LLT::float64();
289 static const LLT S96 = LLT::scalar(96);
290 static const LLT S128 = LLT::scalar(128);
291 static const LLT S160 = LLT::scalar(160);
292 static const LLT S192 = LLT::scalar(192);
293 static const LLT S224 = LLT::scalar(224);
294 static const LLT S256 = LLT::scalar(256);
295 static const LLT S512 = LLT::scalar(512);
296 static const LLT S1024 = LLT::scalar(1024);
297 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
298 
299 static const LLT V2S8 = LLT::fixed_vector(2, 8);
300 static const LLT V2S16 = LLT::fixed_vector(2, 16);
301 static const LLT V4S16 = LLT::fixed_vector(4, 16);
302 static const LLT V6S16 = LLT::fixed_vector(6, 16);
303 static const LLT V8S16 = LLT::fixed_vector(8, 16);
304 static const LLT V10S16 = LLT::fixed_vector(10, 16);
305 static const LLT V12S16 = LLT::fixed_vector(12, 16);
306 static const LLT V16S16 = LLT::fixed_vector(16, 16);
307 
308 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
309 static const LLT V2BF16 = V2F16; // FIXME
310 
311 static const LLT V2S32 = LLT::fixed_vector(2, 32);
312 static const LLT V3S32 = LLT::fixed_vector(3, 32);
313 static const LLT V4S32 = LLT::fixed_vector(4, 32);
314 static const LLT V5S32 = LLT::fixed_vector(5, 32);
315 static const LLT V6S32 = LLT::fixed_vector(6, 32);
316 static const LLT V7S32 = LLT::fixed_vector(7, 32);
317 static const LLT V8S32 = LLT::fixed_vector(8, 32);
318 static const LLT V9S32 = LLT::fixed_vector(9, 32);
319 static const LLT V10S32 = LLT::fixed_vector(10, 32);
320 static const LLT V11S32 = LLT::fixed_vector(11, 32);
321 static const LLT V12S32 = LLT::fixed_vector(12, 32);
322 static const LLT V16S32 = LLT::fixed_vector(16, 32);
323 static const LLT V32S32 = LLT::fixed_vector(32, 32);
324 
325 static const LLT V2S64 = LLT::fixed_vector(2, 64);
326 static const LLT V3S64 = LLT::fixed_vector(3, 64);
327 static const LLT V4S64 = LLT::fixed_vector(4, 64);
328 static const LLT V5S64 = LLT::fixed_vector(5, 64);
329 static const LLT V6S64 = LLT::fixed_vector(6, 64);
330 static const LLT V7S64 = LLT::fixed_vector(7, 64);
331 static const LLT V8S64 = LLT::fixed_vector(8, 64);
332 static const LLT V16S64 = LLT::fixed_vector(16, 64);
333 
334 static const LLT V2S128 = LLT::fixed_vector(2, 128);
335 static const LLT V4S128 = LLT::fixed_vector(4, 128);
336 
337 static std::initializer_list<LLT> AllScalarTypes = {
338     S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
339 
340 static std::initializer_list<LLT> AllS16Vectors{
341     V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
342 
343 static std::initializer_list<LLT> AllS32Vectors = {
344     V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
345     V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
346 
347 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
348                                                    V6S64, V7S64, V8S64, V16S64};
349 
350 // Checks whether a type is in the list of legal register types.
351 static bool isRegisterClassType(LLT Ty) {
352   if (Ty.isPointerOrPointerVector())
353     Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
354 
355   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
356          is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
357 }
358 
359 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
360   return [TypeIdx](const LegalityQuery &Query) {
361     return isRegisterClassType(Query.Types[TypeIdx]);
362   };
363 }
364 
365 // If we have a truncating store or an extending load with a data size larger
366 // than 32-bits, we need to reduce to a 32-bit type.
367 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
368   return [=](const LegalityQuery &Query) {
369     const LLT Ty = Query.Types[TypeIdx];
370     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
371            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
372   };
373 }
374 
375 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
376 // handle some operations by just promoting the register during
377 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
378 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
379                                     bool IsLoad, bool IsAtomic) {
380   switch (AS) {
381   case AMDGPUAS::PRIVATE_ADDRESS:
382     // FIXME: Private element size.
383     return ST.enableFlatScratch() ? 128 : 32;
384   case AMDGPUAS::LOCAL_ADDRESS:
385     return ST.useDS128() ? 128 : 64;
386   case AMDGPUAS::GLOBAL_ADDRESS:
387   case AMDGPUAS::CONSTANT_ADDRESS:
388   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
389   case AMDGPUAS::BUFFER_RESOURCE:
390     // Treat constant and global as identical. SMRD loads are sometimes usable for
391     // global loads (ideally constant address space should be eliminated)
392     // depending on the context. Legality cannot be context dependent, but
393     // RegBankSelect can split the load as necessary depending on the pointer
394     // register bank/uniformity and if the memory is invariant or not written in a
395     // kernel.
396     return IsLoad ? 512 : 128;
397   default:
398     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
399     // if they may alias scratch depending on the subtarget.  This needs to be
400     // moved to custom handling to use addressMayBeAccessedAsPrivate
401     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
402   }
403 }
404 
405 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
406                                  const LegalityQuery &Query) {
407   const LLT Ty = Query.Types[0];
408 
409   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
410   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
411 
412   unsigned RegSize = Ty.getSizeInBits();
413   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
414   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
415   unsigned AS = Query.Types[1].getAddressSpace();
416 
417   // All of these need to be custom lowered to cast the pointer operand.
418   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
419     return false;
420 
421   // Do not handle extending vector loads.
422   if (Ty.isVector() && MemSize != RegSize)
423     return false;
424 
425   // TODO: We should be able to widen loads if the alignment is high enough, but
426   // we also need to modify the memory access size.
427 #if 0
428   // Accept widening loads based on alignment.
429   if (IsLoad && MemSize < Size)
430     MemSize = std::max(MemSize, Align);
431 #endif
432 
433   // Only 1-byte and 2-byte to 32-bit extloads are valid.
434   if (MemSize != RegSize && RegSize != 32)
435     return false;
436 
437   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
438                                     Query.MMODescrs[0].Ordering !=
439                                         AtomicOrdering::NotAtomic))
440     return false;
441 
442   switch (MemSize) {
443   case 8:
444   case 16:
445   case 32:
446   case 64:
447   case 128:
448     break;
449   case 96:
450     if (!ST.hasDwordx3LoadStores())
451       return false;
452     break;
453   case 256:
454   case 512:
455     // These may contextually need to be broken down.
456     break;
457   default:
458     return false;
459   }
460 
461   assert(RegSize >= MemSize);
462 
463   if (AlignBits < MemSize) {
464     const SITargetLowering *TLI = ST.getTargetLowering();
465     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
466                                                  Align(AlignBits / 8)))
467       return false;
468   }
469 
470   return true;
471 }
472 
473 // The newer buffer intrinsic forms take their resource arguments as
474 // pointers in address space 8, aka s128 values. However, in order to not break
475 // SelectionDAG, the underlying operations have to continue to take v4i32
476 // arguments. Therefore, we convert resource pointers - or vectors of them
477 // to integer values here.
478 static bool hasBufferRsrcWorkaround(const LLT Ty) {
479   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
480     return true;
481   if (Ty.isVector()) {
482     const LLT ElemTy = Ty.getElementType();
483     return hasBufferRsrcWorkaround(ElemTy);
484   }
485   return false;
486 }
487 
488 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
489 // workaround this. Eventually it should ignore the type for loads and only care
490 // about the size. Return true in cases where we will workaround this for now by
491 // bitcasting.
492 static bool loadStoreBitcastWorkaround(const LLT Ty) {
493   if (EnableNewLegality)
494     return false;
495 
496   const unsigned Size = Ty.getSizeInBits();
497   if (Ty.isPointerVector())
498     return true;
499   if (Size <= 64)
500     return false;
501   // Address space 8 pointers get their own workaround.
502   if (hasBufferRsrcWorkaround(Ty))
503     return false;
504   if (!Ty.isVector())
505     return true;
506 
507   unsigned EltSize = Ty.getScalarSizeInBits();
508   return EltSize != 32 && EltSize != 64;
509 }
510 
511 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
512   const LLT Ty = Query.Types[0];
513   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
515 }
516 
517 /// Return true if a load or store of the type should be lowered with a bitcast
518 /// to a different type.
519 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
520                                        const LLT MemTy) {
521   const unsigned MemSizeInBits = MemTy.getSizeInBits();
522   const unsigned Size = Ty.getSizeInBits();
523   if (Size != MemSizeInBits)
524     return Size <= 32 && Ty.isVector();
525 
526   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
527     return true;
528 
529   // Don't try to handle bitcasting vector ext loads for now.
530   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
531          (Size <= 32 || isRegisterSize(Size)) &&
532          !isRegisterVectorElementType(Ty.getElementType());
533 }
534 
535 /// Return true if we should legalize a load by widening an odd sized memory
536 /// access up to the alignment. Note this case when the memory access itself
537 /// changes, not the size of the result register.
538 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
539                             uint64_t AlignInBits, unsigned AddrSpace,
540                             unsigned Opcode) {
541   unsigned SizeInBits = MemoryTy.getSizeInBits();
542   // We don't want to widen cases that are naturally legal.
543   if (isPowerOf2_32(SizeInBits))
544     return false;
545 
546   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
547   // end up widening these for a scalar load during RegBankSelect, if we don't
548   // have 96-bit scalar loads.
549   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
550     return false;
551 
552   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
553     return false;
554 
555   // A load is known dereferenceable up to the alignment, so it's legal to widen
556   // to it.
557   //
558   // TODO: Could check dereferenceable for less aligned cases.
559   unsigned RoundedSize = NextPowerOf2(SizeInBits);
560   if (AlignInBits < RoundedSize)
561     return false;
562 
563   // Do not widen if it would introduce a slow unaligned load.
564   const SITargetLowering *TLI = ST.getTargetLowering();
565   unsigned Fast = 0;
566   return TLI->allowsMisalignedMemoryAccessesImpl(
567              RoundedSize, AddrSpace, Align(AlignInBits / 8),
568              MachineMemOperand::MOLoad, &Fast) &&
569          Fast;
570 }
571 
572 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
573                             unsigned Opcode) {
574   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
575     return false;
576 
577   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
578                          Query.MMODescrs[0].AlignInBits,
579                          Query.Types[1].getAddressSpace(), Opcode);
580 }
581 
582 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
583 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
584 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
586                                    MachineRegisterInfo &MRI, unsigned Idx) {
587   MachineOperand &MO = MI.getOperand(Idx);
588 
589   const LLT PointerTy = MRI.getType(MO.getReg());
590 
591   // Paranoidly prevent us from doing this multiple times.
592   if (!hasBufferRsrcWorkaround(PointerTy))
593     return PointerTy;
594 
595   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
596   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
597   if (!PointerTy.isVector()) {
598     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
599     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
600     const LLT S32 = LLT::scalar(32);
601 
602     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
603     std::array<Register, 4> VectorElems;
604     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
605     for (unsigned I = 0; I < NumParts; ++I)
606       VectorElems[I] =
607           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
608     B.buildMergeValues(MO, VectorElems);
609     MO.setReg(VectorReg);
610     return VectorTy;
611   }
612   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
613   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
614   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
615   B.buildIntToPtr(MO, Scalar);
616   MO.setReg(BitcastReg);
617 
618   return VectorTy;
619 }
620 
621 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
622 /// the form in which the value must be in order to be passed to the low-level
623 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
624 /// needed in order to account for the fact that we can't define a register
625 /// class for s128 without breaking SelectionDAG.
626 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
627   MachineRegisterInfo &MRI = *B.getMRI();
628   const LLT PointerTy = MRI.getType(Pointer);
629   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 
632   if (!PointerTy.isVector()) {
633     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
634     SmallVector<Register, 4> PointerParts;
635     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
636     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
637     for (unsigned I = 0; I < NumParts; ++I)
638       PointerParts.push_back(Unmerged.getReg(I));
639     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640   }
641   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
642   return B.buildBitcast(VectorTy, Scalar).getReg(0);
643 }
644 
645 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
646                                      unsigned Idx) {
647   MachineOperand &MO = MI.getOperand(Idx);
648 
649   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
650   // Paranoidly prevent us from doing this multiple times.
651   if (!hasBufferRsrcWorkaround(PointerTy))
652     return;
653   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
654 }
655 
656 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
657                                          const GCNTargetMachine &TM)
658   :  ST(ST_) {
659   using namespace TargetOpcode;
660 
661   auto GetAddrSpacePtr = [&TM](unsigned AS) {
662     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
663   };
664 
665   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
666   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
667   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
668   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
669   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
670   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
671   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
672   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
673   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
674   const LLT BufferStridedPtr =
675       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
676 
677   const LLT CodePtr = FlatPtr;
678 
679   const std::initializer_list<LLT> AddrSpaces64 = {
680     GlobalPtr, ConstantPtr, FlatPtr
681   };
682 
683   const std::initializer_list<LLT> AddrSpaces32 = {
684     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
685   };
686 
687   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 
689   const std::initializer_list<LLT> FPTypesBase = {
690     S32, S64
691   };
692 
693   const std::initializer_list<LLT> FPTypes16 = {
694     S32, S64, S16
695   };
696 
697   const std::initializer_list<LLT> FPTypesPK16 = {
698     S32, S64, S16, V2S16
699   };
700 
701   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
702 
703   // s1 for VCC branches, s32 for SCC branches.
704   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
705 
706   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
707   // elements for v3s16
708   getActionDefinitionsBuilder(G_PHI)
709       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
710       .legalFor(AllS32Vectors)
711       .legalFor(AllS64Vectors)
712       .legalFor(AddrSpaces64)
713       .legalFor(AddrSpaces32)
714       .legalFor(AddrSpaces128)
715       .legalIf(isPointer(0))
716       .clampScalar(0, S16, S256)
717       .widenScalarToNextPow2(0, 32)
718       .clampMaxNumElements(0, S32, 16)
719       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
720       .scalarize(0);
721 
722   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
723     // Full set of gfx9 features.
724     if (ST.hasScalarAddSub64()) {
725       getActionDefinitionsBuilder({G_ADD, G_SUB})
726           .legalFor({S64, S32, S16, V2S16})
727           .clampMaxNumElementsStrict(0, S16, 2)
728           .scalarize(0)
729           .minScalar(0, S16)
730           .widenScalarToNextMultipleOf(0, 32)
731           .maxScalar(0, S32);
732     } else {
733       getActionDefinitionsBuilder({G_ADD, G_SUB})
734           .legalFor({S32, S16, V2S16})
735           .clampMaxNumElementsStrict(0, S16, 2)
736           .scalarize(0)
737           .minScalar(0, S16)
738           .widenScalarToNextMultipleOf(0, 32)
739           .maxScalar(0, S32);
740     }
741 
742     if (ST.hasScalarSMulU64()) {
743       getActionDefinitionsBuilder(G_MUL)
744           .legalFor({S64, S32, S16, V2S16})
745           .clampMaxNumElementsStrict(0, S16, 2)
746           .scalarize(0)
747           .minScalar(0, S16)
748           .widenScalarToNextMultipleOf(0, 32)
749           .custom();
750     } else {
751       getActionDefinitionsBuilder(G_MUL)
752           .legalFor({S32, S16, V2S16})
753           .clampMaxNumElementsStrict(0, S16, 2)
754           .scalarize(0)
755           .minScalar(0, S16)
756           .widenScalarToNextMultipleOf(0, 32)
757           .custom();
758     }
759     assert(ST.hasMad64_32());
760 
761     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
762       .legalFor({S32, S16, V2S16}) // Clamp modifier
763       .minScalarOrElt(0, S16)
764       .clampMaxNumElementsStrict(0, S16, 2)
765       .scalarize(0)
766       .widenScalarToNextPow2(0, 32)
767       .lower();
768   } else if (ST.has16BitInsts()) {
769     getActionDefinitionsBuilder({G_ADD, G_SUB})
770       .legalFor({S32, S16})
771       .minScalar(0, S16)
772       .widenScalarToNextMultipleOf(0, 32)
773       .maxScalar(0, S32)
774       .scalarize(0);
775 
776     getActionDefinitionsBuilder(G_MUL)
777       .legalFor({S32, S16})
778       .scalarize(0)
779       .minScalar(0, S16)
780       .widenScalarToNextMultipleOf(0, 32)
781       .custom();
782     assert(ST.hasMad64_32());
783 
784     // Technically the saturating operations require clamp bit support, but this
785     // was introduced at the same time as 16-bit operations.
786     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
787       .legalFor({S32, S16}) // Clamp modifier
788       .minScalar(0, S16)
789       .scalarize(0)
790       .widenScalarToNextPow2(0, 16)
791       .lower();
792 
793     // We're just lowering this, but it helps get a better result to try to
794     // coerce to the desired type first.
795     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796       .minScalar(0, S16)
797       .scalarize(0)
798       .lower();
799   } else {
800     getActionDefinitionsBuilder({G_ADD, G_SUB})
801       .legalFor({S32})
802       .widenScalarToNextMultipleOf(0, 32)
803       .clampScalar(0, S32, S32)
804       .scalarize(0);
805 
806     auto &Mul = getActionDefinitionsBuilder(G_MUL)
807       .legalFor({S32})
808       .scalarize(0)
809       .minScalar(0, S32)
810       .widenScalarToNextMultipleOf(0, 32);
811 
812     if (ST.hasMad64_32())
813       Mul.custom();
814     else
815       Mul.maxScalar(0, S32);
816 
817     if (ST.hasIntClamp()) {
818       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
819         .legalFor({S32}) // Clamp modifier.
820         .scalarize(0)
821         .minScalarOrElt(0, S32)
822         .lower();
823     } else {
824       // Clamp bit support was added in VI, along with 16-bit operations.
825       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
826         .minScalar(0, S32)
827         .scalarize(0)
828         .lower();
829     }
830 
831     // FIXME: DAG expansion gets better results. The widening uses the smaller
832     // range values and goes for the min/max lowering directly.
833     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
834       .minScalar(0, S32)
835       .scalarize(0)
836       .lower();
837   }
838 
839   getActionDefinitionsBuilder(
840       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
841       .customFor({S32, S64})
842       .clampScalar(0, S32, S64)
843       .widenScalarToNextPow2(0, 32)
844       .scalarize(0);
845 
846   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
847                    .legalFor({S32})
848                    .maxScalar(0, S32);
849 
850   if (ST.hasVOP3PInsts()) {
851     Mulh
852       .clampMaxNumElements(0, S8, 2)
853       .lowerFor({V2S8});
854   }
855 
856   Mulh
857     .scalarize(0)
858     .lower();
859 
860   // Report legal for any types we can handle anywhere. For the cases only legal
861   // on the SALU, RegBankSelect will be able to re-legalize.
862   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
863     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
864     .clampScalar(0, S32, S64)
865     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
866     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
867     .widenScalarToNextPow2(0)
868     .scalarize(0);
869 
870   getActionDefinitionsBuilder(
871       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
872       .legalFor({{S32, S1}, {S32, S32}})
873       .clampScalar(0, S32, S32)
874       .scalarize(0);
875 
876   getActionDefinitionsBuilder(G_BITCAST)
877       // Don't worry about the size constraint.
878       .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
879       .lower();
880 
881   getActionDefinitionsBuilder(G_CONSTANT)
882     .legalFor({S1, S32, S64, S16, GlobalPtr,
883                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
884     .legalIf(isPointer(0))
885     .clampScalar(0, S32, S64)
886     .widenScalarToNextPow2(0);
887 
888   getActionDefinitionsBuilder(G_FCONSTANT)
889     .legalFor({S32, S64, S16})
890     .clampScalar(0, S16, S64);
891 
892   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
893       .legalIf(isRegisterClassType(0))
894       // s1 and s16 are special cases because they have legal operations on
895       // them, but don't really occupy registers in the normal way.
896       .legalFor({S1, S16})
897       .clampNumElements(0, V16S32, V32S32)
898       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899       .clampScalarOrElt(0, S32, MaxScalar)
900       .widenScalarToNextPow2(0, 32)
901       .clampMaxNumElements(0, S32, 16);
902 
903   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
904 
905   // If the amount is divergent, we have to do a wave reduction to get the
906   // maximum value, so this is expanded during RegBankSelect.
907   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
908     .legalFor({{PrivatePtr, S32}});
909 
910   getActionDefinitionsBuilder(G_STACKSAVE)
911     .customFor({PrivatePtr});
912   getActionDefinitionsBuilder(G_STACKRESTORE)
913     .legalFor({PrivatePtr});
914 
915   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
916 
917   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
918     .customIf(typeIsNot(0, PrivatePtr));
919 
920   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
921 
922   auto &FPOpActions = getActionDefinitionsBuilder(
923     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
924       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
925     .legalFor({S32, S64});
926   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
927     .customFor({S32, S64});
928   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
929     .customFor({S32, S64});
930 
931   if (ST.has16BitInsts()) {
932     if (ST.hasVOP3PInsts())
933       FPOpActions.legalFor({S16, V2S16});
934     else
935       FPOpActions.legalFor({S16});
936 
937     TrigActions.customFor({S16});
938     FDIVActions.customFor({S16});
939   }
940 
941   if (ST.hasPackedFP32Ops()) {
942     FPOpActions.legalFor({V2S32});
943     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
944   }
945 
946   auto &MinNumMaxNum = getActionDefinitionsBuilder({
947       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
948 
949   if (ST.hasVOP3PInsts()) {
950     MinNumMaxNum.customFor(FPTypesPK16)
951       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
952       .clampMaxNumElements(0, S16, 2)
953       .clampScalar(0, S16, S64)
954       .scalarize(0);
955   } else if (ST.has16BitInsts()) {
956     MinNumMaxNum.customFor(FPTypes16)
957       .clampScalar(0, S16, S64)
958       .scalarize(0);
959   } else {
960     MinNumMaxNum.customFor(FPTypesBase)
961       .clampScalar(0, S32, S64)
962       .scalarize(0);
963   }
964 
965   if (ST.hasVOP3PInsts())
966     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
967 
968   FPOpActions
969     .scalarize(0)
970     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
971 
972   TrigActions
973     .scalarize(0)
974     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
975 
976   FDIVActions
977     .scalarize(0)
978     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
979 
980   getActionDefinitionsBuilder({G_FNEG, G_FABS})
981     .legalFor(FPTypesPK16)
982     .clampMaxNumElementsStrict(0, S16, 2)
983     .scalarize(0)
984     .clampScalar(0, S16, S64);
985 
986   if (ST.has16BitInsts()) {
987     getActionDefinitionsBuilder(G_FSQRT)
988       .legalFor({S16})
989       .customFor({S32, S64})
990       .scalarize(0)
991       .unsupported();
992     getActionDefinitionsBuilder(G_FFLOOR)
993       .legalFor({S32, S64, S16})
994       .scalarize(0)
995       .clampScalar(0, S16, S64);
996 
997     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
998       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
999       .scalarize(0)
1000       .maxScalarIf(typeIs(0, S16), 1, S16)
1001       .clampScalar(1, S32, S32)
1002       .lower();
1003 
1004     getActionDefinitionsBuilder(G_FFREXP)
1005       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1006       .scalarize(0)
1007       .lower();
1008   } else {
1009     getActionDefinitionsBuilder(G_FSQRT)
1010       .customFor({S32, S64, S16})
1011       .scalarize(0)
1012       .unsupported();
1013 
1014 
1015     if (ST.hasFractBug()) {
1016       getActionDefinitionsBuilder(G_FFLOOR)
1017         .customFor({S64})
1018         .legalFor({S32, S64})
1019         .scalarize(0)
1020         .clampScalar(0, S32, S64);
1021     } else {
1022       getActionDefinitionsBuilder(G_FFLOOR)
1023         .legalFor({S32, S64})
1024         .scalarize(0)
1025         .clampScalar(0, S32, S64);
1026     }
1027 
1028     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1029       .legalFor({{S32, S32}, {S64, S32}})
1030       .scalarize(0)
1031       .clampScalar(0, S32, S64)
1032       .clampScalar(1, S32, S32)
1033       .lower();
1034 
1035     getActionDefinitionsBuilder(G_FFREXP)
1036       .customFor({{S32, S32}, {S64, S32}})
1037       .scalarize(0)
1038       .minScalar(0, S32)
1039       .clampScalar(1, S32, S32)
1040       .lower();
1041   }
1042 
1043   auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1044   if (ST.hasCvtPkF16F32Inst())
1045     FPTruncActions.legalFor(
1046         {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
1047   else
1048     FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1049   FPTruncActions.scalarize(0).lower();
1050 
1051   getActionDefinitionsBuilder(G_FPEXT)
1052     .legalFor({{S64, S32}, {S32, S16}})
1053     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1054     .scalarize(0);
1055 
1056   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1057   if (ST.has16BitInsts()) {
1058     FSubActions
1059       // Use actual fsub instruction
1060       .legalFor({S32, S16})
1061       // Must use fadd + fneg
1062       .lowerFor({S64, V2S16});
1063   } else {
1064     FSubActions
1065       // Use actual fsub instruction
1066       .legalFor({S32})
1067       // Must use fadd + fneg
1068       .lowerFor({S64, S16, V2S16});
1069   }
1070 
1071   FSubActions
1072     .scalarize(0)
1073     .clampScalar(0, S32, S64);
1074 
1075   // Whether this is legal depends on the floating point mode for the function.
1076   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1077   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1078     FMad.customFor({S32, S16});
1079   else if (ST.hasMadMacF32Insts())
1080     FMad.customFor({S32});
1081   else if (ST.hasMadF16())
1082     FMad.customFor({S16});
1083   FMad.scalarize(0)
1084       .lower();
1085 
1086   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1087   if (ST.has16BitInsts()) {
1088     FRem.customFor({S16, S32, S64});
1089   } else {
1090     FRem.minScalar(0, S32)
1091         .customFor({S32, S64});
1092   }
1093   FRem.scalarize(0);
1094 
1095   // TODO: Do we need to clamp maximum bitwidth?
1096   getActionDefinitionsBuilder(G_TRUNC)
1097     .legalIf(isScalar(0))
1098     .legalFor({{V2S16, V2S32}})
1099     .clampMaxNumElements(0, S16, 2)
1100     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1101     // situations (like an invalid implicit use), we don't want to infinite loop
1102     // in the legalizer.
1103     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1104     .alwaysLegal();
1105 
1106   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1107     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1108                {S32, S1}, {S64, S1}, {S16, S1}})
1109     .scalarize(0)
1110     .clampScalar(0, S32, S64)
1111     .widenScalarToNextPow2(1, 32);
1112 
1113   // TODO: Split s1->s64 during regbankselect for VALU.
1114   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1115                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1116                     .lowerIf(typeIs(1, S1))
1117                     .customFor({{S32, S64}, {S64, S64}});
1118   if (ST.has16BitInsts())
1119     IToFP.legalFor({{S16, S16}});
1120   IToFP.clampScalar(1, S32, S64)
1121        .minScalar(0, S32)
1122        .scalarize(0)
1123        .widenScalarToNextPow2(1);
1124 
1125   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1126     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1127     .customFor({{S64, S32}, {S64, S64}})
1128     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1129   if (ST.has16BitInsts())
1130     FPToI.legalFor({{S16, S16}});
1131   else
1132     FPToI.minScalar(1, S32);
1133 
1134   FPToI.minScalar(0, S32)
1135        .widenScalarToNextPow2(0, 32)
1136        .scalarize(0)
1137        .lower();
1138 
1139   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1140       .clampScalar(0, S16, S64)
1141       .scalarize(0)
1142       .lower();
1143 
1144   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1145       .legalFor({S16, S32})
1146       .scalarize(0)
1147       .lower();
1148 
1149   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1150   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1151       .scalarize(0)
1152       .lower();
1153 
1154   getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1155       .clampScalar(0, S16, S64)
1156       .scalarize(0)
1157       .lower();
1158 
1159   if (ST.has16BitInsts()) {
1160     getActionDefinitionsBuilder(
1161         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1162         .legalFor({S16, S32, S64})
1163         .clampScalar(0, S16, S64)
1164         .scalarize(0);
1165   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1166     getActionDefinitionsBuilder(
1167         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1168         .legalFor({S32, S64})
1169         .clampScalar(0, S32, S64)
1170         .scalarize(0);
1171   } else {
1172     getActionDefinitionsBuilder(
1173         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1174         .legalFor({S32})
1175         .customFor({S64})
1176         .clampScalar(0, S32, S64)
1177         .scalarize(0);
1178   }
1179 
1180   getActionDefinitionsBuilder(G_PTR_ADD)
1181       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1182       .legalIf(all(isPointer(0), sameSize(0, 1)))
1183       .scalarize(0)
1184       .scalarSameSizeAs(1, 0);
1185 
1186   getActionDefinitionsBuilder(G_PTRMASK)
1187     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1188     .scalarSameSizeAs(1, 0)
1189     .scalarize(0);
1190 
1191   auto &CmpBuilder =
1192     getActionDefinitionsBuilder(G_ICMP)
1193     // The compare output type differs based on the register bank of the output,
1194     // so make both s1 and s32 legal.
1195     //
1196     // Scalar compares producing output in scc will be promoted to s32, as that
1197     // is the allocatable register type that will be needed for the copy from
1198     // scc. This will be promoted during RegBankSelect, and we assume something
1199     // before that won't try to use s32 result types.
1200     //
1201     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1202     // bank.
1203     .legalForCartesianProduct(
1204       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1205     .legalForCartesianProduct(
1206       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1207   if (ST.has16BitInsts()) {
1208     CmpBuilder.legalFor({{S1, S16}});
1209   }
1210 
1211   CmpBuilder
1212     .widenScalarToNextPow2(1)
1213     .clampScalar(1, S32, S64)
1214     .scalarize(0)
1215     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1216 
1217   auto &FCmpBuilder =
1218       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1219           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1220 
1221   if (ST.hasSALUFloatInsts())
1222     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1223 
1224   FCmpBuilder
1225     .widenScalarToNextPow2(1)
1226     .clampScalar(1, S32, S64)
1227     .scalarize(0);
1228 
1229   // FIXME: fpow has a selection pattern that should move to custom lowering.
1230   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1231   if (ST.has16BitInsts())
1232     ExpOps.customFor({{S32}, {S16}});
1233   else
1234     ExpOps.customFor({S32});
1235   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1236         .scalarize(0);
1237 
1238   getActionDefinitionsBuilder(G_FPOWI)
1239     .clampScalar(0, MinScalarFPTy, S32)
1240     .lower();
1241 
1242   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1243   Log2Ops.customFor({S32});
1244   if (ST.has16BitInsts())
1245     Log2Ops.legalFor({S16});
1246   else
1247     Log2Ops.customFor({S16});
1248   Log2Ops.scalarize(0)
1249     .lower();
1250 
1251   auto &LogOps =
1252       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1253   LogOps.customFor({S32, S16});
1254   LogOps.clampScalar(0, MinScalarFPTy, S32)
1255         .scalarize(0);
1256 
1257   // The 64-bit versions produce 32-bit results, but only on the SALU.
1258   getActionDefinitionsBuilder(G_CTPOP)
1259     .legalFor({{S32, S32}, {S32, S64}})
1260     .clampScalar(0, S32, S32)
1261     .widenScalarToNextPow2(1, 32)
1262     .clampScalar(1, S32, S64)
1263     .scalarize(0)
1264     .widenScalarToNextPow2(0, 32);
1265 
1266   // If no 16 bit instr is available, lower into different instructions.
1267   if (ST.has16BitInsts())
1268     getActionDefinitionsBuilder(G_IS_FPCLASS)
1269         .legalForCartesianProduct({S1}, FPTypes16)
1270         .widenScalarToNextPow2(1)
1271         .scalarize(0)
1272         .lower();
1273   else
1274     getActionDefinitionsBuilder(G_IS_FPCLASS)
1275         .legalForCartesianProduct({S1}, FPTypesBase)
1276         .lowerFor({S1, S16})
1277         .widenScalarToNextPow2(1)
1278         .scalarize(0)
1279         .lower();
1280 
1281   // The hardware instructions return a different result on 0 than the generic
1282   // instructions expect. The hardware produces -1, but these produce the
1283   // bitwidth.
1284   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1285     .scalarize(0)
1286     .clampScalar(0, S32, S32)
1287     .clampScalar(1, S32, S64)
1288     .widenScalarToNextPow2(0, 32)
1289     .widenScalarToNextPow2(1, 32)
1290     .custom();
1291 
1292   // The 64-bit versions produce 32-bit results, but only on the SALU.
1293   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1294       .legalFor({{S32, S32}, {S32, S64}})
1295       .customIf(scalarNarrowerThan(1, 32))
1296       .clampScalar(0, S32, S32)
1297       .clampScalar(1, S32, S64)
1298       .scalarize(0)
1299       .widenScalarToNextPow2(0, 32)
1300       .widenScalarToNextPow2(1, 32);
1301 
1302   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1303       .legalFor({{S32, S32}, {S32, S64}})
1304       .clampScalar(0, S32, S32)
1305       .clampScalar(1, S32, S64)
1306       .scalarize(0)
1307       .widenScalarToNextPow2(0, 32)
1308       .widenScalarToNextPow2(1, 32);
1309 
1310   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1311   // RegBankSelect.
1312   getActionDefinitionsBuilder(G_BITREVERSE)
1313     .legalFor({S32, S64})
1314     .clampScalar(0, S32, S64)
1315     .scalarize(0)
1316     .widenScalarToNextPow2(0);
1317 
1318   if (ST.has16BitInsts()) {
1319     getActionDefinitionsBuilder(G_BSWAP)
1320       .legalFor({S16, S32, V2S16})
1321       .clampMaxNumElementsStrict(0, S16, 2)
1322       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1323       // narrowScalar limitation.
1324       .widenScalarToNextPow2(0)
1325       .clampScalar(0, S16, S32)
1326       .scalarize(0);
1327 
1328     if (ST.hasVOP3PInsts()) {
1329       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1330         .legalFor({S32, S16, V2S16})
1331         .clampMaxNumElements(0, S16, 2)
1332         .minScalar(0, S16)
1333         .widenScalarToNextPow2(0)
1334         .scalarize(0)
1335         .lower();
1336     } else {
1337       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1338         .legalFor({S32, S16})
1339         .widenScalarToNextPow2(0)
1340         .minScalar(0, S16)
1341         .scalarize(0)
1342         .lower();
1343     }
1344   } else {
1345     // TODO: Should have same legality without v_perm_b32
1346     getActionDefinitionsBuilder(G_BSWAP)
1347       .legalFor({S32})
1348       .lowerIf(scalarNarrowerThan(0, 32))
1349       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1350       // narrowScalar limitation.
1351       .widenScalarToNextPow2(0)
1352       .maxScalar(0, S32)
1353       .scalarize(0)
1354       .lower();
1355 
1356     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1357       .legalFor({S32})
1358       .minScalar(0, S32)
1359       .widenScalarToNextPow2(0)
1360       .scalarize(0)
1361       .lower();
1362   }
1363 
1364   getActionDefinitionsBuilder(G_INTTOPTR)
1365       // List the common cases
1366       .legalForCartesianProduct(AddrSpaces64, {S64})
1367       .legalForCartesianProduct(AddrSpaces32, {S32})
1368       .scalarize(0)
1369       // Accept any address space as long as the size matches
1370       .legalIf(sameSize(0, 1))
1371       .widenScalarIf(smallerThan(1, 0),
1372                      [](const LegalityQuery &Query) {
1373                        return std::pair(
1374                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1375                      })
1376       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1377         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1378       });
1379 
1380   getActionDefinitionsBuilder(G_PTRTOINT)
1381       // List the common cases
1382       .legalForCartesianProduct(AddrSpaces64, {S64})
1383       .legalForCartesianProduct(AddrSpaces32, {S32})
1384       .scalarize(0)
1385       // Accept any address space as long as the size matches
1386       .legalIf(sameSize(0, 1))
1387       .widenScalarIf(smallerThan(0, 1),
1388                      [](const LegalityQuery &Query) {
1389                        return std::pair(
1390                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1391                      })
1392       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1393         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1394       });
1395 
1396   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1397     .scalarize(0)
1398     .custom();
1399 
1400   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1401                                     bool IsLoad) -> bool {
1402     const LLT DstTy = Query.Types[0];
1403 
1404     // Split vector extloads.
1405     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1406 
1407     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1408       return true;
1409 
1410     const LLT PtrTy = Query.Types[1];
1411     unsigned AS = PtrTy.getAddressSpace();
1412     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1413                                       Query.MMODescrs[0].Ordering !=
1414                                           AtomicOrdering::NotAtomic))
1415       return true;
1416 
1417     // Catch weird sized loads that don't evenly divide into the access sizes
1418     // TODO: May be able to widen depending on alignment etc.
1419     unsigned NumRegs = (MemSize + 31) / 32;
1420     if (NumRegs == 3) {
1421       if (!ST.hasDwordx3LoadStores())
1422         return true;
1423     } else {
1424       // If the alignment allows, these should have been widened.
1425       if (!isPowerOf2_32(NumRegs))
1426         return true;
1427     }
1428 
1429     return false;
1430   };
1431 
1432   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1433   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1434   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1435 
1436   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1437   // LDS
1438   // TODO: Unsupported flat for SI.
1439 
1440   for (unsigned Op : {G_LOAD, G_STORE}) {
1441     const bool IsStore = Op == G_STORE;
1442 
1443     auto &Actions = getActionDefinitionsBuilder(Op);
1444     // Explicitly list some common cases.
1445     // TODO: Does this help compile time at all?
1446     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1447                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1448                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1449                                       {S64, GlobalPtr, S64, GlobalAlign32},
1450                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1451                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1452                                       {S32, GlobalPtr, S8, GlobalAlign8},
1453                                       {S32, GlobalPtr, S16, GlobalAlign16},
1454 
1455                                       {S32, LocalPtr, S32, 32},
1456                                       {S64, LocalPtr, S64, 32},
1457                                       {V2S32, LocalPtr, V2S32, 32},
1458                                       {S32, LocalPtr, S8, 8},
1459                                       {S32, LocalPtr, S16, 16},
1460                                       {V2S16, LocalPtr, S32, 32},
1461 
1462                                       {S32, PrivatePtr, S32, 32},
1463                                       {S32, PrivatePtr, S8, 8},
1464                                       {S32, PrivatePtr, S16, 16},
1465                                       {V2S16, PrivatePtr, S32, 32},
1466 
1467                                       {S32, ConstantPtr, S32, GlobalAlign32},
1468                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1469                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1470                                       {S64, ConstantPtr, S64, GlobalAlign32},
1471                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1472     Actions.legalIf(
1473       [=](const LegalityQuery &Query) -> bool {
1474         return isLoadStoreLegal(ST, Query);
1475       });
1476 
1477     // The custom pointers (fat pointers, buffer resources) don't work with load
1478     // and store at this level. Fat pointers should have been lowered to
1479     // intrinsics before the translation to MIR.
1480     Actions.unsupportedIf(
1481         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1482 
1483     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1484     // ptrtoint. This is needed to account for the fact that we can't have i128
1485     // as a register class for SelectionDAG reasons.
1486     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1487       return hasBufferRsrcWorkaround(Query.Types[0]);
1488     });
1489 
1490     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1491     // 64-bits.
1492     //
1493     // TODO: Should generalize bitcast action into coerce, which will also cover
1494     // inserting addrspacecasts.
1495     Actions.customIf(typeIs(1, Constant32Ptr));
1496 
1497     // Turn any illegal element vectors into something easier to deal
1498     // with. These will ultimately produce 32-bit scalar shifts to extract the
1499     // parts anyway.
1500     //
1501     // For odd 16-bit element vectors, prefer to split those into pieces with
1502     // 16-bit vector parts.
1503     Actions.bitcastIf(
1504       [=](const LegalityQuery &Query) -> bool {
1505         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1506                                           Query.MMODescrs[0].MemoryTy);
1507       }, bitcastToRegisterType(0));
1508 
1509     if (!IsStore) {
1510       // Widen suitably aligned loads by loading extra bytes. The standard
1511       // legalization actions can't properly express widening memory operands.
1512       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1513         return shouldWidenLoad(ST, Query, G_LOAD);
1514       });
1515     }
1516 
1517     // FIXME: load/store narrowing should be moved to lower action
1518     Actions
1519         .narrowScalarIf(
1520             [=](const LegalityQuery &Query) -> bool {
1521               return !Query.Types[0].isVector() &&
1522                      needToSplitMemOp(Query, Op == G_LOAD);
1523             },
1524             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1525               const LLT DstTy = Query.Types[0];
1526               const LLT PtrTy = Query.Types[1];
1527 
1528               const unsigned DstSize = DstTy.getSizeInBits();
1529               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1530 
1531               // Split extloads.
1532               if (DstSize > MemSize)
1533                 return std::pair(0, LLT::scalar(MemSize));
1534 
1535               unsigned MaxSize = maxSizeForAddrSpace(
1536                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1537                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1538               if (MemSize > MaxSize)
1539                 return std::pair(0, LLT::scalar(MaxSize));
1540 
1541               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1542               return std::pair(0, LLT::scalar(Align));
1543             })
1544         .fewerElementsIf(
1545             [=](const LegalityQuery &Query) -> bool {
1546               return Query.Types[0].isVector() &&
1547                      needToSplitMemOp(Query, Op == G_LOAD);
1548             },
1549             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1550               const LLT DstTy = Query.Types[0];
1551               const LLT PtrTy = Query.Types[1];
1552 
1553               LLT EltTy = DstTy.getElementType();
1554               unsigned MaxSize = maxSizeForAddrSpace(
1555                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1556                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1557 
1558               // FIXME: Handle widened to power of 2 results better. This ends
1559               // up scalarizing.
1560               // FIXME: 3 element stores scalarized on SI
1561 
1562               // Split if it's too large for the address space.
1563               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1564               if (MemSize > MaxSize) {
1565                 unsigned NumElts = DstTy.getNumElements();
1566                 unsigned EltSize = EltTy.getSizeInBits();
1567 
1568                 if (MaxSize % EltSize == 0) {
1569                   return std::pair(
1570                       0, LLT::scalarOrVector(
1571                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1572                 }
1573 
1574                 unsigned NumPieces = MemSize / MaxSize;
1575 
1576                 // FIXME: Refine when odd breakdowns handled
1577                 // The scalars will need to be re-legalized.
1578                 if (NumPieces == 1 || NumPieces >= NumElts ||
1579                     NumElts % NumPieces != 0)
1580                   return std::pair(0, EltTy);
1581 
1582                 return std::pair(0,
1583                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1584               }
1585 
1586               // FIXME: We could probably handle weird extending loads better.
1587               if (DstTy.getSizeInBits() > MemSize)
1588                 return std::pair(0, EltTy);
1589 
1590               unsigned EltSize = EltTy.getSizeInBits();
1591               unsigned DstSize = DstTy.getSizeInBits();
1592               if (!isPowerOf2_32(DstSize)) {
1593                 // We're probably decomposing an odd sized store. Try to split
1594                 // to the widest type. TODO: Account for alignment. As-is it
1595                 // should be OK, since the new parts will be further legalized.
1596                 unsigned FloorSize = llvm::bit_floor(DstSize);
1597                 return std::pair(
1598                     0, LLT::scalarOrVector(
1599                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1600               }
1601 
1602               // May need relegalization for the scalars.
1603               return std::pair(0, EltTy);
1604             })
1605     .minScalar(0, S32)
1606     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1607     .widenScalarToNextPow2(0)
1608     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1609     .lower();
1610   }
1611 
1612   // FIXME: Unaligned accesses not lowered.
1613   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1614                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1615                                                   {S32, GlobalPtr, S16, 2 * 8},
1616                                                   {S32, LocalPtr, S8, 8},
1617                                                   {S32, LocalPtr, S16, 16},
1618                                                   {S32, PrivatePtr, S8, 8},
1619                                                   {S32, PrivatePtr, S16, 16},
1620                                                   {S32, ConstantPtr, S8, 8},
1621                                                   {S32, ConstantPtr, S16, 2 * 8}})
1622                        .legalIf(
1623                          [=](const LegalityQuery &Query) -> bool {
1624                            return isLoadStoreLegal(ST, Query);
1625                          });
1626 
1627   if (ST.hasFlatAddressSpace()) {
1628     ExtLoads.legalForTypesWithMemDesc(
1629         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1630   }
1631 
1632   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1633   // 64-bits.
1634   //
1635   // TODO: Should generalize bitcast action into coerce, which will also cover
1636   // inserting addrspacecasts.
1637   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1638 
1639   ExtLoads.clampScalar(0, S32, S32)
1640           .widenScalarToNextPow2(0)
1641           .lower();
1642 
1643   auto &Atomics = getActionDefinitionsBuilder(
1644     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1645      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1646      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1647      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1648     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1649                {S64, GlobalPtr}, {S64, LocalPtr},
1650                {S32, RegionPtr}, {S64, RegionPtr}});
1651   if (ST.hasFlatAddressSpace()) {
1652     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1653   }
1654 
1655   // TODO: v2bf16 operations, and fat buffer pointer support.
1656   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1657   if (ST.hasLDSFPAtomicAddF32()) {
1658     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1659     if (ST.hasLdsAtomicAddF64())
1660       Atomic.legalFor({{S64, LocalPtr}});
1661     if (ST.hasAtomicDsPkAdd16Insts())
1662       Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1663   }
1664   if (ST.hasAtomicFaddInsts())
1665     Atomic.legalFor({{S32, GlobalPtr}});
1666   if (ST.hasFlatAtomicFaddF32Inst())
1667     Atomic.legalFor({{S32, FlatPtr}});
1668 
1669   if (ST.hasGFX90AInsts()) {
1670     // These are legal with some caveats, and should have undergone expansion in
1671     // the IR in most situations
1672     // TODO: Move atomic expansion into legalizer
1673     Atomic.legalFor({
1674         {S32, GlobalPtr},
1675         {S64, GlobalPtr},
1676         {S64, FlatPtr}
1677       });
1678   }
1679 
1680   if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1681       ST.hasAtomicBufferGlobalPkAddF16Insts())
1682     Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1683   if (ST.hasAtomicGlobalPkAddBF16Inst())
1684     Atomic.legalFor({{V2BF16, GlobalPtr}});
1685   if (ST.hasAtomicFlatPkAdd16Insts())
1686     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1687 
1688 
1689   // Most of the legalization work here is done by AtomicExpand. We could
1690   // probably use a simpler legality rule that just assumes anything is OK.
1691   auto &AtomicFMinFMax =
1692     getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1693     .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1694 
1695   if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1696     AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1697   if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1698     AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1699   if (ST.hasAtomicFMinFMaxF32FlatInsts())
1700     AtomicFMinFMax.legalFor({F32, FlatPtr});
1701   if (ST.hasAtomicFMinFMaxF64FlatInsts())
1702     AtomicFMinFMax.legalFor({F64, FlatPtr});
1703 
1704   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1705   // demarshalling
1706   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1707     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1708                 {S32, FlatPtr}, {S64, FlatPtr}})
1709     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1710                {S32, RegionPtr}, {S64, RegionPtr}});
1711   // TODO: Pointer types, any 32-bit or 64-bit vector
1712 
1713   // Condition should be s32 for scalar, s1 for vector.
1714   getActionDefinitionsBuilder(G_SELECT)
1715       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1716                                  LocalPtr, FlatPtr, PrivatePtr,
1717                                  LLT::fixed_vector(2, LocalPtr),
1718                                  LLT::fixed_vector(2, PrivatePtr)},
1719                                 {S1, S32})
1720       .clampScalar(0, S16, S64)
1721       .scalarize(1)
1722       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1723       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1724       .clampMaxNumElements(0, S32, 2)
1725       .clampMaxNumElements(0, LocalPtr, 2)
1726       .clampMaxNumElements(0, PrivatePtr, 2)
1727       .scalarize(0)
1728       .widenScalarToNextPow2(0)
1729       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1730 
1731   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1732   // be more flexible with the shift amount type.
1733   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1734     .legalFor({{S32, S32}, {S64, S32}});
1735   if (ST.has16BitInsts()) {
1736     if (ST.hasVOP3PInsts()) {
1737       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1738             .clampMaxNumElements(0, S16, 2);
1739     } else
1740       Shifts.legalFor({{S16, S16}});
1741 
1742     // TODO: Support 16-bit shift amounts for all types
1743     Shifts.widenScalarIf(
1744       [=](const LegalityQuery &Query) {
1745         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1746         // 32-bit amount.
1747         const LLT ValTy = Query.Types[0];
1748         const LLT AmountTy = Query.Types[1];
1749         return ValTy.getSizeInBits() <= 16 &&
1750                AmountTy.getSizeInBits() < 16;
1751       }, changeTo(1, S16));
1752     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1753     Shifts.clampScalar(1, S32, S32);
1754     Shifts.widenScalarToNextPow2(0, 16);
1755     Shifts.clampScalar(0, S16, S64);
1756 
1757     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1758       .minScalar(0, S16)
1759       .scalarize(0)
1760       .lower();
1761   } else {
1762     // Make sure we legalize the shift amount type first, as the general
1763     // expansion for the shifted type will produce much worse code if it hasn't
1764     // been truncated already.
1765     Shifts.clampScalar(1, S32, S32);
1766     Shifts.widenScalarToNextPow2(0, 32);
1767     Shifts.clampScalar(0, S32, S64);
1768 
1769     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1770       .minScalar(0, S32)
1771       .scalarize(0)
1772       .lower();
1773   }
1774   Shifts.scalarize(0);
1775 
1776   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1777     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1778     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1779     unsigned IdxTypeIdx = 2;
1780 
1781     getActionDefinitionsBuilder(Op)
1782       .customIf([=](const LegalityQuery &Query) {
1783           const LLT EltTy = Query.Types[EltTypeIdx];
1784           const LLT VecTy = Query.Types[VecTypeIdx];
1785           const LLT IdxTy = Query.Types[IdxTypeIdx];
1786           const unsigned EltSize = EltTy.getSizeInBits();
1787           const bool isLegalVecType =
1788               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1789           // Address space 8 pointers are 128-bit wide values, but the logic
1790           // below will try to bitcast them to 2N x s64, which will fail.
1791           // Therefore, as an intermediate step, wrap extracts/insertions from a
1792           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1793           // extraction result) in order to produce a vector operation that can
1794           // be handled by the logic below.
1795           if (EltTy.isPointer() && EltSize > 64)
1796             return true;
1797           return (EltSize == 32 || EltSize == 64) &&
1798                   VecTy.getSizeInBits() % 32 == 0 &&
1799                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1800                   IdxTy.getSizeInBits() == 32 &&
1801                   isLegalVecType;
1802         })
1803       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1804                  bitcastToVectorElement32(VecTypeIdx))
1805       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1806       .bitcastIf(
1807         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1808         [=](const LegalityQuery &Query) {
1809           // For > 64-bit element types, try to turn this into a 64-bit
1810           // element vector since we may be able to do better indexing
1811           // if this is scalar. If not, fall back to 32.
1812           const LLT EltTy = Query.Types[EltTypeIdx];
1813           const LLT VecTy = Query.Types[VecTypeIdx];
1814           const unsigned DstEltSize = EltTy.getSizeInBits();
1815           const unsigned VecSize = VecTy.getSizeInBits();
1816 
1817           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1818           return std::pair(
1819               VecTypeIdx,
1820               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1821         })
1822       .clampScalar(EltTypeIdx, S32, S64)
1823       .clampScalar(VecTypeIdx, S32, S64)
1824       .clampScalar(IdxTypeIdx, S32, S32)
1825       .clampMaxNumElements(VecTypeIdx, S32, 32)
1826       // TODO: Clamp elements for 64-bit vectors?
1827       .moreElementsIf(
1828         isIllegalRegisterType(VecTypeIdx),
1829         moreElementsToNextExistingRegClass(VecTypeIdx))
1830       // It should only be necessary with variable indexes.
1831       // As a last resort, lower to the stack
1832       .lower();
1833   }
1834 
1835   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1836     .unsupportedIf([=](const LegalityQuery &Query) {
1837         const LLT &EltTy = Query.Types[1].getElementType();
1838         return Query.Types[0] != EltTy;
1839       });
1840 
1841   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1842     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1843     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1844 
1845     // FIXME: Doesn't handle extract of illegal sizes.
1846     getActionDefinitionsBuilder(Op)
1847       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1848       .lowerIf([=](const LegalityQuery &Query) {
1849           // Sub-vector(or single element) insert and extract.
1850           // TODO: verify immediate offset here since lower only works with
1851           // whole elements.
1852           const LLT BigTy = Query.Types[BigTyIdx];
1853           return BigTy.isVector();
1854         })
1855       // FIXME: Multiples of 16 should not be legal.
1856       .legalIf([=](const LegalityQuery &Query) {
1857           const LLT BigTy = Query.Types[BigTyIdx];
1858           const LLT LitTy = Query.Types[LitTyIdx];
1859           return (BigTy.getSizeInBits() % 32 == 0) &&
1860                  (LitTy.getSizeInBits() % 16 == 0);
1861         })
1862       .widenScalarIf(
1863         [=](const LegalityQuery &Query) {
1864           const LLT BigTy = Query.Types[BigTyIdx];
1865           return (BigTy.getScalarSizeInBits() < 16);
1866         },
1867         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1868       .widenScalarIf(
1869         [=](const LegalityQuery &Query) {
1870           const LLT LitTy = Query.Types[LitTyIdx];
1871           return (LitTy.getScalarSizeInBits() < 16);
1872         },
1873         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1874       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1875       .widenScalarToNextPow2(BigTyIdx, 32);
1876 
1877   }
1878 
1879   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1880     .legalForCartesianProduct(AllS32Vectors, {S32})
1881     .legalForCartesianProduct(AllS64Vectors, {S64})
1882     .clampNumElements(0, V16S32, V32S32)
1883     .clampNumElements(0, V2S64, V16S64)
1884     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1885     .moreElementsIf(
1886       isIllegalRegisterType(0),
1887       moreElementsToNextExistingRegClass(0));
1888 
1889   if (ST.hasScalarPackInsts()) {
1890     BuildVector
1891       // FIXME: Should probably widen s1 vectors straight to s32
1892       .minScalarOrElt(0, S16)
1893       .minScalar(1, S16);
1894 
1895     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1896       .legalFor({V2S16, S32})
1897       .lower();
1898   } else {
1899     BuildVector.customFor({V2S16, S16});
1900     BuildVector.minScalarOrElt(0, S32);
1901 
1902     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1903       .customFor({V2S16, S32})
1904       .lower();
1905   }
1906 
1907   BuildVector.legalIf(isRegisterType(0));
1908 
1909   // FIXME: Clamp maximum size
1910   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1911     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1912     .clampMaxNumElements(0, S32, 32)
1913     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1914     .clampMaxNumElements(0, S16, 64);
1915 
1916   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1917 
1918   // Merge/Unmerge
1919   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1920     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1921     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1922 
1923     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1924       const LLT Ty = Query.Types[TypeIdx];
1925       if (Ty.isVector()) {
1926         const LLT &EltTy = Ty.getElementType();
1927         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1928           return true;
1929         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1930           return true;
1931       }
1932       return false;
1933     };
1934 
1935     auto &Builder = getActionDefinitionsBuilder(Op)
1936       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1937       .lowerFor({{S16, V2S16}})
1938       .lowerIf([=](const LegalityQuery &Query) {
1939           const LLT BigTy = Query.Types[BigTyIdx];
1940           return BigTy.getSizeInBits() == 32;
1941         })
1942       // Try to widen to s16 first for small types.
1943       // TODO: Only do this on targets with legal s16 shifts
1944       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1945       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1946       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1947       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1948                            elementTypeIs(1, S16)),
1949                        changeTo(1, V2S16))
1950       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1951       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1952       // valid.
1953       .clampScalar(LitTyIdx, S32, S512)
1954       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1955       // Break up vectors with weird elements into scalars
1956       .fewerElementsIf(
1957         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1958         scalarize(0))
1959       .fewerElementsIf(
1960         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1961         scalarize(1))
1962       .clampScalar(BigTyIdx, S32, MaxScalar);
1963 
1964     if (Op == G_MERGE_VALUES) {
1965       Builder.widenScalarIf(
1966         // TODO: Use 16-bit shifts if legal for 8-bit values?
1967         [=](const LegalityQuery &Query) {
1968           const LLT Ty = Query.Types[LitTyIdx];
1969           return Ty.getSizeInBits() < 32;
1970         },
1971         changeTo(LitTyIdx, S32));
1972     }
1973 
1974     Builder.widenScalarIf(
1975       [=](const LegalityQuery &Query) {
1976         const LLT Ty = Query.Types[BigTyIdx];
1977         return Ty.getSizeInBits() % 16 != 0;
1978       },
1979       [=](const LegalityQuery &Query) {
1980         // Pick the next power of 2, or a multiple of 64 over 128.
1981         // Whichever is smaller.
1982         const LLT &Ty = Query.Types[BigTyIdx];
1983         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1984         if (NewSizeInBits >= 256) {
1985           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1986           if (RoundedTo < NewSizeInBits)
1987             NewSizeInBits = RoundedTo;
1988         }
1989         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1990       })
1991       // Any vectors left are the wrong size. Scalarize them.
1992       .scalarize(0)
1993       .scalarize(1);
1994   }
1995 
1996   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1997   // RegBankSelect.
1998   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1999     .legalFor({{S32}, {S64}});
2000 
2001   if (ST.hasVOP3PInsts()) {
2002     SextInReg.lowerFor({{V2S16}})
2003       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2004       // get more vector shift opportunities, since we'll get those when
2005       // expanded.
2006       .clampMaxNumElementsStrict(0, S16, 2);
2007   } else if (ST.has16BitInsts()) {
2008     SextInReg.lowerFor({{S32}, {S64}, {S16}});
2009   } else {
2010     // Prefer to promote to s32 before lowering if we don't have 16-bit
2011     // shifts. This avoid a lot of intermediate truncate and extend operations.
2012     SextInReg.lowerFor({{S32}, {S64}});
2013   }
2014 
2015   SextInReg
2016     .scalarize(0)
2017     .clampScalar(0, S32, S64)
2018     .lower();
2019 
2020   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2021     .scalarize(0)
2022     .lower();
2023 
2024   // TODO: Only Try to form v2s16 with legal packed instructions.
2025   getActionDefinitionsBuilder(G_FSHR)
2026     .legalFor({{S32, S32}})
2027     .lowerFor({{V2S16, V2S16}})
2028     .clampMaxNumElementsStrict(0, S16, 2)
2029     .scalarize(0)
2030     .lower();
2031 
2032   if (ST.hasVOP3PInsts()) {
2033     getActionDefinitionsBuilder(G_FSHL)
2034       .lowerFor({{V2S16, V2S16}})
2035       .clampMaxNumElementsStrict(0, S16, 2)
2036       .scalarize(0)
2037       .lower();
2038   } else {
2039     getActionDefinitionsBuilder(G_FSHL)
2040       .scalarize(0)
2041       .lower();
2042   }
2043 
2044   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2045     .legalFor({S64});
2046 
2047   getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2048 
2049   getActionDefinitionsBuilder(G_FENCE)
2050     .alwaysLegal();
2051 
2052   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2053       .scalarize(0)
2054       .minScalar(0, S32)
2055       .lower();
2056 
2057   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2058       .legalFor({{S32, S32}, {S64, S32}})
2059       .clampScalar(1, S32, S32)
2060       .clampScalar(0, S32, S64)
2061       .widenScalarToNextPow2(0)
2062       .scalarize(0);
2063 
2064   getActionDefinitionsBuilder(
2065       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2066        G_FCOPYSIGN,
2067 
2068        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2069        G_READ_REGISTER, G_WRITE_REGISTER,
2070 
2071        G_SADDO, G_SSUBO})
2072       .lower();
2073 
2074   if (ST.hasIEEEMinMax()) {
2075     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2076         .legalFor(FPTypesPK16)
2077         .clampMaxNumElements(0, S16, 2)
2078         .scalarize(0);
2079   } else {
2080     // TODO: Implement
2081     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2082   }
2083 
2084   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2085       .lower();
2086 
2087   getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2088 
2089   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2090         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2091         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2092     .unsupported();
2093 
2094   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2095 
2096   getLegacyLegalizerInfo().computeTables();
2097   verify(*ST.getInstrInfo());
2098 }
2099 
2100 bool AMDGPULegalizerInfo::legalizeCustom(
2101     LegalizerHelper &Helper, MachineInstr &MI,
2102     LostDebugLocObserver &LocObserver) const {
2103   MachineIRBuilder &B = Helper.MIRBuilder;
2104   MachineRegisterInfo &MRI = *B.getMRI();
2105 
2106   switch (MI.getOpcode()) {
2107   case TargetOpcode::G_ADDRSPACE_CAST:
2108     return legalizeAddrSpaceCast(MI, MRI, B);
2109   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2110     return legalizeFroundeven(MI, MRI, B);
2111   case TargetOpcode::G_FCEIL:
2112     return legalizeFceil(MI, MRI, B);
2113   case TargetOpcode::G_FREM:
2114     return legalizeFrem(MI, MRI, B);
2115   case TargetOpcode::G_INTRINSIC_TRUNC:
2116     return legalizeIntrinsicTrunc(MI, MRI, B);
2117   case TargetOpcode::G_SITOFP:
2118     return legalizeITOFP(MI, MRI, B, true);
2119   case TargetOpcode::G_UITOFP:
2120     return legalizeITOFP(MI, MRI, B, false);
2121   case TargetOpcode::G_FPTOSI:
2122     return legalizeFPTOI(MI, MRI, B, true);
2123   case TargetOpcode::G_FPTOUI:
2124     return legalizeFPTOI(MI, MRI, B, false);
2125   case TargetOpcode::G_FMINNUM:
2126   case TargetOpcode::G_FMAXNUM:
2127   case TargetOpcode::G_FMINNUM_IEEE:
2128   case TargetOpcode::G_FMAXNUM_IEEE:
2129     return legalizeMinNumMaxNum(Helper, MI);
2130   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2131     return legalizeExtractVectorElt(MI, MRI, B);
2132   case TargetOpcode::G_INSERT_VECTOR_ELT:
2133     return legalizeInsertVectorElt(MI, MRI, B);
2134   case TargetOpcode::G_FSIN:
2135   case TargetOpcode::G_FCOS:
2136     return legalizeSinCos(MI, MRI, B);
2137   case TargetOpcode::G_GLOBAL_VALUE:
2138     return legalizeGlobalValue(MI, MRI, B);
2139   case TargetOpcode::G_LOAD:
2140   case TargetOpcode::G_SEXTLOAD:
2141   case TargetOpcode::G_ZEXTLOAD:
2142     return legalizeLoad(Helper, MI);
2143   case TargetOpcode::G_STORE:
2144     return legalizeStore(Helper, MI);
2145   case TargetOpcode::G_FMAD:
2146     return legalizeFMad(MI, MRI, B);
2147   case TargetOpcode::G_FDIV:
2148     return legalizeFDIV(MI, MRI, B);
2149   case TargetOpcode::G_FFREXP:
2150     return legalizeFFREXP(MI, MRI, B);
2151   case TargetOpcode::G_FSQRT:
2152     return legalizeFSQRT(MI, MRI, B);
2153   case TargetOpcode::G_UDIV:
2154   case TargetOpcode::G_UREM:
2155   case TargetOpcode::G_UDIVREM:
2156     return legalizeUnsignedDIV_REM(MI, MRI, B);
2157   case TargetOpcode::G_SDIV:
2158   case TargetOpcode::G_SREM:
2159   case TargetOpcode::G_SDIVREM:
2160     return legalizeSignedDIV_REM(MI, MRI, B);
2161   case TargetOpcode::G_ATOMIC_CMPXCHG:
2162     return legalizeAtomicCmpXChg(MI, MRI, B);
2163   case TargetOpcode::G_FLOG2:
2164     return legalizeFlog2(MI, B);
2165   case TargetOpcode::G_FLOG:
2166   case TargetOpcode::G_FLOG10:
2167     return legalizeFlogCommon(MI, B);
2168   case TargetOpcode::G_FEXP2:
2169     return legalizeFExp2(MI, B);
2170   case TargetOpcode::G_FEXP:
2171   case TargetOpcode::G_FEXP10:
2172     return legalizeFExp(MI, B);
2173   case TargetOpcode::G_FPOW:
2174     return legalizeFPow(MI, B);
2175   case TargetOpcode::G_FFLOOR:
2176     return legalizeFFloor(MI, MRI, B);
2177   case TargetOpcode::G_BUILD_VECTOR:
2178   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2179     return legalizeBuildVector(MI, MRI, B);
2180   case TargetOpcode::G_MUL:
2181     return legalizeMul(Helper, MI);
2182   case TargetOpcode::G_CTLZ:
2183   case TargetOpcode::G_CTTZ:
2184     return legalizeCTLZ_CTTZ(MI, MRI, B);
2185   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2186     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2187   case TargetOpcode::G_STACKSAVE:
2188     return legalizeStackSave(MI, B);
2189   case TargetOpcode::G_GET_FPENV:
2190     return legalizeGetFPEnv(MI, MRI, B);
2191   case TargetOpcode::G_SET_FPENV:
2192     return legalizeSetFPEnv(MI, MRI, B);
2193   case TargetOpcode::G_TRAP:
2194     return legalizeTrap(MI, MRI, B);
2195   case TargetOpcode::G_DEBUGTRAP:
2196     return legalizeDebugTrap(MI, MRI, B);
2197   default:
2198     return false;
2199   }
2200 
2201   llvm_unreachable("expected switch to return");
2202 }
2203 
2204 Register AMDGPULegalizerInfo::getSegmentAperture(
2205   unsigned AS,
2206   MachineRegisterInfo &MRI,
2207   MachineIRBuilder &B) const {
2208   MachineFunction &MF = B.getMF();
2209   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2210   const LLT S32 = LLT::scalar(32);
2211   const LLT S64 = LLT::scalar(64);
2212 
2213   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2214 
2215   if (ST.hasApertureRegs()) {
2216     // Note: this register is somewhat broken. When used as a 32-bit operand,
2217     // it only returns zeroes. The real value is in the upper 32 bits.
2218     // Thus, we must emit extract the high 32 bits.
2219     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2220                                        ? AMDGPU::SRC_SHARED_BASE
2221                                        : AMDGPU::SRC_PRIVATE_BASE;
2222     // FIXME: It would be more natural to emit a COPY here, but then copy
2223     // coalescing would kick in and it would think it's okay to use the "HI"
2224     // subregister (instead of extracting the HI 32 bits) which is an artificial
2225     // (unusable) register.
2226     //  Register TableGen definitions would need an overhaul to get rid of the
2227     //  artificial "HI" aperture registers and prevent this kind of issue from
2228     //  happening.
2229     Register Dst = MRI.createGenericVirtualRegister(S64);
2230     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2231     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2232     return B.buildUnmerge(S32, Dst).getReg(1);
2233   }
2234 
2235   // TODO: can we be smarter about machine pointer info?
2236   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2237   Register LoadAddr = MRI.createGenericVirtualRegister(
2238     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2239   // For code object version 5, private_base and shared_base are passed through
2240   // implicit kernargs.
2241   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2242       AMDGPU::AMDHSA_COV5) {
2243     AMDGPUTargetLowering::ImplicitParameter Param =
2244         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2245                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2246     uint64_t Offset =
2247         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2248 
2249     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2250         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2251 
2252     if (!loadInputValue(KernargPtrReg, B,
2253                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2254       return Register();
2255 
2256     MachineMemOperand *MMO = MF.getMachineMemOperand(
2257         PtrInfo,
2258         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2259             MachineMemOperand::MOInvariant,
2260         LLT::scalar(32), commonAlignment(Align(64), Offset));
2261 
2262     // Pointer address
2263     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2264                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2265     // Load address
2266     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2267   }
2268 
2269   Register QueuePtr = MRI.createGenericVirtualRegister(
2270     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2271 
2272   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2273     return Register();
2274 
2275   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2276   // private_segment_aperture_base_hi.
2277   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2278 
2279   MachineMemOperand *MMO = MF.getMachineMemOperand(
2280       PtrInfo,
2281       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2282           MachineMemOperand::MOInvariant,
2283       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2284 
2285   B.buildPtrAdd(LoadAddr, QueuePtr,
2286                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2287   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2288 }
2289 
2290 /// Return true if the value is a known valid address, such that a null check is
2291 /// not necessary.
2292 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2293                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2294   MachineInstr *Def = MRI.getVRegDef(Val);
2295   switch (Def->getOpcode()) {
2296   case AMDGPU::G_FRAME_INDEX:
2297   case AMDGPU::G_GLOBAL_VALUE:
2298   case AMDGPU::G_BLOCK_ADDR:
2299     return true;
2300   case AMDGPU::G_CONSTANT: {
2301     const ConstantInt *CI = Def->getOperand(1).getCImm();
2302     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2303   }
2304   default:
2305     return false;
2306   }
2307 
2308   return false;
2309 }
2310 
2311 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2312   MachineInstr &MI, MachineRegisterInfo &MRI,
2313   MachineIRBuilder &B) const {
2314   MachineFunction &MF = B.getMF();
2315 
2316   // MI can either be a G_ADDRSPACE_CAST or a
2317   // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2318   assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2319          (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2320                                      Intrinsic::amdgcn_addrspacecast_nonnull));
2321 
2322   const LLT S32 = LLT::scalar(32);
2323   Register Dst = MI.getOperand(0).getReg();
2324   Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2325                                      : MI.getOperand(1).getReg();
2326   LLT DstTy = MRI.getType(Dst);
2327   LLT SrcTy = MRI.getType(Src);
2328   unsigned DestAS = DstTy.getAddressSpace();
2329   unsigned SrcAS = SrcTy.getAddressSpace();
2330 
2331   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2332   // vector element.
2333   assert(!DstTy.isVector());
2334 
2335   const AMDGPUTargetMachine &TM
2336     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2337 
2338   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2339     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2340     return true;
2341   }
2342 
2343   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2344       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2345        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2346     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2347     // G_ADDRSPACE_CAST we need to guess.
2348     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2349       // Extract low 32-bits of the pointer.
2350       B.buildExtract(Dst, Src, 0);
2351       MI.eraseFromParent();
2352       return true;
2353     }
2354 
2355     unsigned NullVal = TM.getNullPointerValue(DestAS);
2356 
2357     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2358     auto FlatNull = B.buildConstant(SrcTy, 0);
2359 
2360     // Extract low 32-bits of the pointer.
2361     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2362 
2363     auto CmpRes =
2364         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2365     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2366 
2367     MI.eraseFromParent();
2368     return true;
2369   }
2370 
2371   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2372       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2373        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2374     auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2375       Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2376       if (!ApertureReg.isValid())
2377         return false;
2378 
2379       // Coerce the type of the low half of the result so we can use
2380       // merge_values.
2381       Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2382 
2383       // TODO: Should we allow mismatched types but matching sizes in merges to
2384       // avoid the ptrtoint?
2385       return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2386     };
2387 
2388     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2389     // G_ADDRSPACE_CAST we need to guess.
2390     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2391       castLocalOrPrivateToFlat(Dst);
2392       MI.eraseFromParent();
2393       return true;
2394     }
2395 
2396     Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2397 
2398     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2399     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2400 
2401     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2402                               SegmentNull.getReg(0));
2403 
2404     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2405 
2406     MI.eraseFromParent();
2407     return true;
2408   }
2409 
2410   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2411       SrcTy.getSizeInBits() == 64) {
2412     // Truncate.
2413     B.buildExtract(Dst, Src, 0);
2414     MI.eraseFromParent();
2415     return true;
2416   }
2417 
2418   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2419       DstTy.getSizeInBits() == 64) {
2420     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2421     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2422     auto PtrLo = B.buildPtrToInt(S32, Src);
2423     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2424     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2425     MI.eraseFromParent();
2426     return true;
2427   }
2428 
2429   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2430       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2431 
2432   LLVMContext &Ctx = MF.getFunction().getContext();
2433   Ctx.diagnose(InvalidAddrSpaceCast);
2434   B.buildUndef(Dst);
2435   MI.eraseFromParent();
2436   return true;
2437 }
2438 
2439 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2440                                              MachineRegisterInfo &MRI,
2441                                              MachineIRBuilder &B) const {
2442   Register Src = MI.getOperand(1).getReg();
2443   LLT Ty = MRI.getType(Src);
2444   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2445 
2446   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2447   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2448 
2449   auto C1 = B.buildFConstant(Ty, C1Val);
2450   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2451 
2452   // TODO: Should this propagate fast-math-flags?
2453   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2454   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2455 
2456   auto C2 = B.buildFConstant(Ty, C2Val);
2457   auto Fabs = B.buildFAbs(Ty, Src);
2458 
2459   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2460   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2461   MI.eraseFromParent();
2462   return true;
2463 }
2464 
2465 bool AMDGPULegalizerInfo::legalizeFceil(
2466   MachineInstr &MI, MachineRegisterInfo &MRI,
2467   MachineIRBuilder &B) const {
2468 
2469   const LLT S1 = LLT::scalar(1);
2470   const LLT S64 = LLT::scalar(64);
2471 
2472   Register Src = MI.getOperand(1).getReg();
2473   assert(MRI.getType(Src) == S64);
2474 
2475   // result = trunc(src)
2476   // if (src > 0.0 && src != result)
2477   //   result += 1.0
2478 
2479   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2480 
2481   const auto Zero = B.buildFConstant(S64, 0.0);
2482   const auto One = B.buildFConstant(S64, 1.0);
2483   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2484   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2485   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2486   auto Add = B.buildSelect(S64, And, One, Zero);
2487 
2488   // TODO: Should this propagate fast-math-flags?
2489   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2490   MI.eraseFromParent();
2491   return true;
2492 }
2493 
2494 bool AMDGPULegalizerInfo::legalizeFrem(
2495   MachineInstr &MI, MachineRegisterInfo &MRI,
2496   MachineIRBuilder &B) const {
2497     Register DstReg = MI.getOperand(0).getReg();
2498     Register Src0Reg = MI.getOperand(1).getReg();
2499     Register Src1Reg = MI.getOperand(2).getReg();
2500     auto Flags = MI.getFlags();
2501     LLT Ty = MRI.getType(DstReg);
2502 
2503     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2504     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2505     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2506     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2507     MI.eraseFromParent();
2508     return true;
2509 }
2510 
2511 static MachineInstrBuilder extractF64Exponent(Register Hi,
2512                                               MachineIRBuilder &B) {
2513   const unsigned FractBits = 52;
2514   const unsigned ExpBits = 11;
2515   LLT S32 = LLT::scalar(32);
2516 
2517   auto Const0 = B.buildConstant(S32, FractBits - 32);
2518   auto Const1 = B.buildConstant(S32, ExpBits);
2519 
2520   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2521                      .addUse(Hi)
2522                      .addUse(Const0.getReg(0))
2523                      .addUse(Const1.getReg(0));
2524 
2525   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2526 }
2527 
2528 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2529   MachineInstr &MI, MachineRegisterInfo &MRI,
2530   MachineIRBuilder &B) const {
2531   const LLT S1 = LLT::scalar(1);
2532   const LLT S32 = LLT::scalar(32);
2533   const LLT S64 = LLT::scalar(64);
2534 
2535   Register Src = MI.getOperand(1).getReg();
2536   assert(MRI.getType(Src) == S64);
2537 
2538   // TODO: Should this use extract since the low half is unused?
2539   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2540   Register Hi = Unmerge.getReg(1);
2541 
2542   // Extract the upper half, since this is where we will find the sign and
2543   // exponent.
2544   auto Exp = extractF64Exponent(Hi, B);
2545 
2546   const unsigned FractBits = 52;
2547 
2548   // Extract the sign bit.
2549   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2550   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2551 
2552   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2553 
2554   const auto Zero32 = B.buildConstant(S32, 0);
2555 
2556   // Extend back to 64-bits.
2557   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2558 
2559   auto Shr = B.buildAShr(S64, FractMask, Exp);
2560   auto Not = B.buildNot(S64, Shr);
2561   auto Tmp0 = B.buildAnd(S64, Src, Not);
2562   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2563 
2564   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2565   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2566 
2567   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2568   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2569   MI.eraseFromParent();
2570   return true;
2571 }
2572 
2573 bool AMDGPULegalizerInfo::legalizeITOFP(
2574   MachineInstr &MI, MachineRegisterInfo &MRI,
2575   MachineIRBuilder &B, bool Signed) const {
2576 
2577   Register Dst = MI.getOperand(0).getReg();
2578   Register Src = MI.getOperand(1).getReg();
2579 
2580   const LLT S64 = LLT::scalar(64);
2581   const LLT S32 = LLT::scalar(32);
2582 
2583   assert(MRI.getType(Src) == S64);
2584 
2585   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2586   auto ThirtyTwo = B.buildConstant(S32, 32);
2587 
2588   if (MRI.getType(Dst) == S64) {
2589     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2590                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2591 
2592     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2593     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2594 
2595     // TODO: Should this propagate fast-math-flags?
2596     B.buildFAdd(Dst, LdExp, CvtLo);
2597     MI.eraseFromParent();
2598     return true;
2599   }
2600 
2601   assert(MRI.getType(Dst) == S32);
2602 
2603   auto One = B.buildConstant(S32, 1);
2604 
2605   MachineInstrBuilder ShAmt;
2606   if (Signed) {
2607     auto ThirtyOne = B.buildConstant(S32, 31);
2608     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2609     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2610     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2611     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2612                   .addUse(Unmerge.getReg(1));
2613     auto LS2 = B.buildSub(S32, LS, One);
2614     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2615   } else
2616     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2617   auto Norm = B.buildShl(S64, Src, ShAmt);
2618   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2619   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2620   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2621   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2622   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2623   B.buildFLdexp(Dst, FVal, Scale);
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 // TODO: Copied from DAG implementation. Verify logic and document how this
2629 // actually works.
2630 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2631                                         MachineRegisterInfo &MRI,
2632                                         MachineIRBuilder &B,
2633                                         bool Signed) const {
2634 
2635   Register Dst = MI.getOperand(0).getReg();
2636   Register Src = MI.getOperand(1).getReg();
2637 
2638   const LLT S64 = LLT::scalar(64);
2639   const LLT S32 = LLT::scalar(32);
2640 
2641   const LLT SrcLT = MRI.getType(Src);
2642   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2643 
2644   unsigned Flags = MI.getFlags();
2645 
2646   // The basic idea of converting a floating point number into a pair of 32-bit
2647   // integers is illustrated as follows:
2648   //
2649   //     tf := trunc(val);
2650   //    hif := floor(tf * 2^-32);
2651   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2652   //     hi := fptoi(hif);
2653   //     lo := fptoi(lof);
2654   //
2655   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2656   MachineInstrBuilder Sign;
2657   if (Signed && SrcLT == S32) {
2658     // However, a 32-bit floating point number has only 23 bits mantissa and
2659     // it's not enough to hold all the significant bits of `lof` if val is
2660     // negative. To avoid the loss of precision, We need to take the absolute
2661     // value after truncating and flip the result back based on the original
2662     // signedness.
2663     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2664     Trunc = B.buildFAbs(S32, Trunc, Flags);
2665   }
2666   MachineInstrBuilder K0, K1;
2667   if (SrcLT == S64) {
2668     K0 = B.buildFConstant(
2669         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2670     K1 = B.buildFConstant(
2671         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2672   } else {
2673     K0 = B.buildFConstant(
2674         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2675     K1 = B.buildFConstant(
2676         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2677   }
2678 
2679   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2680   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2681   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2682 
2683   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2684                                      : B.buildFPTOUI(S32, FloorMul);
2685   auto Lo = B.buildFPTOUI(S32, Fma);
2686 
2687   if (Signed && SrcLT == S32) {
2688     // Flip the result based on the signedness, which is either all 0s or 1s.
2689     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2690     // r := xor({lo, hi}, sign) - sign;
2691     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2692                Sign);
2693   } else
2694     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2695   MI.eraseFromParent();
2696 
2697   return true;
2698 }
2699 
2700 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2701                                                MachineInstr &MI) const {
2702   MachineFunction &MF = Helper.MIRBuilder.getMF();
2703   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2704 
2705   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2706                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2707 
2708   // With ieee_mode disabled, the instructions have the correct behavior
2709   // already for G_FMINNUM/G_FMAXNUM
2710   if (!MFI->getMode().IEEE)
2711     return !IsIEEEOp;
2712 
2713   if (IsIEEEOp)
2714     return true;
2715 
2716   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2717 }
2718 
2719 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2720   MachineInstr &MI, MachineRegisterInfo &MRI,
2721   MachineIRBuilder &B) const {
2722   // TODO: Should move some of this into LegalizerHelper.
2723 
2724   // TODO: Promote dynamic indexing of s16 to s32
2725 
2726   Register Dst = MI.getOperand(0).getReg();
2727   Register Vec = MI.getOperand(1).getReg();
2728 
2729   LLT VecTy = MRI.getType(Vec);
2730   LLT EltTy = VecTy.getElementType();
2731   assert(EltTy == MRI.getType(Dst));
2732 
2733   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2734   // but we can't go directly to that logic becasue you can't bitcast a vector
2735   // of pointers to a vector of integers. Therefore, introduce an intermediate
2736   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2737   // drive the legalization forward.
2738   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2739     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2740     LLT IntVecTy = VecTy.changeElementType(IntTy);
2741 
2742     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2743     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2744     B.buildIntToPtr(Dst, IntElt);
2745 
2746     MI.eraseFromParent();
2747     return true;
2748   }
2749 
2750   // FIXME: Artifact combiner probably should have replaced the truncated
2751   // constant before this, so we shouldn't need
2752   // getIConstantVRegValWithLookThrough.
2753   std::optional<ValueAndVReg> MaybeIdxVal =
2754       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2755   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2756     return true;
2757   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2758 
2759   if (IdxVal < VecTy.getNumElements()) {
2760     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2761     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2762   } else {
2763     B.buildUndef(Dst);
2764   }
2765 
2766   MI.eraseFromParent();
2767   return true;
2768 }
2769 
2770 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2771   MachineInstr &MI, MachineRegisterInfo &MRI,
2772   MachineIRBuilder &B) const {
2773   // TODO: Should move some of this into LegalizerHelper.
2774 
2775   // TODO: Promote dynamic indexing of s16 to s32
2776 
2777   Register Dst = MI.getOperand(0).getReg();
2778   Register Vec = MI.getOperand(1).getReg();
2779   Register Ins = MI.getOperand(2).getReg();
2780 
2781   LLT VecTy = MRI.getType(Vec);
2782   LLT EltTy = VecTy.getElementType();
2783   assert(EltTy == MRI.getType(Ins));
2784 
2785   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2786   // but we can't go directly to that logic becasue you can't bitcast a vector
2787   // of pointers to a vector of integers. Therefore, make the pointer vector
2788   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2789   // new value, and then inttoptr the result vector back. This will then allow
2790   // the rest of legalization to take over.
2791   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2792     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2793     LLT IntVecTy = VecTy.changeElementType(IntTy);
2794 
2795     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2796     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2797     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2798                                                  MI.getOperand(3));
2799     B.buildIntToPtr(Dst, IntVecDest);
2800     MI.eraseFromParent();
2801     return true;
2802   }
2803 
2804   // FIXME: Artifact combiner probably should have replaced the truncated
2805   // constant before this, so we shouldn't need
2806   // getIConstantVRegValWithLookThrough.
2807   std::optional<ValueAndVReg> MaybeIdxVal =
2808       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2809   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2810     return true;
2811 
2812   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2813 
2814   unsigned NumElts = VecTy.getNumElements();
2815   if (IdxVal < NumElts) {
2816     SmallVector<Register, 8> SrcRegs;
2817     for (unsigned i = 0; i < NumElts; ++i)
2818       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2819     B.buildUnmerge(SrcRegs, Vec);
2820 
2821     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2822     B.buildMergeLikeInstr(Dst, SrcRegs);
2823   } else {
2824     B.buildUndef(Dst);
2825   }
2826 
2827   MI.eraseFromParent();
2828   return true;
2829 }
2830 
2831 bool AMDGPULegalizerInfo::legalizeSinCos(
2832   MachineInstr &MI, MachineRegisterInfo &MRI,
2833   MachineIRBuilder &B) const {
2834 
2835   Register DstReg = MI.getOperand(0).getReg();
2836   Register SrcReg = MI.getOperand(1).getReg();
2837   LLT Ty = MRI.getType(DstReg);
2838   unsigned Flags = MI.getFlags();
2839 
2840   Register TrigVal;
2841   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2842   if (ST.hasTrigReducedRange()) {
2843     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2844     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2845                   .addUse(MulVal.getReg(0))
2846                   .setMIFlags(Flags)
2847                   .getReg(0);
2848   } else
2849     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2850 
2851   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2852     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2853   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2854       .addUse(TrigVal)
2855       .setMIFlags(Flags);
2856   MI.eraseFromParent();
2857   return true;
2858 }
2859 
2860 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2861                                                   MachineIRBuilder &B,
2862                                                   const GlobalValue *GV,
2863                                                   int64_t Offset,
2864                                                   unsigned GAFlags) const {
2865   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2866   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2867   // to the following code sequence:
2868   //
2869   // For constant address space:
2870   //   s_getpc_b64 s[0:1]
2871   //   s_add_u32 s0, s0, $symbol
2872   //   s_addc_u32 s1, s1, 0
2873   //
2874   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2875   //   a fixup or relocation is emitted to replace $symbol with a literal
2876   //   constant, which is a pc-relative offset from the encoding of the $symbol
2877   //   operand to the global variable.
2878   //
2879   // For global address space:
2880   //   s_getpc_b64 s[0:1]
2881   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2882   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2883   //
2884   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2885   //   fixups or relocations are emitted to replace $symbol@*@lo and
2886   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2887   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2888   //   operand to the global variable.
2889 
2890   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2891 
2892   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2893     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2894 
2895   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2896     .addDef(PCReg);
2897 
2898   MIB.addGlobalAddress(GV, Offset, GAFlags);
2899   if (GAFlags == SIInstrInfo::MO_NONE)
2900     MIB.addImm(0);
2901   else
2902     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2903 
2904   if (!B.getMRI()->getRegClassOrNull(PCReg))
2905     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2906 
2907   if (PtrTy.getSizeInBits() == 32)
2908     B.buildExtract(DstReg, PCReg, 0);
2909   return true;
2910 }
2911 
2912 // Emit a ABS32_LO / ABS32_HI relocation stub.
2913 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2914     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2915     MachineRegisterInfo &MRI) const {
2916   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2917 
2918   LLT S32 = LLT::scalar(32);
2919 
2920   // Use the destination directly, if and only if we store the lower address
2921   // part only and we don't have a register class being set.
2922   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2923                         ? DstReg
2924                         : MRI.createGenericVirtualRegister(S32);
2925 
2926   if (!MRI.getRegClassOrNull(AddrLo))
2927     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2928 
2929   // Write the lower half.
2930   B.buildInstr(AMDGPU::S_MOV_B32)
2931       .addDef(AddrLo)
2932       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2933 
2934   // If required, write the upper half as well.
2935   if (RequiresHighHalf) {
2936     assert(PtrTy.getSizeInBits() == 64 &&
2937            "Must provide a 64-bit pointer type!");
2938 
2939     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2940     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2941 
2942     B.buildInstr(AMDGPU::S_MOV_B32)
2943         .addDef(AddrHi)
2944         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2945 
2946     // Use the destination directly, if and only if we don't have a register
2947     // class being set.
2948     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2949                            ? DstReg
2950                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2951 
2952     if (!MRI.getRegClassOrNull(AddrDst))
2953       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2954 
2955     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2956 
2957     // If we created a new register for the destination, cast the result into
2958     // the final output.
2959     if (AddrDst != DstReg)
2960       B.buildCast(DstReg, AddrDst);
2961   } else if (AddrLo != DstReg) {
2962     // If we created a new register for the destination, cast the result into
2963     // the final output.
2964     B.buildCast(DstReg, AddrLo);
2965   }
2966 }
2967 
2968 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2969   MachineInstr &MI, MachineRegisterInfo &MRI,
2970   MachineIRBuilder &B) const {
2971   Register DstReg = MI.getOperand(0).getReg();
2972   LLT Ty = MRI.getType(DstReg);
2973   unsigned AS = Ty.getAddressSpace();
2974 
2975   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2976   MachineFunction &MF = B.getMF();
2977   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2978 
2979   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2980     if (!MFI->isModuleEntryFunction() &&
2981         GV->getName() != "llvm.amdgcn.module.lds" &&
2982         !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
2983       const Function &Fn = MF.getFunction();
2984       DiagnosticInfoUnsupported BadLDSDecl(
2985         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2986         DS_Warning);
2987       Fn.getContext().diagnose(BadLDSDecl);
2988 
2989       // We currently don't have a way to correctly allocate LDS objects that
2990       // aren't directly associated with a kernel. We do force inlining of
2991       // functions that use local objects. However, if these dead functions are
2992       // not eliminated, we don't want a compile time error. Just emit a warning
2993       // and a trap, since there should be no callable path here.
2994       B.buildTrap();
2995       B.buildUndef(DstReg);
2996       MI.eraseFromParent();
2997       return true;
2998     }
2999 
3000     // TODO: We could emit code to handle the initialization somewhere.
3001     // We ignore the initializer for now and legalize it to allow selection.
3002     // The initializer will anyway get errored out during assembly emission.
3003     const SITargetLowering *TLI = ST.getTargetLowering();
3004     if (!TLI->shouldUseLDSConstAddress(GV)) {
3005       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3006       return true; // Leave in place;
3007     }
3008 
3009     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3010       Type *Ty = GV->getValueType();
3011       // HIP uses an unsized array `extern __shared__ T s[]` or similar
3012       // zero-sized type in other languages to declare the dynamic shared
3013       // memory which size is not known at the compile time. They will be
3014       // allocated by the runtime and placed directly after the static
3015       // allocated ones. They all share the same offset.
3016       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3017         // Adjust alignment for that dynamic shared memory array.
3018         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3019         LLT S32 = LLT::scalar(32);
3020         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3021         B.buildIntToPtr(DstReg, Sz);
3022         MI.eraseFromParent();
3023         return true;
3024       }
3025     }
3026 
3027     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3028                                                    *cast<GlobalVariable>(GV)));
3029     MI.eraseFromParent();
3030     return true;
3031   }
3032 
3033   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3034     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3035     MI.eraseFromParent();
3036     return true;
3037   }
3038 
3039   const SITargetLowering *TLI = ST.getTargetLowering();
3040 
3041   if (TLI->shouldEmitFixup(GV)) {
3042     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3043     MI.eraseFromParent();
3044     return true;
3045   }
3046 
3047   if (TLI->shouldEmitPCReloc(GV)) {
3048     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3049     MI.eraseFromParent();
3050     return true;
3051   }
3052 
3053   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3054   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3055 
3056   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3057   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3058       MachinePointerInfo::getGOT(MF),
3059       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3060           MachineMemOperand::MOInvariant,
3061       LoadTy, Align(8));
3062 
3063   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3064 
3065   if (Ty.getSizeInBits() == 32) {
3066     // Truncate if this is a 32-bit constant address.
3067     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3068     B.buildExtract(DstReg, Load, 0);
3069   } else
3070     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3071 
3072   MI.eraseFromParent();
3073   return true;
3074 }
3075 
3076 static LLT widenToNextPowerOf2(LLT Ty) {
3077   if (Ty.isVector())
3078     return Ty.changeElementCount(
3079         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3080   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3081 }
3082 
3083 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3084                                        MachineInstr &MI) const {
3085   MachineIRBuilder &B = Helper.MIRBuilder;
3086   MachineRegisterInfo &MRI = *B.getMRI();
3087   GISelChangeObserver &Observer = Helper.Observer;
3088 
3089   Register PtrReg = MI.getOperand(1).getReg();
3090   LLT PtrTy = MRI.getType(PtrReg);
3091   unsigned AddrSpace = PtrTy.getAddressSpace();
3092 
3093   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3094     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3095     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3096     Observer.changingInstr(MI);
3097     MI.getOperand(1).setReg(Cast.getReg(0));
3098     Observer.changedInstr(MI);
3099     return true;
3100   }
3101 
3102   if (MI.getOpcode() != AMDGPU::G_LOAD)
3103     return false;
3104 
3105   Register ValReg = MI.getOperand(0).getReg();
3106   LLT ValTy = MRI.getType(ValReg);
3107 
3108   if (hasBufferRsrcWorkaround(ValTy)) {
3109     Observer.changingInstr(MI);
3110     castBufferRsrcFromV4I32(MI, B, MRI, 0);
3111     Observer.changedInstr(MI);
3112     return true;
3113   }
3114 
3115   MachineMemOperand *MMO = *MI.memoperands_begin();
3116   const unsigned ValSize = ValTy.getSizeInBits();
3117   const LLT MemTy = MMO->getMemoryType();
3118   const Align MemAlign = MMO->getAlign();
3119   const unsigned MemSize = MemTy.getSizeInBits();
3120   const uint64_t AlignInBits = 8 * MemAlign.value();
3121 
3122   // Widen non-power-of-2 loads to the alignment if needed
3123   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3124     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3125 
3126     // This was already the correct extending load result type, so just adjust
3127     // the memory type.
3128     if (WideMemSize == ValSize) {
3129       MachineFunction &MF = B.getMF();
3130 
3131       MachineMemOperand *WideMMO =
3132           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3133       Observer.changingInstr(MI);
3134       MI.setMemRefs(MF, {WideMMO});
3135       Observer.changedInstr(MI);
3136       return true;
3137     }
3138 
3139     // Don't bother handling edge case that should probably never be produced.
3140     if (ValSize > WideMemSize)
3141       return false;
3142 
3143     LLT WideTy = widenToNextPowerOf2(ValTy);
3144 
3145     Register WideLoad;
3146     if (!WideTy.isVector()) {
3147       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3148       B.buildTrunc(ValReg, WideLoad).getReg(0);
3149     } else {
3150       // Extract the subvector.
3151 
3152       if (isRegisterType(ValTy)) {
3153         // If this a case where G_EXTRACT is legal, use it.
3154         // (e.g. <3 x s32> -> <4 x s32>)
3155         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3156         B.buildExtract(ValReg, WideLoad, 0);
3157       } else {
3158         // For cases where the widened type isn't a nice register value, unmerge
3159         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3160         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3161         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3162       }
3163     }
3164 
3165     MI.eraseFromParent();
3166     return true;
3167   }
3168 
3169   return false;
3170 }
3171 
3172 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3173                                         MachineInstr &MI) const {
3174   MachineIRBuilder &B = Helper.MIRBuilder;
3175   MachineRegisterInfo &MRI = *B.getMRI();
3176   GISelChangeObserver &Observer = Helper.Observer;
3177 
3178   Register DataReg = MI.getOperand(0).getReg();
3179   LLT DataTy = MRI.getType(DataReg);
3180 
3181   if (hasBufferRsrcWorkaround(DataTy)) {
3182     Observer.changingInstr(MI);
3183     castBufferRsrcArgToV4I32(MI, B, 0);
3184     Observer.changedInstr(MI);
3185     return true;
3186   }
3187   return false;
3188 }
3189 
3190 bool AMDGPULegalizerInfo::legalizeFMad(
3191   MachineInstr &MI, MachineRegisterInfo &MRI,
3192   MachineIRBuilder &B) const {
3193   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3194   assert(Ty.isScalar());
3195 
3196   MachineFunction &MF = B.getMF();
3197   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3198 
3199   // TODO: Always legal with future ftz flag.
3200   // FIXME: Do we need just output?
3201   if (Ty == LLT::float32() &&
3202       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3203     return true;
3204   if (Ty == LLT::float16() &&
3205       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3206     return true;
3207 
3208   MachineIRBuilder HelperBuilder(MI);
3209   GISelObserverWrapper DummyObserver;
3210   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3211   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3212 }
3213 
3214 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3215   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3216   Register DstReg = MI.getOperand(0).getReg();
3217   Register PtrReg = MI.getOperand(1).getReg();
3218   Register CmpVal = MI.getOperand(2).getReg();
3219   Register NewVal = MI.getOperand(3).getReg();
3220 
3221   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3222          "this should not have been custom lowered");
3223 
3224   LLT ValTy = MRI.getType(CmpVal);
3225   LLT VecTy = LLT::fixed_vector(2, ValTy);
3226 
3227   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3228 
3229   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3230     .addDef(DstReg)
3231     .addUse(PtrReg)
3232     .addUse(PackedVal)
3233     .setMemRefs(MI.memoperands());
3234 
3235   MI.eraseFromParent();
3236   return true;
3237 }
3238 
3239 /// Return true if it's known that \p Src can never be an f32 denormal value.
3240 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3241                                        Register Src) {
3242   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3243   switch (DefMI->getOpcode()) {
3244   case TargetOpcode::G_INTRINSIC: {
3245     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3246     case Intrinsic::amdgcn_frexp_mant:
3247       return true;
3248     default:
3249       break;
3250     }
3251 
3252     break;
3253   }
3254   case TargetOpcode::G_FFREXP: {
3255     if (DefMI->getOperand(0).getReg() == Src)
3256       return true;
3257     break;
3258   }
3259   case TargetOpcode::G_FPEXT: {
3260     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3261   }
3262   default:
3263     return false;
3264   }
3265 
3266   return false;
3267 }
3268 
3269 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3270   if (Flags & MachineInstr::FmAfn)
3271     return true;
3272   const auto &Options = MF.getTarget().Options;
3273   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3274 }
3275 
3276 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3277                                    unsigned Flags) {
3278   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3279          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3280              DenormalMode::PreserveSign;
3281 }
3282 
3283 std::pair<Register, Register>
3284 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3285                                        unsigned Flags) const {
3286   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3287     return {};
3288 
3289   const LLT F32 = LLT::scalar(32);
3290   auto SmallestNormal = B.buildFConstant(
3291       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3292   auto IsLtSmallestNormal =
3293       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3294 
3295   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3296   auto One = B.buildFConstant(F32, 1.0);
3297   auto ScaleFactor =
3298       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3299   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3300 
3301   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3302 }
3303 
3304 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3305                                         MachineIRBuilder &B) const {
3306   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3307   // If we have to handle denormals, scale up the input and adjust the result.
3308 
3309   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3310   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3311 
3312   Register Dst = MI.getOperand(0).getReg();
3313   Register Src = MI.getOperand(1).getReg();
3314   LLT Ty = B.getMRI()->getType(Dst);
3315   unsigned Flags = MI.getFlags();
3316 
3317   if (Ty == LLT::scalar(16)) {
3318     const LLT F32 = LLT::scalar(32);
3319     // Nothing in half is a denormal when promoted to f32.
3320     auto Ext = B.buildFPExt(F32, Src, Flags);
3321     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3322                     .addUse(Ext.getReg(0))
3323                     .setMIFlags(Flags);
3324     B.buildFPTrunc(Dst, Log2, Flags);
3325     MI.eraseFromParent();
3326     return true;
3327   }
3328 
3329   assert(Ty == LLT::scalar(32));
3330 
3331   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3332   if (!ScaledInput) {
3333     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3334         .addUse(Src)
3335         .setMIFlags(Flags);
3336     MI.eraseFromParent();
3337     return true;
3338   }
3339 
3340   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3341                   .addUse(ScaledInput)
3342                   .setMIFlags(Flags);
3343 
3344   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3345   auto Zero = B.buildFConstant(Ty, 0.0);
3346   auto ResultOffset =
3347       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3348   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3349 
3350   MI.eraseFromParent();
3351   return true;
3352 }
3353 
3354 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3355                        Register Z, unsigned Flags) {
3356   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3357   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3358 }
3359 
3360 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3361                                              MachineIRBuilder &B) const {
3362   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3363   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3364 
3365   MachineRegisterInfo &MRI = *B.getMRI();
3366   Register Dst = MI.getOperand(0).getReg();
3367   Register X = MI.getOperand(1).getReg();
3368   unsigned Flags = MI.getFlags();
3369   const LLT Ty = MRI.getType(X);
3370   MachineFunction &MF = B.getMF();
3371 
3372   const LLT F32 = LLT::scalar(32);
3373   const LLT F16 = LLT::scalar(16);
3374 
3375   const AMDGPUTargetMachine &TM =
3376       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3377 
3378   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3379       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3380     if (Ty == F16 && !ST.has16BitInsts()) {
3381       Register LogVal = MRI.createGenericVirtualRegister(F32);
3382       auto PromoteSrc = B.buildFPExt(F32, X);
3383       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3384       B.buildFPTrunc(Dst, LogVal);
3385     } else {
3386       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3387     }
3388 
3389     MI.eraseFromParent();
3390     return true;
3391   }
3392 
3393   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3394   if (ScaledInput)
3395     X = ScaledInput;
3396 
3397   auto Y =
3398       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3399 
3400   Register R;
3401   if (ST.hasFastFMAF32()) {
3402     // c+cc are ln(2)/ln(10) to more than 49 bits
3403     const float c_log10 = 0x1.344134p-2f;
3404     const float cc_log10 = 0x1.09f79ep-26f;
3405 
3406     // c + cc is ln(2) to more than 49 bits
3407     const float c_log = 0x1.62e42ep-1f;
3408     const float cc_log = 0x1.efa39ep-25f;
3409 
3410     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3411     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3412 
3413     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3414     auto NegR = B.buildFNeg(Ty, R, Flags);
3415     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3416     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3417     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3418   } else {
3419     // ch+ct is ln(2)/ln(10) to more than 36 bits
3420     const float ch_log10 = 0x1.344000p-2f;
3421     const float ct_log10 = 0x1.3509f6p-18f;
3422 
3423     // ch + ct is ln(2) to more than 36 bits
3424     const float ch_log = 0x1.62e000p-1f;
3425     const float ct_log = 0x1.0bfbe8p-15f;
3426 
3427     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3428     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3429 
3430     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3431     auto YH = B.buildAnd(Ty, Y, MaskConst);
3432     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3433     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3434 
3435     Register Mad0 =
3436         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3437     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3438     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3439   }
3440 
3441   const bool IsFiniteOnly =
3442       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3443       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3444 
3445   if (!IsFiniteOnly) {
3446     // Expand isfinite(x) => fabs(x) < inf
3447     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3448     auto Fabs = B.buildFAbs(Ty, Y);
3449     auto IsFinite =
3450         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3451     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3452   }
3453 
3454   if (ScaledInput) {
3455     auto Zero = B.buildFConstant(Ty, 0.0);
3456     auto ShiftK =
3457         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3458     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3459     B.buildFSub(Dst, R, Shift, Flags);
3460   } else {
3461     B.buildCopy(Dst, R);
3462   }
3463 
3464   MI.eraseFromParent();
3465   return true;
3466 }
3467 
3468 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3469                                              Register Src, bool IsLog10,
3470                                              unsigned Flags) const {
3471   const double Log2BaseInverted =
3472       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3473 
3474   LLT Ty = B.getMRI()->getType(Dst);
3475 
3476   if (Ty == LLT::scalar(32)) {
3477     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3478     if (ScaledInput) {
3479       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3480                         .addUse(Src)
3481                         .setMIFlags(Flags);
3482       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3483       auto Zero = B.buildFConstant(Ty, 0.0);
3484       auto ResultOffset =
3485           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3486       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3487 
3488       if (ST.hasFastFMAF32())
3489         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3490       else {
3491         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3492         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3493       }
3494 
3495       return true;
3496     }
3497   }
3498 
3499   auto Log2Operand = Ty == LLT::scalar(16)
3500                          ? B.buildFLog2(Ty, Src, Flags)
3501                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3502                                .addUse(Src)
3503                                .setMIFlags(Flags);
3504   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3505   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3506   return true;
3507 }
3508 
3509 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3510                                         MachineIRBuilder &B) const {
3511   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3512   // If we have to handle denormals, scale up the input and adjust the result.
3513 
3514   Register Dst = MI.getOperand(0).getReg();
3515   Register Src = MI.getOperand(1).getReg();
3516   unsigned Flags = MI.getFlags();
3517   LLT Ty = B.getMRI()->getType(Dst);
3518   const LLT F16 = LLT::scalar(16);
3519   const LLT F32 = LLT::scalar(32);
3520 
3521   if (Ty == F16) {
3522     // Nothing in half is a denormal when promoted to f32.
3523     auto Ext = B.buildFPExt(F32, Src, Flags);
3524     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3525                     .addUse(Ext.getReg(0))
3526                     .setMIFlags(Flags);
3527     B.buildFPTrunc(Dst, Log2, Flags);
3528     MI.eraseFromParent();
3529     return true;
3530   }
3531 
3532   assert(Ty == F32);
3533 
3534   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3535     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3536         .addUse(Src)
3537         .setMIFlags(Flags);
3538     MI.eraseFromParent();
3539     return true;
3540   }
3541 
3542   // bool needs_scaling = x < -0x1.f80000p+6f;
3543   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3544 
3545   // -nextafter(128.0, -1)
3546   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3547   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3548                                   RangeCheckConst, Flags);
3549 
3550   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3551   auto Zero = B.buildFConstant(Ty, 0.0);
3552   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3553   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3554 
3555   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3556                   .addUse(AddInput.getReg(0))
3557                   .setMIFlags(Flags);
3558 
3559   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3560   auto One = B.buildFConstant(Ty, 1.0);
3561   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3562   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3563   MI.eraseFromParent();
3564   return true;
3565 }
3566 
3567 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3568                                              Register X, unsigned Flags) const {
3569   LLT Ty = B.getMRI()->getType(Dst);
3570   LLT F32 = LLT::scalar(32);
3571 
3572   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3573     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3574     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3575 
3576     if (Ty == F32) {
3577       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3578         .addUse(Mul.getReg(0))
3579         .setMIFlags(Flags);
3580     } else {
3581       B.buildFExp2(Dst, Mul.getReg(0), Flags);
3582     }
3583 
3584     return true;
3585   }
3586 
3587   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3588   auto NeedsScaling =
3589       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3590   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3591   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3592   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3593 
3594   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3595   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3596 
3597   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3598     .addUse(ExpInput.getReg(0))
3599     .setMIFlags(Flags);
3600 
3601   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3602   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3603   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3604   return true;
3605 }
3606 
3607 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3608                                        MachineIRBuilder &B) const {
3609   Register Dst = MI.getOperand(0).getReg();
3610   Register X = MI.getOperand(1).getReg();
3611   const unsigned Flags = MI.getFlags();
3612   MachineFunction &MF = B.getMF();
3613   MachineRegisterInfo &MRI = *B.getMRI();
3614   LLT Ty = MRI.getType(Dst);
3615   const LLT F16 = LLT::scalar(16);
3616   const LLT F32 = LLT::scalar(32);
3617   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3618 
3619   if (Ty == F16) {
3620     // v_exp_f16 (fmul x, log2e)
3621     if (allowApproxFunc(MF, Flags)) {
3622       // TODO: Does this really require fast?
3623       legalizeFExpUnsafe(B, Dst, X, Flags);
3624       MI.eraseFromParent();
3625       return true;
3626     }
3627 
3628     // exp(f16 x) ->
3629     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3630 
3631     // Nothing in half is a denormal when promoted to f32.
3632     auto Ext = B.buildFPExt(F32, X, Flags);
3633     Register Lowered = MRI.createGenericVirtualRegister(F32);
3634     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3635     B.buildFPTrunc(Dst, Lowered, Flags);
3636     MI.eraseFromParent();
3637     return true;
3638   }
3639 
3640   assert(Ty == F32);
3641 
3642   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3643   // library behavior. Also, is known-not-daz source sufficient?
3644   if (allowApproxFunc(MF, Flags)) {
3645     legalizeFExpUnsafe(B, Dst, X, Flags);
3646     MI.eraseFromParent();
3647     return true;
3648   }
3649 
3650   //    Algorithm:
3651   //
3652   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3653   //
3654   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3655   //    n = 64*m + j,   0 <= j < 64
3656   //
3657   //    e^x = 2^((64*m + j + f)/64)
3658   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3659   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3660   //
3661   //    f = x*(64/ln(2)) - n
3662   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3663   //
3664   //    e^x = (2^m) * (2^(j/64)) * e^r
3665   //
3666   //    (2^(j/64)) is precomputed
3667   //
3668   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3669   //    e^r = 1 + q
3670   //
3671   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3672   //
3673   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3674   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3675   Register PH, PL;
3676 
3677   if (ST.hasFastFMAF32()) {
3678     const float c_exp = numbers::log2ef;
3679     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3680     const float c_exp10 = 0x1.a934f0p+1f;
3681     const float cc_exp10 = 0x1.2f346ep-24f;
3682 
3683     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3684     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3685     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3686     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3687 
3688     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3689     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3690   } else {
3691     const float ch_exp = 0x1.714000p+0f;
3692     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3693 
3694     const float ch_exp10 = 0x1.a92000p+1f;
3695     const float cl_exp10 = 0x1.4f0978p-11f;
3696 
3697     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3698     auto XH = B.buildAnd(Ty, X, MaskConst);
3699     auto XL = B.buildFSub(Ty, X, XH, Flags);
3700 
3701     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3702     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3703 
3704     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3705     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3706 
3707     Register Mad0 =
3708         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3709     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3710   }
3711 
3712   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3713 
3714   // It is unsafe to contract this fsub into the PH multiply.
3715   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3716   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3717   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3718 
3719   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3720                   .addUse(A.getReg(0))
3721                   .setMIFlags(Flags);
3722   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3723 
3724   auto UnderflowCheckConst =
3725       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3726   auto Zero = B.buildFConstant(Ty, 0.0);
3727   auto Underflow =
3728       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3729 
3730   R = B.buildSelect(Ty, Underflow, Zero, R);
3731 
3732   const auto &Options = MF.getTarget().Options;
3733 
3734   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3735     auto OverflowCheckConst =
3736         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3737 
3738     auto Overflow =
3739         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3740     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3741     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3742   }
3743 
3744   B.buildCopy(Dst, R);
3745   MI.eraseFromParent();
3746   return true;
3747 }
3748 
3749 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3750                                        MachineIRBuilder &B) const {
3751   Register Dst = MI.getOperand(0).getReg();
3752   Register Src0 = MI.getOperand(1).getReg();
3753   Register Src1 = MI.getOperand(2).getReg();
3754   unsigned Flags = MI.getFlags();
3755   LLT Ty = B.getMRI()->getType(Dst);
3756   const LLT F16 = LLT::float16();
3757   const LLT F32 = LLT::float32();
3758 
3759   if (Ty == F32) {
3760     auto Log = B.buildFLog2(F32, Src0, Flags);
3761     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3762                    .addUse(Log.getReg(0))
3763                    .addUse(Src1)
3764                    .setMIFlags(Flags);
3765     B.buildFExp2(Dst, Mul, Flags);
3766   } else if (Ty == F16) {
3767     // There's no f16 fmul_legacy, so we need to convert for it.
3768     auto Log = B.buildFLog2(F16, Src0, Flags);
3769     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3770     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3771     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3772                    .addUse(Ext0.getReg(0))
3773                    .addUse(Ext1.getReg(0))
3774                    .setMIFlags(Flags);
3775     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3776   } else
3777     return false;
3778 
3779   MI.eraseFromParent();
3780   return true;
3781 }
3782 
3783 // Find a source register, ignoring any possible source modifiers.
3784 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3785   Register ModSrc = OrigSrc;
3786   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3787     ModSrc = SrcFNeg->getOperand(1).getReg();
3788     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3789       ModSrc = SrcFAbs->getOperand(1).getReg();
3790   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3791     ModSrc = SrcFAbs->getOperand(1).getReg();
3792   return ModSrc;
3793 }
3794 
3795 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3796                                          MachineRegisterInfo &MRI,
3797                                          MachineIRBuilder &B) const {
3798 
3799   const LLT S1 = LLT::scalar(1);
3800   const LLT F64 = LLT::float64();
3801   Register Dst = MI.getOperand(0).getReg();
3802   Register OrigSrc = MI.getOperand(1).getReg();
3803   unsigned Flags = MI.getFlags();
3804   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3805          "this should not have been custom lowered");
3806 
3807   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3808   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3809   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3810   // V_FRACT bug is:
3811   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3812   //
3813   // Convert floor(x) to (x - fract(x))
3814 
3815   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3816                    .addUse(OrigSrc)
3817                    .setMIFlags(Flags);
3818 
3819   // Give source modifier matching some assistance before obscuring a foldable
3820   // pattern.
3821 
3822   // TODO: We can avoid the neg on the fract? The input sign to fract
3823   // shouldn't matter?
3824   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3825 
3826   auto Const =
3827       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3828 
3829   Register Min = MRI.createGenericVirtualRegister(F64);
3830 
3831   // We don't need to concern ourselves with the snan handling difference, so
3832   // use the one which will directly select.
3833   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3834   if (MFI->getMode().IEEE)
3835     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3836   else
3837     B.buildFMinNum(Min, Fract, Const, Flags);
3838 
3839   Register CorrectedFract = Min;
3840   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3841     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3842     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3843   }
3844 
3845   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3846   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3847 
3848   MI.eraseFromParent();
3849   return true;
3850 }
3851 
3852 // Turn an illegal packed v2s16 build vector into bit operations.
3853 // TODO: This should probably be a bitcast action in LegalizerHelper.
3854 bool AMDGPULegalizerInfo::legalizeBuildVector(
3855   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3856   Register Dst = MI.getOperand(0).getReg();
3857   const LLT S32 = LLT::scalar(32);
3858   const LLT S16 = LLT::scalar(16);
3859   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3860 
3861   Register Src0 = MI.getOperand(1).getReg();
3862   Register Src1 = MI.getOperand(2).getReg();
3863 
3864   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3865     assert(MRI.getType(Src0) == S32);
3866     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3867     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3868   }
3869 
3870   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3871   B.buildBitcast(Dst, Merge);
3872 
3873   MI.eraseFromParent();
3874   return true;
3875 }
3876 
3877 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3878 //
3879 // Source and accumulation registers must all be 32-bits.
3880 //
3881 // TODO: When the multiply is uniform, we should produce a code sequence
3882 // that is better suited to instruction selection on the SALU. Instead of
3883 // the outer loop going over parts of the result, the outer loop should go
3884 // over parts of one of the factors. This should result in instruction
3885 // selection that makes full use of S_ADDC_U32 instructions.
3886 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3887                                         MutableArrayRef<Register> Accum,
3888                                         ArrayRef<Register> Src0,
3889                                         ArrayRef<Register> Src1,
3890                                         bool UsePartialMad64_32,
3891                                         bool SeparateOddAlignedProducts) const {
3892   // Use (possibly empty) vectors of S1 registers to represent the set of
3893   // carries from one pair of positions to the next.
3894   using Carry = SmallVector<Register, 2>;
3895 
3896   MachineIRBuilder &B = Helper.MIRBuilder;
3897   GISelKnownBits &KB = *Helper.getKnownBits();
3898 
3899   const LLT S1 = LLT::scalar(1);
3900   const LLT S32 = LLT::scalar(32);
3901   const LLT S64 = LLT::scalar(64);
3902 
3903   Register Zero32;
3904   Register Zero64;
3905 
3906   auto getZero32 = [&]() -> Register {
3907     if (!Zero32)
3908       Zero32 = B.buildConstant(S32, 0).getReg(0);
3909     return Zero32;
3910   };
3911   auto getZero64 = [&]() -> Register {
3912     if (!Zero64)
3913       Zero64 = B.buildConstant(S64, 0).getReg(0);
3914     return Zero64;
3915   };
3916 
3917   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3918   for (unsigned i = 0; i < Src0.size(); ++i) {
3919     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3920     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3921   }
3922 
3923   // Merge the given carries into the 32-bit LocalAccum, which is modified
3924   // in-place.
3925   //
3926   // Returns the carry-out, which is a single S1 register or null.
3927   auto mergeCarry =
3928       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3929         if (CarryIn.empty())
3930           return Register();
3931 
3932         bool HaveCarryOut = true;
3933         Register CarryAccum;
3934         if (CarryIn.size() == 1) {
3935           if (!LocalAccum) {
3936             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3937             return Register();
3938           }
3939 
3940           CarryAccum = getZero32();
3941         } else {
3942           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3943           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3944             CarryAccum =
3945                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3946                     .getReg(0);
3947           }
3948 
3949           if (!LocalAccum) {
3950             LocalAccum = getZero32();
3951             HaveCarryOut = false;
3952           }
3953         }
3954 
3955         auto Add =
3956             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3957         LocalAccum = Add.getReg(0);
3958         return HaveCarryOut ? Add.getReg(1) : Register();
3959       };
3960 
3961   // Build a multiply-add chain to compute
3962   //
3963   //   LocalAccum + (partial products at DstIndex)
3964   //       + (opportunistic subset of CarryIn)
3965   //
3966   // LocalAccum is an array of one or two 32-bit registers that are updated
3967   // in-place. The incoming registers may be null.
3968   //
3969   // In some edge cases, carry-ins can be consumed "for free". In that case,
3970   // the consumed carry bits are removed from CarryIn in-place.
3971   auto buildMadChain =
3972       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3973           -> Carry {
3974         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3975                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3976 
3977         Carry CarryOut;
3978         unsigned j0 = 0;
3979 
3980         // Use plain 32-bit multiplication for the most significant part of the
3981         // result by default.
3982         if (LocalAccum.size() == 1 &&
3983             (!UsePartialMad64_32 || !CarryIn.empty())) {
3984           do {
3985             // Skip multiplication if one of the operands is 0
3986             unsigned j1 = DstIndex - j0;
3987             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3988               ++j0;
3989               continue;
3990             }
3991             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3992             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3993               LocalAccum[0] = Mul.getReg(0);
3994             } else {
3995               if (CarryIn.empty()) {
3996                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3997               } else {
3998                 LocalAccum[0] =
3999                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4000                         .getReg(0);
4001                 CarryIn.pop_back();
4002               }
4003             }
4004             ++j0;
4005           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4006         }
4007 
4008         // Build full 64-bit multiplies.
4009         if (j0 <= DstIndex) {
4010           bool HaveSmallAccum = false;
4011           Register Tmp;
4012 
4013           if (LocalAccum[0]) {
4014             if (LocalAccum.size() == 1) {
4015               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4016               HaveSmallAccum = true;
4017             } else if (LocalAccum[1]) {
4018               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4019               HaveSmallAccum = false;
4020             } else {
4021               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4022               HaveSmallAccum = true;
4023             }
4024           } else {
4025             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4026             Tmp = getZero64();
4027             HaveSmallAccum = true;
4028           }
4029 
4030           do {
4031             unsigned j1 = DstIndex - j0;
4032             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4033               ++j0;
4034               continue;
4035             }
4036             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4037                                     {Src0[j0], Src1[j1], Tmp});
4038             Tmp = Mad.getReg(0);
4039             if (!HaveSmallAccum)
4040               CarryOut.push_back(Mad.getReg(1));
4041             HaveSmallAccum = false;
4042 
4043             ++j0;
4044           } while (j0 <= DstIndex);
4045 
4046           auto Unmerge = B.buildUnmerge(S32, Tmp);
4047           LocalAccum[0] = Unmerge.getReg(0);
4048           if (LocalAccum.size() > 1)
4049             LocalAccum[1] = Unmerge.getReg(1);
4050         }
4051 
4052         return CarryOut;
4053       };
4054 
4055   // Outer multiply loop, iterating over destination parts from least
4056   // significant to most significant parts.
4057   //
4058   // The columns of the following diagram correspond to the destination parts
4059   // affected by one iteration of the outer loop (ignoring boundary
4060   // conditions).
4061   //
4062   //   Dest index relative to 2 * i:      1 0 -1
4063   //                                      ------
4064   //   Carries from previous iteration:     e o
4065   //   Even-aligned partial product sum:  E E .
4066   //   Odd-aligned partial product sum:     O O
4067   //
4068   // 'o' is OddCarry, 'e' is EvenCarry.
4069   // EE and OO are computed from partial products via buildMadChain and use
4070   // accumulation where possible and appropriate.
4071   //
4072   Register SeparateOddCarry;
4073   Carry EvenCarry;
4074   Carry OddCarry;
4075 
4076   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4077     Carry OddCarryIn = std::move(OddCarry);
4078     Carry EvenCarryIn = std::move(EvenCarry);
4079     OddCarry.clear();
4080     EvenCarry.clear();
4081 
4082     // Partial products at offset 2 * i.
4083     if (2 * i < Accum.size()) {
4084       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4085       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4086     }
4087 
4088     // Partial products at offset 2 * i - 1.
4089     if (i > 0) {
4090       if (!SeparateOddAlignedProducts) {
4091         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4092         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4093       } else {
4094         bool IsHighest = 2 * i >= Accum.size();
4095         Register SeparateOddOut[2];
4096         auto LocalAccum = MutableArrayRef(SeparateOddOut)
4097                               .take_front(IsHighest ? 1 : 2);
4098         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4099 
4100         MachineInstr *Lo;
4101 
4102         if (i == 1) {
4103           if (!IsHighest)
4104             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4105           else
4106             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4107         } else {
4108           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4109                             SeparateOddCarry);
4110         }
4111         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4112 
4113         if (!IsHighest) {
4114           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4115                                 Lo->getOperand(1).getReg());
4116           Accum[2 * i] = Hi.getReg(0);
4117           SeparateOddCarry = Hi.getReg(1);
4118         }
4119       }
4120     }
4121 
4122     // Add in the carries from the previous iteration
4123     if (i > 0) {
4124       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4125         EvenCarryIn.push_back(CarryOut);
4126 
4127       if (2 * i < Accum.size()) {
4128         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4129           OddCarry.push_back(CarryOut);
4130       }
4131     }
4132   }
4133 }
4134 
4135 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4136 //
4137 // TODO: If the multiply is followed by an addition, we should attempt to
4138 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4139 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4140                                       MachineInstr &MI) const {
4141   assert(ST.hasMad64_32());
4142   assert(MI.getOpcode() == TargetOpcode::G_MUL);
4143 
4144   MachineIRBuilder &B = Helper.MIRBuilder;
4145   MachineRegisterInfo &MRI = *B.getMRI();
4146 
4147   Register DstReg = MI.getOperand(0).getReg();
4148   Register Src0 = MI.getOperand(1).getReg();
4149   Register Src1 = MI.getOperand(2).getReg();
4150 
4151   LLT Ty = MRI.getType(DstReg);
4152   assert(Ty.isScalar());
4153 
4154   unsigned Size = Ty.getSizeInBits();
4155   unsigned NumParts = Size / 32;
4156   assert((Size % 32) == 0);
4157   assert(NumParts >= 2);
4158 
4159   // Whether to use MAD_64_32 for partial products whose high half is
4160   // discarded. This avoids some ADD instructions but risks false dependency
4161   // stalls on some subtargets in some cases.
4162   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4163 
4164   // Whether to compute odd-aligned partial products separately. This is
4165   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4166   // in an even-aligned VGPR.
4167   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4168 
4169   LLT S32 = LLT::scalar(32);
4170   SmallVector<Register, 2> Src0Parts, Src1Parts;
4171   for (unsigned i = 0; i < NumParts; ++i) {
4172     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4173     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4174   }
4175   B.buildUnmerge(Src0Parts, Src0);
4176   B.buildUnmerge(Src1Parts, Src1);
4177 
4178   SmallVector<Register, 2> AccumRegs(NumParts);
4179   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4180                 SeparateOddAlignedProducts);
4181 
4182   B.buildMergeLikeInstr(DstReg, AccumRegs);
4183   MI.eraseFromParent();
4184   return true;
4185 }
4186 
4187 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4188 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4189 // case with a single min instruction instead of a compare+select.
4190 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4191                                             MachineRegisterInfo &MRI,
4192                                             MachineIRBuilder &B) const {
4193   Register Dst = MI.getOperand(0).getReg();
4194   Register Src = MI.getOperand(1).getReg();
4195   LLT DstTy = MRI.getType(Dst);
4196   LLT SrcTy = MRI.getType(Src);
4197 
4198   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4199                         ? AMDGPU::G_AMDGPU_FFBH_U32
4200                         : AMDGPU::G_AMDGPU_FFBL_B32;
4201   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4202   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4203 
4204   MI.eraseFromParent();
4205   return true;
4206 }
4207 
4208 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4209                                                   MachineRegisterInfo &MRI,
4210                                                   MachineIRBuilder &B) const {
4211   Register Dst = MI.getOperand(0).getReg();
4212   Register Src = MI.getOperand(1).getReg();
4213   LLT SrcTy = MRI.getType(Src);
4214   TypeSize NumBits = SrcTy.getSizeInBits();
4215 
4216   assert(NumBits < 32u);
4217 
4218   auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4219   auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4220   auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4221   auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4222   B.buildTrunc(Dst, Ctlz);
4223   MI.eraseFromParent();
4224   return true;
4225 }
4226 
4227 // Check that this is a G_XOR x, -1
4228 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4229   if (MI.getOpcode() != TargetOpcode::G_XOR)
4230     return false;
4231   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4232   return ConstVal && *ConstVal == -1;
4233 }
4234 
4235 // Return the use branch instruction, otherwise null if the usage is invalid.
4236 static MachineInstr *
4237 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4238                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4239   Register CondDef = MI.getOperand(0).getReg();
4240   if (!MRI.hasOneNonDBGUse(CondDef))
4241     return nullptr;
4242 
4243   MachineBasicBlock *Parent = MI.getParent();
4244   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4245 
4246   if (isNot(MRI, *UseMI)) {
4247     Register NegatedCond = UseMI->getOperand(0).getReg();
4248     if (!MRI.hasOneNonDBGUse(NegatedCond))
4249       return nullptr;
4250 
4251     // We're deleting the def of this value, so we need to remove it.
4252     eraseInstr(*UseMI, MRI);
4253 
4254     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4255     Negated = true;
4256   }
4257 
4258   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4259     return nullptr;
4260 
4261   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4262   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4263   if (Next == Parent->end()) {
4264     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4265     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4266       return nullptr;
4267     UncondBrTarget = &*NextMBB;
4268   } else {
4269     if (Next->getOpcode() != AMDGPU::G_BR)
4270       return nullptr;
4271     Br = &*Next;
4272     UncondBrTarget = Br->getOperand(0).getMBB();
4273   }
4274 
4275   return UseMI;
4276 }
4277 
4278 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4279                                          const ArgDescriptor *Arg,
4280                                          const TargetRegisterClass *ArgRC,
4281                                          LLT ArgTy) const {
4282   MCRegister SrcReg = Arg->getRegister();
4283   assert(SrcReg.isPhysical() && "Physical register expected");
4284   assert(DstReg.isVirtual() && "Virtual register expected");
4285 
4286   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4287                                              *ArgRC, B.getDebugLoc(), ArgTy);
4288   if (Arg->isMasked()) {
4289     // TODO: Should we try to emit this once in the entry block?
4290     const LLT S32 = LLT::scalar(32);
4291     const unsigned Mask = Arg->getMask();
4292     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4293 
4294     Register AndMaskSrc = LiveIn;
4295 
4296     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4297     // 0.
4298     if (Shift != 0) {
4299       auto ShiftAmt = B.buildConstant(S32, Shift);
4300       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4301     }
4302 
4303     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4304   } else {
4305     B.buildCopy(DstReg, LiveIn);
4306   }
4307 
4308   return true;
4309 }
4310 
4311 bool AMDGPULegalizerInfo::loadInputValue(
4312     Register DstReg, MachineIRBuilder &B,
4313     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4314   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4315   const ArgDescriptor *Arg = nullptr;
4316   const TargetRegisterClass *ArgRC;
4317   LLT ArgTy;
4318 
4319   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4320   const ArgDescriptor WorkGroupIDX =
4321       ArgDescriptor::createRegister(AMDGPU::TTMP9);
4322   // If GridZ is not programmed in an entry function then the hardware will set
4323   // it to all zeros, so there is no need to mask the GridY value in the low
4324   // order bits.
4325   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4326       AMDGPU::TTMP7,
4327       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4328   const ArgDescriptor WorkGroupIDZ =
4329       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4330   if (ST.hasArchitectedSGPRs() &&
4331       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4332     switch (ArgType) {
4333     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4334       Arg = &WorkGroupIDX;
4335       ArgRC = &AMDGPU::SReg_32RegClass;
4336       ArgTy = LLT::scalar(32);
4337       break;
4338     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4339       Arg = &WorkGroupIDY;
4340       ArgRC = &AMDGPU::SReg_32RegClass;
4341       ArgTy = LLT::scalar(32);
4342       break;
4343     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4344       Arg = &WorkGroupIDZ;
4345       ArgRC = &AMDGPU::SReg_32RegClass;
4346       ArgTy = LLT::scalar(32);
4347       break;
4348     default:
4349       break;
4350     }
4351   }
4352 
4353   if (!Arg)
4354     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4355 
4356   if (!Arg) {
4357     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4358       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4359       // case the pointer argument may be missing and we use null.
4360       B.buildConstant(DstReg, 0);
4361       return true;
4362     }
4363 
4364     // It's undefined behavior if a function marked with the amdgpu-no-*
4365     // attributes uses the corresponding intrinsic.
4366     B.buildUndef(DstReg);
4367     return true;
4368   }
4369 
4370   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4371     return false; // TODO: Handle these
4372   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4373 }
4374 
4375 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4376     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4377     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4378   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4379     return false;
4380 
4381   MI.eraseFromParent();
4382   return true;
4383 }
4384 
4385 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4386                                 int64_t C) {
4387   B.buildConstant(MI.getOperand(0).getReg(), C);
4388   MI.eraseFromParent();
4389   return true;
4390 }
4391 
4392 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4393     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4394     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4395   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4396   if (MaxID == 0)
4397     return replaceWithConstant(B, MI, 0);
4398 
4399   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4400   const ArgDescriptor *Arg;
4401   const TargetRegisterClass *ArgRC;
4402   LLT ArgTy;
4403   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4404 
4405   Register DstReg = MI.getOperand(0).getReg();
4406   if (!Arg) {
4407     // It's undefined behavior if a function marked with the amdgpu-no-*
4408     // attributes uses the corresponding intrinsic.
4409     B.buildUndef(DstReg);
4410     MI.eraseFromParent();
4411     return true;
4412   }
4413 
4414   if (Arg->isMasked()) {
4415     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4416     // masking operations anyway.
4417     //
4418     // TODO: We could assert the top bit is 0 for the source copy.
4419     if (!loadInputValue(DstReg, B, ArgType))
4420       return false;
4421   } else {
4422     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4423     if (!loadInputValue(TmpReg, B, ArgType))
4424       return false;
4425     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4426   }
4427 
4428   MI.eraseFromParent();
4429   return true;
4430 }
4431 
4432 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4433                                                      int64_t Offset) const {
4434   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4435   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4436 
4437   // TODO: If we passed in the base kernel offset we could have a better
4438   // alignment than 4, but we don't really need it.
4439   if (!loadInputValue(KernArgReg, B,
4440                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4441     llvm_unreachable("failed to find kernarg segment ptr");
4442 
4443   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4444   // TODO: Should get nuw
4445   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4446 }
4447 
4448 /// Legalize a value that's loaded from kernel arguments. This is only used by
4449 /// legacy intrinsics.
4450 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4451                                                       MachineIRBuilder &B,
4452                                                       uint64_t Offset,
4453                                                       Align Alignment) const {
4454   Register DstReg = MI.getOperand(0).getReg();
4455 
4456   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4457          "unexpected kernarg parameter type");
4458 
4459   Register Ptr = getKernargParameterPtr(B, Offset);
4460   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4461   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4462               MachineMemOperand::MODereferenceable |
4463                   MachineMemOperand::MOInvariant);
4464   MI.eraseFromParent();
4465   return true;
4466 }
4467 
4468 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4469                                        MachineRegisterInfo &MRI,
4470                                        MachineIRBuilder &B) const {
4471   Register Dst = MI.getOperand(0).getReg();
4472   LLT DstTy = MRI.getType(Dst);
4473   LLT S16 = LLT::scalar(16);
4474   LLT S32 = LLT::scalar(32);
4475   LLT S64 = LLT::scalar(64);
4476 
4477   if (DstTy == S16)
4478     return legalizeFDIV16(MI, MRI, B);
4479   if (DstTy == S32)
4480     return legalizeFDIV32(MI, MRI, B);
4481   if (DstTy == S64)
4482     return legalizeFDIV64(MI, MRI, B);
4483 
4484   return false;
4485 }
4486 
4487 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4488                                                         Register DstDivReg,
4489                                                         Register DstRemReg,
4490                                                         Register X,
4491                                                         Register Y) const {
4492   const LLT S1 = LLT::scalar(1);
4493   const LLT S32 = LLT::scalar(32);
4494 
4495   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4496   // algorithm used here.
4497 
4498   // Initial estimate of inv(y).
4499   auto FloatY = B.buildUITOFP(S32, Y);
4500   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4501   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4502   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4503   auto Z = B.buildFPTOUI(S32, ScaledY);
4504 
4505   // One round of UNR.
4506   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4507   auto NegYZ = B.buildMul(S32, NegY, Z);
4508   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4509 
4510   // Quotient/remainder estimate.
4511   auto Q = B.buildUMulH(S32, X, Z);
4512   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4513 
4514   // First quotient/remainder refinement.
4515   auto One = B.buildConstant(S32, 1);
4516   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4517   if (DstDivReg)
4518     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4519   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4520 
4521   // Second quotient/remainder refinement.
4522   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4523   if (DstDivReg)
4524     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4525 
4526   if (DstRemReg)
4527     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4528 }
4529 
4530 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4531 //
4532 // Return lo, hi of result
4533 //
4534 // %cvt.lo = G_UITOFP Val.lo
4535 // %cvt.hi = G_UITOFP Val.hi
4536 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4537 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4538 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4539 // %mul2 = G_FMUL %mul1, 2**(-32)
4540 // %trunc = G_INTRINSIC_TRUNC %mul2
4541 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4542 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4543 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4544                                                        Register Val) {
4545   const LLT S32 = LLT::scalar(32);
4546   auto Unmerge = B.buildUnmerge(S32, Val);
4547 
4548   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4549   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4550 
4551   auto Mad = B.buildFMAD(
4552       S32, CvtHi, // 2**32
4553       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4554 
4555   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4556   auto Mul1 = B.buildFMul(
4557       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4558 
4559   // 2**(-32)
4560   auto Mul2 = B.buildFMul(
4561       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4562   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4563 
4564   // -(2**32)
4565   auto Mad2 = B.buildFMAD(
4566       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4567       Mul1);
4568 
4569   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4570   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4571 
4572   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4573 }
4574 
4575 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4576                                                         Register DstDivReg,
4577                                                         Register DstRemReg,
4578                                                         Register Numer,
4579                                                         Register Denom) const {
4580   const LLT S32 = LLT::scalar(32);
4581   const LLT S64 = LLT::scalar(64);
4582   const LLT S1 = LLT::scalar(1);
4583   Register RcpLo, RcpHi;
4584 
4585   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4586 
4587   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4588 
4589   auto Zero64 = B.buildConstant(S64, 0);
4590   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4591 
4592   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4593   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4594 
4595   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4596   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4597   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4598 
4599   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4600   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4601   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4602 
4603   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4604   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4605   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4606   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4607   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4608 
4609   auto Zero32 = B.buildConstant(S32, 0);
4610   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4611   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4612   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4613 
4614   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4615   Register NumerLo = UnmergeNumer.getReg(0);
4616   Register NumerHi = UnmergeNumer.getReg(1);
4617 
4618   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4619   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4620   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4621   Register Mul3_Lo = UnmergeMul3.getReg(0);
4622   Register Mul3_Hi = UnmergeMul3.getReg(1);
4623   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4624   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4625   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4626   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4627 
4628   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4629   Register DenomLo = UnmergeDenom.getReg(0);
4630   Register DenomHi = UnmergeDenom.getReg(1);
4631 
4632   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4633   auto C1 = B.buildSExt(S32, CmpHi);
4634 
4635   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4636   auto C2 = B.buildSExt(S32, CmpLo);
4637 
4638   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4639   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4640 
4641   // TODO: Here and below portions of the code can be enclosed into if/endif.
4642   // Currently control flow is unconditional and we have 4 selects after
4643   // potential endif to substitute PHIs.
4644 
4645   // if C3 != 0 ...
4646   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4647   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4648   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4649   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4650 
4651   auto One64 = B.buildConstant(S64, 1);
4652   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4653 
4654   auto C4 =
4655       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4656   auto C5 =
4657       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4658   auto C6 = B.buildSelect(
4659       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4660 
4661   // if (C6 != 0)
4662   auto Add4 = B.buildAdd(S64, Add3, One64);
4663   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4664 
4665   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4666   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4667   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4668 
4669   // endif C6
4670   // endif C3
4671 
4672   if (DstDivReg) {
4673     auto Sel1 = B.buildSelect(
4674         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4675     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4676                   Sel1, MulHi3);
4677   }
4678 
4679   if (DstRemReg) {
4680     auto Sel2 = B.buildSelect(
4681         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4682     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4683                   Sel2, Sub1);
4684   }
4685 }
4686 
4687 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4688                                                   MachineRegisterInfo &MRI,
4689                                                   MachineIRBuilder &B) const {
4690   Register DstDivReg, DstRemReg;
4691   switch (MI.getOpcode()) {
4692   default:
4693     llvm_unreachable("Unexpected opcode!");
4694   case AMDGPU::G_UDIV: {
4695     DstDivReg = MI.getOperand(0).getReg();
4696     break;
4697   }
4698   case AMDGPU::G_UREM: {
4699     DstRemReg = MI.getOperand(0).getReg();
4700     break;
4701   }
4702   case AMDGPU::G_UDIVREM: {
4703     DstDivReg = MI.getOperand(0).getReg();
4704     DstRemReg = MI.getOperand(1).getReg();
4705     break;
4706   }
4707   }
4708 
4709   const LLT S64 = LLT::scalar(64);
4710   const LLT S32 = LLT::scalar(32);
4711   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4712   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4713   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4714   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4715 
4716   if (Ty == S32)
4717     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4718   else if (Ty == S64)
4719     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4720   else
4721     return false;
4722 
4723   MI.eraseFromParent();
4724   return true;
4725 }
4726 
4727 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4728                                                 MachineRegisterInfo &MRI,
4729                                                 MachineIRBuilder &B) const {
4730   const LLT S64 = LLT::scalar(64);
4731   const LLT S32 = LLT::scalar(32);
4732 
4733   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4734   if (Ty != S32 && Ty != S64)
4735     return false;
4736 
4737   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4738   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4739   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4740 
4741   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4742   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4743   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4744 
4745   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4746   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4747 
4748   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4749   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4750 
4751   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4752   switch (MI.getOpcode()) {
4753   default:
4754     llvm_unreachable("Unexpected opcode!");
4755   case AMDGPU::G_SDIV: {
4756     DstDivReg = MI.getOperand(0).getReg();
4757     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4758     break;
4759   }
4760   case AMDGPU::G_SREM: {
4761     DstRemReg = MI.getOperand(0).getReg();
4762     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4763     break;
4764   }
4765   case AMDGPU::G_SDIVREM: {
4766     DstDivReg = MI.getOperand(0).getReg();
4767     DstRemReg = MI.getOperand(1).getReg();
4768     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4769     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4770     break;
4771   }
4772   }
4773 
4774   if (Ty == S32)
4775     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4776   else
4777     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4778 
4779   if (DstDivReg) {
4780     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4781     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4782     B.buildSub(DstDivReg, SignXor, Sign);
4783   }
4784 
4785   if (DstRemReg) {
4786     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4787     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4788     B.buildSub(DstRemReg, SignXor, Sign);
4789   }
4790 
4791   MI.eraseFromParent();
4792   return true;
4793 }
4794 
4795 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4796                                                  MachineRegisterInfo &MRI,
4797                                                  MachineIRBuilder &B) const {
4798   Register Res = MI.getOperand(0).getReg();
4799   Register LHS = MI.getOperand(1).getReg();
4800   Register RHS = MI.getOperand(2).getReg();
4801   uint16_t Flags = MI.getFlags();
4802   LLT ResTy = MRI.getType(Res);
4803 
4804   const MachineFunction &MF = B.getMF();
4805   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4806                             MF.getTarget().Options.UnsafeFPMath;
4807 
4808   if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
4809     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4810       return false;
4811 
4812     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4813     // the CI documentation has a worst case error of 1 ulp.
4814     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4815     // use it as long as we aren't trying to use denormals.
4816     //
4817     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4818 
4819     // 1 / x -> RCP(x)
4820     if (CLHS->isExactlyValue(1.0)) {
4821       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4822           .addUse(RHS)
4823           .setMIFlags(Flags);
4824 
4825       MI.eraseFromParent();
4826       return true;
4827     }
4828 
4829     // -1 / x -> RCP( FNEG(x) )
4830     if (CLHS->isExactlyValue(-1.0)) {
4831       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4832       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4833           .addUse(FNeg.getReg(0))
4834           .setMIFlags(Flags);
4835 
4836       MI.eraseFromParent();
4837       return true;
4838     }
4839   }
4840 
4841   // For f16 require afn or arcp.
4842   // For f32 require afn.
4843   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4844                               !MI.getFlag(MachineInstr::FmArcp)))
4845     return false;
4846 
4847   // x / y -> x * (1.0 / y)
4848   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4849                  .addUse(RHS)
4850                  .setMIFlags(Flags);
4851   B.buildFMul(Res, LHS, RCP, Flags);
4852 
4853   MI.eraseFromParent();
4854   return true;
4855 }
4856 
4857 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4858                                                    MachineRegisterInfo &MRI,
4859                                                    MachineIRBuilder &B) const {
4860   Register Res = MI.getOperand(0).getReg();
4861   Register X = MI.getOperand(1).getReg();
4862   Register Y = MI.getOperand(2).getReg();
4863   uint16_t Flags = MI.getFlags();
4864   LLT ResTy = MRI.getType(Res);
4865 
4866   const MachineFunction &MF = B.getMF();
4867   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4868                             MI.getFlag(MachineInstr::FmAfn);
4869 
4870   if (!AllowInaccurateRcp)
4871     return false;
4872 
4873   auto NegY = B.buildFNeg(ResTy, Y);
4874   auto One = B.buildFConstant(ResTy, 1.0);
4875 
4876   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4877                .addUse(Y)
4878                .setMIFlags(Flags);
4879 
4880   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4881   R = B.buildFMA(ResTy, Tmp0, R, R);
4882 
4883   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4884   R = B.buildFMA(ResTy, Tmp1, R, R);
4885 
4886   auto Ret = B.buildFMul(ResTy, X, R);
4887   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4888 
4889   B.buildFMA(Res, Tmp2, R, Ret);
4890   MI.eraseFromParent();
4891   return true;
4892 }
4893 
4894 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4895                                          MachineRegisterInfo &MRI,
4896                                          MachineIRBuilder &B) const {
4897   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4898     return true;
4899 
4900   Register Res = MI.getOperand(0).getReg();
4901   Register LHS = MI.getOperand(1).getReg();
4902   Register RHS = MI.getOperand(2).getReg();
4903 
4904   uint16_t Flags = MI.getFlags();
4905 
4906   LLT S16 = LLT::scalar(16);
4907   LLT S32 = LLT::scalar(32);
4908 
4909   // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4910   // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4911   // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4912   // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4913   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4914   // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4915   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4916   // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4917   // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4918   // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4919   // q16.u = opx(V_CVT_F16_F32, q32.u);
4920   // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4921 
4922   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4923   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4924   auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4925   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4926                  .addUse(RHSExt.getReg(0))
4927                  .setMIFlags(Flags);
4928   auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
4929   MachineInstrBuilder Err;
4930   if (ST.hasMadMacF32Insts()) {
4931     Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4932     Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
4933     Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4934   } else {
4935     Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4936     Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
4937     Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4938   }
4939   auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
4940   Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4941   Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
4942   auto RDst = B.buildFPTrunc(S16, Quot, Flags);
4943   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4944       .addUse(RDst.getReg(0))
4945       .addUse(RHS)
4946       .addUse(LHS)
4947       .setMIFlags(Flags);
4948 
4949   MI.eraseFromParent();
4950   return true;
4951 }
4952 
4953 static constexpr unsigned SPDenormModeBitField =
4954     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4955 
4956 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4957 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4958 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4959                                const GCNSubtarget &ST,
4960                                SIModeRegisterDefaults Mode) {
4961   // Set SP denorm mode to this value.
4962   unsigned SPDenormMode =
4963     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4964 
4965   if (ST.hasDenormModeInst()) {
4966     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4967     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4968 
4969     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4970     B.buildInstr(AMDGPU::S_DENORM_MODE)
4971       .addImm(NewDenormModeValue);
4972 
4973   } else {
4974     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4975       .addImm(SPDenormMode)
4976       .addImm(SPDenormModeBitField);
4977   }
4978 }
4979 
4980 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4981                                          MachineRegisterInfo &MRI,
4982                                          MachineIRBuilder &B) const {
4983   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4984     return true;
4985 
4986   Register Res = MI.getOperand(0).getReg();
4987   Register LHS = MI.getOperand(1).getReg();
4988   Register RHS = MI.getOperand(2).getReg();
4989   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4990   SIModeRegisterDefaults Mode = MFI->getMode();
4991 
4992   uint16_t Flags = MI.getFlags();
4993 
4994   LLT S32 = LLT::scalar(32);
4995   LLT S1 = LLT::scalar(1);
4996 
4997   auto One = B.buildFConstant(S32, 1.0f);
4998 
4999   auto DenominatorScaled =
5000       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5001           .addUse(LHS)
5002           .addUse(RHS)
5003           .addImm(0)
5004           .setMIFlags(Flags);
5005   auto NumeratorScaled =
5006       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5007           .addUse(LHS)
5008           .addUse(RHS)
5009           .addImm(1)
5010           .setMIFlags(Flags);
5011 
5012   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5013                        .addUse(DenominatorScaled.getReg(0))
5014                        .setMIFlags(Flags);
5015   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5016 
5017   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5018   const bool HasDynamicDenormals =
5019       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5020       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5021 
5022   Register SavedSPDenormMode;
5023   if (!PreservesDenormals) {
5024     if (HasDynamicDenormals) {
5025       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5026       B.buildInstr(AMDGPU::S_GETREG_B32)
5027           .addDef(SavedSPDenormMode)
5028           .addImm(SPDenormModeBitField);
5029     }
5030     toggleSPDenormMode(true, B, ST, Mode);
5031   }
5032 
5033   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5034   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5035   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5036   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5037   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5038   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5039 
5040   if (!PreservesDenormals) {
5041     if (HasDynamicDenormals) {
5042       assert(SavedSPDenormMode);
5043       B.buildInstr(AMDGPU::S_SETREG_B32)
5044           .addReg(SavedSPDenormMode)
5045           .addImm(SPDenormModeBitField);
5046     } else
5047       toggleSPDenormMode(false, B, ST, Mode);
5048   }
5049 
5050   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5051                   .addUse(Fma4.getReg(0))
5052                   .addUse(Fma1.getReg(0))
5053                   .addUse(Fma3.getReg(0))
5054                   .addUse(NumeratorScaled.getReg(1))
5055                   .setMIFlags(Flags);
5056 
5057   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5058       .addUse(Fmas.getReg(0))
5059       .addUse(RHS)
5060       .addUse(LHS)
5061       .setMIFlags(Flags);
5062 
5063   MI.eraseFromParent();
5064   return true;
5065 }
5066 
5067 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5068                                          MachineRegisterInfo &MRI,
5069                                          MachineIRBuilder &B) const {
5070   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5071     return true;
5072 
5073   Register Res = MI.getOperand(0).getReg();
5074   Register LHS = MI.getOperand(1).getReg();
5075   Register RHS = MI.getOperand(2).getReg();
5076 
5077   uint16_t Flags = MI.getFlags();
5078 
5079   LLT S64 = LLT::scalar(64);
5080   LLT S1 = LLT::scalar(1);
5081 
5082   auto One = B.buildFConstant(S64, 1.0);
5083 
5084   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5085                        .addUse(LHS)
5086                        .addUse(RHS)
5087                        .addImm(0)
5088                        .setMIFlags(Flags);
5089 
5090   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5091 
5092   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5093                  .addUse(DivScale0.getReg(0))
5094                  .setMIFlags(Flags);
5095 
5096   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5097   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5098   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5099 
5100   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5101                        .addUse(LHS)
5102                        .addUse(RHS)
5103                        .addImm(1)
5104                        .setMIFlags(Flags);
5105 
5106   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5107   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5108   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5109 
5110   Register Scale;
5111   if (!ST.hasUsableDivScaleConditionOutput()) {
5112     // Workaround a hardware bug on SI where the condition output from div_scale
5113     // is not usable.
5114 
5115     LLT S32 = LLT::scalar(32);
5116 
5117     auto NumUnmerge = B.buildUnmerge(S32, LHS);
5118     auto DenUnmerge = B.buildUnmerge(S32, RHS);
5119     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5120     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5121 
5122     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5123                               Scale1Unmerge.getReg(1));
5124     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5125                               Scale0Unmerge.getReg(1));
5126     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5127   } else {
5128     Scale = DivScale1.getReg(1);
5129   }
5130 
5131   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5132                   .addUse(Fma4.getReg(0))
5133                   .addUse(Fma3.getReg(0))
5134                   .addUse(Mul.getReg(0))
5135                   .addUse(Scale)
5136                   .setMIFlags(Flags);
5137 
5138   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5139       .addUse(Fmas.getReg(0))
5140       .addUse(RHS)
5141       .addUse(LHS)
5142       .setMIFlags(Flags);
5143 
5144   MI.eraseFromParent();
5145   return true;
5146 }
5147 
5148 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5149                                          MachineRegisterInfo &MRI,
5150                                          MachineIRBuilder &B) const {
5151   Register Res0 = MI.getOperand(0).getReg();
5152   Register Res1 = MI.getOperand(1).getReg();
5153   Register Val = MI.getOperand(2).getReg();
5154   uint16_t Flags = MI.getFlags();
5155 
5156   LLT Ty = MRI.getType(Res0);
5157   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5158 
5159   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5160                   .addUse(Val)
5161                   .setMIFlags(Flags);
5162   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5163                  .addUse(Val)
5164                  .setMIFlags(Flags);
5165 
5166   if (ST.hasFractBug()) {
5167     auto Fabs = B.buildFAbs(Ty, Val);
5168     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5169     auto IsFinite =
5170         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5171     auto Zero = B.buildConstant(InstrExpTy, 0);
5172     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5173     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5174   }
5175 
5176   B.buildCopy(Res0, Mant);
5177   B.buildSExtOrTrunc(Res1, Exp);
5178 
5179   MI.eraseFromParent();
5180   return true;
5181 }
5182 
5183 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5184                                                  MachineRegisterInfo &MRI,
5185                                                  MachineIRBuilder &B) const {
5186   Register Res = MI.getOperand(0).getReg();
5187   Register LHS = MI.getOperand(2).getReg();
5188   Register RHS = MI.getOperand(3).getReg();
5189   uint16_t Flags = MI.getFlags();
5190 
5191   LLT S32 = LLT::scalar(32);
5192   LLT S1 = LLT::scalar(1);
5193 
5194   auto Abs = B.buildFAbs(S32, RHS, Flags);
5195   const APFloat C0Val(1.0f);
5196 
5197   auto C0 = B.buildFConstant(S32, 0x1p+96f);
5198   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5199   auto C2 = B.buildFConstant(S32, 1.0f);
5200 
5201   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5202   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5203 
5204   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5205 
5206   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5207                  .addUse(Mul0.getReg(0))
5208                  .setMIFlags(Flags);
5209 
5210   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5211 
5212   B.buildFMul(Res, Sel, Mul1, Flags);
5213 
5214   MI.eraseFromParent();
5215   return true;
5216 }
5217 
5218 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5219                                            MachineRegisterInfo &MRI,
5220                                            MachineIRBuilder &B) const {
5221   // Bypass the correct expansion a standard promotion through G_FSQRT would
5222   // get. The f32 op is accurate enough for the f16 cas.
5223   unsigned Flags = MI.getFlags();
5224   assert(!ST.has16BitInsts());
5225   const LLT F32 = LLT::scalar(32);
5226   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5227   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5228     .addUse(Ext.getReg(0))
5229     .setMIFlags(Flags);
5230   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5231   MI.eraseFromParent();
5232   return true;
5233 }
5234 
5235 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5236                                            MachineRegisterInfo &MRI,
5237                                            MachineIRBuilder &B) const {
5238   MachineFunction &MF = B.getMF();
5239   Register Dst = MI.getOperand(0).getReg();
5240   Register X = MI.getOperand(1).getReg();
5241   const unsigned Flags = MI.getFlags();
5242   const LLT S1 = LLT::scalar(1);
5243   const LLT F32 = LLT::scalar(32);
5244   const LLT I32 = LLT::scalar(32);
5245 
5246   if (allowApproxFunc(MF, Flags)) {
5247     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5248       .addUse(X)
5249       .setMIFlags(Flags);
5250     MI.eraseFromParent();
5251     return true;
5252   }
5253 
5254   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5255   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5256   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5257   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5258   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5259 
5260   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5261   if (needsDenormHandlingF32(MF, X, Flags)) {
5262     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5263       .addUse(SqrtX.getReg(0))
5264       .setMIFlags(Flags);
5265 
5266     auto NegOne = B.buildConstant(I32, -1);
5267     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5268 
5269     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5270     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5271 
5272     auto PosOne = B.buildConstant(I32, 1);
5273     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5274 
5275     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5276     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5277 
5278     auto Zero = B.buildFConstant(F32, 0.0f);
5279     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5280 
5281     SqrtS =
5282         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5283 
5284     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5285     SqrtS =
5286         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5287   } else {
5288     auto SqrtR =
5289         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5290     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5291 
5292     auto Half = B.buildFConstant(F32, 0.5f);
5293     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5294     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5295     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5296     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5297     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5298     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5299     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5300     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5301   }
5302 
5303   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5304 
5305   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5306 
5307   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5308 
5309   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5310   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5311 
5312   MI.eraseFromParent();
5313   return true;
5314 }
5315 
5316 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5317                                            MachineRegisterInfo &MRI,
5318                                            MachineIRBuilder &B) const {
5319   // For double type, the SQRT and RSQ instructions don't have required
5320   // precision, we apply Goldschmidt's algorithm to improve the result:
5321   //
5322   //   y0 = rsq(x)
5323   //   g0 = x * y0
5324   //   h0 = 0.5 * y0
5325   //
5326   //   r0 = 0.5 - h0 * g0
5327   //   g1 = g0 * r0 + g0
5328   //   h1 = h0 * r0 + h0
5329   //
5330   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5331   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
5332   //   h2 = h1 * r1 + h1
5333   //
5334   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5335   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
5336   //
5337   //   sqrt(x) = g3
5338 
5339   const LLT S1 = LLT::scalar(1);
5340   const LLT S32 = LLT::scalar(32);
5341   const LLT F64 = LLT::scalar(64);
5342 
5343   Register Dst = MI.getOperand(0).getReg();
5344   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5345 
5346   Register X = MI.getOperand(1).getReg();
5347   unsigned Flags = MI.getFlags();
5348 
5349   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5350 
5351   auto ZeroInt = B.buildConstant(S32, 0);
5352   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5353 
5354   // Scale up input if it is too small.
5355   auto ScaleUpFactor = B.buildConstant(S32, 256);
5356   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5357   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5358 
5359   auto SqrtY =
5360       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5361 
5362   auto Half = B.buildFConstant(F64, 0.5);
5363   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5364   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5365 
5366   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5367   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5368 
5369   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5370   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5371 
5372   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5373   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5374 
5375   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5376 
5377   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5378   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5379 
5380   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5381 
5382   // Scale down the result.
5383   auto ScaleDownFactor = B.buildConstant(S32, -128);
5384   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5385   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5386 
5387   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5388   // with finite only or nsz because rsq(+/-0) = +/-inf
5389 
5390   // TODO: Check for DAZ and expand to subnormals
5391   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5392 
5393   // If x is +INF, +0, or -0, use its original value
5394   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5395 
5396   MI.eraseFromParent();
5397   return true;
5398 }
5399 
5400 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5401                                         MachineRegisterInfo &MRI,
5402                                         MachineIRBuilder &B) const {
5403   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5404   if (Ty == LLT::scalar(32))
5405     return legalizeFSQRTF32(MI, MRI, B);
5406   if (Ty == LLT::scalar(64))
5407     return legalizeFSQRTF64(MI, MRI, B);
5408   if (Ty == LLT::scalar(16))
5409     return legalizeFSQRTF16(MI, MRI, B);
5410   return false;
5411 }
5412 
5413 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5414 // FIXME: Why do we handle this one but not other removed instructions?
5415 //
5416 // Reciprocal square root.  The clamp prevents infinite results, clamping
5417 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5418 // +-max_float.
5419 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5420                                                     MachineRegisterInfo &MRI,
5421                                                     MachineIRBuilder &B) const {
5422   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5423     return true;
5424 
5425   Register Dst = MI.getOperand(0).getReg();
5426   Register Src = MI.getOperand(2).getReg();
5427   auto Flags = MI.getFlags();
5428 
5429   LLT Ty = MRI.getType(Dst);
5430 
5431   const fltSemantics *FltSemantics;
5432   if (Ty == LLT::scalar(32))
5433     FltSemantics = &APFloat::IEEEsingle();
5434   else if (Ty == LLT::scalar(64))
5435     FltSemantics = &APFloat::IEEEdouble();
5436   else
5437     return false;
5438 
5439   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5440                  .addUse(Src)
5441                  .setMIFlags(Flags);
5442 
5443   // We don't need to concern ourselves with the snan handling difference, since
5444   // the rsq quieted (or not) so use the one which will directly select.
5445   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5446   const bool UseIEEE = MFI->getMode().IEEE;
5447 
5448   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5449   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5450                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5451 
5452   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5453 
5454   if (UseIEEE)
5455     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5456   else
5457     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5458   MI.eraseFromParent();
5459   return true;
5460 }
5461 
5462 // TODO: Fix pointer type handling
5463 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5464                                          MachineInstr &MI,
5465                                          Intrinsic::ID IID) const {
5466 
5467   MachineIRBuilder &B = Helper.MIRBuilder;
5468   MachineRegisterInfo &MRI = *B.getMRI();
5469 
5470   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5471                       IID == Intrinsic::amdgcn_permlanex16;
5472   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5473                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5474 
5475   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5476                                       Register Src2, LLT VT) -> Register {
5477     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5478     switch (IID) {
5479     case Intrinsic::amdgcn_readfirstlane:
5480     case Intrinsic::amdgcn_permlane64:
5481       return LaneOp.getReg(0);
5482     case Intrinsic::amdgcn_readlane:
5483     case Intrinsic::amdgcn_set_inactive:
5484     case Intrinsic::amdgcn_set_inactive_chain_arg:
5485       return LaneOp.addUse(Src1).getReg(0);
5486     case Intrinsic::amdgcn_writelane:
5487       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5488     case Intrinsic::amdgcn_permlane16:
5489     case Intrinsic::amdgcn_permlanex16: {
5490       Register Src3 = MI.getOperand(5).getReg();
5491       Register Src4 = MI.getOperand(6).getImm();
5492       Register Src5 = MI.getOperand(7).getImm();
5493       return LaneOp.addUse(Src1)
5494           .addUse(Src2)
5495           .addUse(Src3)
5496           .addImm(Src4)
5497           .addImm(Src5)
5498           .getReg(0);
5499     }
5500     case Intrinsic::amdgcn_mov_dpp8:
5501       return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5502     case Intrinsic::amdgcn_update_dpp:
5503       return LaneOp.addUse(Src1)
5504           .addImm(MI.getOperand(4).getImm())
5505           .addImm(MI.getOperand(5).getImm())
5506           .addImm(MI.getOperand(6).getImm())
5507           .addImm(MI.getOperand(7).getImm())
5508           .getReg(0);
5509     default:
5510       llvm_unreachable("unhandled lane op");
5511     }
5512   };
5513 
5514   Register DstReg = MI.getOperand(0).getReg();
5515   Register Src0 = MI.getOperand(2).getReg();
5516   Register Src1, Src2;
5517   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5518       IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5519     Src1 = MI.getOperand(3).getReg();
5520     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5521       Src2 = MI.getOperand(4).getReg();
5522     }
5523   }
5524 
5525   LLT Ty = MRI.getType(DstReg);
5526   unsigned Size = Ty.getSizeInBits();
5527 
5528   unsigned SplitSize = 32;
5529   if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5530       ST.hasDPALU_DPP() &&
5531       AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
5532     SplitSize = 64;
5533 
5534   if (Size == SplitSize) {
5535     // Already legal
5536     return true;
5537   }
5538 
5539   if (Size < 32) {
5540     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5541 
5542     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5543       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5544 
5545     if (IID == Intrinsic::amdgcn_writelane)
5546       Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5547 
5548     Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5549     B.buildTrunc(DstReg, LaneOpDst);
5550     MI.eraseFromParent();
5551     return true;
5552   }
5553 
5554   if (Size % SplitSize != 0)
5555     return false;
5556 
5557   LLT PartialResTy = LLT::scalar(SplitSize);
5558   if (Ty.isVector()) {
5559     LLT EltTy = Ty.getElementType();
5560     unsigned EltSize = EltTy.getSizeInBits();
5561     if (EltSize == SplitSize) {
5562       PartialResTy = EltTy;
5563     } else if (EltSize == 16 || EltSize == 32) {
5564       unsigned NElem = SplitSize / EltSize;
5565       PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5566     }
5567     // Handle all other cases via S32/S64 pieces;
5568   }
5569 
5570   SmallVector<Register, 4> PartialRes;
5571   unsigned NumParts = Size / SplitSize;
5572   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5573   MachineInstrBuilder Src1Parts, Src2Parts;
5574 
5575   if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5576     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5577 
5578   if (IID == Intrinsic::amdgcn_writelane)
5579     Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5580 
5581   for (unsigned i = 0; i < NumParts; ++i) {
5582     Src0 = Src0Parts.getReg(i);
5583 
5584     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5585       Src1 = Src1Parts.getReg(i);
5586 
5587     if (IID == Intrinsic::amdgcn_writelane)
5588       Src2 = Src2Parts.getReg(i);
5589 
5590     PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5591   }
5592 
5593   B.buildMergeLikeInstr(DstReg, PartialRes);
5594   MI.eraseFromParent();
5595   return true;
5596 }
5597 
5598 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5599                                             MachineRegisterInfo &MRI,
5600                                             MachineIRBuilder &B) const {
5601   uint64_t Offset =
5602     ST.getTargetLowering()->getImplicitParameterOffset(
5603       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5604   LLT DstTy = MRI.getType(DstReg);
5605   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5606 
5607   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5608   if (!loadInputValue(KernargPtrReg, B,
5609                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5610     return false;
5611 
5612   // FIXME: This should be nuw
5613   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5614   return true;
5615 }
5616 
5617 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5618 /// bits of the pointer and replace them with the stride argument, then
5619 /// merge_values everything together. In the common case of a raw buffer (the
5620 /// stride component is 0), we can just AND off the upper half.
5621 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5622     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5623   Register Result = MI.getOperand(0).getReg();
5624   Register Pointer = MI.getOperand(2).getReg();
5625   Register Stride = MI.getOperand(3).getReg();
5626   Register NumRecords = MI.getOperand(4).getReg();
5627   Register Flags = MI.getOperand(5).getReg();
5628 
5629   LLT S32 = LLT::scalar(32);
5630 
5631   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5632   auto Unmerge = B.buildUnmerge(S32, Pointer);
5633   Register LowHalf = Unmerge.getReg(0);
5634   Register HighHalf = Unmerge.getReg(1);
5635 
5636   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5637   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5638 
5639   MachineInstrBuilder NewHighHalf = Masked;
5640   std::optional<ValueAndVReg> StrideConst =
5641       getIConstantVRegValWithLookThrough(Stride, MRI);
5642   if (!StrideConst || !StrideConst->Value.isZero()) {
5643     MachineInstrBuilder ShiftedStride;
5644     if (StrideConst) {
5645       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5646       uint32_t ShiftedStrideVal = StrideVal << 16;
5647       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5648     } else {
5649       auto ExtStride = B.buildAnyExt(S32, Stride);
5650       auto ShiftConst = B.buildConstant(S32, 16);
5651       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5652     }
5653     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5654   }
5655   Register NewHighHalfReg = NewHighHalf.getReg(0);
5656   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5657   MI.eraseFromParent();
5658   return true;
5659 }
5660 
5661 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5662                                                  MachineRegisterInfo &MRI,
5663                                                  MachineIRBuilder &B) const {
5664   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5665   if (!MFI->isEntryFunction()) {
5666     return legalizePreloadedArgIntrin(MI, MRI, B,
5667                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5668   }
5669 
5670   Register DstReg = MI.getOperand(0).getReg();
5671   if (!getImplicitArgPtr(DstReg, MRI, B))
5672     return false;
5673 
5674   MI.eraseFromParent();
5675   return true;
5676 }
5677 
5678 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5679                                          MachineRegisterInfo &MRI,
5680                                          MachineIRBuilder &B) const {
5681   Function &F = B.getMF().getFunction();
5682   std::optional<uint32_t> KnownSize =
5683       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5684   if (KnownSize.has_value())
5685     B.buildConstant(DstReg, *KnownSize);
5686   return false;
5687 }
5688 
5689 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5690                                               MachineRegisterInfo &MRI,
5691                                               MachineIRBuilder &B) const {
5692 
5693   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5694   if (!MFI->isEntryFunction()) {
5695     return legalizePreloadedArgIntrin(MI, MRI, B,
5696                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5697   }
5698 
5699   Register DstReg = MI.getOperand(0).getReg();
5700   if (!getLDSKernelId(DstReg, MRI, B))
5701     return false;
5702 
5703   MI.eraseFromParent();
5704   return true;
5705 }
5706 
5707 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5708                                               MachineRegisterInfo &MRI,
5709                                               MachineIRBuilder &B,
5710                                               unsigned AddrSpace) const {
5711   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5712   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5713   Register Hi32 = Unmerge.getReg(1);
5714 
5715   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5716   MI.eraseFromParent();
5717   return true;
5718 }
5719 
5720 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5721 // offset (the offset that is included in bounds checking and swizzling, to be
5722 // split between the instruction's voffset and immoffset fields) and soffset
5723 // (the offset that is excluded from bounds checking and swizzling, to go in
5724 // the instruction's soffset field).  This function takes the first kind of
5725 // offset and figures out how to split it between voffset and immoffset.
5726 std::pair<Register, unsigned>
5727 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5728                                         Register OrigOffset) const {
5729   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5730   Register BaseReg;
5731   unsigned ImmOffset;
5732   const LLT S32 = LLT::scalar(32);
5733   MachineRegisterInfo &MRI = *B.getMRI();
5734 
5735   std::tie(BaseReg, ImmOffset) =
5736       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5737 
5738   // If BaseReg is a pointer, convert it to int.
5739   if (MRI.getType(BaseReg).isPointer())
5740     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5741 
5742   // If the immediate value is too big for the immoffset field, put only bits
5743   // that would normally fit in the immoffset field. The remaining value that
5744   // is copied/added for the voffset field is a large power of 2, and it
5745   // stands more chance of being CSEd with the copy/add for another similar
5746   // load/store.
5747   // However, do not do that rounding down if that is a negative
5748   // number, as it appears to be illegal to have a negative offset in the
5749   // vgpr, even if adding the immediate offset makes it positive.
5750   unsigned Overflow = ImmOffset & ~MaxImm;
5751   ImmOffset -= Overflow;
5752   if ((int32_t)Overflow < 0) {
5753     Overflow += ImmOffset;
5754     ImmOffset = 0;
5755   }
5756 
5757   if (Overflow != 0) {
5758     if (!BaseReg) {
5759       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5760     } else {
5761       auto OverflowVal = B.buildConstant(S32, Overflow);
5762       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5763     }
5764   }
5765 
5766   if (!BaseReg)
5767     BaseReg = B.buildConstant(S32, 0).getReg(0);
5768 
5769   return std::pair(BaseReg, ImmOffset);
5770 }
5771 
5772 /// Handle register layout difference for f16 images for some subtargets.
5773 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5774                                              MachineRegisterInfo &MRI,
5775                                              Register Reg,
5776                                              bool ImageStore) const {
5777   const LLT S16 = LLT::scalar(16);
5778   const LLT S32 = LLT::scalar(32);
5779   LLT StoreVT = MRI.getType(Reg);
5780   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5781 
5782   if (ST.hasUnpackedD16VMem()) {
5783     auto Unmerge = B.buildUnmerge(S16, Reg);
5784 
5785     SmallVector<Register, 4> WideRegs;
5786     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5787       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5788 
5789     int NumElts = StoreVT.getNumElements();
5790 
5791     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5792         .getReg(0);
5793   }
5794 
5795   if (ImageStore && ST.hasImageStoreD16Bug()) {
5796     if (StoreVT.getNumElements() == 2) {
5797       SmallVector<Register, 4> PackedRegs;
5798       Reg = B.buildBitcast(S32, Reg).getReg(0);
5799       PackedRegs.push_back(Reg);
5800       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5801       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5802           .getReg(0);
5803     }
5804 
5805     if (StoreVT.getNumElements() == 3) {
5806       SmallVector<Register, 4> PackedRegs;
5807       auto Unmerge = B.buildUnmerge(S16, Reg);
5808       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5809         PackedRegs.push_back(Unmerge.getReg(I));
5810       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5811       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5812       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5813     }
5814 
5815     if (StoreVT.getNumElements() == 4) {
5816       SmallVector<Register, 4> PackedRegs;
5817       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5818       auto Unmerge = B.buildUnmerge(S32, Reg);
5819       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5820         PackedRegs.push_back(Unmerge.getReg(I));
5821       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5822       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5823           .getReg(0);
5824     }
5825 
5826     llvm_unreachable("invalid data type");
5827   }
5828 
5829   if (StoreVT == LLT::fixed_vector(3, S16)) {
5830     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5831               .getReg(0);
5832   }
5833   return Reg;
5834 }
5835 
5836 Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
5837                                                  Register VData, LLT MemTy,
5838                                                  bool IsFormat) const {
5839   MachineRegisterInfo *MRI = B.getMRI();
5840   LLT Ty = MRI->getType(VData);
5841 
5842   const LLT S16 = LLT::scalar(16);
5843 
5844   // Fixup buffer resources themselves needing to be v4i128.
5845   if (hasBufferRsrcWorkaround(Ty))
5846     return castBufferRsrcToV4I32(VData, B);
5847 
5848   if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5849     Ty = getBitcastRegisterType(Ty);
5850     VData = B.buildBitcast(Ty, VData).getReg(0);
5851   }
5852   // Fixup illegal register types for i8 stores.
5853   if (Ty == LLT::scalar(8) || Ty == S16) {
5854     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5855     return AnyExt;
5856   }
5857 
5858   if (Ty.isVector()) {
5859     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5860       if (IsFormat)
5861         return handleD16VData(B, *MRI, VData);
5862     }
5863   }
5864 
5865   return VData;
5866 }
5867 
5868 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5869                                               LegalizerHelper &Helper,
5870                                               bool IsTyped,
5871                                               bool IsFormat) const {
5872   MachineIRBuilder &B = Helper.MIRBuilder;
5873   MachineRegisterInfo &MRI = *B.getMRI();
5874 
5875   Register VData = MI.getOperand(1).getReg();
5876   LLT Ty = MRI.getType(VData);
5877   LLT EltTy = Ty.getScalarType();
5878   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5879   const LLT S32 = LLT::scalar(32);
5880 
5881   MachineMemOperand *MMO = *MI.memoperands_begin();
5882   const int MemSize = MMO->getSize().getValue();
5883   LLT MemTy = MMO->getMemoryType();
5884 
5885   VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5886 
5887   castBufferRsrcArgToV4I32(MI, B, 2);
5888   Register RSrc = MI.getOperand(2).getReg();
5889 
5890   unsigned ImmOffset;
5891 
5892   // The typed intrinsics add an immediate after the registers.
5893   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5894 
5895   // The struct intrinsic variants add one additional operand over raw.
5896   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5897   Register VIndex;
5898   int OpOffset = 0;
5899   if (HasVIndex) {
5900     VIndex = MI.getOperand(3).getReg();
5901     OpOffset = 1;
5902   } else {
5903     VIndex = B.buildConstant(S32, 0).getReg(0);
5904   }
5905 
5906   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5907   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5908 
5909   unsigned Format = 0;
5910   if (IsTyped) {
5911     Format = MI.getOperand(5 + OpOffset).getImm();
5912     ++OpOffset;
5913   }
5914 
5915   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5916 
5917   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5918 
5919   unsigned Opc;
5920   if (IsTyped) {
5921     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5922                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5923   } else if (IsFormat) {
5924     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5925                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5926   } else {
5927     switch (MemSize) {
5928     case 1:
5929       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5930       break;
5931     case 2:
5932       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5933       break;
5934     default:
5935       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5936       break;
5937     }
5938   }
5939 
5940   auto MIB = B.buildInstr(Opc)
5941     .addUse(VData)              // vdata
5942     .addUse(RSrc)               // rsrc
5943     .addUse(VIndex)             // vindex
5944     .addUse(VOffset)            // voffset
5945     .addUse(SOffset)            // soffset
5946     .addImm(ImmOffset);         // offset(imm)
5947 
5948   if (IsTyped)
5949     MIB.addImm(Format);
5950 
5951   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5952      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5953      .addMemOperand(MMO);
5954 
5955   MI.eraseFromParent();
5956   return true;
5957 }
5958 
5959 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5960                             Register VIndex, Register VOffset, Register SOffset,
5961                             unsigned ImmOffset, unsigned Format,
5962                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5963                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5964   auto MIB = B.buildInstr(Opc)
5965                  .addDef(LoadDstReg) // vdata
5966                  .addUse(RSrc)       // rsrc
5967                  .addUse(VIndex)     // vindex
5968                  .addUse(VOffset)    // voffset
5969                  .addUse(SOffset)    // soffset
5970                  .addImm(ImmOffset); // offset(imm)
5971 
5972   if (IsTyped)
5973     MIB.addImm(Format);
5974 
5975   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5976       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5977       .addMemOperand(MMO);
5978 }
5979 
5980 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5981                                              LegalizerHelper &Helper,
5982                                              bool IsFormat,
5983                                              bool IsTyped) const {
5984   MachineIRBuilder &B = Helper.MIRBuilder;
5985   MachineRegisterInfo &MRI = *B.getMRI();
5986   GISelChangeObserver &Observer = Helper.Observer;
5987 
5988   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5989   MachineMemOperand *MMO = *MI.memoperands_begin();
5990   const LLT MemTy = MMO->getMemoryType();
5991   const LLT S32 = LLT::scalar(32);
5992 
5993   Register Dst = MI.getOperand(0).getReg();
5994 
5995   Register StatusDst;
5996   int OpOffset = 0;
5997   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5998   bool IsTFE = MI.getNumExplicitDefs() == 2;
5999   if (IsTFE) {
6000     StatusDst = MI.getOperand(1).getReg();
6001     ++OpOffset;
6002   }
6003 
6004   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6005   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6006 
6007   // The typed intrinsics add an immediate after the registers.
6008   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6009 
6010   // The struct intrinsic variants add one additional operand over raw.
6011   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6012   Register VIndex;
6013   if (HasVIndex) {
6014     VIndex = MI.getOperand(3 + OpOffset).getReg();
6015     ++OpOffset;
6016   } else {
6017     VIndex = B.buildConstant(S32, 0).getReg(0);
6018   }
6019 
6020   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6021   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6022 
6023   unsigned Format = 0;
6024   if (IsTyped) {
6025     Format = MI.getOperand(5 + OpOffset).getImm();
6026     ++OpOffset;
6027   }
6028 
6029   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6030   unsigned ImmOffset;
6031 
6032   LLT Ty = MRI.getType(Dst);
6033   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6034   // logic doesn't have to handle that case.
6035   if (hasBufferRsrcWorkaround(Ty)) {
6036     Observer.changingInstr(MI);
6037     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6038     Observer.changedInstr(MI);
6039     Dst = MI.getOperand(0).getReg();
6040     B.setInsertPt(B.getMBB(), MI);
6041   }
6042   if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6043     Ty = getBitcastRegisterType(Ty);
6044     Observer.changingInstr(MI);
6045     Helper.bitcastDst(MI, Ty, 0);
6046     Observer.changedInstr(MI);
6047     Dst = MI.getOperand(0).getReg();
6048     B.setInsertPt(B.getMBB(), MI);
6049   }
6050 
6051   LLT EltTy = Ty.getScalarType();
6052   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6053   const bool Unpacked = ST.hasUnpackedD16VMem();
6054 
6055   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6056 
6057   unsigned Opc;
6058 
6059   // TODO: Support TFE for typed and narrow loads.
6060   if (IsTyped) {
6061     if (IsTFE)
6062       return false;
6063     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6064                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6065   } else if (IsFormat) {
6066     if (IsD16) {
6067       if (IsTFE)
6068         return false;
6069       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6070     } else {
6071       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6072                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6073     }
6074   } else {
6075     switch (MemTy.getSizeInBits()) {
6076     case 8:
6077       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6078                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6079       break;
6080     case 16:
6081       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6082                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6083       break;
6084     default:
6085       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6086                   : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6087       break;
6088     }
6089   }
6090 
6091   if (IsTFE) {
6092     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6093     unsigned NumLoadDWords = NumValueDWords + 1;
6094     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6095     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6096     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6097                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6098     if (MemTy.getSizeInBits() < 32) {
6099       Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6100       B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6101       B.buildTrunc(Dst, ExtDst);
6102     } else if (NumValueDWords == 1) {
6103       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6104     } else {
6105       SmallVector<Register, 5> LoadElts;
6106       for (unsigned I = 0; I != NumValueDWords; ++I)
6107         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6108       LoadElts.push_back(StatusDst);
6109       B.buildUnmerge(LoadElts, LoadDstReg);
6110       LoadElts.truncate(NumValueDWords);
6111       B.buildMergeLikeInstr(Dst, LoadElts);
6112     }
6113   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6114              (IsD16 && !Ty.isVector())) {
6115     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6116     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6117                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6118     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6119     B.buildTrunc(Dst, LoadDstReg);
6120   } else if (Unpacked && IsD16 && Ty.isVector()) {
6121     LLT UnpackedTy = Ty.changeElementSize(32);
6122     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6123     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6124                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6125     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6126     // FIXME: G_TRUNC should work, but legalization currently fails
6127     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6128     SmallVector<Register, 4> Repack;
6129     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6130       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6131     B.buildMergeLikeInstr(Dst, Repack);
6132   } else {
6133     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6134                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6135   }
6136 
6137   MI.eraseFromParent();
6138   return true;
6139 }
6140 
6141 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6142   switch (IntrID) {
6143   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6144   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6145   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6146   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6147     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6148   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6149   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6150   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6151   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6152     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6153   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6154   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6155   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6156   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6157     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6158   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6159   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6160   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6161   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6162     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6163   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6164   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6165   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6166   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6167     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6168   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6169   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6170   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6171   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6172     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6173   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6174   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6175   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6176   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6177     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6178   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6179   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6180   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6181   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6182     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6183   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6184   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6185   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6186   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6187     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6188   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6189   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6190   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6191   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6192     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6193   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6194   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6195   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6196   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6197     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6198   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6199   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6200   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6201   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6202     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6203   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6204   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6205   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6206   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6207     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6208   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6209   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6210   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6211   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6212     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6213   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6214   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6215   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6216   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6217     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6218   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6219   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6220   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6221   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6222     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6223   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6224   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6225     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6226   default:
6227     llvm_unreachable("unhandled atomic opcode");
6228   }
6229 }
6230 
6231 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6232                                                MachineIRBuilder &B,
6233                                                Intrinsic::ID IID) const {
6234   const bool IsCmpSwap =
6235       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6236       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6237       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6238       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6239 
6240   Register Dst = MI.getOperand(0).getReg();
6241   // Since we don't have 128-bit atomics, we don't need to handle the case of
6242   // p8 argmunents to the atomic itself
6243   Register VData = MI.getOperand(2).getReg();
6244 
6245   Register CmpVal;
6246   int OpOffset = 0;
6247 
6248   if (IsCmpSwap) {
6249     CmpVal = MI.getOperand(3).getReg();
6250     ++OpOffset;
6251   }
6252 
6253   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6254   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6255   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6256 
6257   // The struct intrinsic variants add one additional operand over raw.
6258   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6259   Register VIndex;
6260   if (HasVIndex) {
6261     VIndex = MI.getOperand(4 + OpOffset).getReg();
6262     ++OpOffset;
6263   } else {
6264     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6265   }
6266 
6267   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6268   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6269   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6270 
6271   MachineMemOperand *MMO = *MI.memoperands_begin();
6272 
6273   unsigned ImmOffset;
6274   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6275 
6276   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6277       .addDef(Dst)
6278       .addUse(VData); // vdata
6279 
6280   if (IsCmpSwap)
6281     MIB.addReg(CmpVal);
6282 
6283   MIB.addUse(RSrc)               // rsrc
6284      .addUse(VIndex)             // vindex
6285      .addUse(VOffset)            // voffset
6286      .addUse(SOffset)            // soffset
6287      .addImm(ImmOffset)          // offset(imm)
6288      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
6289      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6290      .addMemOperand(MMO);
6291 
6292   MI.eraseFromParent();
6293   return true;
6294 }
6295 
6296 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6297 /// vector with s16 typed elements.
6298 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6299                                       SmallVectorImpl<Register> &PackedAddrs,
6300                                       unsigned ArgOffset,
6301                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
6302                                       bool IsA16, bool IsG16) {
6303   const LLT S16 = LLT::scalar(16);
6304   const LLT V2S16 = LLT::fixed_vector(2, 16);
6305   auto EndIdx = Intr->VAddrEnd;
6306 
6307   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6308     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6309     if (!SrcOp.isReg())
6310       continue; // _L to _LZ may have eliminated this.
6311 
6312     Register AddrReg = SrcOp.getReg();
6313 
6314     if ((I < Intr->GradientStart) ||
6315         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6316         (I >= Intr->CoordStart && !IsA16)) {
6317       if ((I < Intr->GradientStart) && IsA16 &&
6318           (B.getMRI()->getType(AddrReg) == S16)) {
6319         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6320         // Special handling of bias when A16 is on. Bias is of type half but
6321         // occupies full 32-bit.
6322         PackedAddrs.push_back(
6323             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6324                 .getReg(0));
6325       } else {
6326         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6327                "Bias needs to be converted to 16 bit in A16 mode");
6328         // Handle any gradient or coordinate operands that should not be packed
6329         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6330         PackedAddrs.push_back(AddrReg);
6331       }
6332     } else {
6333       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6334       // derivatives dx/dh and dx/dv are packed with undef.
6335       if (((I + 1) >= EndIdx) ||
6336           ((Intr->NumGradients / 2) % 2 == 1 &&
6337            (I == static_cast<unsigned>(Intr->GradientStart +
6338                                        (Intr->NumGradients / 2) - 1) ||
6339             I == static_cast<unsigned>(Intr->GradientStart +
6340                                        Intr->NumGradients - 1))) ||
6341           // Check for _L to _LZ optimization
6342           !MI.getOperand(ArgOffset + I + 1).isReg()) {
6343         PackedAddrs.push_back(
6344             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6345                 .getReg(0));
6346       } else {
6347         PackedAddrs.push_back(
6348             B.buildBuildVector(
6349                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6350                 .getReg(0));
6351         ++I;
6352       }
6353     }
6354   }
6355 }
6356 
6357 /// Convert from separate vaddr components to a single vector address register,
6358 /// and replace the remaining operands with $noreg.
6359 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6360                                      int DimIdx, int NumVAddrs) {
6361   const LLT S32 = LLT::scalar(32);
6362   (void)S32;
6363   SmallVector<Register, 8> AddrRegs;
6364   for (int I = 0; I != NumVAddrs; ++I) {
6365     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6366     if (SrcOp.isReg()) {
6367       AddrRegs.push_back(SrcOp.getReg());
6368       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6369     }
6370   }
6371 
6372   int NumAddrRegs = AddrRegs.size();
6373   if (NumAddrRegs != 1) {
6374     auto VAddr =
6375         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6376     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6377   }
6378 
6379   for (int I = 1; I != NumVAddrs; ++I) {
6380     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6381     if (SrcOp.isReg())
6382       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6383   }
6384 }
6385 
6386 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6387 ///
6388 /// Depending on the subtarget, load/store with 16-bit element data need to be
6389 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6390 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6391 /// registers.
6392 ///
6393 /// We don't want to directly select image instructions just yet, but also want
6394 /// to exposes all register repacking to the legalizer/combiners. We also don't
6395 /// want a selected instruction entering RegBankSelect. In order to avoid
6396 /// defining a multitude of intermediate image instructions, directly hack on
6397 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6398 /// padding now unnecessary arguments with $noreg.
6399 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6400     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6401     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6402 
6403   const MachineFunction &MF = *MI.getMF();
6404   const unsigned NumDefs = MI.getNumExplicitDefs();
6405   const unsigned ArgOffset = NumDefs + 1;
6406   bool IsTFE = NumDefs == 2;
6407   // We are only processing the operands of d16 image operations on subtargets
6408   // that use the unpacked register layout, or need to repack the TFE result.
6409 
6410   // TODO: Do we need to guard against already legalized intrinsics?
6411   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6412       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6413 
6414   MachineRegisterInfo *MRI = B.getMRI();
6415   const LLT S32 = LLT::scalar(32);
6416   const LLT S16 = LLT::scalar(16);
6417   const LLT V2S16 = LLT::fixed_vector(2, 16);
6418 
6419   unsigned DMask = 0;
6420   Register VData;
6421   LLT Ty;
6422 
6423   if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6424     VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6425     Ty = MRI->getType(VData);
6426   }
6427 
6428   const bool IsAtomicPacked16Bit =
6429       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6430        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6431 
6432   // Check for 16 bit addresses and pack if true.
6433   LLT GradTy =
6434       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6435   LLT AddrTy =
6436       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6437   const bool IsG16 =
6438       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6439   const bool IsA16 = AddrTy == S16;
6440   const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6441 
6442   int DMaskLanes = 0;
6443   if (!BaseOpcode->Atomic) {
6444     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6445     if (BaseOpcode->Gather4) {
6446       DMaskLanes = 4;
6447     } else if (DMask != 0) {
6448       DMaskLanes = llvm::popcount(DMask);
6449     } else if (!IsTFE && !BaseOpcode->Store) {
6450       // If dmask is 0, this is a no-op load. This can be eliminated.
6451       B.buildUndef(MI.getOperand(0));
6452       MI.eraseFromParent();
6453       return true;
6454     }
6455   }
6456 
6457   Observer.changingInstr(MI);
6458   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6459 
6460   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6461                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6462   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6463                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6464   unsigned NewOpcode = LoadOpcode;
6465   if (BaseOpcode->Store)
6466     NewOpcode = StoreOpcode;
6467   else if (BaseOpcode->NoReturn)
6468     NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6469 
6470   // Track that we legalized this
6471   MI.setDesc(B.getTII().get(NewOpcode));
6472 
6473   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6474   // dmask to be at least 1 otherwise the instruction will fail
6475   if (IsTFE && DMask == 0) {
6476     DMask = 0x1;
6477     DMaskLanes = 1;
6478     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6479   }
6480 
6481   if (BaseOpcode->Atomic) {
6482     Register VData0 = MI.getOperand(2).getReg();
6483     LLT Ty = MRI->getType(VData0);
6484 
6485     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6486     if (Ty.isVector() && !IsAtomicPacked16Bit)
6487       return false;
6488 
6489     if (BaseOpcode->AtomicX2) {
6490       Register VData1 = MI.getOperand(3).getReg();
6491       // The two values are packed in one register.
6492       LLT PackedTy = LLT::fixed_vector(2, Ty);
6493       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6494       MI.getOperand(2).setReg(Concat.getReg(0));
6495       MI.getOperand(3).setReg(AMDGPU::NoRegister);
6496     }
6497   }
6498 
6499   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6500 
6501   // Rewrite the addressing register layout before doing anything else.
6502   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6503     // 16 bit gradients are supported, but are tied to the A16 control
6504     // so both gradients and addresses must be 16 bit
6505     return false;
6506   }
6507 
6508   if (IsA16 && !ST.hasA16()) {
6509     // A16 not supported
6510     return false;
6511   }
6512 
6513   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6514   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6515 
6516   if (IsA16 || IsG16) {
6517     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6518     // instructions expect VGPR_32
6519     SmallVector<Register, 4> PackedRegs;
6520 
6521     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6522 
6523     // See also below in the non-a16 branch
6524     const bool UseNSA = ST.hasNSAEncoding() &&
6525                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6526                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6527     const bool UsePartialNSA =
6528         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6529 
6530     if (UsePartialNSA) {
6531       // Pack registers that would go over NSAMaxSize into last VAddr register
6532       LLT PackedAddrTy =
6533           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6534       auto Concat = B.buildConcatVectors(
6535           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6536       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6537       PackedRegs.resize(NSAMaxSize);
6538     } else if (!UseNSA && PackedRegs.size() > 1) {
6539       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6540       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6541       PackedRegs[0] = Concat.getReg(0);
6542       PackedRegs.resize(1);
6543     }
6544 
6545     const unsigned NumPacked = PackedRegs.size();
6546     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6547       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6548       if (!SrcOp.isReg()) {
6549         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6550         continue;
6551       }
6552 
6553       assert(SrcOp.getReg() != AMDGPU::NoRegister);
6554 
6555       if (I - Intr->VAddrStart < NumPacked)
6556         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6557       else
6558         SrcOp.setReg(AMDGPU::NoRegister);
6559     }
6560   } else {
6561     // If the register allocator cannot place the address registers contiguously
6562     // without introducing moves, then using the non-sequential address encoding
6563     // is always preferable, since it saves VALU instructions and is usually a
6564     // wash in terms of code size or even better.
6565     //
6566     // However, we currently have no way of hinting to the register allocator
6567     // that MIMG addresses should be placed contiguously when it is possible to
6568     // do so, so force non-NSA for the common 2-address case as a heuristic.
6569     //
6570     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6571     // allocation when possible.
6572     //
6573     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6574     // set of the remaining addresses.
6575     const bool UseNSA = ST.hasNSAEncoding() &&
6576                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6577                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6578     const bool UsePartialNSA =
6579         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6580 
6581     if (UsePartialNSA) {
6582       convertImageAddrToPacked(B, MI,
6583                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6584                                Intr->NumVAddrs - NSAMaxSize + 1);
6585     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6586       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6587                                Intr->NumVAddrs);
6588     }
6589   }
6590 
6591   int Flags = 0;
6592   if (IsA16)
6593     Flags |= 1;
6594   if (IsG16)
6595     Flags |= 2;
6596   MI.addOperand(MachineOperand::CreateImm(Flags));
6597 
6598   if (BaseOpcode->NoReturn) { // No TFE for stores?
6599     // TODO: Handle dmask trim
6600     if (!Ty.isVector() || !IsD16)
6601       return true;
6602 
6603     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6604     if (RepackedReg != VData) {
6605       MI.getOperand(1).setReg(RepackedReg);
6606     }
6607 
6608     return true;
6609   }
6610 
6611   Register DstReg = MI.getOperand(0).getReg();
6612   const LLT EltTy = Ty.getScalarType();
6613   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6614 
6615   // Confirm that the return type is large enough for the dmask specified
6616   if (NumElts < DMaskLanes)
6617     return false;
6618 
6619   if (NumElts > 4 || DMaskLanes > 4)
6620     return false;
6621 
6622   // Image atomic instructions are using DMask to specify how many bits
6623   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6624   // DMaskLanes for image atomic has default value '0'.
6625   // We must be sure that atomic variants (especially packed) will not be
6626   // truncated from v2s16 or v4s16 to s16 type.
6627   //
6628   // ChangeElementCount will be needed for image load where Ty is always scalar.
6629   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6630   const LLT AdjustedTy =
6631       DMaskLanes == 0
6632           ? Ty
6633           : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6634 
6635   // The raw dword aligned data component of the load. The only legal cases
6636   // where this matters should be when using the packed D16 format, for
6637   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6638   LLT RoundedTy;
6639 
6640   // S32 vector to cover all data, plus TFE result element.
6641   LLT TFETy;
6642 
6643   // Register type to use for each loaded component. Will be S32 or V2S16.
6644   LLT RegTy;
6645 
6646   if (IsD16 && ST.hasUnpackedD16VMem()) {
6647     RoundedTy =
6648         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6649     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6650     RegTy = S32;
6651   } else {
6652     unsigned EltSize = EltTy.getSizeInBits();
6653     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6654     unsigned RoundedSize = 32 * RoundedElts;
6655     RoundedTy = LLT::scalarOrVector(
6656         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6657     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6658     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6659   }
6660 
6661   // The return type does not need adjustment.
6662   // TODO: Should we change s16 case to s32 or <2 x s16>?
6663   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6664     return true;
6665 
6666   Register Dst1Reg;
6667 
6668   // Insert after the instruction.
6669   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6670 
6671   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6672   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6673   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6674   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6675 
6676   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6677 
6678   MI.getOperand(0).setReg(NewResultReg);
6679 
6680   // In the IR, TFE is supposed to be used with a 2 element struct return
6681   // type. The instruction really returns these two values in one contiguous
6682   // register, with one additional dword beyond the loaded data. Rewrite the
6683   // return type to use a single register result.
6684 
6685   if (IsTFE) {
6686     Dst1Reg = MI.getOperand(1).getReg();
6687     if (MRI->getType(Dst1Reg) != S32)
6688       return false;
6689 
6690     // TODO: Make sure the TFE operand bit is set.
6691     MI.removeOperand(1);
6692 
6693     // Handle the easy case that requires no repack instructions.
6694     if (Ty == S32) {
6695       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6696       return true;
6697     }
6698   }
6699 
6700   // Now figure out how to copy the new result register back into the old
6701   // result.
6702   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6703 
6704   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6705 
6706   if (ResultNumRegs == 1) {
6707     assert(!IsTFE);
6708     ResultRegs[0] = NewResultReg;
6709   } else {
6710     // We have to repack into a new vector of some kind.
6711     for (int I = 0; I != NumDataRegs; ++I)
6712       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6713     B.buildUnmerge(ResultRegs, NewResultReg);
6714 
6715     // Drop the final TFE element to get the data part. The TFE result is
6716     // directly written to the right place already.
6717     if (IsTFE)
6718       ResultRegs.resize(NumDataRegs);
6719   }
6720 
6721   // For an s16 scalar result, we form an s32 result with a truncate regardless
6722   // of packed vs. unpacked.
6723   if (IsD16 && !Ty.isVector()) {
6724     B.buildTrunc(DstReg, ResultRegs[0]);
6725     return true;
6726   }
6727 
6728   // Avoid a build/concat_vector of 1 entry.
6729   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6730     B.buildBitcast(DstReg, ResultRegs[0]);
6731     return true;
6732   }
6733 
6734   assert(Ty.isVector());
6735 
6736   if (IsD16) {
6737     // For packed D16 results with TFE enabled, all the data components are
6738     // S32. Cast back to the expected type.
6739     //
6740     // TODO: We don't really need to use load s32 elements. We would only need one
6741     // cast for the TFE result if a multiple of v2s16 was used.
6742     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6743       for (Register &Reg : ResultRegs)
6744         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6745     } else if (ST.hasUnpackedD16VMem()) {
6746       for (Register &Reg : ResultRegs)
6747         Reg = B.buildTrunc(S16, Reg).getReg(0);
6748     }
6749   }
6750 
6751   auto padWithUndef = [&](LLT Ty, int NumElts) {
6752     if (NumElts == 0)
6753       return;
6754     Register Undef = B.buildUndef(Ty).getReg(0);
6755     for (int I = 0; I != NumElts; ++I)
6756       ResultRegs.push_back(Undef);
6757   };
6758 
6759   // Pad out any elements eliminated due to the dmask.
6760   LLT ResTy = MRI->getType(ResultRegs[0]);
6761   if (!ResTy.isVector()) {
6762     padWithUndef(ResTy, NumElts - ResultRegs.size());
6763     B.buildBuildVector(DstReg, ResultRegs);
6764     return true;
6765   }
6766 
6767   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6768   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6769 
6770   // Deal with the one annoying legal case.
6771   const LLT V3S16 = LLT::fixed_vector(3, 16);
6772   if (Ty == V3S16) {
6773     if (IsTFE) {
6774       if (ResultRegs.size() == 1) {
6775         NewResultReg = ResultRegs[0];
6776       } else if (ResultRegs.size() == 2) {
6777         LLT V4S16 = LLT::fixed_vector(4, 16);
6778         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6779       } else {
6780         return false;
6781       }
6782     }
6783 
6784     if (MRI->getType(DstReg).getNumElements() <
6785         MRI->getType(NewResultReg).getNumElements()) {
6786       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6787     } else {
6788       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6789     }
6790     return true;
6791   }
6792 
6793   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6794   B.buildConcatVectors(DstReg, ResultRegs);
6795   return true;
6796 }
6797 
6798 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6799                                               MachineInstr &MI) const {
6800   MachineIRBuilder &B = Helper.MIRBuilder;
6801   GISelChangeObserver &Observer = Helper.Observer;
6802 
6803   Register OrigDst = MI.getOperand(0).getReg();
6804   Register Dst;
6805   LLT Ty = B.getMRI()->getType(OrigDst);
6806   unsigned Size = Ty.getSizeInBits();
6807   MachineFunction &MF = B.getMF();
6808   unsigned Opc = 0;
6809   if (Size < 32 && ST.hasScalarSubwordLoads()) {
6810     assert(Size == 8 || Size == 16);
6811     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6812                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6813     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6814     // destination register.
6815     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6816   } else {
6817     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6818     Dst = OrigDst;
6819   }
6820 
6821   Observer.changingInstr(MI);
6822 
6823   // Handle needing to s.buffer.load() a p8 value.
6824   if (hasBufferRsrcWorkaround(Ty)) {
6825     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6826     B.setInsertPt(B.getMBB(), MI);
6827   }
6828   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6829     Ty = getBitcastRegisterType(Ty);
6830     Helper.bitcastDst(MI, Ty, 0);
6831     B.setInsertPt(B.getMBB(), MI);
6832   }
6833 
6834   // FIXME: We don't really need this intermediate instruction. The intrinsic
6835   // should be fixed to have a memory operand. Since it's readnone, we're not
6836   // allowed to add one.
6837   MI.setDesc(B.getTII().get(Opc));
6838   MI.removeOperand(1); // Remove intrinsic ID
6839 
6840   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6841   const unsigned MemSize = (Size + 7) / 8;
6842   const Align MemAlign = B.getDataLayout().getABITypeAlign(
6843       getTypeForLLT(Ty, MF.getFunction().getContext()));
6844   MachineMemOperand *MMO = MF.getMachineMemOperand(
6845       MachinePointerInfo(),
6846       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6847           MachineMemOperand::MOInvariant,
6848       MemSize, MemAlign);
6849   MI.addMemOperand(MF, MMO);
6850   if (Dst != OrigDst) {
6851     MI.getOperand(0).setReg(Dst);
6852     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6853     B.buildTrunc(OrigDst, Dst);
6854   }
6855 
6856   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6857   // always be legal. We may need to restore this to a 96-bit result if it turns
6858   // out this needs to be converted to a vector load during RegBankSelect.
6859   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6860     if (Ty.isVector())
6861       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6862     else
6863       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6864   }
6865 
6866   Observer.changedInstr(MI);
6867   return true;
6868 }
6869 
6870 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6871                                                   MachineInstr &MI) const {
6872   MachineIRBuilder &B = Helper.MIRBuilder;
6873   GISelChangeObserver &Observer = Helper.Observer;
6874   Observer.changingInstr(MI);
6875   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6876   MI.removeOperand(0); // Remove intrinsic ID
6877   castBufferRsrcArgToV4I32(MI, B, 0);
6878   Observer.changedInstr(MI);
6879   return true;
6880 }
6881 
6882 // TODO: Move to selection
6883 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6884                                        MachineRegisterInfo &MRI,
6885                                        MachineIRBuilder &B) const {
6886   if (!ST.isTrapHandlerEnabled() ||
6887       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6888     return legalizeTrapEndpgm(MI, MRI, B);
6889 
6890   return ST.supportsGetDoorbellID() ?
6891          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6892 }
6893 
6894 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6895     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6896   const DebugLoc &DL = MI.getDebugLoc();
6897   MachineBasicBlock &BB = B.getMBB();
6898   MachineFunction *MF = BB.getParent();
6899 
6900   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6901     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6902       .addImm(0);
6903     MI.eraseFromParent();
6904     return true;
6905   }
6906 
6907   // We need a block split to make the real endpgm a terminator. We also don't
6908   // want to break phis in successor blocks, so we can't just delete to the
6909   // end of the block.
6910   BB.splitAt(MI, false /*UpdateLiveIns*/);
6911   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6912   MF->push_back(TrapBB);
6913   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6914     .addImm(0);
6915   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6916     .addMBB(TrapBB);
6917 
6918   BB.addSuccessor(TrapBB);
6919   MI.eraseFromParent();
6920   return true;
6921 }
6922 
6923 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6924     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6925   MachineFunction &MF = B.getMF();
6926   const LLT S64 = LLT::scalar(64);
6927 
6928   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6929   // For code object version 5, queue_ptr is passed through implicit kernarg.
6930   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6931       AMDGPU::AMDHSA_COV5) {
6932     AMDGPUTargetLowering::ImplicitParameter Param =
6933         AMDGPUTargetLowering::QUEUE_PTR;
6934     uint64_t Offset =
6935         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6936 
6937     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6938         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6939 
6940     if (!loadInputValue(KernargPtrReg, B,
6941                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6942       return false;
6943 
6944     // TODO: can we be smarter about machine pointer info?
6945     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6946     MachineMemOperand *MMO = MF.getMachineMemOperand(
6947         PtrInfo,
6948         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6949             MachineMemOperand::MOInvariant,
6950         LLT::scalar(64), commonAlignment(Align(64), Offset));
6951 
6952     // Pointer address
6953     Register LoadAddr = MRI.createGenericVirtualRegister(
6954         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6955     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6956                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6957     // Load address
6958     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6959     B.buildCopy(SGPR01, Temp);
6960     B.buildInstr(AMDGPU::S_TRAP)
6961         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6962         .addReg(SGPR01, RegState::Implicit);
6963     MI.eraseFromParent();
6964     return true;
6965   }
6966 
6967   // Pass queue pointer to trap handler as input, and insert trap instruction
6968   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6969   Register LiveIn =
6970     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6971   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6972     return false;
6973 
6974   B.buildCopy(SGPR01, LiveIn);
6975   B.buildInstr(AMDGPU::S_TRAP)
6976       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6977       .addReg(SGPR01, RegState::Implicit);
6978 
6979   MI.eraseFromParent();
6980   return true;
6981 }
6982 
6983 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6984                                           MachineRegisterInfo &MRI,
6985                                           MachineIRBuilder &B) const {
6986   // We need to simulate the 's_trap 2' instruction on targets that run in
6987   // PRIV=1 (where it is treated as a nop).
6988   if (ST.hasPrivEnabledTrap2NopBug()) {
6989     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6990                                            MI.getDebugLoc());
6991     MI.eraseFromParent();
6992     return true;
6993   }
6994 
6995   B.buildInstr(AMDGPU::S_TRAP)
6996       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6997   MI.eraseFromParent();
6998   return true;
6999 }
7000 
7001 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7002                                             MachineRegisterInfo &MRI,
7003                                             MachineIRBuilder &B) const {
7004   // Is non-HSA path or trap-handler disabled? Then, report a warning
7005   // accordingly
7006   if (!ST.isTrapHandlerEnabled() ||
7007       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7008     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
7009                                      "debugtrap handler not supported",
7010                                      MI.getDebugLoc(), DS_Warning);
7011     LLVMContext &Ctx = B.getMF().getFunction().getContext();
7012     Ctx.diagnose(NoTrap);
7013   } else {
7014     // Insert debug-trap instruction
7015     B.buildInstr(AMDGPU::S_TRAP)
7016         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7017   }
7018 
7019   MI.eraseFromParent();
7020   return true;
7021 }
7022 
7023 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
7024                                                MachineIRBuilder &B) const {
7025   MachineRegisterInfo &MRI = *B.getMRI();
7026   const LLT S16 = LLT::scalar(16);
7027   const LLT S32 = LLT::scalar(32);
7028   const LLT V2S16 = LLT::fixed_vector(2, 16);
7029   const LLT V3S32 = LLT::fixed_vector(3, 32);
7030 
7031   Register DstReg = MI.getOperand(0).getReg();
7032   Register NodePtr = MI.getOperand(2).getReg();
7033   Register RayExtent = MI.getOperand(3).getReg();
7034   Register RayOrigin = MI.getOperand(4).getReg();
7035   Register RayDir = MI.getOperand(5).getReg();
7036   Register RayInvDir = MI.getOperand(6).getReg();
7037   Register TDescr = MI.getOperand(7).getReg();
7038 
7039   if (!ST.hasGFX10_AEncoding()) {
7040     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
7041                                         "intrinsic not supported on subtarget",
7042                                         MI.getDebugLoc());
7043     B.getMF().getFunction().getContext().diagnose(BadIntrin);
7044     return false;
7045   }
7046 
7047   const bool IsGFX11 = AMDGPU::isGFX11(ST);
7048   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7049   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7050   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7051   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7052   const unsigned NumVDataDwords = 4;
7053   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7054   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7055   const bool UseNSA =
7056       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7057 
7058   const unsigned BaseOpcodes[2][2] = {
7059       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7060       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7061        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7062   int Opcode;
7063   if (UseNSA) {
7064     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7065                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7066                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
7067                                                : AMDGPU::MIMGEncGfx10NSA,
7068                                    NumVDataDwords, NumVAddrDwords);
7069   } else {
7070     assert(!IsGFX12Plus);
7071     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7072                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7073                                            : AMDGPU::MIMGEncGfx10Default,
7074                                    NumVDataDwords, NumVAddrDwords);
7075   }
7076   assert(Opcode != -1);
7077 
7078   SmallVector<Register, 12> Ops;
7079   if (UseNSA && IsGFX11Plus) {
7080     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7081       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7082       auto Merged = B.buildMergeLikeInstr(
7083           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7084       Ops.push_back(Merged.getReg(0));
7085     };
7086 
7087     Ops.push_back(NodePtr);
7088     Ops.push_back(RayExtent);
7089     packLanes(RayOrigin);
7090 
7091     if (IsA16) {
7092       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7093       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7094       auto MergedDir = B.buildMergeLikeInstr(
7095           V3S32,
7096           {B.buildBitcast(
7097                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7098                                                    UnmergeRayDir.getReg(0)}))
7099                .getReg(0),
7100            B.buildBitcast(
7101                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7102                                                    UnmergeRayDir.getReg(1)}))
7103                .getReg(0),
7104            B.buildBitcast(
7105                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7106                                                    UnmergeRayDir.getReg(2)}))
7107                .getReg(0)});
7108       Ops.push_back(MergedDir.getReg(0));
7109     } else {
7110       packLanes(RayDir);
7111       packLanes(RayInvDir);
7112     }
7113   } else {
7114     if (Is64) {
7115       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7116       Ops.push_back(Unmerge.getReg(0));
7117       Ops.push_back(Unmerge.getReg(1));
7118     } else {
7119       Ops.push_back(NodePtr);
7120     }
7121     Ops.push_back(RayExtent);
7122 
7123     auto packLanes = [&Ops, &S32, &B](Register Src) {
7124       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7125       Ops.push_back(Unmerge.getReg(0));
7126       Ops.push_back(Unmerge.getReg(1));
7127       Ops.push_back(Unmerge.getReg(2));
7128     };
7129 
7130     packLanes(RayOrigin);
7131     if (IsA16) {
7132       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7133       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7134       Register R1 = MRI.createGenericVirtualRegister(S32);
7135       Register R2 = MRI.createGenericVirtualRegister(S32);
7136       Register R3 = MRI.createGenericVirtualRegister(S32);
7137       B.buildMergeLikeInstr(R1,
7138                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7139       B.buildMergeLikeInstr(
7140           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7141       B.buildMergeLikeInstr(
7142           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7143       Ops.push_back(R1);
7144       Ops.push_back(R2);
7145       Ops.push_back(R3);
7146     } else {
7147       packLanes(RayDir);
7148       packLanes(RayInvDir);
7149     }
7150   }
7151 
7152   if (!UseNSA) {
7153     // Build a single vector containing all the operands so far prepared.
7154     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7155     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7156     Ops.clear();
7157     Ops.push_back(MergedOps);
7158   }
7159 
7160   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7161     .addDef(DstReg)
7162     .addImm(Opcode);
7163 
7164   for (Register R : Ops) {
7165     MIB.addUse(R);
7166   }
7167 
7168   MIB.addUse(TDescr)
7169      .addImm(IsA16 ? 1 : 0)
7170      .cloneMemRefs(MI);
7171 
7172   MI.eraseFromParent();
7173   return true;
7174 }
7175 
7176 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7177                                             MachineIRBuilder &B) const {
7178   const SITargetLowering *TLI = ST.getTargetLowering();
7179   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7180   Register DstReg = MI.getOperand(0).getReg();
7181   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7182   MI.eraseFromParent();
7183   return true;
7184 }
7185 
7186 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7187                                          MachineIRBuilder &B) const {
7188   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7189   if (!ST.hasArchitectedSGPRs())
7190     return false;
7191   LLT S32 = LLT::scalar(32);
7192   Register DstReg = MI.getOperand(0).getReg();
7193   auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7194   auto LSB = B.buildConstant(S32, 25);
7195   auto Width = B.buildConstant(S32, 5);
7196   B.buildUbfx(DstReg, TTMP8, LSB, Width);
7197   MI.eraseFromParent();
7198   return true;
7199 }
7200 
7201 static constexpr unsigned FPEnvModeBitField =
7202     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7203 
7204 static constexpr unsigned FPEnvTrapBitField =
7205     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7206 
7207 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7208                                            MachineRegisterInfo &MRI,
7209                                            MachineIRBuilder &B) const {
7210   Register Src = MI.getOperand(0).getReg();
7211   if (MRI.getType(Src) != S64)
7212     return false;
7213 
7214   auto ModeReg =
7215       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7216                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7217           .addImm(FPEnvModeBitField);
7218   auto TrapReg =
7219       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7220                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7221           .addImm(FPEnvTrapBitField);
7222   B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7223   MI.eraseFromParent();
7224   return true;
7225 }
7226 
7227 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7228                                            MachineRegisterInfo &MRI,
7229                                            MachineIRBuilder &B) const {
7230   Register Src = MI.getOperand(0).getReg();
7231   if (MRI.getType(Src) != S64)
7232     return false;
7233 
7234   auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7235   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7236                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7237       .addImm(static_cast<int16_t>(FPEnvModeBitField))
7238       .addReg(Unmerge.getReg(0));
7239   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7240                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7241       .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7242       .addReg(Unmerge.getReg(1));
7243   MI.eraseFromParent();
7244   return true;
7245 }
7246 
7247 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7248                                             MachineInstr &MI) const {
7249   MachineIRBuilder &B = Helper.MIRBuilder;
7250   MachineRegisterInfo &MRI = *B.getMRI();
7251 
7252   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7253   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7254   switch (IntrID) {
7255   case Intrinsic::amdgcn_if:
7256   case Intrinsic::amdgcn_else: {
7257     MachineInstr *Br = nullptr;
7258     MachineBasicBlock *UncondBrTarget = nullptr;
7259     bool Negated = false;
7260     if (MachineInstr *BrCond =
7261             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7262       const SIRegisterInfo *TRI
7263         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7264 
7265       Register Def = MI.getOperand(1).getReg();
7266       Register Use = MI.getOperand(3).getReg();
7267 
7268       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7269 
7270       if (Negated)
7271         std::swap(CondBrTarget, UncondBrTarget);
7272 
7273       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7274       if (IntrID == Intrinsic::amdgcn_if) {
7275         B.buildInstr(AMDGPU::SI_IF)
7276           .addDef(Def)
7277           .addUse(Use)
7278           .addMBB(UncondBrTarget);
7279       } else {
7280         B.buildInstr(AMDGPU::SI_ELSE)
7281             .addDef(Def)
7282             .addUse(Use)
7283             .addMBB(UncondBrTarget);
7284       }
7285 
7286       if (Br) {
7287         Br->getOperand(0).setMBB(CondBrTarget);
7288       } else {
7289         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7290         // since we're swapping branch targets it needs to be reinserted.
7291         // FIXME: IRTranslator should probably not do this
7292         B.buildBr(*CondBrTarget);
7293       }
7294 
7295       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7296       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7297       MI.eraseFromParent();
7298       BrCond->eraseFromParent();
7299       return true;
7300     }
7301 
7302     return false;
7303   }
7304   case Intrinsic::amdgcn_loop: {
7305     MachineInstr *Br = nullptr;
7306     MachineBasicBlock *UncondBrTarget = nullptr;
7307     bool Negated = false;
7308     if (MachineInstr *BrCond =
7309             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7310       const SIRegisterInfo *TRI
7311         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7312 
7313       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7314       Register Reg = MI.getOperand(2).getReg();
7315 
7316       if (Negated)
7317         std::swap(CondBrTarget, UncondBrTarget);
7318 
7319       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7320       B.buildInstr(AMDGPU::SI_LOOP)
7321         .addUse(Reg)
7322         .addMBB(UncondBrTarget);
7323 
7324       if (Br)
7325         Br->getOperand(0).setMBB(CondBrTarget);
7326       else
7327         B.buildBr(*CondBrTarget);
7328 
7329       MI.eraseFromParent();
7330       BrCond->eraseFromParent();
7331       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7332       return true;
7333     }
7334 
7335     return false;
7336   }
7337   case Intrinsic::amdgcn_addrspacecast_nonnull:
7338     return legalizeAddrSpaceCast(MI, MRI, B);
7339   case Intrinsic::amdgcn_make_buffer_rsrc:
7340     return legalizePointerAsRsrcIntrin(MI, MRI, B);
7341   case Intrinsic::amdgcn_kernarg_segment_ptr:
7342     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7343       // This only makes sense to call in a kernel, so just lower to null.
7344       B.buildConstant(MI.getOperand(0).getReg(), 0);
7345       MI.eraseFromParent();
7346       return true;
7347     }
7348 
7349     return legalizePreloadedArgIntrin(
7350       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7351   case Intrinsic::amdgcn_implicitarg_ptr:
7352     return legalizeImplicitArgPtr(MI, MRI, B);
7353   case Intrinsic::amdgcn_workitem_id_x:
7354     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7355                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7356   case Intrinsic::amdgcn_workitem_id_y:
7357     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7358                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7359   case Intrinsic::amdgcn_workitem_id_z:
7360     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7361                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7362   case Intrinsic::amdgcn_workgroup_id_x:
7363     return legalizePreloadedArgIntrin(MI, MRI, B,
7364                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7365   case Intrinsic::amdgcn_workgroup_id_y:
7366     return legalizePreloadedArgIntrin(MI, MRI, B,
7367                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7368   case Intrinsic::amdgcn_workgroup_id_z:
7369     return legalizePreloadedArgIntrin(MI, MRI, B,
7370                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7371   case Intrinsic::amdgcn_wave_id:
7372     return legalizeWaveID(MI, B);
7373   case Intrinsic::amdgcn_lds_kernel_id:
7374     return legalizePreloadedArgIntrin(MI, MRI, B,
7375                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7376   case Intrinsic::amdgcn_dispatch_ptr:
7377     return legalizePreloadedArgIntrin(MI, MRI, B,
7378                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
7379   case Intrinsic::amdgcn_queue_ptr:
7380     return legalizePreloadedArgIntrin(MI, MRI, B,
7381                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
7382   case Intrinsic::amdgcn_implicit_buffer_ptr:
7383     return legalizePreloadedArgIntrin(
7384       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7385   case Intrinsic::amdgcn_dispatch_id:
7386     return legalizePreloadedArgIntrin(MI, MRI, B,
7387                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
7388   case Intrinsic::r600_read_ngroups_x:
7389     // TODO: Emit error for hsa
7390     return legalizeKernargMemParameter(MI, B,
7391                                        SI::KernelInputOffsets::NGROUPS_X);
7392   case Intrinsic::r600_read_ngroups_y:
7393     return legalizeKernargMemParameter(MI, B,
7394                                        SI::KernelInputOffsets::NGROUPS_Y);
7395   case Intrinsic::r600_read_ngroups_z:
7396     return legalizeKernargMemParameter(MI, B,
7397                                        SI::KernelInputOffsets::NGROUPS_Z);
7398   case Intrinsic::r600_read_local_size_x:
7399     // TODO: Could insert G_ASSERT_ZEXT from s16
7400     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7401   case Intrinsic::r600_read_local_size_y:
7402     // TODO: Could insert G_ASSERT_ZEXT from s16
7403     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
7404     // TODO: Could insert G_ASSERT_ZEXT from s16
7405   case Intrinsic::r600_read_local_size_z:
7406     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7407   case Intrinsic::r600_read_global_size_x:
7408     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7409   case Intrinsic::r600_read_global_size_y:
7410     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7411   case Intrinsic::r600_read_global_size_z:
7412     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7413   case Intrinsic::amdgcn_fdiv_fast:
7414     return legalizeFDIVFastIntrin(MI, MRI, B);
7415   case Intrinsic::amdgcn_is_shared:
7416     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7417   case Intrinsic::amdgcn_is_private:
7418     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7419   case Intrinsic::amdgcn_wavefrontsize: {
7420     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7421     MI.eraseFromParent();
7422     return true;
7423   }
7424   case Intrinsic::amdgcn_s_buffer_load:
7425     return legalizeSBufferLoad(Helper, MI);
7426   case Intrinsic::amdgcn_raw_buffer_store:
7427   case Intrinsic::amdgcn_raw_ptr_buffer_store:
7428   case Intrinsic::amdgcn_struct_buffer_store:
7429   case Intrinsic::amdgcn_struct_ptr_buffer_store:
7430     return legalizeBufferStore(MI, Helper, false, false);
7431   case Intrinsic::amdgcn_raw_buffer_store_format:
7432   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7433   case Intrinsic::amdgcn_struct_buffer_store_format:
7434   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7435     return legalizeBufferStore(MI, Helper, false, true);
7436   case Intrinsic::amdgcn_raw_tbuffer_store:
7437   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7438   case Intrinsic::amdgcn_struct_tbuffer_store:
7439   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7440     return legalizeBufferStore(MI, Helper, true, true);
7441   case Intrinsic::amdgcn_raw_buffer_load:
7442   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7443   case Intrinsic::amdgcn_raw_atomic_buffer_load:
7444   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7445   case Intrinsic::amdgcn_struct_buffer_load:
7446   case Intrinsic::amdgcn_struct_ptr_buffer_load:
7447   case Intrinsic::amdgcn_struct_atomic_buffer_load:
7448   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7449     return legalizeBufferLoad(MI, Helper, false, false);
7450   case Intrinsic::amdgcn_raw_buffer_load_format:
7451   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7452   case Intrinsic::amdgcn_struct_buffer_load_format:
7453   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7454     return legalizeBufferLoad(MI, Helper, true, false);
7455   case Intrinsic::amdgcn_raw_tbuffer_load:
7456   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7457   case Intrinsic::amdgcn_struct_tbuffer_load:
7458   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7459     return legalizeBufferLoad(MI, Helper, true, true);
7460   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7461   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7462   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7463   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7464   case Intrinsic::amdgcn_raw_buffer_atomic_add:
7465   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7466   case Intrinsic::amdgcn_struct_buffer_atomic_add:
7467   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7468   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7469   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7470   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7471   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7472   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7473   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7474   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7475   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7476   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7477   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7478   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7479   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7480   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7481   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7482   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7483   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7484   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7485   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7486   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7487   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7488   case Intrinsic::amdgcn_raw_buffer_atomic_and:
7489   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7490   case Intrinsic::amdgcn_struct_buffer_atomic_and:
7491   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7492   case Intrinsic::amdgcn_raw_buffer_atomic_or:
7493   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7494   case Intrinsic::amdgcn_struct_buffer_atomic_or:
7495   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7496   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7497   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7498   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7499   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7500   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7501   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7502   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7503   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7504   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7505   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7506   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7507   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7508   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7509   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7510   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7511   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7512   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7513   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7514   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7515   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7516   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7517   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7518   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7519   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7520   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7521   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7522   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7523   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7524     return legalizeBufferAtomic(MI, B, IntrID);
7525   case Intrinsic::amdgcn_rsq_clamp:
7526     return legalizeRsqClampIntrinsic(MI, MRI, B);
7527   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7528     return legalizeBVHIntrinsic(MI, B);
7529   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7530   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7531   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7532   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7533   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7534   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7535   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7536   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7537     Register Index = MI.getOperand(5).getReg();
7538     LLT S32 = LLT::scalar(32);
7539     if (MRI.getType(Index) != S32)
7540       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7541     return true;
7542   }
7543   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7544   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7545   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7546     Register Index = MI.getOperand(7).getReg();
7547     LLT S32 = LLT::scalar(32);
7548     if (MRI.getType(Index) != S32)
7549       MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7550     return true;
7551   }
7552   case Intrinsic::amdgcn_fmed3: {
7553     GISelChangeObserver &Observer = Helper.Observer;
7554 
7555     // FIXME: This is to workaround the inability of tablegen match combiners to
7556     // match intrinsics in patterns.
7557     Observer.changingInstr(MI);
7558     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7559     MI.removeOperand(1);
7560     Observer.changedInstr(MI);
7561     return true;
7562   }
7563   case Intrinsic::amdgcn_readlane:
7564   case Intrinsic::amdgcn_writelane:
7565   case Intrinsic::amdgcn_readfirstlane:
7566   case Intrinsic::amdgcn_permlane16:
7567   case Intrinsic::amdgcn_permlanex16:
7568   case Intrinsic::amdgcn_permlane64:
7569   case Intrinsic::amdgcn_set_inactive:
7570   case Intrinsic::amdgcn_set_inactive_chain_arg:
7571   case Intrinsic::amdgcn_mov_dpp8:
7572   case Intrinsic::amdgcn_update_dpp:
7573     return legalizeLaneOp(Helper, MI, IntrID);
7574   case Intrinsic::amdgcn_s_buffer_prefetch_data:
7575     return legalizeSBufferPrefetch(Helper, MI);
7576   default: {
7577     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7578             AMDGPU::getImageDimIntrinsicInfo(IntrID))
7579       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7580     return true;
7581   }
7582   }
7583 
7584   return true;
7585 }
7586