xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 753f127f3ace09432b2baeffd71a308760641a62)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx,
100                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(
112         TypeIdx,
113         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136   const unsigned Size = Ty.getSizeInBits();
137 
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB})
534       .legalFor({S32, S16, V2S16})
535       .clampMaxNumElementsStrict(0, S16, 2)
536       .scalarize(0)
537       .minScalar(0, S16)
538       .widenScalarToNextMultipleOf(0, 32)
539       .maxScalar(0, S32);
540 
541     getActionDefinitionsBuilder(G_MUL)
542       .legalFor({S32, S16, V2S16})
543       .clampMaxNumElementsStrict(0, S16, 2)
544       .scalarize(0)
545       .minScalar(0, S16)
546       .widenScalarToNextMultipleOf(0, 32)
547       .custom();
548     assert(ST.hasMad64_32());
549 
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551       .legalFor({S32, S16, V2S16}) // Clamp modifier
552       .minScalarOrElt(0, S16)
553       .clampMaxNumElementsStrict(0, S16, 2)
554       .scalarize(0)
555       .widenScalarToNextPow2(0, 32)
556       .lower();
557   } else if (ST.has16BitInsts()) {
558     getActionDefinitionsBuilder({G_ADD, G_SUB})
559       .legalFor({S32, S16})
560       .minScalar(0, S16)
561       .widenScalarToNextMultipleOf(0, 32)
562       .maxScalar(0, S32)
563       .scalarize(0);
564 
565     getActionDefinitionsBuilder(G_MUL)
566       .legalFor({S32, S16})
567       .scalarize(0)
568       .minScalar(0, S16)
569       .widenScalarToNextMultipleOf(0, 32)
570       .custom();
571     assert(ST.hasMad64_32());
572 
573     // Technically the saturating operations require clamp bit support, but this
574     // was introduced at the same time as 16-bit operations.
575     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576       .legalFor({S32, S16}) // Clamp modifier
577       .minScalar(0, S16)
578       .scalarize(0)
579       .widenScalarToNextPow2(0, 16)
580       .lower();
581 
582     // We're just lowering this, but it helps get a better result to try to
583     // coerce to the desired type first.
584     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585       .minScalar(0, S16)
586       .scalarize(0)
587       .lower();
588   } else {
589     getActionDefinitionsBuilder({G_ADD, G_SUB})
590       .legalFor({S32})
591       .widenScalarToNextMultipleOf(0, 32)
592       .clampScalar(0, S32, S32)
593       .scalarize(0);
594 
595     auto &Mul = getActionDefinitionsBuilder(G_MUL)
596       .legalFor({S32})
597       .scalarize(0)
598       .minScalar(0, S32)
599       .widenScalarToNextMultipleOf(0, 32);
600 
601     if (ST.hasMad64_32())
602       Mul.custom();
603     else
604       Mul.maxScalar(0, S32);
605 
606     if (ST.hasIntClamp()) {
607       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608         .legalFor({S32}) // Clamp modifier.
609         .scalarize(0)
610         .minScalarOrElt(0, S32)
611         .lower();
612     } else {
613       // Clamp bit support was added in VI, along with 16-bit operations.
614       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615         .minScalar(0, S32)
616         .scalarize(0)
617         .lower();
618     }
619 
620     // FIXME: DAG expansion gets better results. The widening uses the smaller
621     // range values and goes for the min/max lowering directly.
622     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623       .minScalar(0, S32)
624       .scalarize(0)
625       .lower();
626   }
627 
628   getActionDefinitionsBuilder(
629       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630       .customFor({S32, S64})
631       .clampScalar(0, S32, S64)
632       .widenScalarToNextPow2(0, 32)
633       .scalarize(0);
634 
635   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636                    .legalFor({S32})
637                    .maxScalar(0, S32);
638 
639   if (ST.hasVOP3PInsts()) {
640     Mulh
641       .clampMaxNumElements(0, S8, 2)
642       .lowerFor({V2S8});
643   }
644 
645   Mulh
646     .scalarize(0)
647     .lower();
648 
649   // Report legal for any types we can handle anywhere. For the cases only legal
650   // on the SALU, RegBankSelect will be able to re-legalize.
651   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653     .clampScalar(0, S32, S64)
654     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
655     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
656     .widenScalarToNextPow2(0)
657     .scalarize(0);
658 
659   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661     .legalFor({{S32, S1}, {S32, S32}})
662     .minScalar(0, S32)
663     .scalarize(0)
664     .lower();
665 
666   getActionDefinitionsBuilder(G_BITCAST)
667     // Don't worry about the size constraint.
668     .legalIf(all(isRegisterType(0), isRegisterType(1)))
669     .lower();
670 
671 
672   getActionDefinitionsBuilder(G_CONSTANT)
673     .legalFor({S1, S32, S64, S16, GlobalPtr,
674                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675     .legalIf(isPointer(0))
676     .clampScalar(0, S32, S64)
677     .widenScalarToNextPow2(0);
678 
679   getActionDefinitionsBuilder(G_FCONSTANT)
680     .legalFor({S32, S64, S16})
681     .clampScalar(0, S16, S64);
682 
683   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684       .legalIf(isRegisterType(0))
685       // s1 and s16 are special cases because they have legal operations on
686       // them, but don't really occupy registers in the normal way.
687       .legalFor({S1, S16})
688       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689       .clampScalarOrElt(0, S32, MaxScalar)
690       .widenScalarToNextPow2(0, 32)
691       .clampMaxNumElements(0, S32, 16);
692 
693   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694 
695   // If the amount is divergent, we have to do a wave reduction to get the
696   // maximum value, so this is expanded during RegBankSelect.
697   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698     .legalFor({{PrivatePtr, S32}});
699 
700   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701     .customIf(typeIsNot(0, PrivatePtr));
702 
703   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704 
705   auto &FPOpActions = getActionDefinitionsBuilder(
706     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707     .legalFor({S32, S64});
708   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709     .customFor({S32, S64});
710   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711     .customFor({S32, S64});
712 
713   if (ST.has16BitInsts()) {
714     if (ST.hasVOP3PInsts())
715       FPOpActions.legalFor({S16, V2S16});
716     else
717       FPOpActions.legalFor({S16});
718 
719     TrigActions.customFor({S16});
720     FDIVActions.customFor({S16});
721   }
722 
723   auto &MinNumMaxNum = getActionDefinitionsBuilder({
724       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725 
726   if (ST.hasVOP3PInsts()) {
727     MinNumMaxNum.customFor(FPTypesPK16)
728       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
729       .clampMaxNumElements(0, S16, 2)
730       .clampScalar(0, S16, S64)
731       .scalarize(0);
732   } else if (ST.has16BitInsts()) {
733     MinNumMaxNum.customFor(FPTypes16)
734       .clampScalar(0, S16, S64)
735       .scalarize(0);
736   } else {
737     MinNumMaxNum.customFor(FPTypesBase)
738       .clampScalar(0, S32, S64)
739       .scalarize(0);
740   }
741 
742   if (ST.hasVOP3PInsts())
743     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744 
745   FPOpActions
746     .scalarize(0)
747     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748 
749   TrigActions
750     .scalarize(0)
751     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752 
753   FDIVActions
754     .scalarize(0)
755     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756 
757   getActionDefinitionsBuilder({G_FNEG, G_FABS})
758     .legalFor(FPTypesPK16)
759     .clampMaxNumElementsStrict(0, S16, 2)
760     .scalarize(0)
761     .clampScalar(0, S16, S64);
762 
763   if (ST.has16BitInsts()) {
764     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765       .legalFor({S32, S64, S16})
766       .scalarize(0)
767       .clampScalar(0, S16, S64);
768   } else {
769     getActionDefinitionsBuilder(G_FSQRT)
770       .legalFor({S32, S64})
771       .scalarize(0)
772       .clampScalar(0, S32, S64);
773 
774     if (ST.hasFractBug()) {
775       getActionDefinitionsBuilder(G_FFLOOR)
776         .customFor({S64})
777         .legalFor({S32, S64})
778         .scalarize(0)
779         .clampScalar(0, S32, S64);
780     } else {
781       getActionDefinitionsBuilder(G_FFLOOR)
782         .legalFor({S32, S64})
783         .scalarize(0)
784         .clampScalar(0, S32, S64);
785     }
786   }
787 
788   getActionDefinitionsBuilder(G_FPTRUNC)
789     .legalFor({{S32, S64}, {S16, S32}})
790     .scalarize(0)
791     .lower();
792 
793   getActionDefinitionsBuilder(G_FPEXT)
794     .legalFor({{S64, S32}, {S32, S16}})
795     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796     .scalarize(0);
797 
798   auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
799   if (ST.has16BitInsts()) {
800     FSubActions
801       // Use actual fsub instruction
802       .legalFor({S32, S16})
803       // Must use fadd + fneg
804       .lowerFor({S64, V2S16});
805   } else {
806     FSubActions
807       // Use actual fsub instruction
808       .legalFor({S32})
809       // Must use fadd + fneg
810       .lowerFor({S64, S16, V2S16});
811   }
812 
813   FSubActions
814     .scalarize(0)
815     .clampScalar(0, S32, S64);
816 
817   // Whether this is legal depends on the floating point mode for the function.
818   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
819   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820     FMad.customFor({S32, S16});
821   else if (ST.hasMadMacF32Insts())
822     FMad.customFor({S32});
823   else if (ST.hasMadF16())
824     FMad.customFor({S16});
825   FMad.scalarize(0)
826       .lower();
827 
828   auto &FRem = getActionDefinitionsBuilder(G_FREM);
829   if (ST.has16BitInsts()) {
830     FRem.customFor({S16, S32, S64});
831   } else {
832     FRem.minScalar(0, S32)
833         .customFor({S32, S64});
834   }
835   FRem.scalarize(0);
836 
837   // TODO: Do we need to clamp maximum bitwidth?
838   getActionDefinitionsBuilder(G_TRUNC)
839     .legalIf(isScalar(0))
840     .legalFor({{V2S16, V2S32}})
841     .clampMaxNumElements(0, S16, 2)
842     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
843     // situations (like an invalid implicit use), we don't want to infinite loop
844     // in the legalizer.
845     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
846     .alwaysLegal();
847 
848   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
849     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850                {S32, S1}, {S64, S1}, {S16, S1}})
851     .scalarize(0)
852     .clampScalar(0, S32, S64)
853     .widenScalarToNextPow2(1, 32);
854 
855   // TODO: Split s1->s64 during regbankselect for VALU.
856   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858                     .lowerIf(typeIs(1, S1))
859                     .customFor({{S32, S64}, {S64, S64}});
860   if (ST.has16BitInsts())
861     IToFP.legalFor({{S16, S16}});
862   IToFP.clampScalar(1, S32, S64)
863        .minScalar(0, S32)
864        .scalarize(0)
865        .widenScalarToNextPow2(1);
866 
867   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
868     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869     .customFor({{S64, S32}, {S64, S64}})
870     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
871   if (ST.has16BitInsts())
872     FPToI.legalFor({{S16, S16}});
873   else
874     FPToI.minScalar(1, S32);
875 
876   FPToI.minScalar(0, S32)
877        .widenScalarToNextPow2(0, 32)
878        .scalarize(0)
879        .lower();
880 
881   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
882       .customFor({S16, S32})
883       .scalarize(0)
884       .lower();
885 
886   // Lower roundeven into G_FRINT
887   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888     .scalarize(0)
889     .lower();
890 
891   if (ST.has16BitInsts()) {
892     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893       .legalFor({S16, S32, S64})
894       .clampScalar(0, S16, S64)
895       .scalarize(0);
896   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
897     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898       .legalFor({S32, S64})
899       .clampScalar(0, S32, S64)
900       .scalarize(0);
901   } else {
902     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
903       .legalFor({S32})
904       .customFor({S64})
905       .clampScalar(0, S32, S64)
906       .scalarize(0);
907   }
908 
909   getActionDefinitionsBuilder(G_PTR_ADD)
910     .legalIf(all(isPointer(0), sameSize(0, 1)))
911     .scalarize(0)
912     .scalarSameSizeAs(1, 0);
913 
914   getActionDefinitionsBuilder(G_PTRMASK)
915     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916     .scalarSameSizeAs(1, 0)
917     .scalarize(0);
918 
919   auto &CmpBuilder =
920     getActionDefinitionsBuilder(G_ICMP)
921     // The compare output type differs based on the register bank of the output,
922     // so make both s1 and s32 legal.
923     //
924     // Scalar compares producing output in scc will be promoted to s32, as that
925     // is the allocatable register type that will be needed for the copy from
926     // scc. This will be promoted during RegBankSelect, and we assume something
927     // before that won't try to use s32 result types.
928     //
929     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930     // bank.
931     .legalForCartesianProduct(
932       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933     .legalForCartesianProduct(
934       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935   if (ST.has16BitInsts()) {
936     CmpBuilder.legalFor({{S1, S16}});
937   }
938 
939   CmpBuilder
940     .widenScalarToNextPow2(1)
941     .clampScalar(1, S32, S64)
942     .scalarize(0)
943     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
944 
945   getActionDefinitionsBuilder(G_FCMP)
946     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947     .widenScalarToNextPow2(1)
948     .clampScalar(1, S32, S64)
949     .scalarize(0);
950 
951   // FIXME: fpow has a selection pattern that should move to custom lowering.
952   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953   if (ST.has16BitInsts())
954     Exp2Ops.legalFor({S32, S16});
955   else
956     Exp2Ops.legalFor({S32});
957   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958   Exp2Ops.scalarize(0);
959 
960   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961   if (ST.has16BitInsts())
962     ExpOps.customFor({{S32}, {S16}});
963   else
964     ExpOps.customFor({S32});
965   ExpOps.clampScalar(0, MinScalarFPTy, S32)
966         .scalarize(0);
967 
968   getActionDefinitionsBuilder(G_FPOWI)
969     .clampScalar(0, MinScalarFPTy, S32)
970     .lower();
971 
972   // The 64-bit versions produce 32-bit results, but only on the SALU.
973   getActionDefinitionsBuilder(G_CTPOP)
974     .legalFor({{S32, S32}, {S32, S64}})
975     .clampScalar(0, S32, S32)
976     .widenScalarToNextPow2(1, 32)
977     .clampScalar(1, S32, S64)
978     .scalarize(0)
979     .widenScalarToNextPow2(0, 32);
980 
981 
982   // The hardware instructions return a different result on 0 than the generic
983   // instructions expect. The hardware produces -1, but these produce the
984   // bitwidth.
985   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
986     .scalarize(0)
987     .clampScalar(0, S32, S32)
988     .clampScalar(1, S32, S64)
989     .widenScalarToNextPow2(0, 32)
990     .widenScalarToNextPow2(1, 32)
991     .custom();
992 
993   // The 64-bit versions produce 32-bit results, but only on the SALU.
994   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995     .legalFor({{S32, S32}, {S32, S64}})
996     .clampScalar(0, S32, S32)
997     .clampScalar(1, S32, S64)
998     .scalarize(0)
999     .widenScalarToNextPow2(0, 32)
1000     .widenScalarToNextPow2(1, 32);
1001 
1002   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003   // RegBankSelect.
1004   getActionDefinitionsBuilder(G_BITREVERSE)
1005     .legalFor({S32, S64})
1006     .clampScalar(0, S32, S64)
1007     .scalarize(0)
1008     .widenScalarToNextPow2(0);
1009 
1010   if (ST.has16BitInsts()) {
1011     getActionDefinitionsBuilder(G_BSWAP)
1012       .legalFor({S16, S32, V2S16})
1013       .clampMaxNumElementsStrict(0, S16, 2)
1014       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1015       // narrowScalar limitation.
1016       .widenScalarToNextPow2(0)
1017       .clampScalar(0, S16, S32)
1018       .scalarize(0);
1019 
1020     if (ST.hasVOP3PInsts()) {
1021       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022         .legalFor({S32, S16, V2S16})
1023         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1024         .clampMaxNumElements(0, S16, 2)
1025         .minScalar(0, S16)
1026         .widenScalarToNextPow2(0)
1027         .scalarize(0)
1028         .lower();
1029     } else {
1030       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031         .legalFor({S32, S16})
1032         .widenScalarToNextPow2(0)
1033         .minScalar(0, S16)
1034         .scalarize(0)
1035         .lower();
1036     }
1037   } else {
1038     // TODO: Should have same legality without v_perm_b32
1039     getActionDefinitionsBuilder(G_BSWAP)
1040       .legalFor({S32})
1041       .lowerIf(scalarNarrowerThan(0, 32))
1042       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1043       // narrowScalar limitation.
1044       .widenScalarToNextPow2(0)
1045       .maxScalar(0, S32)
1046       .scalarize(0)
1047       .lower();
1048 
1049     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1050       .legalFor({S32})
1051       .minScalar(0, S32)
1052       .widenScalarToNextPow2(0)
1053       .scalarize(0)
1054       .lower();
1055   }
1056 
1057   getActionDefinitionsBuilder(G_INTTOPTR)
1058     // List the common cases
1059     .legalForCartesianProduct(AddrSpaces64, {S64})
1060     .legalForCartesianProduct(AddrSpaces32, {S32})
1061     .scalarize(0)
1062     // Accept any address space as long as the size matches
1063     .legalIf(sameSize(0, 1))
1064     .widenScalarIf(smallerThan(1, 0),
1065       [](const LegalityQuery &Query) {
1066         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1067       })
1068     .narrowScalarIf(largerThan(1, 0),
1069       [](const LegalityQuery &Query) {
1070         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1071       });
1072 
1073   getActionDefinitionsBuilder(G_PTRTOINT)
1074     // List the common cases
1075     .legalForCartesianProduct(AddrSpaces64, {S64})
1076     .legalForCartesianProduct(AddrSpaces32, {S32})
1077     .scalarize(0)
1078     // Accept any address space as long as the size matches
1079     .legalIf(sameSize(0, 1))
1080     .widenScalarIf(smallerThan(0, 1),
1081       [](const LegalityQuery &Query) {
1082         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1083       })
1084     .narrowScalarIf(
1085       largerThan(0, 1),
1086       [](const LegalityQuery &Query) {
1087         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1088       });
1089 
1090   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1091     .scalarize(0)
1092     .custom();
1093 
1094   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1095                                     bool IsLoad) -> bool {
1096     const LLT DstTy = Query.Types[0];
1097 
1098     // Split vector extloads.
1099     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100 
1101     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1102       return true;
1103 
1104     const LLT PtrTy = Query.Types[1];
1105     unsigned AS = PtrTy.getAddressSpace();
1106     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1107       return true;
1108 
1109     // Catch weird sized loads that don't evenly divide into the access sizes
1110     // TODO: May be able to widen depending on alignment etc.
1111     unsigned NumRegs = (MemSize + 31) / 32;
1112     if (NumRegs == 3) {
1113       if (!ST.hasDwordx3LoadStores())
1114         return true;
1115     } else {
1116       // If the alignment allows, these should have been widened.
1117       if (!isPowerOf2_32(NumRegs))
1118         return true;
1119     }
1120 
1121     return false;
1122   };
1123 
1124   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1127 
1128   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1129   // LDS
1130   // TODO: Unsupported flat for SI.
1131 
1132   for (unsigned Op : {G_LOAD, G_STORE}) {
1133     const bool IsStore = Op == G_STORE;
1134 
1135     auto &Actions = getActionDefinitionsBuilder(Op);
1136     // Explicitly list some common cases.
1137     // TODO: Does this help compile time at all?
1138     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141                                       {S64, GlobalPtr, S64, GlobalAlign32},
1142                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144                                       {S32, GlobalPtr, S8, GlobalAlign8},
1145                                       {S32, GlobalPtr, S16, GlobalAlign16},
1146 
1147                                       {S32, LocalPtr, S32, 32},
1148                                       {S64, LocalPtr, S64, 32},
1149                                       {V2S32, LocalPtr, V2S32, 32},
1150                                       {S32, LocalPtr, S8, 8},
1151                                       {S32, LocalPtr, S16, 16},
1152                                       {V2S16, LocalPtr, S32, 32},
1153 
1154                                       {S32, PrivatePtr, S32, 32},
1155                                       {S32, PrivatePtr, S8, 8},
1156                                       {S32, PrivatePtr, S16, 16},
1157                                       {V2S16, PrivatePtr, S32, 32},
1158 
1159                                       {S32, ConstantPtr, S32, GlobalAlign32},
1160                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162                                       {S64, ConstantPtr, S64, GlobalAlign32},
1163                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1164     Actions.legalIf(
1165       [=](const LegalityQuery &Query) -> bool {
1166         return isLoadStoreLegal(ST, Query);
1167       });
1168 
1169     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1170     // 64-bits.
1171     //
1172     // TODO: Should generalize bitcast action into coerce, which will also cover
1173     // inserting addrspacecasts.
1174     Actions.customIf(typeIs(1, Constant32Ptr));
1175 
1176     // Turn any illegal element vectors into something easier to deal
1177     // with. These will ultimately produce 32-bit scalar shifts to extract the
1178     // parts anyway.
1179     //
1180     // For odd 16-bit element vectors, prefer to split those into pieces with
1181     // 16-bit vector parts.
1182     Actions.bitcastIf(
1183       [=](const LegalityQuery &Query) -> bool {
1184         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185                                           Query.MMODescrs[0].MemoryTy);
1186       }, bitcastToRegisterType(0));
1187 
1188     if (!IsStore) {
1189       // Widen suitably aligned loads by loading extra bytes. The standard
1190       // legalization actions can't properly express widening memory operands.
1191       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192         return shouldWidenLoad(ST, Query, G_LOAD);
1193       });
1194     }
1195 
1196     // FIXME: load/store narrowing should be moved to lower action
1197     Actions
1198         .narrowScalarIf(
1199             [=](const LegalityQuery &Query) -> bool {
1200               return !Query.Types[0].isVector() &&
1201                      needToSplitMemOp(Query, Op == G_LOAD);
1202             },
1203             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1204               const LLT DstTy = Query.Types[0];
1205               const LLT PtrTy = Query.Types[1];
1206 
1207               const unsigned DstSize = DstTy.getSizeInBits();
1208               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1209 
1210               // Split extloads.
1211               if (DstSize > MemSize)
1212                 return std::make_pair(0, LLT::scalar(MemSize));
1213 
1214               unsigned MaxSize = maxSizeForAddrSpace(ST,
1215                                                      PtrTy.getAddressSpace(),
1216                                                      Op == G_LOAD);
1217               if (MemSize > MaxSize)
1218                 return std::make_pair(0, LLT::scalar(MaxSize));
1219 
1220               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1221               return std::make_pair(0, LLT::scalar(Align));
1222             })
1223         .fewerElementsIf(
1224             [=](const LegalityQuery &Query) -> bool {
1225               return Query.Types[0].isVector() &&
1226                      needToSplitMemOp(Query, Op == G_LOAD);
1227             },
1228             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1229               const LLT DstTy = Query.Types[0];
1230               const LLT PtrTy = Query.Types[1];
1231 
1232               LLT EltTy = DstTy.getElementType();
1233               unsigned MaxSize = maxSizeForAddrSpace(ST,
1234                                                      PtrTy.getAddressSpace(),
1235                                                      Op == G_LOAD);
1236 
1237               // FIXME: Handle widened to power of 2 results better. This ends
1238               // up scalarizing.
1239               // FIXME: 3 element stores scalarized on SI
1240 
1241               // Split if it's too large for the address space.
1242               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243               if (MemSize > MaxSize) {
1244                 unsigned NumElts = DstTy.getNumElements();
1245                 unsigned EltSize = EltTy.getSizeInBits();
1246 
1247                 if (MaxSize % EltSize == 0) {
1248                   return std::make_pair(
1249                       0, LLT::scalarOrVector(
1250                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1251                 }
1252 
1253                 unsigned NumPieces = MemSize / MaxSize;
1254 
1255                 // FIXME: Refine when odd breakdowns handled
1256                 // The scalars will need to be re-legalized.
1257                 if (NumPieces == 1 || NumPieces >= NumElts ||
1258                     NumElts % NumPieces != 0)
1259                   return std::make_pair(0, EltTy);
1260 
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1263               }
1264 
1265               // FIXME: We could probably handle weird extending loads better.
1266               if (DstTy.getSizeInBits() > MemSize)
1267                 return std::make_pair(0, EltTy);
1268 
1269               unsigned EltSize = EltTy.getSizeInBits();
1270               unsigned DstSize = DstTy.getSizeInBits();
1271               if (!isPowerOf2_32(DstSize)) {
1272                 // We're probably decomposing an odd sized store. Try to split
1273                 // to the widest type. TODO: Account for alignment. As-is it
1274                 // should be OK, since the new parts will be further legalized.
1275                 unsigned FloorSize = PowerOf2Floor(DstSize);
1276                 return std::make_pair(
1277                     0, LLT::scalarOrVector(
1278                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1279               }
1280 
1281               // May need relegalization for the scalars.
1282               return std::make_pair(0, EltTy);
1283             })
1284     .minScalar(0, S32)
1285     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1286     .widenScalarToNextPow2(0)
1287     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288     .lower();
1289   }
1290 
1291   // FIXME: Unaligned accesses not lowered.
1292   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294                                                   {S32, GlobalPtr, S16, 2 * 8},
1295                                                   {S32, LocalPtr, S8, 8},
1296                                                   {S32, LocalPtr, S16, 16},
1297                                                   {S32, PrivatePtr, S8, 8},
1298                                                   {S32, PrivatePtr, S16, 16},
1299                                                   {S32, ConstantPtr, S8, 8},
1300                                                   {S32, ConstantPtr, S16, 2 * 8}})
1301                        .legalIf(
1302                          [=](const LegalityQuery &Query) -> bool {
1303                            return isLoadStoreLegal(ST, Query);
1304                          });
1305 
1306   if (ST.hasFlatAddressSpace()) {
1307     ExtLoads.legalForTypesWithMemDesc(
1308         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1309   }
1310 
1311   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312   // 64-bits.
1313   //
1314   // TODO: Should generalize bitcast action into coerce, which will also cover
1315   // inserting addrspacecasts.
1316   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317 
1318   ExtLoads.clampScalar(0, S32, S32)
1319           .widenScalarToNextPow2(0)
1320           .lower();
1321 
1322   auto &Atomics = getActionDefinitionsBuilder(
1323     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326      G_ATOMICRMW_UMIN})
1327     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328                {S64, GlobalPtr}, {S64, LocalPtr},
1329                {S32, RegionPtr}, {S64, RegionPtr}});
1330   if (ST.hasFlatAddressSpace()) {
1331     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1332   }
1333 
1334   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335   if (ST.hasLDSFPAtomicAdd()) {
1336     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337     if (ST.hasGFX90AInsts())
1338       Atomic.legalFor({{S64, LocalPtr}});
1339     if (ST.hasGFX940Insts())
1340       Atomic.legalFor({{V2S16, LocalPtr}});
1341   }
1342   if (ST.hasAtomicFaddInsts())
1343     Atomic.legalFor({{S32, GlobalPtr}});
1344 
1345   if (ST.hasGFX90AInsts()) {
1346     // These are legal with some caveats, and should have undergone expansion in
1347     // the IR in most situations
1348     // TODO: Move atomic expansion into legalizer
1349     // TODO: Also supports <2 x f16>
1350     Atomic.legalFor({
1351         {S32, GlobalPtr},
1352         {S64, GlobalPtr},
1353         {S64, FlatPtr}
1354       });
1355   }
1356 
1357   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1358   // demarshalling
1359   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361                 {S32, FlatPtr}, {S64, FlatPtr}})
1362     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363                {S32, RegionPtr}, {S64, RegionPtr}});
1364   // TODO: Pointer types, any 32-bit or 64-bit vector
1365 
1366   // Condition should be s32 for scalar, s1 for vector.
1367   getActionDefinitionsBuilder(G_SELECT)
1368       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369                                  LocalPtr, FlatPtr, PrivatePtr,
1370                                  LLT::fixed_vector(2, LocalPtr),
1371                                  LLT::fixed_vector(2, PrivatePtr)},
1372                                 {S1, S32})
1373       .clampScalar(0, S16, S64)
1374       .scalarize(1)
1375       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1376       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1377       .clampMaxNumElements(0, S32, 2)
1378       .clampMaxNumElements(0, LocalPtr, 2)
1379       .clampMaxNumElements(0, PrivatePtr, 2)
1380       .scalarize(0)
1381       .widenScalarToNextPow2(0)
1382       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1383 
1384   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1385   // be more flexible with the shift amount type.
1386   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1387     .legalFor({{S32, S32}, {S64, S32}});
1388   if (ST.has16BitInsts()) {
1389     if (ST.hasVOP3PInsts()) {
1390       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1391             .clampMaxNumElements(0, S16, 2);
1392     } else
1393       Shifts.legalFor({{S16, S16}});
1394 
1395     // TODO: Support 16-bit shift amounts for all types
1396     Shifts.widenScalarIf(
1397       [=](const LegalityQuery &Query) {
1398         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1399         // 32-bit amount.
1400         const LLT ValTy = Query.Types[0];
1401         const LLT AmountTy = Query.Types[1];
1402         return ValTy.getSizeInBits() <= 16 &&
1403                AmountTy.getSizeInBits() < 16;
1404       }, changeTo(1, S16));
1405     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1406     Shifts.clampScalar(1, S32, S32);
1407     Shifts.widenScalarToNextPow2(0, 16);
1408     Shifts.clampScalar(0, S16, S64);
1409 
1410     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1411       .minScalar(0, S16)
1412       .scalarize(0)
1413       .lower();
1414   } else {
1415     // Make sure we legalize the shift amount type first, as the general
1416     // expansion for the shifted type will produce much worse code if it hasn't
1417     // been truncated already.
1418     Shifts.clampScalar(1, S32, S32);
1419     Shifts.widenScalarToNextPow2(0, 32);
1420     Shifts.clampScalar(0, S32, S64);
1421 
1422     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1423       .minScalar(0, S32)
1424       .scalarize(0)
1425       .lower();
1426   }
1427   Shifts.scalarize(0);
1428 
1429   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1430     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1431     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1432     unsigned IdxTypeIdx = 2;
1433 
1434     getActionDefinitionsBuilder(Op)
1435       .customIf([=](const LegalityQuery &Query) {
1436           const LLT EltTy = Query.Types[EltTypeIdx];
1437           const LLT VecTy = Query.Types[VecTypeIdx];
1438           const LLT IdxTy = Query.Types[IdxTypeIdx];
1439           const unsigned EltSize = EltTy.getSizeInBits();
1440           return (EltSize == 32 || EltSize == 64) &&
1441                   VecTy.getSizeInBits() % 32 == 0 &&
1442                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1443                   IdxTy.getSizeInBits() == 32;
1444         })
1445       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1446                  bitcastToVectorElement32(VecTypeIdx))
1447       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1448       .bitcastIf(
1449         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1450         [=](const LegalityQuery &Query) {
1451           // For > 64-bit element types, try to turn this into a 64-bit
1452           // element vector since we may be able to do better indexing
1453           // if this is scalar. If not, fall back to 32.
1454           const LLT EltTy = Query.Types[EltTypeIdx];
1455           const LLT VecTy = Query.Types[VecTypeIdx];
1456           const unsigned DstEltSize = EltTy.getSizeInBits();
1457           const unsigned VecSize = VecTy.getSizeInBits();
1458 
1459           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460           return std::make_pair(
1461               VecTypeIdx,
1462               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1463         })
1464       .clampScalar(EltTypeIdx, S32, S64)
1465       .clampScalar(VecTypeIdx, S32, S64)
1466       .clampScalar(IdxTypeIdx, S32, S32)
1467       .clampMaxNumElements(VecTypeIdx, S32, 32)
1468       // TODO: Clamp elements for 64-bit vectors?
1469       // It should only be necessary with variable indexes.
1470       // As a last resort, lower to the stack
1471       .lower();
1472   }
1473 
1474   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1475     .unsupportedIf([=](const LegalityQuery &Query) {
1476         const LLT &EltTy = Query.Types[1].getElementType();
1477         return Query.Types[0] != EltTy;
1478       });
1479 
1480   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1481     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1482     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1483 
1484     // FIXME: Doesn't handle extract of illegal sizes.
1485     getActionDefinitionsBuilder(Op)
1486       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1487       .lowerIf([=](const LegalityQuery &Query) {
1488           // Sub-vector(or single element) insert and extract.
1489           // TODO: verify immediate offset here since lower only works with
1490           // whole elements.
1491           const LLT BigTy = Query.Types[BigTyIdx];
1492           return BigTy.isVector();
1493         })
1494       // FIXME: Multiples of 16 should not be legal.
1495       .legalIf([=](const LegalityQuery &Query) {
1496           const LLT BigTy = Query.Types[BigTyIdx];
1497           const LLT LitTy = Query.Types[LitTyIdx];
1498           return (BigTy.getSizeInBits() % 32 == 0) &&
1499                  (LitTy.getSizeInBits() % 16 == 0);
1500         })
1501       .widenScalarIf(
1502         [=](const LegalityQuery &Query) {
1503           const LLT BigTy = Query.Types[BigTyIdx];
1504           return (BigTy.getScalarSizeInBits() < 16);
1505         },
1506         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1507       .widenScalarIf(
1508         [=](const LegalityQuery &Query) {
1509           const LLT LitTy = Query.Types[LitTyIdx];
1510           return (LitTy.getScalarSizeInBits() < 16);
1511         },
1512         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1513       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1514       .widenScalarToNextPow2(BigTyIdx, 32);
1515 
1516   }
1517 
1518   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1519     .legalForCartesianProduct(AllS32Vectors, {S32})
1520     .legalForCartesianProduct(AllS64Vectors, {S64})
1521     .clampNumElements(0, V16S32, V32S32)
1522     .clampNumElements(0, V2S64, V16S64)
1523     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1524 
1525   if (ST.hasScalarPackInsts()) {
1526     BuildVector
1527       // FIXME: Should probably widen s1 vectors straight to s32
1528       .minScalarOrElt(0, S16)
1529       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1530       .minScalar(1, S32);
1531 
1532     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533       .legalFor({V2S16, S32})
1534       .lower();
1535     BuildVector.minScalarOrElt(0, S32);
1536   } else {
1537     BuildVector.customFor({V2S16, S16});
1538     BuildVector.minScalarOrElt(0, S32);
1539 
1540     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1541       .customFor({V2S16, S32})
1542       .lower();
1543   }
1544 
1545   BuildVector.legalIf(isRegisterType(0));
1546 
1547   // FIXME: Clamp maximum size
1548   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1549     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1550     .clampMaxNumElements(0, S32, 32)
1551     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1552     .clampMaxNumElements(0, S16, 64);
1553 
1554   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1555   // pre-legalize.
1556   if (ST.hasVOP3PInsts()) {
1557     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1558       .customFor({V2S16, V2S16})
1559       .lower();
1560   } else
1561     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1562 
1563   // Merge/Unmerge
1564   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1565     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1566     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1567 
1568     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1569       const LLT Ty = Query.Types[TypeIdx];
1570       if (Ty.isVector()) {
1571         const LLT &EltTy = Ty.getElementType();
1572         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1573           return true;
1574         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1575           return true;
1576       }
1577       return false;
1578     };
1579 
1580     auto &Builder = getActionDefinitionsBuilder(Op)
1581       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1582       .lowerFor({{S16, V2S16}})
1583       .lowerIf([=](const LegalityQuery &Query) {
1584           const LLT BigTy = Query.Types[BigTyIdx];
1585           return BigTy.getSizeInBits() == 32;
1586         })
1587       // Try to widen to s16 first for small types.
1588       // TODO: Only do this on targets with legal s16 shifts
1589       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1590       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1591       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1592       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1593                            elementTypeIs(1, S16)),
1594                        changeTo(1, V2S16))
1595       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1596       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1597       // valid.
1598       .clampScalar(LitTyIdx, S32, S512)
1599       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1600       // Break up vectors with weird elements into scalars
1601       .fewerElementsIf(
1602         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1603         scalarize(0))
1604       .fewerElementsIf(
1605         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1606         scalarize(1))
1607       .clampScalar(BigTyIdx, S32, MaxScalar);
1608 
1609     if (Op == G_MERGE_VALUES) {
1610       Builder.widenScalarIf(
1611         // TODO: Use 16-bit shifts if legal for 8-bit values?
1612         [=](const LegalityQuery &Query) {
1613           const LLT Ty = Query.Types[LitTyIdx];
1614           return Ty.getSizeInBits() < 32;
1615         },
1616         changeTo(LitTyIdx, S32));
1617     }
1618 
1619     Builder.widenScalarIf(
1620       [=](const LegalityQuery &Query) {
1621         const LLT Ty = Query.Types[BigTyIdx];
1622         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1623           Ty.getSizeInBits() % 16 != 0;
1624       },
1625       [=](const LegalityQuery &Query) {
1626         // Pick the next power of 2, or a multiple of 64 over 128.
1627         // Whichever is smaller.
1628         const LLT &Ty = Query.Types[BigTyIdx];
1629         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1630         if (NewSizeInBits >= 256) {
1631           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1632           if (RoundedTo < NewSizeInBits)
1633             NewSizeInBits = RoundedTo;
1634         }
1635         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1636       })
1637       // Any vectors left are the wrong size. Scalarize them.
1638       .scalarize(0)
1639       .scalarize(1);
1640   }
1641 
1642   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1643   // RegBankSelect.
1644   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1645     .legalFor({{S32}, {S64}});
1646 
1647   if (ST.hasVOP3PInsts()) {
1648     SextInReg.lowerFor({{V2S16}})
1649       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1650       // get more vector shift opportunities, since we'll get those when
1651       // expanded.
1652       .clampMaxNumElementsStrict(0, S16, 2);
1653   } else if (ST.has16BitInsts()) {
1654     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1655   } else {
1656     // Prefer to promote to s32 before lowering if we don't have 16-bit
1657     // shifts. This avoid a lot of intermediate truncate and extend operations.
1658     SextInReg.lowerFor({{S32}, {S64}});
1659   }
1660 
1661   SextInReg
1662     .scalarize(0)
1663     .clampScalar(0, S32, S64)
1664     .lower();
1665 
1666   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1667     .scalarize(0)
1668     .lower();
1669 
1670   // TODO: Only Try to form v2s16 with legal packed instructions.
1671   getActionDefinitionsBuilder(G_FSHR)
1672     .legalFor({{S32, S32}})
1673     .lowerFor({{V2S16, V2S16}})
1674     .clampMaxNumElementsStrict(0, S16, 2)
1675     .scalarize(0)
1676     .lower();
1677 
1678   if (ST.hasVOP3PInsts()) {
1679     getActionDefinitionsBuilder(G_FSHL)
1680       .lowerFor({{V2S16, V2S16}})
1681       .clampMaxNumElementsStrict(0, S16, 2)
1682       .scalarize(0)
1683       .lower();
1684   } else {
1685     getActionDefinitionsBuilder(G_FSHL)
1686       .scalarize(0)
1687       .lower();
1688   }
1689 
1690   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1691     .legalFor({S64});
1692 
1693   getActionDefinitionsBuilder(G_FENCE)
1694     .alwaysLegal();
1695 
1696   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1697       .scalarize(0)
1698       .minScalar(0, S32)
1699       .lower();
1700 
1701   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702       .legalFor({{S32, S32}, {S64, S32}})
1703       .clampScalar(1, S32, S32)
1704       .clampScalar(0, S32, S64)
1705       .widenScalarToNextPow2(0)
1706       .scalarize(0);
1707 
1708   getActionDefinitionsBuilder({
1709       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1710       G_FCOPYSIGN,
1711 
1712       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1713       G_ATOMICRMW_NAND,
1714       G_ATOMICRMW_FSUB,
1715       G_READ_REGISTER,
1716       G_WRITE_REGISTER,
1717 
1718       G_SADDO, G_SSUBO,
1719 
1720        // TODO: Implement
1721       G_FMINIMUM, G_FMAXIMUM}).lower();
1722 
1723   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1724       .lower();
1725 
1726   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1727         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1729     .unsupported();
1730 
1731   getLegacyLegalizerInfo().computeTables();
1732   verify(*ST.getInstrInfo());
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1736                                          MachineInstr &MI) const {
1737   MachineIRBuilder &B = Helper.MIRBuilder;
1738   MachineRegisterInfo &MRI = *B.getMRI();
1739 
1740   switch (MI.getOpcode()) {
1741   case TargetOpcode::G_ADDRSPACE_CAST:
1742     return legalizeAddrSpaceCast(MI, MRI, B);
1743   case TargetOpcode::G_FRINT:
1744     return legalizeFrint(MI, MRI, B);
1745   case TargetOpcode::G_FCEIL:
1746     return legalizeFceil(MI, MRI, B);
1747   case TargetOpcode::G_FREM:
1748     return legalizeFrem(MI, MRI, B);
1749   case TargetOpcode::G_INTRINSIC_TRUNC:
1750     return legalizeIntrinsicTrunc(MI, MRI, B);
1751   case TargetOpcode::G_SITOFP:
1752     return legalizeITOFP(MI, MRI, B, true);
1753   case TargetOpcode::G_UITOFP:
1754     return legalizeITOFP(MI, MRI, B, false);
1755   case TargetOpcode::G_FPTOSI:
1756     return legalizeFPTOI(MI, MRI, B, true);
1757   case TargetOpcode::G_FPTOUI:
1758     return legalizeFPTOI(MI, MRI, B, false);
1759   case TargetOpcode::G_FMINNUM:
1760   case TargetOpcode::G_FMAXNUM:
1761   case TargetOpcode::G_FMINNUM_IEEE:
1762   case TargetOpcode::G_FMAXNUM_IEEE:
1763     return legalizeMinNumMaxNum(Helper, MI);
1764   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1765     return legalizeExtractVectorElt(MI, MRI, B);
1766   case TargetOpcode::G_INSERT_VECTOR_ELT:
1767     return legalizeInsertVectorElt(MI, MRI, B);
1768   case TargetOpcode::G_SHUFFLE_VECTOR:
1769     return legalizeShuffleVector(MI, MRI, B);
1770   case TargetOpcode::G_FSIN:
1771   case TargetOpcode::G_FCOS:
1772     return legalizeSinCos(MI, MRI, B);
1773   case TargetOpcode::G_GLOBAL_VALUE:
1774     return legalizeGlobalValue(MI, MRI, B);
1775   case TargetOpcode::G_LOAD:
1776   case TargetOpcode::G_SEXTLOAD:
1777   case TargetOpcode::G_ZEXTLOAD:
1778     return legalizeLoad(Helper, MI);
1779   case TargetOpcode::G_FMAD:
1780     return legalizeFMad(MI, MRI, B);
1781   case TargetOpcode::G_FDIV:
1782     return legalizeFDIV(MI, MRI, B);
1783   case TargetOpcode::G_UDIV:
1784   case TargetOpcode::G_UREM:
1785   case TargetOpcode::G_UDIVREM:
1786     return legalizeUnsignedDIV_REM(MI, MRI, B);
1787   case TargetOpcode::G_SDIV:
1788   case TargetOpcode::G_SREM:
1789   case TargetOpcode::G_SDIVREM:
1790     return legalizeSignedDIV_REM(MI, MRI, B);
1791   case TargetOpcode::G_ATOMIC_CMPXCHG:
1792     return legalizeAtomicCmpXChg(MI, MRI, B);
1793   case TargetOpcode::G_FLOG:
1794     return legalizeFlog(MI, B, numbers::ln2f);
1795   case TargetOpcode::G_FLOG10:
1796     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1797   case TargetOpcode::G_FEXP:
1798     return legalizeFExp(MI, B);
1799   case TargetOpcode::G_FPOW:
1800     return legalizeFPow(MI, B);
1801   case TargetOpcode::G_FFLOOR:
1802     return legalizeFFloor(MI, MRI, B);
1803   case TargetOpcode::G_BUILD_VECTOR:
1804     return legalizeBuildVector(MI, MRI, B);
1805   case TargetOpcode::G_MUL:
1806     return legalizeMul(Helper, MI);
1807   case TargetOpcode::G_CTLZ:
1808   case TargetOpcode::G_CTTZ:
1809     return legalizeCTLZ_CTTZ(MI, MRI, B);
1810   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1811     return legalizeFPTruncRound(MI, B);
1812   default:
1813     return false;
1814   }
1815 
1816   llvm_unreachable("expected switch to return");
1817 }
1818 
1819 Register AMDGPULegalizerInfo::getSegmentAperture(
1820   unsigned AS,
1821   MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   MachineFunction &MF = B.getMF();
1824   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1825   const LLT S32 = LLT::scalar(32);
1826 
1827   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1828 
1829   if (ST.hasApertureRegs()) {
1830     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1831     // getreg.
1832     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1833         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1834         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1835     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1836         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1837         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1838     unsigned Encoding =
1839         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1840         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1841         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1842 
1843     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1844 
1845     B.buildInstr(AMDGPU::S_GETREG_B32)
1846       .addDef(GetReg)
1847       .addImm(Encoding);
1848     MRI.setType(GetReg, S32);
1849 
1850     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1851     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1852   }
1853 
1854   // TODO: can we be smarter about machine pointer info?
1855   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1856   Register LoadAddr = MRI.createGenericVirtualRegister(
1857     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1858   // For code object version 5, private_base and shared_base are passed through
1859   // implicit kernargs.
1860   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1861     AMDGPUTargetLowering::ImplicitParameter Param =
1862         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1863                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1864     uint64_t Offset =
1865         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1866 
1867     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1868         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1869 
1870     if (!loadInputValue(KernargPtrReg, B,
1871                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1872       return Register();
1873 
1874     MachineMemOperand *MMO = MF.getMachineMemOperand(
1875         PtrInfo,
1876         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1877             MachineMemOperand::MOInvariant,
1878         LLT::scalar(32), commonAlignment(Align(64), Offset));
1879 
1880     // Pointer address
1881     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1882                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1883     // Load address
1884     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1885   }
1886 
1887   Register QueuePtr = MRI.createGenericVirtualRegister(
1888     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1889 
1890   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1891     return Register();
1892 
1893   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1894   // private_segment_aperture_base_hi.
1895   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1896 
1897   MachineMemOperand *MMO = MF.getMachineMemOperand(
1898       PtrInfo,
1899       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1900           MachineMemOperand::MOInvariant,
1901       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1902 
1903   B.buildPtrAdd(LoadAddr, QueuePtr,
1904                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1905   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1906 }
1907 
1908 /// Return true if the value is a known valid address, such that a null check is
1909 /// not necessary.
1910 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1911                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1912   MachineInstr *Def = MRI.getVRegDef(Val);
1913   switch (Def->getOpcode()) {
1914   case AMDGPU::G_FRAME_INDEX:
1915   case AMDGPU::G_GLOBAL_VALUE:
1916   case AMDGPU::G_BLOCK_ADDR:
1917     return true;
1918   case AMDGPU::G_CONSTANT: {
1919     const ConstantInt *CI = Def->getOperand(1).getCImm();
1920     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1921   }
1922   default:
1923     return false;
1924   }
1925 
1926   return false;
1927 }
1928 
1929 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1930   MachineInstr &MI, MachineRegisterInfo &MRI,
1931   MachineIRBuilder &B) const {
1932   MachineFunction &MF = B.getMF();
1933 
1934   const LLT S32 = LLT::scalar(32);
1935   Register Dst = MI.getOperand(0).getReg();
1936   Register Src = MI.getOperand(1).getReg();
1937 
1938   LLT DstTy = MRI.getType(Dst);
1939   LLT SrcTy = MRI.getType(Src);
1940   unsigned DestAS = DstTy.getAddressSpace();
1941   unsigned SrcAS = SrcTy.getAddressSpace();
1942 
1943   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1944   // vector element.
1945   assert(!DstTy.isVector());
1946 
1947   const AMDGPUTargetMachine &TM
1948     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1949 
1950   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1951     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1952     return true;
1953   }
1954 
1955   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1956       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1957        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1958     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1959       // Extract low 32-bits of the pointer.
1960       B.buildExtract(Dst, Src, 0);
1961       MI.eraseFromParent();
1962       return true;
1963     }
1964 
1965     unsigned NullVal = TM.getNullPointerValue(DestAS);
1966 
1967     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1968     auto FlatNull = B.buildConstant(SrcTy, 0);
1969 
1970     // Extract low 32-bits of the pointer.
1971     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1972 
1973     auto CmpRes =
1974         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1975     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1976 
1977     MI.eraseFromParent();
1978     return true;
1979   }
1980 
1981   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1982       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1983        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1984     if (!ST.hasFlatAddressSpace())
1985       return false;
1986 
1987     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1988     if (!ApertureReg.isValid())
1989       return false;
1990 
1991     // Coerce the type of the low half of the result so we can use merge_values.
1992     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1993 
1994     // TODO: Should we allow mismatched types but matching sizes in merges to
1995     // avoid the ptrtoint?
1996     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1997 
1998     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1999       B.buildCopy(Dst, BuildPtr);
2000       MI.eraseFromParent();
2001       return true;
2002     }
2003 
2004     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2005     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2006 
2007     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2008                               SegmentNull.getReg(0));
2009 
2010     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2011 
2012     MI.eraseFromParent();
2013     return true;
2014   }
2015 
2016   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2017       SrcTy.getSizeInBits() == 64) {
2018     // Truncate.
2019     B.buildExtract(Dst, Src, 0);
2020     MI.eraseFromParent();
2021     return true;
2022   }
2023 
2024   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2025       DstTy.getSizeInBits() == 64) {
2026     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2027     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2028 
2029     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2030     // another. Merge operands are required to be the same type, but creating an
2031     // extra ptrtoint would be kind of pointless.
2032     auto HighAddr = B.buildConstant(
2033         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
2034     B.buildMerge(Dst, {Src, HighAddr});
2035     MI.eraseFromParent();
2036     return true;
2037   }
2038 
2039   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2040       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2041 
2042   LLVMContext &Ctx = MF.getFunction().getContext();
2043   Ctx.diagnose(InvalidAddrSpaceCast);
2044   B.buildUndef(Dst);
2045   MI.eraseFromParent();
2046   return true;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeFrint(
2050   MachineInstr &MI, MachineRegisterInfo &MRI,
2051   MachineIRBuilder &B) const {
2052   Register Src = MI.getOperand(1).getReg();
2053   LLT Ty = MRI.getType(Src);
2054   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2055 
2056   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2057   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2058 
2059   auto C1 = B.buildFConstant(Ty, C1Val);
2060   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2061 
2062   // TODO: Should this propagate fast-math-flags?
2063   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2064   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2065 
2066   auto C2 = B.buildFConstant(Ty, C2Val);
2067   auto Fabs = B.buildFAbs(Ty, Src);
2068 
2069   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2070   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2071   MI.eraseFromParent();
2072   return true;
2073 }
2074 
2075 bool AMDGPULegalizerInfo::legalizeFceil(
2076   MachineInstr &MI, MachineRegisterInfo &MRI,
2077   MachineIRBuilder &B) const {
2078 
2079   const LLT S1 = LLT::scalar(1);
2080   const LLT S64 = LLT::scalar(64);
2081 
2082   Register Src = MI.getOperand(1).getReg();
2083   assert(MRI.getType(Src) == S64);
2084 
2085   // result = trunc(src)
2086   // if (src > 0.0 && src != result)
2087   //   result += 1.0
2088 
2089   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2090 
2091   const auto Zero = B.buildFConstant(S64, 0.0);
2092   const auto One = B.buildFConstant(S64, 1.0);
2093   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2094   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2095   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2096   auto Add = B.buildSelect(S64, And, One, Zero);
2097 
2098   // TODO: Should this propagate fast-math-flags?
2099   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeFrem(
2105   MachineInstr &MI, MachineRegisterInfo &MRI,
2106   MachineIRBuilder &B) const {
2107     Register DstReg = MI.getOperand(0).getReg();
2108     Register Src0Reg = MI.getOperand(1).getReg();
2109     Register Src1Reg = MI.getOperand(2).getReg();
2110     auto Flags = MI.getFlags();
2111     LLT Ty = MRI.getType(DstReg);
2112 
2113     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2115     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2116     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117     MI.eraseFromParent();
2118     return true;
2119 }
2120 
2121 static MachineInstrBuilder extractF64Exponent(Register Hi,
2122                                               MachineIRBuilder &B) {
2123   const unsigned FractBits = 52;
2124   const unsigned ExpBits = 11;
2125   LLT S32 = LLT::scalar(32);
2126 
2127   auto Const0 = B.buildConstant(S32, FractBits - 32);
2128   auto Const1 = B.buildConstant(S32, ExpBits);
2129 
2130   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2131     .addUse(Hi)
2132     .addUse(Const0.getReg(0))
2133     .addUse(Const1.getReg(0));
2134 
2135   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2139   MachineInstr &MI, MachineRegisterInfo &MRI,
2140   MachineIRBuilder &B) const {
2141   const LLT S1 = LLT::scalar(1);
2142   const LLT S32 = LLT::scalar(32);
2143   const LLT S64 = LLT::scalar(64);
2144 
2145   Register Src = MI.getOperand(1).getReg();
2146   assert(MRI.getType(Src) == S64);
2147 
2148   // TODO: Should this use extract since the low half is unused?
2149   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2150   Register Hi = Unmerge.getReg(1);
2151 
2152   // Extract the upper half, since this is where we will find the sign and
2153   // exponent.
2154   auto Exp = extractF64Exponent(Hi, B);
2155 
2156   const unsigned FractBits = 52;
2157 
2158   // Extract the sign bit.
2159   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2160   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2161 
2162   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2163 
2164   const auto Zero32 = B.buildConstant(S32, 0);
2165 
2166   // Extend back to 64-bits.
2167   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2168 
2169   auto Shr = B.buildAShr(S64, FractMask, Exp);
2170   auto Not = B.buildNot(S64, Shr);
2171   auto Tmp0 = B.buildAnd(S64, Src, Not);
2172   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2173 
2174   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2175   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2176 
2177   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2178   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179   MI.eraseFromParent();
2180   return true;
2181 }
2182 
2183 bool AMDGPULegalizerInfo::legalizeITOFP(
2184   MachineInstr &MI, MachineRegisterInfo &MRI,
2185   MachineIRBuilder &B, bool Signed) const {
2186 
2187   Register Dst = MI.getOperand(0).getReg();
2188   Register Src = MI.getOperand(1).getReg();
2189 
2190   const LLT S64 = LLT::scalar(64);
2191   const LLT S32 = LLT::scalar(32);
2192 
2193   assert(MRI.getType(Src) == S64);
2194 
2195   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2196   auto ThirtyTwo = B.buildConstant(S32, 32);
2197 
2198   if (MRI.getType(Dst) == S64) {
2199     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2200                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2201 
2202     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2203     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2204                      .addUse(CvtHi.getReg(0))
2205                      .addUse(ThirtyTwo.getReg(0));
2206 
2207     // TODO: Should this propagate fast-math-flags?
2208     B.buildFAdd(Dst, LdExp, CvtLo);
2209     MI.eraseFromParent();
2210     return true;
2211   }
2212 
2213   assert(MRI.getType(Dst) == S32);
2214 
2215   auto One = B.buildConstant(S32, 1);
2216 
2217   MachineInstrBuilder ShAmt;
2218   if (Signed) {
2219     auto ThirtyOne = B.buildConstant(S32, 31);
2220     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2222     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2224                                /*HasSideEffects=*/false)
2225                   .addUse(Unmerge.getReg(1));
2226     auto LS2 = B.buildSub(S32, LS, One);
2227     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2228   } else
2229     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2230   auto Norm = B.buildShl(S64, Src, ShAmt);
2231   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2232   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2233   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2235   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2236   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2237                    /*HasSideEffects=*/false)
2238       .addUse(FVal.getReg(0))
2239       .addUse(Scale.getReg(0));
2240   MI.eraseFromParent();
2241   return true;
2242 }
2243 
2244 // TODO: Copied from DAG implementation. Verify logic and document how this
2245 // actually works.
2246 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2247                                         MachineRegisterInfo &MRI,
2248                                         MachineIRBuilder &B,
2249                                         bool Signed) const {
2250 
2251   Register Dst = MI.getOperand(0).getReg();
2252   Register Src = MI.getOperand(1).getReg();
2253 
2254   const LLT S64 = LLT::scalar(64);
2255   const LLT S32 = LLT::scalar(32);
2256 
2257   const LLT SrcLT = MRI.getType(Src);
2258   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2259 
2260   unsigned Flags = MI.getFlags();
2261 
2262   // The basic idea of converting a floating point number into a pair of 32-bit
2263   // integers is illustrated as follows:
2264   //
2265   //     tf := trunc(val);
2266   //    hif := floor(tf * 2^-32);
2267   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2268   //     hi := fptoi(hif);
2269   //     lo := fptoi(lof);
2270   //
2271   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2272   MachineInstrBuilder Sign;
2273   if (Signed && SrcLT == S32) {
2274     // However, a 32-bit floating point number has only 23 bits mantissa and
2275     // it's not enough to hold all the significant bits of `lof` if val is
2276     // negative. To avoid the loss of precision, We need to take the absolute
2277     // value after truncating and flip the result back based on the original
2278     // signedness.
2279     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2280     Trunc = B.buildFAbs(S32, Trunc, Flags);
2281   }
2282   MachineInstrBuilder K0, K1;
2283   if (SrcLT == S64) {
2284     K0 = B.buildFConstant(S64,
2285                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2286     K1 = B.buildFConstant(S64,
2287                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2288   } else {
2289     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2290     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2291   }
2292 
2293   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2294   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2295   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2296 
2297   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2298                                      : B.buildFPTOUI(S32, FloorMul);
2299   auto Lo = B.buildFPTOUI(S32, Fma);
2300 
2301   if (Signed && SrcLT == S32) {
2302     // Flip the result based on the signedness, which is either all 0s or 1s.
2303     Sign = B.buildMerge(S64, {Sign, Sign});
2304     // r := xor({lo, hi}, sign) - sign;
2305     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2306   } else
2307     B.buildMerge(Dst, {Lo, Hi});
2308   MI.eraseFromParent();
2309 
2310   return true;
2311 }
2312 
2313 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2314                                                MachineInstr &MI) const {
2315   MachineFunction &MF = Helper.MIRBuilder.getMF();
2316   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2317 
2318   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2320 
2321   // With ieee_mode disabled, the instructions have the correct behavior
2322   // already for G_FMINNUM/G_FMAXNUM
2323   if (!MFI->getMode().IEEE)
2324     return !IsIEEEOp;
2325 
2326   if (IsIEEEOp)
2327     return true;
2328 
2329   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2330 }
2331 
2332 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2333   MachineInstr &MI, MachineRegisterInfo &MRI,
2334   MachineIRBuilder &B) const {
2335   // TODO: Should move some of this into LegalizerHelper.
2336 
2337   // TODO: Promote dynamic indexing of s16 to s32
2338 
2339   // FIXME: Artifact combiner probably should have replaced the truncated
2340   // constant before this, so we shouldn't need
2341   // getIConstantVRegValWithLookThrough.
2342   Optional<ValueAndVReg> MaybeIdxVal =
2343       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2345     return true;
2346   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2347 
2348   Register Dst = MI.getOperand(0).getReg();
2349   Register Vec = MI.getOperand(1).getReg();
2350 
2351   LLT VecTy = MRI.getType(Vec);
2352   LLT EltTy = VecTy.getElementType();
2353   assert(EltTy == MRI.getType(Dst));
2354 
2355   if (IdxVal < VecTy.getNumElements()) {
2356     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2357     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2358   } else {
2359     B.buildUndef(Dst);
2360   }
2361 
2362   MI.eraseFromParent();
2363   return true;
2364 }
2365 
2366 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2367   MachineInstr &MI, MachineRegisterInfo &MRI,
2368   MachineIRBuilder &B) const {
2369   // TODO: Should move some of this into LegalizerHelper.
2370 
2371   // TODO: Promote dynamic indexing of s16 to s32
2372 
2373   // FIXME: Artifact combiner probably should have replaced the truncated
2374   // constant before this, so we shouldn't need
2375   // getIConstantVRegValWithLookThrough.
2376   Optional<ValueAndVReg> MaybeIdxVal =
2377       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2379     return true;
2380 
2381   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2382   Register Dst = MI.getOperand(0).getReg();
2383   Register Vec = MI.getOperand(1).getReg();
2384   Register Ins = MI.getOperand(2).getReg();
2385 
2386   LLT VecTy = MRI.getType(Vec);
2387   LLT EltTy = VecTy.getElementType();
2388   assert(EltTy == MRI.getType(Ins));
2389   (void)Ins;
2390 
2391   unsigned NumElts = VecTy.getNumElements();
2392   if (IdxVal < NumElts) {
2393     SmallVector<Register, 8> SrcRegs;
2394     for (unsigned i = 0; i < NumElts; ++i)
2395       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2396     B.buildUnmerge(SrcRegs, Vec);
2397 
2398     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2399     B.buildMerge(Dst, SrcRegs);
2400   } else {
2401     B.buildUndef(Dst);
2402   }
2403 
2404   MI.eraseFromParent();
2405   return true;
2406 }
2407 
2408 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2409   MachineInstr &MI, MachineRegisterInfo &MRI,
2410   MachineIRBuilder &B) const {
2411   const LLT V2S16 = LLT::fixed_vector(2, 16);
2412 
2413   Register Dst = MI.getOperand(0).getReg();
2414   Register Src0 = MI.getOperand(1).getReg();
2415   LLT DstTy = MRI.getType(Dst);
2416   LLT SrcTy = MRI.getType(Src0);
2417 
2418   if (SrcTy == V2S16 && DstTy == V2S16 &&
2419       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2420     return true;
2421 
2422   MachineIRBuilder HelperBuilder(MI);
2423   GISelObserverWrapper DummyObserver;
2424   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2425   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2426 }
2427 
2428 bool AMDGPULegalizerInfo::legalizeSinCos(
2429   MachineInstr &MI, MachineRegisterInfo &MRI,
2430   MachineIRBuilder &B) const {
2431 
2432   Register DstReg = MI.getOperand(0).getReg();
2433   Register SrcReg = MI.getOperand(1).getReg();
2434   LLT Ty = MRI.getType(DstReg);
2435   unsigned Flags = MI.getFlags();
2436 
2437   Register TrigVal;
2438   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2439   if (ST.hasTrigReducedRange()) {
2440     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2441     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2442       .addUse(MulVal.getReg(0))
2443       .setMIFlags(Flags).getReg(0);
2444   } else
2445     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2446 
2447   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2448     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2449   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2450     .addUse(TrigVal)
2451     .setMIFlags(Flags);
2452   MI.eraseFromParent();
2453   return true;
2454 }
2455 
2456 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2457                                                   MachineIRBuilder &B,
2458                                                   const GlobalValue *GV,
2459                                                   int64_t Offset,
2460                                                   unsigned GAFlags) const {
2461   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2462   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2463   // to the following code sequence:
2464   //
2465   // For constant address space:
2466   //   s_getpc_b64 s[0:1]
2467   //   s_add_u32 s0, s0, $symbol
2468   //   s_addc_u32 s1, s1, 0
2469   //
2470   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2471   //   a fixup or relocation is emitted to replace $symbol with a literal
2472   //   constant, which is a pc-relative offset from the encoding of the $symbol
2473   //   operand to the global variable.
2474   //
2475   // For global address space:
2476   //   s_getpc_b64 s[0:1]
2477   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2478   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2479   //
2480   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2481   //   fixups or relocations are emitted to replace $symbol@*@lo and
2482   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2483   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2484   //   operand to the global variable.
2485   //
2486   // What we want here is an offset from the value returned by s_getpc
2487   // (which is the address of the s_add_u32 instruction) to the global
2488   // variable, but since the encoding of $symbol starts 4 bytes after the start
2489   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2490   // small. This requires us to add 4 to the global variable offset in order to
2491   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2492   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2493   // instruction.
2494 
2495   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2496 
2497   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2498     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2499 
2500   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2501     .addDef(PCReg);
2502 
2503   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2504   if (GAFlags == SIInstrInfo::MO_NONE)
2505     MIB.addImm(0);
2506   else
2507     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2508 
2509   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2510 
2511   if (PtrTy.getSizeInBits() == 32)
2512     B.buildExtract(DstReg, PCReg, 0);
2513   return true;
2514  }
2515 
2516 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2517   MachineInstr &MI, MachineRegisterInfo &MRI,
2518   MachineIRBuilder &B) const {
2519   Register DstReg = MI.getOperand(0).getReg();
2520   LLT Ty = MRI.getType(DstReg);
2521   unsigned AS = Ty.getAddressSpace();
2522 
2523   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2524   MachineFunction &MF = B.getMF();
2525   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2526 
2527   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2528     if (!MFI->isModuleEntryFunction() &&
2529         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2530       const Function &Fn = MF.getFunction();
2531       DiagnosticInfoUnsupported BadLDSDecl(
2532         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2533         DS_Warning);
2534       Fn.getContext().diagnose(BadLDSDecl);
2535 
2536       // We currently don't have a way to correctly allocate LDS objects that
2537       // aren't directly associated with a kernel. We do force inlining of
2538       // functions that use local objects. However, if these dead functions are
2539       // not eliminated, we don't want a compile time error. Just emit a warning
2540       // and a trap, since there should be no callable path here.
2541       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2542       B.buildUndef(DstReg);
2543       MI.eraseFromParent();
2544       return true;
2545     }
2546 
2547     // TODO: We could emit code to handle the initialization somewhere.
2548     // We ignore the initializer for now and legalize it to allow selection.
2549     // The initializer will anyway get errored out during assembly emission.
2550     const SITargetLowering *TLI = ST.getTargetLowering();
2551     if (!TLI->shouldUseLDSConstAddress(GV)) {
2552       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2553       return true; // Leave in place;
2554     }
2555 
2556     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2557       Type *Ty = GV->getValueType();
2558       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2559       // zero-sized type in other languages to declare the dynamic shared
2560       // memory which size is not known at the compile time. They will be
2561       // allocated by the runtime and placed directly after the static
2562       // allocated ones. They all share the same offset.
2563       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2564         // Adjust alignment for that dynamic shared memory array.
2565         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2566         LLT S32 = LLT::scalar(32);
2567         auto Sz =
2568             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2569         B.buildIntToPtr(DstReg, Sz);
2570         MI.eraseFromParent();
2571         return true;
2572       }
2573     }
2574 
2575     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2576                                                    *cast<GlobalVariable>(GV)));
2577     MI.eraseFromParent();
2578     return true;
2579   }
2580 
2581   const SITargetLowering *TLI = ST.getTargetLowering();
2582 
2583   if (TLI->shouldEmitFixup(GV)) {
2584     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2585     MI.eraseFromParent();
2586     return true;
2587   }
2588 
2589   if (TLI->shouldEmitPCReloc(GV)) {
2590     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2591     MI.eraseFromParent();
2592     return true;
2593   }
2594 
2595   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2596   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2597 
2598   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2599   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2600       MachinePointerInfo::getGOT(MF),
2601       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2602           MachineMemOperand::MOInvariant,
2603       LoadTy, Align(8));
2604 
2605   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2606 
2607   if (Ty.getSizeInBits() == 32) {
2608     // Truncate if this is a 32-bit constant address.
2609     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2610     B.buildExtract(DstReg, Load, 0);
2611   } else
2612     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2613 
2614   MI.eraseFromParent();
2615   return true;
2616 }
2617 
2618 static LLT widenToNextPowerOf2(LLT Ty) {
2619   if (Ty.isVector())
2620     return Ty.changeElementCount(
2621         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2622   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2623 }
2624 
2625 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2626                                        MachineInstr &MI) const {
2627   MachineIRBuilder &B = Helper.MIRBuilder;
2628   MachineRegisterInfo &MRI = *B.getMRI();
2629   GISelChangeObserver &Observer = Helper.Observer;
2630 
2631   Register PtrReg = MI.getOperand(1).getReg();
2632   LLT PtrTy = MRI.getType(PtrReg);
2633   unsigned AddrSpace = PtrTy.getAddressSpace();
2634 
2635   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2636     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2637     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2638     Observer.changingInstr(MI);
2639     MI.getOperand(1).setReg(Cast.getReg(0));
2640     Observer.changedInstr(MI);
2641     return true;
2642   }
2643 
2644   if (MI.getOpcode() != AMDGPU::G_LOAD)
2645     return false;
2646 
2647   Register ValReg = MI.getOperand(0).getReg();
2648   LLT ValTy = MRI.getType(ValReg);
2649 
2650   MachineMemOperand *MMO = *MI.memoperands_begin();
2651   const unsigned ValSize = ValTy.getSizeInBits();
2652   const LLT MemTy = MMO->getMemoryType();
2653   const Align MemAlign = MMO->getAlign();
2654   const unsigned MemSize = MemTy.getSizeInBits();
2655   const uint64_t AlignInBits = 8 * MemAlign.value();
2656 
2657   // Widen non-power-of-2 loads to the alignment if needed
2658   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2659     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2660 
2661     // This was already the correct extending load result type, so just adjust
2662     // the memory type.
2663     if (WideMemSize == ValSize) {
2664       MachineFunction &MF = B.getMF();
2665 
2666       MachineMemOperand *WideMMO =
2667           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2668       Observer.changingInstr(MI);
2669       MI.setMemRefs(MF, {WideMMO});
2670       Observer.changedInstr(MI);
2671       return true;
2672     }
2673 
2674     // Don't bother handling edge case that should probably never be produced.
2675     if (ValSize > WideMemSize)
2676       return false;
2677 
2678     LLT WideTy = widenToNextPowerOf2(ValTy);
2679 
2680     Register WideLoad;
2681     if (!WideTy.isVector()) {
2682       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683       B.buildTrunc(ValReg, WideLoad).getReg(0);
2684     } else {
2685       // Extract the subvector.
2686 
2687       if (isRegisterType(ValTy)) {
2688         // If this a case where G_EXTRACT is legal, use it.
2689         // (e.g. <3 x s32> -> <4 x s32>)
2690         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691         B.buildExtract(ValReg, WideLoad, 0);
2692       } else {
2693         // For cases where the widened type isn't a nice register value, unmerge
2694         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2695         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2696         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2697       }
2698     }
2699 
2700     MI.eraseFromParent();
2701     return true;
2702   }
2703 
2704   return false;
2705 }
2706 
2707 bool AMDGPULegalizerInfo::legalizeFMad(
2708   MachineInstr &MI, MachineRegisterInfo &MRI,
2709   MachineIRBuilder &B) const {
2710   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2711   assert(Ty.isScalar());
2712 
2713   MachineFunction &MF = B.getMF();
2714   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2715 
2716   // TODO: Always legal with future ftz flag.
2717   // FIXME: Do we need just output?
2718   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2719     return true;
2720   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2721     return true;
2722 
2723   MachineIRBuilder HelperBuilder(MI);
2724   GISelObserverWrapper DummyObserver;
2725   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2726   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2727 }
2728 
2729 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2730   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2731   Register DstReg = MI.getOperand(0).getReg();
2732   Register PtrReg = MI.getOperand(1).getReg();
2733   Register CmpVal = MI.getOperand(2).getReg();
2734   Register NewVal = MI.getOperand(3).getReg();
2735 
2736   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2737          "this should not have been custom lowered");
2738 
2739   LLT ValTy = MRI.getType(CmpVal);
2740   LLT VecTy = LLT::fixed_vector(2, ValTy);
2741 
2742   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2743 
2744   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2745     .addDef(DstReg)
2746     .addUse(PtrReg)
2747     .addUse(PackedVal)
2748     .setMemRefs(MI.memoperands());
2749 
2750   MI.eraseFromParent();
2751   return true;
2752 }
2753 
2754 bool AMDGPULegalizerInfo::legalizeFlog(
2755   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2756   Register Dst = MI.getOperand(0).getReg();
2757   Register Src = MI.getOperand(1).getReg();
2758   LLT Ty = B.getMRI()->getType(Dst);
2759   unsigned Flags = MI.getFlags();
2760 
2761   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2762   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2763 
2764   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2765   MI.eraseFromParent();
2766   return true;
2767 }
2768 
2769 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2770                                        MachineIRBuilder &B) const {
2771   Register Dst = MI.getOperand(0).getReg();
2772   Register Src = MI.getOperand(1).getReg();
2773   unsigned Flags = MI.getFlags();
2774   LLT Ty = B.getMRI()->getType(Dst);
2775 
2776   auto K = B.buildFConstant(Ty, numbers::log2e);
2777   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2778   B.buildFExp2(Dst, Mul, Flags);
2779   MI.eraseFromParent();
2780   return true;
2781 }
2782 
2783 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2784                                        MachineIRBuilder &B) const {
2785   Register Dst = MI.getOperand(0).getReg();
2786   Register Src0 = MI.getOperand(1).getReg();
2787   Register Src1 = MI.getOperand(2).getReg();
2788   unsigned Flags = MI.getFlags();
2789   LLT Ty = B.getMRI()->getType(Dst);
2790   const LLT S16 = LLT::scalar(16);
2791   const LLT S32 = LLT::scalar(32);
2792 
2793   if (Ty == S32) {
2794     auto Log = B.buildFLog2(S32, Src0, Flags);
2795     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2796       .addUse(Log.getReg(0))
2797       .addUse(Src1)
2798       .setMIFlags(Flags);
2799     B.buildFExp2(Dst, Mul, Flags);
2800   } else if (Ty == S16) {
2801     // There's no f16 fmul_legacy, so we need to convert for it.
2802     auto Log = B.buildFLog2(S16, Src0, Flags);
2803     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2804     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2805     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2806       .addUse(Ext0.getReg(0))
2807       .addUse(Ext1.getReg(0))
2808       .setMIFlags(Flags);
2809 
2810     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2811   } else
2812     return false;
2813 
2814   MI.eraseFromParent();
2815   return true;
2816 }
2817 
2818 // Find a source register, ignoring any possible source modifiers.
2819 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2820   Register ModSrc = OrigSrc;
2821   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2822     ModSrc = SrcFNeg->getOperand(1).getReg();
2823     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2824       ModSrc = SrcFAbs->getOperand(1).getReg();
2825   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2826     ModSrc = SrcFAbs->getOperand(1).getReg();
2827   return ModSrc;
2828 }
2829 
2830 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2831                                          MachineRegisterInfo &MRI,
2832                                          MachineIRBuilder &B) const {
2833 
2834   const LLT S1 = LLT::scalar(1);
2835   const LLT S64 = LLT::scalar(64);
2836   Register Dst = MI.getOperand(0).getReg();
2837   Register OrigSrc = MI.getOperand(1).getReg();
2838   unsigned Flags = MI.getFlags();
2839   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2840          "this should not have been custom lowered");
2841 
2842   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2843   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2844   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2845   // V_FRACT bug is:
2846   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2847   //
2848   // Convert floor(x) to (x - fract(x))
2849 
2850   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2851     .addUse(OrigSrc)
2852     .setMIFlags(Flags);
2853 
2854   // Give source modifier matching some assistance before obscuring a foldable
2855   // pattern.
2856 
2857   // TODO: We can avoid the neg on the fract? The input sign to fract
2858   // shouldn't matter?
2859   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2860 
2861   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2862 
2863   Register Min = MRI.createGenericVirtualRegister(S64);
2864 
2865   // We don't need to concern ourselves with the snan handling difference, so
2866   // use the one which will directly select.
2867   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2868   if (MFI->getMode().IEEE)
2869     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2870   else
2871     B.buildFMinNum(Min, Fract, Const, Flags);
2872 
2873   Register CorrectedFract = Min;
2874   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2875     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2876     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2877   }
2878 
2879   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2880   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2881 
2882   MI.eraseFromParent();
2883   return true;
2884 }
2885 
2886 // Turn an illegal packed v2s16 build vector into bit operations.
2887 // TODO: This should probably be a bitcast action in LegalizerHelper.
2888 bool AMDGPULegalizerInfo::legalizeBuildVector(
2889   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2890   Register Dst = MI.getOperand(0).getReg();
2891   const LLT S32 = LLT::scalar(32);
2892   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2893 
2894   Register Src0 = MI.getOperand(1).getReg();
2895   Register Src1 = MI.getOperand(2).getReg();
2896   assert(MRI.getType(Src0) == LLT::scalar(16));
2897 
2898   auto Merge = B.buildMerge(S32, {Src0, Src1});
2899   B.buildBitcast(Dst, Merge);
2900 
2901   MI.eraseFromParent();
2902   return true;
2903 }
2904 
2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2906 //
2907 // Source and accumulation registers must all be 32-bits.
2908 //
2909 // TODO: When the multiply is uniform, we should produce a code sequence
2910 // that is better suited to instruction selection on the SALU. Instead of
2911 // the outer loop going over parts of the result, the outer loop should go
2912 // over parts of one of the factors. This should result in instruction
2913 // selection that makes full use of S_ADDC_U32 instructions.
2914 void AMDGPULegalizerInfo::buildMultiply(
2915     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2916     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2917     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2918   // Use (possibly empty) vectors of S1 registers to represent the set of
2919   // carries from one pair of positions to the next.
2920   using Carry = SmallVector<Register, 2>;
2921 
2922   MachineIRBuilder &B = Helper.MIRBuilder;
2923 
2924   const LLT S1 = LLT::scalar(1);
2925   const LLT S32 = LLT::scalar(32);
2926   const LLT S64 = LLT::scalar(64);
2927 
2928   Register Zero32;
2929   Register Zero64;
2930 
2931   auto getZero32 = [&]() -> Register {
2932     if (!Zero32)
2933       Zero32 = B.buildConstant(S32, 0).getReg(0);
2934     return Zero32;
2935   };
2936   auto getZero64 = [&]() -> Register {
2937     if (!Zero64)
2938       Zero64 = B.buildConstant(S64, 0).getReg(0);
2939     return Zero64;
2940   };
2941 
2942   // Merge the given carries into the 32-bit LocalAccum, which is modified
2943   // in-place.
2944   //
2945   // Returns the carry-out, which is a single S1 register or null.
2946   auto mergeCarry =
2947       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2948         if (CarryIn.empty())
2949           return Register();
2950 
2951         bool HaveCarryOut = true;
2952         Register CarryAccum;
2953         if (CarryIn.size() == 1) {
2954           if (!LocalAccum) {
2955             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2956             return Register();
2957           }
2958 
2959           CarryAccum = getZero32();
2960         } else {
2961           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2962           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2963             CarryAccum =
2964                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2965                     .getReg(0);
2966           }
2967 
2968           if (!LocalAccum) {
2969             LocalAccum = getZero32();
2970             HaveCarryOut = false;
2971           }
2972         }
2973 
2974         auto Add =
2975             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2976         LocalAccum = Add.getReg(0);
2977         return HaveCarryOut ? Add.getReg(1) : Register();
2978       };
2979 
2980   // Build a multiply-add chain to compute
2981   //
2982   //   LocalAccum + (partial products at DstIndex)
2983   //       + (opportunistic subset of CarryIn)
2984   //
2985   // LocalAccum is an array of one or two 32-bit registers that are updated
2986   // in-place. The incoming registers may be null.
2987   //
2988   // In some edge cases, carry-ins can be consumed "for free". In that case,
2989   // the consumed carry bits are removed from CarryIn in-place.
2990   auto buildMadChain =
2991       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2992           -> Carry {
2993         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2994                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2995 
2996         Carry CarryOut;
2997         unsigned j0 = 0;
2998 
2999         // Use plain 32-bit multiplication for the most significant part of the
3000         // result by default.
3001         if (LocalAccum.size() == 1 &&
3002             (!UsePartialMad64_32 || !CarryIn.empty())) {
3003           do {
3004             unsigned j1 = DstIndex - j0;
3005             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3006             if (!LocalAccum[0]) {
3007               LocalAccum[0] = Mul.getReg(0);
3008             } else {
3009               if (CarryIn.empty()) {
3010                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3011               } else {
3012                 LocalAccum[0] =
3013                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3014                         .getReg(0);
3015                 CarryIn.pop_back();
3016               }
3017             }
3018             ++j0;
3019           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3020         }
3021 
3022         // Build full 64-bit multiplies.
3023         if (j0 <= DstIndex) {
3024           bool HaveSmallAccum = false;
3025           Register Tmp;
3026 
3027           if (LocalAccum[0]) {
3028             if (LocalAccum.size() == 1) {
3029               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3030               HaveSmallAccum = true;
3031             } else if (LocalAccum[1]) {
3032               Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3033               HaveSmallAccum = false;
3034             } else {
3035               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3036               HaveSmallAccum = true;
3037             }
3038           } else {
3039             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3040             Tmp = getZero64();
3041             HaveSmallAccum = true;
3042           }
3043 
3044           do {
3045             unsigned j1 = DstIndex - j0;
3046             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3047                                     {Src0[j0], Src1[j1], Tmp});
3048             Tmp = Mad.getReg(0);
3049             if (!HaveSmallAccum)
3050               CarryOut.push_back(Mad.getReg(1));
3051             HaveSmallAccum = false;
3052             ++j0;
3053           } while (j0 <= DstIndex);
3054 
3055           auto Unmerge = B.buildUnmerge(S32, Tmp);
3056           LocalAccum[0] = Unmerge.getReg(0);
3057           if (LocalAccum.size() > 1)
3058             LocalAccum[1] = Unmerge.getReg(1);
3059         }
3060 
3061         return CarryOut;
3062       };
3063 
3064   // Outer multiply loop, iterating over destination parts from least
3065   // significant to most significant parts.
3066   //
3067   // The columns of the following diagram correspond to the destination parts
3068   // affected by one iteration of the outer loop (ignoring boundary
3069   // conditions).
3070   //
3071   //   Dest index relative to 2 * i:      1 0 -1
3072   //                                      ------
3073   //   Carries from previous iteration:     e o
3074   //   Even-aligned partial product sum:  E E .
3075   //   Odd-aligned partial product sum:     O O
3076   //
3077   // 'o' is OddCarry, 'e' is EvenCarry.
3078   // EE and OO are computed from partial products via buildMadChain and use
3079   // accumulation where possible and appropriate.
3080   //
3081   Register SeparateOddCarry;
3082   Carry EvenCarry;
3083   Carry OddCarry;
3084 
3085   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3086     Carry OddCarryIn = std::move(OddCarry);
3087     Carry EvenCarryIn = std::move(EvenCarry);
3088     OddCarry.clear();
3089     EvenCarry.clear();
3090 
3091     // Partial products at offset 2 * i.
3092     if (2 * i < Accum.size()) {
3093       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3094       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3095     }
3096 
3097     // Partial products at offset 2 * i - 1.
3098     if (i > 0) {
3099       if (!SeparateOddAlignedProducts) {
3100         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3101         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3102       } else {
3103         bool IsHighest = 2 * i >= Accum.size();
3104         Register SeparateOddOut[2];
3105         auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3106                               .take_front(IsHighest ? 1 : 2);
3107         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3108 
3109         MachineInstr *Lo;
3110 
3111         if (i == 1) {
3112           if (!IsHighest)
3113             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3114           else
3115             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3116         } else {
3117           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3118                             SeparateOddCarry);
3119         }
3120         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3121 
3122         if (!IsHighest) {
3123           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3124                                 Lo->getOperand(1).getReg());
3125           Accum[2 * i] = Hi.getReg(0);
3126           SeparateOddCarry = Hi.getReg(1);
3127         }
3128       }
3129     }
3130 
3131     // Add in the carries from the previous iteration
3132     if (i > 0) {
3133       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3134         EvenCarryIn.push_back(CarryOut);
3135 
3136       if (2 * i < Accum.size()) {
3137         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3138           OddCarry.push_back(CarryOut);
3139       }
3140     }
3141   }
3142 }
3143 
3144 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3145 //
3146 // TODO: If the multiply is followed by an addition, we should attempt to
3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3148 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3149                                       MachineInstr &MI) const {
3150   assert(ST.hasMad64_32());
3151   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3152 
3153   MachineIRBuilder &B = Helper.MIRBuilder;
3154   MachineRegisterInfo &MRI = *B.getMRI();
3155 
3156   Register DstReg = MI.getOperand(0).getReg();
3157   Register Src0 = MI.getOperand(1).getReg();
3158   Register Src1 = MI.getOperand(2).getReg();
3159 
3160   LLT Ty = MRI.getType(DstReg);
3161   assert(Ty.isScalar());
3162 
3163   unsigned Size = Ty.getSizeInBits();
3164   unsigned NumParts = Size / 32;
3165   assert((Size % 32) == 0);
3166   assert(NumParts >= 2);
3167 
3168   // Whether to use MAD_64_32 for partial products whose high half is
3169   // discarded. This avoids some ADD instructions but risks false dependency
3170   // stalls on some subtargets in some cases.
3171   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3172 
3173   // Whether to compute odd-aligned partial products separately. This is
3174   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3175   // in an even-aligned VGPR.
3176   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3177 
3178   LLT S32 = LLT::scalar(32);
3179   SmallVector<Register, 2> Src0Parts, Src1Parts;
3180   for (unsigned i = 0; i < NumParts; ++i) {
3181     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3182     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3183   }
3184   B.buildUnmerge(Src0Parts, Src0);
3185   B.buildUnmerge(Src1Parts, Src1);
3186 
3187   SmallVector<Register, 2> AccumRegs(NumParts);
3188   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3189                 SeparateOddAlignedProducts);
3190 
3191   B.buildMerge(DstReg, AccumRegs);
3192   MI.eraseFromParent();
3193   return true;
3194 
3195 }
3196 
3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3199 // case with a single min instruction instead of a compare+select.
3200 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3201                                             MachineRegisterInfo &MRI,
3202                                             MachineIRBuilder &B) const {
3203   Register Dst = MI.getOperand(0).getReg();
3204   Register Src = MI.getOperand(1).getReg();
3205   LLT DstTy = MRI.getType(Dst);
3206   LLT SrcTy = MRI.getType(Src);
3207 
3208   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3209                         ? AMDGPU::G_AMDGPU_FFBH_U32
3210                         : AMDGPU::G_AMDGPU_FFBL_B32;
3211   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3212   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3213 
3214   MI.eraseFromParent();
3215   return true;
3216 }
3217 
3218 // Check that this is a G_XOR x, -1
3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3220   if (MI.getOpcode() != TargetOpcode::G_XOR)
3221     return false;
3222   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3223   return ConstVal && *ConstVal == -1;
3224 }
3225 
3226 // Return the use branch instruction, otherwise null if the usage is invalid.
3227 static MachineInstr *
3228 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3229                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3230   Register CondDef = MI.getOperand(0).getReg();
3231   if (!MRI.hasOneNonDBGUse(CondDef))
3232     return nullptr;
3233 
3234   MachineBasicBlock *Parent = MI.getParent();
3235   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3236 
3237   if (isNot(MRI, *UseMI)) {
3238     Register NegatedCond = UseMI->getOperand(0).getReg();
3239     if (!MRI.hasOneNonDBGUse(NegatedCond))
3240       return nullptr;
3241 
3242     // We're deleting the def of this value, so we need to remove it.
3243     eraseInstr(*UseMI, MRI);
3244 
3245     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3246     Negated = true;
3247   }
3248 
3249   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3250     return nullptr;
3251 
3252   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3253   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3254   if (Next == Parent->end()) {
3255     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3256     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3257       return nullptr;
3258     UncondBrTarget = &*NextMBB;
3259   } else {
3260     if (Next->getOpcode() != AMDGPU::G_BR)
3261       return nullptr;
3262     Br = &*Next;
3263     UncondBrTarget = Br->getOperand(0).getMBB();
3264   }
3265 
3266   return UseMI;
3267 }
3268 
3269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3270                                          const ArgDescriptor *Arg,
3271                                          const TargetRegisterClass *ArgRC,
3272                                          LLT ArgTy) const {
3273   MCRegister SrcReg = Arg->getRegister();
3274   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3275   assert(DstReg.isVirtual() && "Virtual register expected");
3276 
3277   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3278                                              *ArgRC, B.getDebugLoc(), ArgTy);
3279   if (Arg->isMasked()) {
3280     // TODO: Should we try to emit this once in the entry block?
3281     const LLT S32 = LLT::scalar(32);
3282     const unsigned Mask = Arg->getMask();
3283     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3284 
3285     Register AndMaskSrc = LiveIn;
3286 
3287     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3288     // 0.
3289     if (Shift != 0) {
3290       auto ShiftAmt = B.buildConstant(S32, Shift);
3291       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3292     }
3293 
3294     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3295   } else {
3296     B.buildCopy(DstReg, LiveIn);
3297   }
3298 
3299   return true;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::loadInputValue(
3303     Register DstReg, MachineIRBuilder &B,
3304     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3305   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3306   const ArgDescriptor *Arg;
3307   const TargetRegisterClass *ArgRC;
3308   LLT ArgTy;
3309   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3310 
3311   if (!Arg) {
3312     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3313       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3314       // case the pointer argument may be missing and we use null.
3315       B.buildConstant(DstReg, 0);
3316       return true;
3317     }
3318 
3319     // It's undefined behavior if a function marked with the amdgpu-no-*
3320     // attributes uses the corresponding intrinsic.
3321     B.buildUndef(DstReg);
3322     return true;
3323   }
3324 
3325   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3326     return false; // TODO: Handle these
3327   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3328 }
3329 
3330 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3331     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3332     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3333   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3334     return false;
3335 
3336   MI.eraseFromParent();
3337   return true;
3338 }
3339 
3340 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3341                                 int64_t C) {
3342   B.buildConstant(MI.getOperand(0).getReg(), C);
3343   MI.eraseFromParent();
3344   return true;
3345 }
3346 
3347 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3348     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3349     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3350   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3351   if (MaxID == 0)
3352     return replaceWithConstant(B, MI, 0);
3353 
3354   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3355   const ArgDescriptor *Arg;
3356   const TargetRegisterClass *ArgRC;
3357   LLT ArgTy;
3358   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3359 
3360   Register DstReg = MI.getOperand(0).getReg();
3361   if (!Arg) {
3362     // It's undefined behavior if a function marked with the amdgpu-no-*
3363     // attributes uses the corresponding intrinsic.
3364     B.buildUndef(DstReg);
3365     MI.eraseFromParent();
3366     return true;
3367   }
3368 
3369   if (Arg->isMasked()) {
3370     // Don't bother inserting AssertZext for packed IDs since we're emitting the
3371     // masking operations anyway.
3372     //
3373     // TODO: We could assert the top bit is 0 for the source copy.
3374     if (!loadInputValue(DstReg, B, ArgType))
3375       return false;
3376   } else {
3377     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3378     if (!loadInputValue(TmpReg, B, ArgType))
3379       return false;
3380     B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3381   }
3382 
3383   MI.eraseFromParent();
3384   return true;
3385 }
3386 
3387 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3388                                                      int64_t Offset) const {
3389   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3390   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3391 
3392   // TODO: If we passed in the base kernel offset we could have a better
3393   // alignment than 4, but we don't really need it.
3394   if (!loadInputValue(KernArgReg, B,
3395                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3396     llvm_unreachable("failed to find kernarg segment ptr");
3397 
3398   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3399   // TODO: Should get nuw
3400   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3401 }
3402 
3403 /// Legalize a value that's loaded from kernel arguments. This is only used by
3404 /// legacy intrinsics.
3405 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3406                                                       MachineIRBuilder &B,
3407                                                       uint64_t Offset,
3408                                                       Align Alignment) const {
3409   Register DstReg = MI.getOperand(0).getReg();
3410 
3411   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3412          "unexpected kernarg parameter type");
3413 
3414   Register Ptr = getKernargParameterPtr(B, Offset);
3415   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3416   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3417               MachineMemOperand::MODereferenceable |
3418                   MachineMemOperand::MOInvariant);
3419   MI.eraseFromParent();
3420   return true;
3421 }
3422 
3423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3424                                        MachineRegisterInfo &MRI,
3425                                        MachineIRBuilder &B) const {
3426   Register Dst = MI.getOperand(0).getReg();
3427   LLT DstTy = MRI.getType(Dst);
3428   LLT S16 = LLT::scalar(16);
3429   LLT S32 = LLT::scalar(32);
3430   LLT S64 = LLT::scalar(64);
3431 
3432   if (DstTy == S16)
3433     return legalizeFDIV16(MI, MRI, B);
3434   if (DstTy == S32)
3435     return legalizeFDIV32(MI, MRI, B);
3436   if (DstTy == S64)
3437     return legalizeFDIV64(MI, MRI, B);
3438 
3439   return false;
3440 }
3441 
3442 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3443                                                         Register DstDivReg,
3444                                                         Register DstRemReg,
3445                                                         Register X,
3446                                                         Register Y) const {
3447   const LLT S1 = LLT::scalar(1);
3448   const LLT S32 = LLT::scalar(32);
3449 
3450   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3451   // algorithm used here.
3452 
3453   // Initial estimate of inv(y).
3454   auto FloatY = B.buildUITOFP(S32, Y);
3455   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3456   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3457   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3458   auto Z = B.buildFPTOUI(S32, ScaledY);
3459 
3460   // One round of UNR.
3461   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3462   auto NegYZ = B.buildMul(S32, NegY, Z);
3463   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3464 
3465   // Quotient/remainder estimate.
3466   auto Q = B.buildUMulH(S32, X, Z);
3467   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3468 
3469   // First quotient/remainder refinement.
3470   auto One = B.buildConstant(S32, 1);
3471   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3472   if (DstDivReg)
3473     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3474   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3475 
3476   // Second quotient/remainder refinement.
3477   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3478   if (DstDivReg)
3479     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3480 
3481   if (DstRemReg)
3482     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3483 }
3484 
3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3486 //
3487 // Return lo, hi of result
3488 //
3489 // %cvt.lo = G_UITOFP Val.lo
3490 // %cvt.hi = G_UITOFP Val.hi
3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3494 // %mul2 = G_FMUL %mul1, 2**(-32)
3495 // %trunc = G_INTRINSIC_TRUNC %mul2
3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3499                                                        Register Val) {
3500   const LLT S32 = LLT::scalar(32);
3501   auto Unmerge = B.buildUnmerge(S32, Val);
3502 
3503   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3504   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3505 
3506   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3507                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3508 
3509   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3510   auto Mul1 =
3511       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3512 
3513   // 2**(-32)
3514   auto Mul2 =
3515       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3516   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3517 
3518   // -(2**32)
3519   auto Mad2 = B.buildFMAD(S32, Trunc,
3520                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3521 
3522   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3523   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3524 
3525   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3526 }
3527 
3528 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3529                                                         Register DstDivReg,
3530                                                         Register DstRemReg,
3531                                                         Register Numer,
3532                                                         Register Denom) const {
3533   const LLT S32 = LLT::scalar(32);
3534   const LLT S64 = LLT::scalar(64);
3535   const LLT S1 = LLT::scalar(1);
3536   Register RcpLo, RcpHi;
3537 
3538   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3539 
3540   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3541 
3542   auto Zero64 = B.buildConstant(S64, 0);
3543   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3544 
3545   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3546   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3547 
3548   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3549   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3550   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3551 
3552   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3553   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3554   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3555 
3556   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3557   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3558   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3559   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3560   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3561 
3562   auto Zero32 = B.buildConstant(S32, 0);
3563   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3565   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3566 
3567   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3568   Register NumerLo = UnmergeNumer.getReg(0);
3569   Register NumerHi = UnmergeNumer.getReg(1);
3570 
3571   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3572   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3573   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3574   Register Mul3_Lo = UnmergeMul3.getReg(0);
3575   Register Mul3_Hi = UnmergeMul3.getReg(1);
3576   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3577   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3578   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3579   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3580 
3581   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3582   Register DenomLo = UnmergeDenom.getReg(0);
3583   Register DenomHi = UnmergeDenom.getReg(1);
3584 
3585   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3586   auto C1 = B.buildSExt(S32, CmpHi);
3587 
3588   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3589   auto C2 = B.buildSExt(S32, CmpLo);
3590 
3591   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3592   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3593 
3594   // TODO: Here and below portions of the code can be enclosed into if/endif.
3595   // Currently control flow is unconditional and we have 4 selects after
3596   // potential endif to substitute PHIs.
3597 
3598   // if C3 != 0 ...
3599   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3600   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3601   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3602   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3603 
3604   auto One64 = B.buildConstant(S64, 1);
3605   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3606 
3607   auto C4 =
3608       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3609   auto C5 =
3610       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3611   auto C6 = B.buildSelect(
3612       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3613 
3614   // if (C6 != 0)
3615   auto Add4 = B.buildAdd(S64, Add3, One64);
3616   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3617 
3618   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3619   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3620   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3621 
3622   // endif C6
3623   // endif C3
3624 
3625   if (DstDivReg) {
3626     auto Sel1 = B.buildSelect(
3627         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3628     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3629                   Sel1, MulHi3);
3630   }
3631 
3632   if (DstRemReg) {
3633     auto Sel2 = B.buildSelect(
3634         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3635     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636                   Sel2, Sub1);
3637   }
3638 }
3639 
3640 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3641                                                   MachineRegisterInfo &MRI,
3642                                                   MachineIRBuilder &B) const {
3643   Register DstDivReg, DstRemReg;
3644   switch (MI.getOpcode()) {
3645   default:
3646     llvm_unreachable("Unexpected opcode!");
3647   case AMDGPU::G_UDIV: {
3648     DstDivReg = MI.getOperand(0).getReg();
3649     break;
3650   }
3651   case AMDGPU::G_UREM: {
3652     DstRemReg = MI.getOperand(0).getReg();
3653     break;
3654   }
3655   case AMDGPU::G_UDIVREM: {
3656     DstDivReg = MI.getOperand(0).getReg();
3657     DstRemReg = MI.getOperand(1).getReg();
3658     break;
3659   }
3660   }
3661 
3662   const LLT S64 = LLT::scalar(64);
3663   const LLT S32 = LLT::scalar(32);
3664   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3665   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3666   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3667   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3668 
3669   if (Ty == S32)
3670     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3671   else if (Ty == S64)
3672     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3673   else
3674     return false;
3675 
3676   MI.eraseFromParent();
3677   return true;
3678 }
3679 
3680 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3681                                                 MachineRegisterInfo &MRI,
3682                                                 MachineIRBuilder &B) const {
3683   const LLT S64 = LLT::scalar(64);
3684   const LLT S32 = LLT::scalar(32);
3685 
3686   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3687   if (Ty != S32 && Ty != S64)
3688     return false;
3689 
3690   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3691   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3692   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3693 
3694   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3695   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3696   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3697 
3698   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3699   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3700 
3701   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3702   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3703 
3704   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705   switch (MI.getOpcode()) {
3706   default:
3707     llvm_unreachable("Unexpected opcode!");
3708   case AMDGPU::G_SDIV: {
3709     DstDivReg = MI.getOperand(0).getReg();
3710     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711     break;
3712   }
3713   case AMDGPU::G_SREM: {
3714     DstRemReg = MI.getOperand(0).getReg();
3715     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3716     break;
3717   }
3718   case AMDGPU::G_SDIVREM: {
3719     DstDivReg = MI.getOperand(0).getReg();
3720     DstRemReg = MI.getOperand(1).getReg();
3721     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3722     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723     break;
3724   }
3725   }
3726 
3727   if (Ty == S32)
3728     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3729   else
3730     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3731 
3732   if (DstDivReg) {
3733     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735     B.buildSub(DstDivReg, SignXor, Sign);
3736   }
3737 
3738   if (DstRemReg) {
3739     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3740     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741     B.buildSub(DstRemReg, SignXor, Sign);
3742   }
3743 
3744   MI.eraseFromParent();
3745   return true;
3746 }
3747 
3748 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3749                                                  MachineRegisterInfo &MRI,
3750                                                  MachineIRBuilder &B) const {
3751   Register Res = MI.getOperand(0).getReg();
3752   Register LHS = MI.getOperand(1).getReg();
3753   Register RHS = MI.getOperand(2).getReg();
3754   uint16_t Flags = MI.getFlags();
3755   LLT ResTy = MRI.getType(Res);
3756 
3757   const MachineFunction &MF = B.getMF();
3758   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3759                             MI.getFlag(MachineInstr::FmAfn);
3760 
3761   if (!AllowInaccurateRcp)
3762     return false;
3763 
3764   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3765     // 1 / x -> RCP(x)
3766     if (CLHS->isExactlyValue(1.0)) {
3767       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3768         .addUse(RHS)
3769         .setMIFlags(Flags);
3770 
3771       MI.eraseFromParent();
3772       return true;
3773     }
3774 
3775     // -1 / x -> RCP( FNEG(x) )
3776     if (CLHS->isExactlyValue(-1.0)) {
3777       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3778       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3779         .addUse(FNeg.getReg(0))
3780         .setMIFlags(Flags);
3781 
3782       MI.eraseFromParent();
3783       return true;
3784     }
3785   }
3786 
3787   // x / y -> x * (1.0 / y)
3788   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3789     .addUse(RHS)
3790     .setMIFlags(Flags);
3791   B.buildFMul(Res, LHS, RCP, Flags);
3792 
3793   MI.eraseFromParent();
3794   return true;
3795 }
3796 
3797 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3798                                                    MachineRegisterInfo &MRI,
3799                                                    MachineIRBuilder &B) const {
3800   Register Res = MI.getOperand(0).getReg();
3801   Register X = MI.getOperand(1).getReg();
3802   Register Y = MI.getOperand(2).getReg();
3803   uint16_t Flags = MI.getFlags();
3804   LLT ResTy = MRI.getType(Res);
3805 
3806   const MachineFunction &MF = B.getMF();
3807   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3808                             MI.getFlag(MachineInstr::FmAfn);
3809 
3810   if (!AllowInaccurateRcp)
3811     return false;
3812 
3813   auto NegY = B.buildFNeg(ResTy, Y);
3814   auto One = B.buildFConstant(ResTy, 1.0);
3815 
3816   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3817     .addUse(Y)
3818     .setMIFlags(Flags);
3819 
3820   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3821   R = B.buildFMA(ResTy, Tmp0, R, R);
3822 
3823   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3824   R = B.buildFMA(ResTy, Tmp1, R, R);
3825 
3826   auto Ret = B.buildFMul(ResTy, X, R);
3827   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3828 
3829   B.buildFMA(Res, Tmp2, R, Ret);
3830   MI.eraseFromParent();
3831   return true;
3832 }
3833 
3834 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3835                                          MachineRegisterInfo &MRI,
3836                                          MachineIRBuilder &B) const {
3837   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3838     return true;
3839 
3840   Register Res = MI.getOperand(0).getReg();
3841   Register LHS = MI.getOperand(1).getReg();
3842   Register RHS = MI.getOperand(2).getReg();
3843 
3844   uint16_t Flags = MI.getFlags();
3845 
3846   LLT S16 = LLT::scalar(16);
3847   LLT S32 = LLT::scalar(32);
3848 
3849   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3850   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3851 
3852   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3853     .addUse(RHSExt.getReg(0))
3854     .setMIFlags(Flags);
3855 
3856   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3857   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3858 
3859   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3860     .addUse(RDst.getReg(0))
3861     .addUse(RHS)
3862     .addUse(LHS)
3863     .setMIFlags(Flags);
3864 
3865   MI.eraseFromParent();
3866   return true;
3867 }
3868 
3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3871 static void toggleSPDenormMode(bool Enable,
3872                                MachineIRBuilder &B,
3873                                const GCNSubtarget &ST,
3874                                AMDGPU::SIModeRegisterDefaults Mode) {
3875   // Set SP denorm mode to this value.
3876   unsigned SPDenormMode =
3877     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3878 
3879   if (ST.hasDenormModeInst()) {
3880     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3881     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3882 
3883     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884     B.buildInstr(AMDGPU::S_DENORM_MODE)
3885       .addImm(NewDenormModeValue);
3886 
3887   } else {
3888     // Select FP32 bit field in mode register.
3889     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3890                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3891                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3892 
3893     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894       .addImm(SPDenormMode)
3895       .addImm(SPDenormModeBitField);
3896   }
3897 }
3898 
3899 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3900                                          MachineRegisterInfo &MRI,
3901                                          MachineIRBuilder &B) const {
3902   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3903     return true;
3904 
3905   Register Res = MI.getOperand(0).getReg();
3906   Register LHS = MI.getOperand(1).getReg();
3907   Register RHS = MI.getOperand(2).getReg();
3908   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3909   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3910 
3911   uint16_t Flags = MI.getFlags();
3912 
3913   LLT S32 = LLT::scalar(32);
3914   LLT S1 = LLT::scalar(1);
3915 
3916   auto One = B.buildFConstant(S32, 1.0f);
3917 
3918   auto DenominatorScaled =
3919     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3920       .addUse(LHS)
3921       .addUse(RHS)
3922       .addImm(0)
3923       .setMIFlags(Flags);
3924   auto NumeratorScaled =
3925     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926       .addUse(LHS)
3927       .addUse(RHS)
3928       .addImm(1)
3929       .setMIFlags(Flags);
3930 
3931   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3932     .addUse(DenominatorScaled.getReg(0))
3933     .setMIFlags(Flags);
3934   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3935 
3936   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3937   // aren't modeled as reading it.
3938   if (!Mode.allFP32Denormals())
3939     toggleSPDenormMode(true, B, ST, Mode);
3940 
3941   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3945   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3946   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3947 
3948   if (!Mode.allFP32Denormals())
3949     toggleSPDenormMode(false, B, ST, Mode);
3950 
3951   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3952     .addUse(Fma4.getReg(0))
3953     .addUse(Fma1.getReg(0))
3954     .addUse(Fma3.getReg(0))
3955     .addUse(NumeratorScaled.getReg(1))
3956     .setMIFlags(Flags);
3957 
3958   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3959     .addUse(Fmas.getReg(0))
3960     .addUse(RHS)
3961     .addUse(LHS)
3962     .setMIFlags(Flags);
3963 
3964   MI.eraseFromParent();
3965   return true;
3966 }
3967 
3968 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3969                                          MachineRegisterInfo &MRI,
3970                                          MachineIRBuilder &B) const {
3971   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3972     return true;
3973 
3974   Register Res = MI.getOperand(0).getReg();
3975   Register LHS = MI.getOperand(1).getReg();
3976   Register RHS = MI.getOperand(2).getReg();
3977 
3978   uint16_t Flags = MI.getFlags();
3979 
3980   LLT S64 = LLT::scalar(64);
3981   LLT S1 = LLT::scalar(1);
3982 
3983   auto One = B.buildFConstant(S64, 1.0);
3984 
3985   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986     .addUse(LHS)
3987     .addUse(RHS)
3988     .addImm(0)
3989     .setMIFlags(Flags);
3990 
3991   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3992 
3993   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3994     .addUse(DivScale0.getReg(0))
3995     .setMIFlags(Flags);
3996 
3997   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4000 
4001   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4002     .addUse(LHS)
4003     .addUse(RHS)
4004     .addImm(1)
4005     .setMIFlags(Flags);
4006 
4007   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4008   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4010 
4011   Register Scale;
4012   if (!ST.hasUsableDivScaleConditionOutput()) {
4013     // Workaround a hardware bug on SI where the condition output from div_scale
4014     // is not usable.
4015 
4016     LLT S32 = LLT::scalar(32);
4017 
4018     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4019     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4020     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4021     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4022 
4023     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4024                               Scale1Unmerge.getReg(1));
4025     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4026                               Scale0Unmerge.getReg(1));
4027     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4028   } else {
4029     Scale = DivScale1.getReg(1);
4030   }
4031 
4032   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4033     .addUse(Fma4.getReg(0))
4034     .addUse(Fma3.getReg(0))
4035     .addUse(Mul.getReg(0))
4036     .addUse(Scale)
4037     .setMIFlags(Flags);
4038 
4039   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4040     .addUse(Fmas.getReg(0))
4041     .addUse(RHS)
4042     .addUse(LHS)
4043     .setMIFlags(Flags);
4044 
4045   MI.eraseFromParent();
4046   return true;
4047 }
4048 
4049 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4050                                                  MachineRegisterInfo &MRI,
4051                                                  MachineIRBuilder &B) const {
4052   Register Res = MI.getOperand(0).getReg();
4053   Register LHS = MI.getOperand(2).getReg();
4054   Register RHS = MI.getOperand(3).getReg();
4055   uint16_t Flags = MI.getFlags();
4056 
4057   LLT S32 = LLT::scalar(32);
4058   LLT S1 = LLT::scalar(1);
4059 
4060   auto Abs = B.buildFAbs(S32, RHS, Flags);
4061   const APFloat C0Val(1.0f);
4062 
4063   auto C0 = B.buildConstant(S32, 0x6f800000);
4064   auto C1 = B.buildConstant(S32, 0x2f800000);
4065   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4066 
4067   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4068   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4069 
4070   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4071 
4072   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4073     .addUse(Mul0.getReg(0))
4074     .setMIFlags(Flags);
4075 
4076   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4077 
4078   B.buildFMul(Res, Sel, Mul1, Flags);
4079 
4080   MI.eraseFromParent();
4081   return true;
4082 }
4083 
4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4085 // FIXME: Why do we handle this one but not other removed instructions?
4086 //
4087 // Reciprocal square root.  The clamp prevents infinite results, clamping
4088 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4089 // +-max_float.
4090 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4091                                                     MachineRegisterInfo &MRI,
4092                                                     MachineIRBuilder &B) const {
4093   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4094     return true;
4095 
4096   Register Dst = MI.getOperand(0).getReg();
4097   Register Src = MI.getOperand(2).getReg();
4098   auto Flags = MI.getFlags();
4099 
4100   LLT Ty = MRI.getType(Dst);
4101 
4102   const fltSemantics *FltSemantics;
4103   if (Ty == LLT::scalar(32))
4104     FltSemantics = &APFloat::IEEEsingle();
4105   else if (Ty == LLT::scalar(64))
4106     FltSemantics = &APFloat::IEEEdouble();
4107   else
4108     return false;
4109 
4110   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4111     .addUse(Src)
4112     .setMIFlags(Flags);
4113 
4114   // We don't need to concern ourselves with the snan handling difference, since
4115   // the rsq quieted (or not) so use the one which will directly select.
4116   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4117   const bool UseIEEE = MFI->getMode().IEEE;
4118 
4119   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4120   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4122 
4123   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4124 
4125   if (UseIEEE)
4126     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4127   else
4128     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129   MI.eraseFromParent();
4130   return true;
4131 }
4132 
4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4134   switch (IID) {
4135   case Intrinsic::amdgcn_ds_fadd:
4136     return AMDGPU::G_ATOMICRMW_FADD;
4137   case Intrinsic::amdgcn_ds_fmin:
4138     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139   case Intrinsic::amdgcn_ds_fmax:
4140     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4141   default:
4142     llvm_unreachable("not a DS FP intrinsic");
4143   }
4144 }
4145 
4146 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4147                                                       MachineInstr &MI,
4148                                                       Intrinsic::ID IID) const {
4149   GISelChangeObserver &Observer = Helper.Observer;
4150   Observer.changingInstr(MI);
4151 
4152   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4153 
4154   // The remaining operands were used to set fields in the MemOperand on
4155   // construction.
4156   for (int I = 6; I > 3; --I)
4157     MI.removeOperand(I);
4158 
4159   MI.removeOperand(1); // Remove the intrinsic ID.
4160   Observer.changedInstr(MI);
4161   return true;
4162 }
4163 
4164 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4165                                             MachineRegisterInfo &MRI,
4166                                             MachineIRBuilder &B) const {
4167   uint64_t Offset =
4168     ST.getTargetLowering()->getImplicitParameterOffset(
4169       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4170   LLT DstTy = MRI.getType(DstReg);
4171   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4172 
4173   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4174   if (!loadInputValue(KernargPtrReg, B,
4175                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4176     return false;
4177 
4178   // FIXME: This should be nuw
4179   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4180   return true;
4181 }
4182 
4183 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4184                                                  MachineRegisterInfo &MRI,
4185                                                  MachineIRBuilder &B) const {
4186   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4187   if (!MFI->isEntryFunction()) {
4188     return legalizePreloadedArgIntrin(MI, MRI, B,
4189                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4190   }
4191 
4192   Register DstReg = MI.getOperand(0).getReg();
4193   if (!getImplicitArgPtr(DstReg, MRI, B))
4194     return false;
4195 
4196   MI.eraseFromParent();
4197   return true;
4198 }
4199 
4200 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4201                                               MachineRegisterInfo &MRI,
4202                                               MachineIRBuilder &B,
4203                                               unsigned AddrSpace) const {
4204   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4205   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4206   Register Hi32 = Unmerge.getReg(1);
4207 
4208   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4209   MI.eraseFromParent();
4210   return true;
4211 }
4212 
4213 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4214 // offset (the offset that is included in bounds checking and swizzling, to be
4215 // split between the instruction's voffset and immoffset fields) and soffset
4216 // (the offset that is excluded from bounds checking and swizzling, to go in
4217 // the instruction's soffset field).  This function takes the first kind of
4218 // offset and figures out how to split it between voffset and immoffset.
4219 std::pair<Register, unsigned>
4220 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4221                                         Register OrigOffset) const {
4222   const unsigned MaxImm = 4095;
4223   Register BaseReg;
4224   unsigned ImmOffset;
4225   const LLT S32 = LLT::scalar(32);
4226   MachineRegisterInfo &MRI = *B.getMRI();
4227 
4228   std::tie(BaseReg, ImmOffset) =
4229       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4230 
4231   // If BaseReg is a pointer, convert it to int.
4232   if (MRI.getType(BaseReg).isPointer())
4233     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4234 
4235   // If the immediate value is too big for the immoffset field, put the value
4236   // and -4096 into the immoffset field so that the value that is copied/added
4237   // for the voffset field is a multiple of 4096, and it stands more chance
4238   // of being CSEd with the copy/add for another similar load/store.
4239   // However, do not do that rounding down to a multiple of 4096 if that is a
4240   // negative number, as it appears to be illegal to have a negative offset
4241   // in the vgpr, even if adding the immediate offset makes it positive.
4242   unsigned Overflow = ImmOffset & ~MaxImm;
4243   ImmOffset -= Overflow;
4244   if ((int32_t)Overflow < 0) {
4245     Overflow += ImmOffset;
4246     ImmOffset = 0;
4247   }
4248 
4249   if (Overflow != 0) {
4250     if (!BaseReg) {
4251       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4252     } else {
4253       auto OverflowVal = B.buildConstant(S32, Overflow);
4254       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4255     }
4256   }
4257 
4258   if (!BaseReg)
4259     BaseReg = B.buildConstant(S32, 0).getReg(0);
4260 
4261   return std::make_pair(BaseReg, ImmOffset);
4262 }
4263 
4264 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4265 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4266                                           Register VOffset, Register SOffset,
4267                                           unsigned ImmOffset, Register VIndex,
4268                                           MachineRegisterInfo &MRI) const {
4269   Optional<ValueAndVReg> MaybeVOffsetVal =
4270       getIConstantVRegValWithLookThrough(VOffset, MRI);
4271   Optional<ValueAndVReg> MaybeSOffsetVal =
4272       getIConstantVRegValWithLookThrough(SOffset, MRI);
4273   Optional<ValueAndVReg> MaybeVIndexVal =
4274       getIConstantVRegValWithLookThrough(VIndex, MRI);
4275   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4276   // update the MMO with that offset. The stride is unknown so we can only do
4277   // this if VIndex is constant 0.
4278   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4279       MaybeVIndexVal->Value == 0) {
4280     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4281                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4282     MMO->setOffset(TotalOffset);
4283   } else {
4284     // We don't have a constant combined offset to use in the MMO. Give up.
4285     MMO->setValue((Value *)nullptr);
4286   }
4287 }
4288 
4289 /// Handle register layout difference for f16 images for some subtargets.
4290 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4291                                              MachineRegisterInfo &MRI,
4292                                              Register Reg,
4293                                              bool ImageStore) const {
4294   const LLT S16 = LLT::scalar(16);
4295   const LLT S32 = LLT::scalar(32);
4296   LLT StoreVT = MRI.getType(Reg);
4297   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4298 
4299   if (ST.hasUnpackedD16VMem()) {
4300     auto Unmerge = B.buildUnmerge(S16, Reg);
4301 
4302     SmallVector<Register, 4> WideRegs;
4303     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4304       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4305 
4306     int NumElts = StoreVT.getNumElements();
4307 
4308     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4309         .getReg(0);
4310   }
4311 
4312   if (ImageStore && ST.hasImageStoreD16Bug()) {
4313     if (StoreVT.getNumElements() == 2) {
4314       SmallVector<Register, 4> PackedRegs;
4315       Reg = B.buildBitcast(S32, Reg).getReg(0);
4316       PackedRegs.push_back(Reg);
4317       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4318       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4319           .getReg(0);
4320     }
4321 
4322     if (StoreVT.getNumElements() == 3) {
4323       SmallVector<Register, 4> PackedRegs;
4324       auto Unmerge = B.buildUnmerge(S16, Reg);
4325       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4326         PackedRegs.push_back(Unmerge.getReg(I));
4327       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4328       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4329       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4330     }
4331 
4332     if (StoreVT.getNumElements() == 4) {
4333       SmallVector<Register, 4> PackedRegs;
4334       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4335       auto Unmerge = B.buildUnmerge(S32, Reg);
4336       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4337         PackedRegs.push_back(Unmerge.getReg(I));
4338       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4339       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4340           .getReg(0);
4341     }
4342 
4343     llvm_unreachable("invalid data type");
4344   }
4345 
4346   if (StoreVT == LLT::fixed_vector(3, S16)) {
4347     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4348               .getReg(0);
4349   }
4350   return Reg;
4351 }
4352 
4353 Register AMDGPULegalizerInfo::fixStoreSourceType(
4354   MachineIRBuilder &B, Register VData, bool IsFormat) const {
4355   MachineRegisterInfo *MRI = B.getMRI();
4356   LLT Ty = MRI->getType(VData);
4357 
4358   const LLT S16 = LLT::scalar(16);
4359 
4360   // Fixup illegal register types for i8 stores.
4361   if (Ty == LLT::scalar(8) || Ty == S16) {
4362     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4363     return AnyExt;
4364   }
4365 
4366   if (Ty.isVector()) {
4367     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4368       if (IsFormat)
4369         return handleD16VData(B, *MRI, VData);
4370     }
4371   }
4372 
4373   return VData;
4374 }
4375 
4376 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4377                                               MachineRegisterInfo &MRI,
4378                                               MachineIRBuilder &B,
4379                                               bool IsTyped,
4380                                               bool IsFormat) const {
4381   Register VData = MI.getOperand(1).getReg();
4382   LLT Ty = MRI.getType(VData);
4383   LLT EltTy = Ty.getScalarType();
4384   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4385   const LLT S32 = LLT::scalar(32);
4386 
4387   VData = fixStoreSourceType(B, VData, IsFormat);
4388   Register RSrc = MI.getOperand(2).getReg();
4389 
4390   MachineMemOperand *MMO = *MI.memoperands_begin();
4391   const int MemSize = MMO->getSize();
4392 
4393   unsigned ImmOffset;
4394 
4395   // The typed intrinsics add an immediate after the registers.
4396   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4397 
4398   // The struct intrinsic variants add one additional operand over raw.
4399   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4400   Register VIndex;
4401   int OpOffset = 0;
4402   if (HasVIndex) {
4403     VIndex = MI.getOperand(3).getReg();
4404     OpOffset = 1;
4405   } else {
4406     VIndex = B.buildConstant(S32, 0).getReg(0);
4407   }
4408 
4409   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4410   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4411 
4412   unsigned Format = 0;
4413   if (IsTyped) {
4414     Format = MI.getOperand(5 + OpOffset).getImm();
4415     ++OpOffset;
4416   }
4417 
4418   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4419 
4420   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4421   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4422 
4423   unsigned Opc;
4424   if (IsTyped) {
4425     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4426                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4427   } else if (IsFormat) {
4428     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4429                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4430   } else {
4431     switch (MemSize) {
4432     case 1:
4433       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4434       break;
4435     case 2:
4436       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4437       break;
4438     default:
4439       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4440       break;
4441     }
4442   }
4443 
4444   auto MIB = B.buildInstr(Opc)
4445     .addUse(VData)              // vdata
4446     .addUse(RSrc)               // rsrc
4447     .addUse(VIndex)             // vindex
4448     .addUse(VOffset)            // voffset
4449     .addUse(SOffset)            // soffset
4450     .addImm(ImmOffset);         // offset(imm)
4451 
4452   if (IsTyped)
4453     MIB.addImm(Format);
4454 
4455   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4456      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4457      .addMemOperand(MMO);
4458 
4459   MI.eraseFromParent();
4460   return true;
4461 }
4462 
4463 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4464                                              MachineRegisterInfo &MRI,
4465                                              MachineIRBuilder &B,
4466                                              bool IsFormat,
4467                                              bool IsTyped) const {
4468   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4469   MachineMemOperand *MMO = *MI.memoperands_begin();
4470   const LLT MemTy = MMO->getMemoryType();
4471   const LLT S32 = LLT::scalar(32);
4472 
4473   Register Dst = MI.getOperand(0).getReg();
4474   Register RSrc = MI.getOperand(2).getReg();
4475 
4476   // The typed intrinsics add an immediate after the registers.
4477   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4478 
4479   // The struct intrinsic variants add one additional operand over raw.
4480   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4481   Register VIndex;
4482   int OpOffset = 0;
4483   if (HasVIndex) {
4484     VIndex = MI.getOperand(3).getReg();
4485     OpOffset = 1;
4486   } else {
4487     VIndex = B.buildConstant(S32, 0).getReg(0);
4488   }
4489 
4490   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4491   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4492 
4493   unsigned Format = 0;
4494   if (IsTyped) {
4495     Format = MI.getOperand(5 + OpOffset).getImm();
4496     ++OpOffset;
4497   }
4498 
4499   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4500   unsigned ImmOffset;
4501 
4502   LLT Ty = MRI.getType(Dst);
4503   LLT EltTy = Ty.getScalarType();
4504   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4505   const bool Unpacked = ST.hasUnpackedD16VMem();
4506 
4507   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4508   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4509 
4510   unsigned Opc;
4511 
4512   if (IsTyped) {
4513     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4514                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4515   } else if (IsFormat) {
4516     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4517                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4518   } else {
4519     switch (MemTy.getSizeInBits()) {
4520     case 8:
4521       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4522       break;
4523     case 16:
4524       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4525       break;
4526     default:
4527       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4528       break;
4529     }
4530   }
4531 
4532   Register LoadDstReg;
4533 
4534   bool IsExtLoad =
4535       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4536   LLT UnpackedTy = Ty.changeElementSize(32);
4537 
4538   if (IsExtLoad)
4539     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4540   else if (Unpacked && IsD16 && Ty.isVector())
4541     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4542   else
4543     LoadDstReg = Dst;
4544 
4545   auto MIB = B.buildInstr(Opc)
4546     .addDef(LoadDstReg)         // vdata
4547     .addUse(RSrc)               // rsrc
4548     .addUse(VIndex)             // vindex
4549     .addUse(VOffset)            // voffset
4550     .addUse(SOffset)            // soffset
4551     .addImm(ImmOffset);         // offset(imm)
4552 
4553   if (IsTyped)
4554     MIB.addImm(Format);
4555 
4556   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4557      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4558      .addMemOperand(MMO);
4559 
4560   if (LoadDstReg != Dst) {
4561     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4562 
4563     // Widen result for extending loads was widened.
4564     if (IsExtLoad)
4565       B.buildTrunc(Dst, LoadDstReg);
4566     else {
4567       // Repack to original 16-bit vector result
4568       // FIXME: G_TRUNC should work, but legalization currently fails
4569       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4570       SmallVector<Register, 4> Repack;
4571       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4572         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4573       B.buildMerge(Dst, Repack);
4574     }
4575   }
4576 
4577   MI.eraseFromParent();
4578   return true;
4579 }
4580 
4581 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4582                                                MachineIRBuilder &B,
4583                                                bool IsInc) const {
4584   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4585                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4586   B.buildInstr(Opc)
4587     .addDef(MI.getOperand(0).getReg())
4588     .addUse(MI.getOperand(2).getReg())
4589     .addUse(MI.getOperand(3).getReg())
4590     .cloneMemRefs(MI);
4591   MI.eraseFromParent();
4592   return true;
4593 }
4594 
4595 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4596   switch (IntrID) {
4597   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4598   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4599     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4600   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4601   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4602     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4603   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4604   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4605     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4606   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4607   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4608     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4609   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4610   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4611     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4612   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4613   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4614     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4615   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4616   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4617     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4618   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4619   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4620     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4621   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4622   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4623     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4624   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4625   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4626     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4627   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4628   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4629     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4630   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4631   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4632     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4633   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4634   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4635     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4636   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4637   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4638     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4639   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4640   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4641     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4642   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4643   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4644     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4645   default:
4646     llvm_unreachable("unhandled atomic opcode");
4647   }
4648 }
4649 
4650 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4651                                                MachineIRBuilder &B,
4652                                                Intrinsic::ID IID) const {
4653   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4654                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4655   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4656 
4657   Register Dst;
4658 
4659   int OpOffset = 0;
4660   if (HasReturn) {
4661     // A few FP atomics do not support return values.
4662     Dst = MI.getOperand(0).getReg();
4663   } else {
4664     OpOffset = -1;
4665   }
4666 
4667   Register VData = MI.getOperand(2 + OpOffset).getReg();
4668   Register CmpVal;
4669 
4670   if (IsCmpSwap) {
4671     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4672     ++OpOffset;
4673   }
4674 
4675   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4676   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4677 
4678   // The struct intrinsic variants add one additional operand over raw.
4679   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4680   Register VIndex;
4681   if (HasVIndex) {
4682     VIndex = MI.getOperand(4 + OpOffset).getReg();
4683     ++OpOffset;
4684   } else {
4685     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4686   }
4687 
4688   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4689   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4690   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4691 
4692   MachineMemOperand *MMO = *MI.memoperands_begin();
4693 
4694   unsigned ImmOffset;
4695   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4696   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4697 
4698   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4699 
4700   if (HasReturn)
4701     MIB.addDef(Dst);
4702 
4703   MIB.addUse(VData); // vdata
4704 
4705   if (IsCmpSwap)
4706     MIB.addReg(CmpVal);
4707 
4708   MIB.addUse(RSrc)               // rsrc
4709      .addUse(VIndex)             // vindex
4710      .addUse(VOffset)            // voffset
4711      .addUse(SOffset)            // soffset
4712      .addImm(ImmOffset)          // offset(imm)
4713      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4714      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4715      .addMemOperand(MMO);
4716 
4717   MI.eraseFromParent();
4718   return true;
4719 }
4720 
4721 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4722 /// vector with s16 typed elements.
4723 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4724                                       SmallVectorImpl<Register> &PackedAddrs,
4725                                       unsigned ArgOffset,
4726                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4727                                       bool IsA16, bool IsG16) {
4728   const LLT S16 = LLT::scalar(16);
4729   const LLT V2S16 = LLT::fixed_vector(2, 16);
4730   auto EndIdx = Intr->VAddrEnd;
4731 
4732   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4733     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4734     if (!SrcOp.isReg())
4735       continue; // _L to _LZ may have eliminated this.
4736 
4737     Register AddrReg = SrcOp.getReg();
4738 
4739     if ((I < Intr->GradientStart) ||
4740         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4741         (I >= Intr->CoordStart && !IsA16)) {
4742       if ((I < Intr->GradientStart) && IsA16 &&
4743           (B.getMRI()->getType(AddrReg) == S16)) {
4744         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4745         // Special handling of bias when A16 is on. Bias is of type half but
4746         // occupies full 32-bit.
4747         PackedAddrs.push_back(
4748             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4749                 .getReg(0));
4750       } else {
4751         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4752                "Bias needs to be converted to 16 bit in A16 mode");
4753         // Handle any gradient or coordinate operands that should not be packed
4754         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4755         PackedAddrs.push_back(AddrReg);
4756       }
4757     } else {
4758       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4759       // derivatives dx/dh and dx/dv are packed with undef.
4760       if (((I + 1) >= EndIdx) ||
4761           ((Intr->NumGradients / 2) % 2 == 1 &&
4762            (I == static_cast<unsigned>(Intr->GradientStart +
4763                                        (Intr->NumGradients / 2) - 1) ||
4764             I == static_cast<unsigned>(Intr->GradientStart +
4765                                        Intr->NumGradients - 1))) ||
4766           // Check for _L to _LZ optimization
4767           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4768         PackedAddrs.push_back(
4769             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4770                 .getReg(0));
4771       } else {
4772         PackedAddrs.push_back(
4773             B.buildBuildVector(
4774                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4775                 .getReg(0));
4776         ++I;
4777       }
4778     }
4779   }
4780 }
4781 
4782 /// Convert from separate vaddr components to a single vector address register,
4783 /// and replace the remaining operands with $noreg.
4784 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4785                                      int DimIdx, int NumVAddrs) {
4786   const LLT S32 = LLT::scalar(32);
4787 
4788   SmallVector<Register, 8> AddrRegs;
4789   for (int I = 0; I != NumVAddrs; ++I) {
4790     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4791     if (SrcOp.isReg()) {
4792       AddrRegs.push_back(SrcOp.getReg());
4793       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4794     }
4795   }
4796 
4797   int NumAddrRegs = AddrRegs.size();
4798   if (NumAddrRegs != 1) {
4799     // Above 8 elements round up to next power of 2 (i.e. 16).
4800     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4801       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4802       auto Undef = B.buildUndef(S32);
4803       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4804       NumAddrRegs = RoundedNumRegs;
4805     }
4806 
4807     auto VAddr =
4808         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4809     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4810   }
4811 
4812   for (int I = 1; I != NumVAddrs; ++I) {
4813     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4814     if (SrcOp.isReg())
4815       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4816   }
4817 }
4818 
4819 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4820 ///
4821 /// Depending on the subtarget, load/store with 16-bit element data need to be
4822 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4823 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4824 /// registers.
4825 ///
4826 /// We don't want to directly select image instructions just yet, but also want
4827 /// to exposes all register repacking to the legalizer/combiners. We also don't
4828 /// want a selected instruction entering RegBankSelect. In order to avoid
4829 /// defining a multitude of intermediate image instructions, directly hack on
4830 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4831 /// padding now unnecessary arguments with $noreg.
4832 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4833     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4834     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4835 
4836   const unsigned NumDefs = MI.getNumExplicitDefs();
4837   const unsigned ArgOffset = NumDefs + 1;
4838   bool IsTFE = NumDefs == 2;
4839   // We are only processing the operands of d16 image operations on subtargets
4840   // that use the unpacked register layout, or need to repack the TFE result.
4841 
4842   // TODO: Do we need to guard against already legalized intrinsics?
4843   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4844       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4845 
4846   MachineRegisterInfo *MRI = B.getMRI();
4847   const LLT S32 = LLT::scalar(32);
4848   const LLT S16 = LLT::scalar(16);
4849   const LLT V2S16 = LLT::fixed_vector(2, 16);
4850 
4851   unsigned DMask = 0;
4852   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4853   LLT Ty = MRI->getType(VData);
4854 
4855   // Check for 16 bit addresses and pack if true.
4856   LLT GradTy =
4857       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4858   LLT AddrTy =
4859       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4860   const bool IsG16 = GradTy == S16;
4861   const bool IsA16 = AddrTy == S16;
4862   const bool IsD16 = Ty.getScalarType() == S16;
4863 
4864   int DMaskLanes = 0;
4865   if (!BaseOpcode->Atomic) {
4866     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4867     if (BaseOpcode->Gather4) {
4868       DMaskLanes = 4;
4869     } else if (DMask != 0) {
4870       DMaskLanes = countPopulation(DMask);
4871     } else if (!IsTFE && !BaseOpcode->Store) {
4872       // If dmask is 0, this is a no-op load. This can be eliminated.
4873       B.buildUndef(MI.getOperand(0));
4874       MI.eraseFromParent();
4875       return true;
4876     }
4877   }
4878 
4879   Observer.changingInstr(MI);
4880   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4881 
4882   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4883                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4884   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4885                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4886   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4887 
4888   // Track that we legalized this
4889   MI.setDesc(B.getTII().get(NewOpcode));
4890 
4891   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4892   // dmask to be at least 1 otherwise the instruction will fail
4893   if (IsTFE && DMask == 0) {
4894     DMask = 0x1;
4895     DMaskLanes = 1;
4896     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4897   }
4898 
4899   if (BaseOpcode->Atomic) {
4900     Register VData0 = MI.getOperand(2).getReg();
4901     LLT Ty = MRI->getType(VData0);
4902 
4903     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4904     if (Ty.isVector())
4905       return false;
4906 
4907     if (BaseOpcode->AtomicX2) {
4908       Register VData1 = MI.getOperand(3).getReg();
4909       // The two values are packed in one register.
4910       LLT PackedTy = LLT::fixed_vector(2, Ty);
4911       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4912       MI.getOperand(2).setReg(Concat.getReg(0));
4913       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4914     }
4915   }
4916 
4917   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4918 
4919   // Rewrite the addressing register layout before doing anything else.
4920   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4921     // 16 bit gradients are supported, but are tied to the A16 control
4922     // so both gradients and addresses must be 16 bit
4923     return false;
4924   }
4925 
4926   if (IsA16 && !ST.hasA16()) {
4927     // A16 not supported
4928     return false;
4929   }
4930 
4931   if (IsA16 || IsG16) {
4932     if (Intr->NumVAddrs > 1) {
4933       SmallVector<Register, 4> PackedRegs;
4934 
4935       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4936                                 IsG16);
4937 
4938       // See also below in the non-a16 branch
4939       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4940                           PackedRegs.size() <= ST.getNSAMaxSize();
4941 
4942       if (!UseNSA && PackedRegs.size() > 1) {
4943         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4944         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4945         PackedRegs[0] = Concat.getReg(0);
4946         PackedRegs.resize(1);
4947       }
4948 
4949       const unsigned NumPacked = PackedRegs.size();
4950       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4951         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4952         if (!SrcOp.isReg()) {
4953           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4954           continue;
4955         }
4956 
4957         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4958 
4959         if (I - Intr->VAddrStart < NumPacked)
4960           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4961         else
4962           SrcOp.setReg(AMDGPU::NoRegister);
4963       }
4964     }
4965   } else {
4966     // If the register allocator cannot place the address registers contiguously
4967     // without introducing moves, then using the non-sequential address encoding
4968     // is always preferable, since it saves VALU instructions and is usually a
4969     // wash in terms of code size or even better.
4970     //
4971     // However, we currently have no way of hinting to the register allocator
4972     // that MIMG addresses should be placed contiguously when it is possible to
4973     // do so, so force non-NSA for the common 2-address case as a heuristic.
4974     //
4975     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4976     // allocation when possible.
4977     //
4978     // TODO: we can actually allow partial NSA where the final register is a
4979     // contiguous set of the remaining addresses.
4980     // This could help where there are more addresses than supported.
4981     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4982                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4983 
4984     if (!UseNSA && Intr->NumVAddrs > 1)
4985       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4986                                Intr->NumVAddrs);
4987   }
4988 
4989   int Flags = 0;
4990   if (IsA16)
4991     Flags |= 1;
4992   if (IsG16)
4993     Flags |= 2;
4994   MI.addOperand(MachineOperand::CreateImm(Flags));
4995 
4996   if (BaseOpcode->Store) { // No TFE for stores?
4997     // TODO: Handle dmask trim
4998     if (!Ty.isVector() || !IsD16)
4999       return true;
5000 
5001     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5002     if (RepackedReg != VData) {
5003       MI.getOperand(1).setReg(RepackedReg);
5004     }
5005 
5006     return true;
5007   }
5008 
5009   Register DstReg = MI.getOperand(0).getReg();
5010   const LLT EltTy = Ty.getScalarType();
5011   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5012 
5013   // Confirm that the return type is large enough for the dmask specified
5014   if (NumElts < DMaskLanes)
5015     return false;
5016 
5017   if (NumElts > 4 || DMaskLanes > 4)
5018     return false;
5019 
5020   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5021   const LLT AdjustedTy =
5022       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5023 
5024   // The raw dword aligned data component of the load. The only legal cases
5025   // where this matters should be when using the packed D16 format, for
5026   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5027   LLT RoundedTy;
5028 
5029   // S32 vector to to cover all data, plus TFE result element.
5030   LLT TFETy;
5031 
5032   // Register type to use for each loaded component. Will be S32 or V2S16.
5033   LLT RegTy;
5034 
5035   if (IsD16 && ST.hasUnpackedD16VMem()) {
5036     RoundedTy =
5037         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5038     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5039     RegTy = S32;
5040   } else {
5041     unsigned EltSize = EltTy.getSizeInBits();
5042     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5043     unsigned RoundedSize = 32 * RoundedElts;
5044     RoundedTy = LLT::scalarOrVector(
5045         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5046     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5047     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5048   }
5049 
5050   // The return type does not need adjustment.
5051   // TODO: Should we change s16 case to s32 or <2 x s16>?
5052   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5053     return true;
5054 
5055   Register Dst1Reg;
5056 
5057   // Insert after the instruction.
5058   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5059 
5060   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5061   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5062   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5063   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5064 
5065   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5066 
5067   MI.getOperand(0).setReg(NewResultReg);
5068 
5069   // In the IR, TFE is supposed to be used with a 2 element struct return
5070   // type. The instruction really returns these two values in one contiguous
5071   // register, with one additional dword beyond the loaded data. Rewrite the
5072   // return type to use a single register result.
5073 
5074   if (IsTFE) {
5075     Dst1Reg = MI.getOperand(1).getReg();
5076     if (MRI->getType(Dst1Reg) != S32)
5077       return false;
5078 
5079     // TODO: Make sure the TFE operand bit is set.
5080     MI.removeOperand(1);
5081 
5082     // Handle the easy case that requires no repack instructions.
5083     if (Ty == S32) {
5084       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5085       return true;
5086     }
5087   }
5088 
5089   // Now figure out how to copy the new result register back into the old
5090   // result.
5091   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5092 
5093   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
5094 
5095   if (ResultNumRegs == 1) {
5096     assert(!IsTFE);
5097     ResultRegs[0] = NewResultReg;
5098   } else {
5099     // We have to repack into a new vector of some kind.
5100     for (int I = 0; I != NumDataRegs; ++I)
5101       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5102     B.buildUnmerge(ResultRegs, NewResultReg);
5103 
5104     // Drop the final TFE element to get the data part. The TFE result is
5105     // directly written to the right place already.
5106     if (IsTFE)
5107       ResultRegs.resize(NumDataRegs);
5108   }
5109 
5110   // For an s16 scalar result, we form an s32 result with a truncate regardless
5111   // of packed vs. unpacked.
5112   if (IsD16 && !Ty.isVector()) {
5113     B.buildTrunc(DstReg, ResultRegs[0]);
5114     return true;
5115   }
5116 
5117   // Avoid a build/concat_vector of 1 entry.
5118   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5119     B.buildBitcast(DstReg, ResultRegs[0]);
5120     return true;
5121   }
5122 
5123   assert(Ty.isVector());
5124 
5125   if (IsD16) {
5126     // For packed D16 results with TFE enabled, all the data components are
5127     // S32. Cast back to the expected type.
5128     //
5129     // TODO: We don't really need to use load s32 elements. We would only need one
5130     // cast for the TFE result if a multiple of v2s16 was used.
5131     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5132       for (Register &Reg : ResultRegs)
5133         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5134     } else if (ST.hasUnpackedD16VMem()) {
5135       for (Register &Reg : ResultRegs)
5136         Reg = B.buildTrunc(S16, Reg).getReg(0);
5137     }
5138   }
5139 
5140   auto padWithUndef = [&](LLT Ty, int NumElts) {
5141     if (NumElts == 0)
5142       return;
5143     Register Undef = B.buildUndef(Ty).getReg(0);
5144     for (int I = 0; I != NumElts; ++I)
5145       ResultRegs.push_back(Undef);
5146   };
5147 
5148   // Pad out any elements eliminated due to the dmask.
5149   LLT ResTy = MRI->getType(ResultRegs[0]);
5150   if (!ResTy.isVector()) {
5151     padWithUndef(ResTy, NumElts - ResultRegs.size());
5152     B.buildBuildVector(DstReg, ResultRegs);
5153     return true;
5154   }
5155 
5156   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5157   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5158 
5159   // Deal with the one annoying legal case.
5160   const LLT V3S16 = LLT::fixed_vector(3, 16);
5161   if (Ty == V3S16) {
5162     if (IsTFE) {
5163       if (ResultRegs.size() == 1) {
5164         NewResultReg = ResultRegs[0];
5165       } else if (ResultRegs.size() == 2) {
5166         LLT V4S16 = LLT::fixed_vector(4, 16);
5167         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5168       } else {
5169         return false;
5170       }
5171     }
5172 
5173     if (MRI->getType(DstReg).getNumElements() <
5174         MRI->getType(NewResultReg).getNumElements()) {
5175       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5176     } else {
5177       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5178     }
5179     return true;
5180   }
5181 
5182   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5183   B.buildConcatVectors(DstReg, ResultRegs);
5184   return true;
5185 }
5186 
5187 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5188   LegalizerHelper &Helper, MachineInstr &MI) const {
5189   MachineIRBuilder &B = Helper.MIRBuilder;
5190   GISelChangeObserver &Observer = Helper.Observer;
5191 
5192   Register Dst = MI.getOperand(0).getReg();
5193   LLT Ty = B.getMRI()->getType(Dst);
5194   unsigned Size = Ty.getSizeInBits();
5195   MachineFunction &MF = B.getMF();
5196 
5197   Observer.changingInstr(MI);
5198 
5199   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5200     Ty = getBitcastRegisterType(Ty);
5201     Helper.bitcastDst(MI, Ty, 0);
5202     Dst = MI.getOperand(0).getReg();
5203     B.setInsertPt(B.getMBB(), MI);
5204   }
5205 
5206   // FIXME: We don't really need this intermediate instruction. The intrinsic
5207   // should be fixed to have a memory operand. Since it's readnone, we're not
5208   // allowed to add one.
5209   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5210   MI.removeOperand(1); // Remove intrinsic ID
5211 
5212   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5213   // TODO: Should this use datalayout alignment?
5214   const unsigned MemSize = (Size + 7) / 8;
5215   const Align MemAlign(4);
5216   MachineMemOperand *MMO = MF.getMachineMemOperand(
5217       MachinePointerInfo(),
5218       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5219           MachineMemOperand::MOInvariant,
5220       MemSize, MemAlign);
5221   MI.addMemOperand(MF, MMO);
5222 
5223   // There are no 96-bit result scalar loads, but widening to 128-bit should
5224   // always be legal. We may need to restore this to a 96-bit result if it turns
5225   // out this needs to be converted to a vector load during RegBankSelect.
5226   if (!isPowerOf2_32(Size)) {
5227     if (Ty.isVector())
5228       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5229     else
5230       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5231   }
5232 
5233   Observer.changedInstr(MI);
5234   return true;
5235 }
5236 
5237 // TODO: Move to selection
5238 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5239                                                 MachineRegisterInfo &MRI,
5240                                                 MachineIRBuilder &B) const {
5241   if (!ST.isTrapHandlerEnabled() ||
5242       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5243     return legalizeTrapEndpgm(MI, MRI, B);
5244 
5245   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5246     switch (*HsaAbiVer) {
5247     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5248     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5249       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5250     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5251     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5252       return ST.supportsGetDoorbellID() ?
5253           legalizeTrapHsa(MI, MRI, B) :
5254           legalizeTrapHsaQueuePtr(MI, MRI, B);
5255     }
5256   }
5257 
5258   llvm_unreachable("Unknown trap handler");
5259 }
5260 
5261 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5262     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5263   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5264   MI.eraseFromParent();
5265   return true;
5266 }
5267 
5268 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5269     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5270   MachineFunction &MF = B.getMF();
5271   const LLT S64 = LLT::scalar(64);
5272 
5273   Register SGPR01(AMDGPU::SGPR0_SGPR1);
5274   // For code object version 5, queue_ptr is passed through implicit kernarg.
5275   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5276     AMDGPUTargetLowering::ImplicitParameter Param =
5277         AMDGPUTargetLowering::QUEUE_PTR;
5278     uint64_t Offset =
5279         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5280 
5281     Register KernargPtrReg = MRI.createGenericVirtualRegister(
5282         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5283 
5284     if (!loadInputValue(KernargPtrReg, B,
5285                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5286       return false;
5287 
5288     // TODO: can we be smarter about machine pointer info?
5289     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5290     MachineMemOperand *MMO = MF.getMachineMemOperand(
5291         PtrInfo,
5292         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5293             MachineMemOperand::MOInvariant,
5294         LLT::scalar(64), commonAlignment(Align(64), Offset));
5295 
5296     // Pointer address
5297     Register LoadAddr = MRI.createGenericVirtualRegister(
5298         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5299     B.buildPtrAdd(LoadAddr, KernargPtrReg,
5300                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5301     // Load address
5302     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5303     B.buildCopy(SGPR01, Temp);
5304     B.buildInstr(AMDGPU::S_TRAP)
5305         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5306         .addReg(SGPR01, RegState::Implicit);
5307     MI.eraseFromParent();
5308     return true;
5309   }
5310 
5311   // Pass queue pointer to trap handler as input, and insert trap instruction
5312   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5313   Register LiveIn =
5314     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5315   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5316     return false;
5317 
5318   B.buildCopy(SGPR01, LiveIn);
5319   B.buildInstr(AMDGPU::S_TRAP)
5320       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5321       .addReg(SGPR01, RegState::Implicit);
5322 
5323   MI.eraseFromParent();
5324   return true;
5325 }
5326 
5327 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5328     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5329   B.buildInstr(AMDGPU::S_TRAP)
5330       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5331   MI.eraseFromParent();
5332   return true;
5333 }
5334 
5335 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5336     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5337   // Is non-HSA path or trap-handler disabled? Then, report a warning
5338   // accordingly
5339   if (!ST.isTrapHandlerEnabled() ||
5340       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5341     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5342                                      "debugtrap handler not supported",
5343                                      MI.getDebugLoc(), DS_Warning);
5344     LLVMContext &Ctx = B.getMF().getFunction().getContext();
5345     Ctx.diagnose(NoTrap);
5346   } else {
5347     // Insert debug-trap instruction
5348     B.buildInstr(AMDGPU::S_TRAP)
5349         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5350   }
5351 
5352   MI.eraseFromParent();
5353   return true;
5354 }
5355 
5356 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5357                                                MachineIRBuilder &B) const {
5358   MachineRegisterInfo &MRI = *B.getMRI();
5359   const LLT S16 = LLT::scalar(16);
5360   const LLT S32 = LLT::scalar(32);
5361   const LLT V2S16 = LLT::fixed_vector(2, 16);
5362   const LLT V3S32 = LLT::fixed_vector(3, 32);
5363 
5364   Register DstReg = MI.getOperand(0).getReg();
5365   Register NodePtr = MI.getOperand(2).getReg();
5366   Register RayExtent = MI.getOperand(3).getReg();
5367   Register RayOrigin = MI.getOperand(4).getReg();
5368   Register RayDir = MI.getOperand(5).getReg();
5369   Register RayInvDir = MI.getOperand(6).getReg();
5370   Register TDescr = MI.getOperand(7).getReg();
5371 
5372   if (!ST.hasGFX10_AEncoding()) {
5373     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5374                                         "intrinsic not supported on subtarget",
5375                                         MI.getDebugLoc());
5376     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5377     return false;
5378   }
5379 
5380   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5381   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5382   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5383   const unsigned NumVDataDwords = 4;
5384   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5385   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5386   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5387   const unsigned BaseOpcodes[2][2] = {
5388       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5389       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5390        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5391   int Opcode;
5392   if (UseNSA) {
5393     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5394                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5395                                                : AMDGPU::MIMGEncGfx10NSA,
5396                                    NumVDataDwords, NumVAddrDwords);
5397   } else {
5398     Opcode = AMDGPU::getMIMGOpcode(
5399         BaseOpcodes[Is64][IsA16],
5400         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5401         NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
5402   }
5403   assert(Opcode != -1);
5404 
5405   SmallVector<Register, 12> Ops;
5406   if (UseNSA && IsGFX11Plus) {
5407     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5408       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5409       auto Merged = B.buildMerge(
5410           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5411       Ops.push_back(Merged.getReg(0));
5412     };
5413 
5414     Ops.push_back(NodePtr);
5415     Ops.push_back(RayExtent);
5416     packLanes(RayOrigin);
5417 
5418     if (IsA16) {
5419       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5420       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5421       auto MergedDir = B.buildMerge(
5422           V3S32,
5423           {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
5424                                                     UnmergeRayDir.getReg(0)}))
5425                .getReg(0),
5426            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
5427                                                     UnmergeRayDir.getReg(1)}))
5428                .getReg(0),
5429            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
5430                                                     UnmergeRayDir.getReg(2)}))
5431                .getReg(0)});
5432       Ops.push_back(MergedDir.getReg(0));
5433     } else {
5434       packLanes(RayDir);
5435       packLanes(RayInvDir);
5436     }
5437   } else {
5438     if (Is64) {
5439       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5440       Ops.push_back(Unmerge.getReg(0));
5441       Ops.push_back(Unmerge.getReg(1));
5442     } else {
5443       Ops.push_back(NodePtr);
5444     }
5445     Ops.push_back(RayExtent);
5446 
5447     auto packLanes = [&Ops, &S32, &B](Register Src) {
5448       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5449       Ops.push_back(Unmerge.getReg(0));
5450       Ops.push_back(Unmerge.getReg(1));
5451       Ops.push_back(Unmerge.getReg(2));
5452     };
5453 
5454     packLanes(RayOrigin);
5455     if (IsA16) {
5456       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5457       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5458       Register R1 = MRI.createGenericVirtualRegister(S32);
5459       Register R2 = MRI.createGenericVirtualRegister(S32);
5460       Register R3 = MRI.createGenericVirtualRegister(S32);
5461       B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5462       B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5463       B.buildMerge(R3,
5464                    {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5465       Ops.push_back(R1);
5466       Ops.push_back(R2);
5467       Ops.push_back(R3);
5468     } else {
5469       packLanes(RayDir);
5470       packLanes(RayInvDir);
5471     }
5472   }
5473 
5474   if (!UseNSA) {
5475     // Build a single vector containing all the operands so far prepared.
5476     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5477     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5478     Ops.clear();
5479     Ops.push_back(MergedOps);
5480   }
5481 
5482   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5483     .addDef(DstReg)
5484     .addImm(Opcode);
5485 
5486   for (Register R : Ops) {
5487     MIB.addUse(R);
5488   }
5489 
5490   MIB.addUse(TDescr)
5491      .addImm(IsA16 ? 1 : 0)
5492      .cloneMemRefs(MI);
5493 
5494   MI.eraseFromParent();
5495   return true;
5496 }
5497 
5498 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5499                                                MachineIRBuilder &B) const {
5500   unsigned Opc;
5501   int RoundMode = MI.getOperand(2).getImm();
5502 
5503   if (RoundMode == (int)RoundingMode::TowardPositive)
5504     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5505   else if (RoundMode == (int)RoundingMode::TowardNegative)
5506     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5507   else
5508     return false;
5509 
5510   B.buildInstr(Opc)
5511       .addDef(MI.getOperand(0).getReg())
5512       .addUse(MI.getOperand(1).getReg());
5513 
5514   MI.eraseFromParent();
5515 
5516   return true;
5517 }
5518 
5519 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5520                                             MachineInstr &MI) const {
5521   MachineIRBuilder &B = Helper.MIRBuilder;
5522   MachineRegisterInfo &MRI = *B.getMRI();
5523 
5524   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5525   auto IntrID = MI.getIntrinsicID();
5526   switch (IntrID) {
5527   case Intrinsic::amdgcn_if:
5528   case Intrinsic::amdgcn_else: {
5529     MachineInstr *Br = nullptr;
5530     MachineBasicBlock *UncondBrTarget = nullptr;
5531     bool Negated = false;
5532     if (MachineInstr *BrCond =
5533             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5534       const SIRegisterInfo *TRI
5535         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5536 
5537       Register Def = MI.getOperand(1).getReg();
5538       Register Use = MI.getOperand(3).getReg();
5539 
5540       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5541 
5542       if (Negated)
5543         std::swap(CondBrTarget, UncondBrTarget);
5544 
5545       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5546       if (IntrID == Intrinsic::amdgcn_if) {
5547         B.buildInstr(AMDGPU::SI_IF)
5548           .addDef(Def)
5549           .addUse(Use)
5550           .addMBB(UncondBrTarget);
5551       } else {
5552         B.buildInstr(AMDGPU::SI_ELSE)
5553             .addDef(Def)
5554             .addUse(Use)
5555             .addMBB(UncondBrTarget);
5556       }
5557 
5558       if (Br) {
5559         Br->getOperand(0).setMBB(CondBrTarget);
5560       } else {
5561         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5562         // since we're swapping branch targets it needs to be reinserted.
5563         // FIXME: IRTranslator should probably not do this
5564         B.buildBr(*CondBrTarget);
5565       }
5566 
5567       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5568       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5569       MI.eraseFromParent();
5570       BrCond->eraseFromParent();
5571       return true;
5572     }
5573 
5574     return false;
5575   }
5576   case Intrinsic::amdgcn_loop: {
5577     MachineInstr *Br = nullptr;
5578     MachineBasicBlock *UncondBrTarget = nullptr;
5579     bool Negated = false;
5580     if (MachineInstr *BrCond =
5581             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5582       const SIRegisterInfo *TRI
5583         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5584 
5585       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5586       Register Reg = MI.getOperand(2).getReg();
5587 
5588       if (Negated)
5589         std::swap(CondBrTarget, UncondBrTarget);
5590 
5591       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5592       B.buildInstr(AMDGPU::SI_LOOP)
5593         .addUse(Reg)
5594         .addMBB(UncondBrTarget);
5595 
5596       if (Br)
5597         Br->getOperand(0).setMBB(CondBrTarget);
5598       else
5599         B.buildBr(*CondBrTarget);
5600 
5601       MI.eraseFromParent();
5602       BrCond->eraseFromParent();
5603       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5604       return true;
5605     }
5606 
5607     return false;
5608   }
5609   case Intrinsic::amdgcn_kernarg_segment_ptr:
5610     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5611       // This only makes sense to call in a kernel, so just lower to null.
5612       B.buildConstant(MI.getOperand(0).getReg(), 0);
5613       MI.eraseFromParent();
5614       return true;
5615     }
5616 
5617     return legalizePreloadedArgIntrin(
5618       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5619   case Intrinsic::amdgcn_implicitarg_ptr:
5620     return legalizeImplicitArgPtr(MI, MRI, B);
5621   case Intrinsic::amdgcn_workitem_id_x:
5622     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5623                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5624   case Intrinsic::amdgcn_workitem_id_y:
5625     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5626                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5627   case Intrinsic::amdgcn_workitem_id_z:
5628     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5629                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5630   case Intrinsic::amdgcn_workgroup_id_x:
5631     return legalizePreloadedArgIntrin(MI, MRI, B,
5632                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5633   case Intrinsic::amdgcn_workgroup_id_y:
5634     return legalizePreloadedArgIntrin(MI, MRI, B,
5635                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5636   case Intrinsic::amdgcn_workgroup_id_z:
5637     return legalizePreloadedArgIntrin(MI, MRI, B,
5638                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5639   case Intrinsic::amdgcn_dispatch_ptr:
5640     return legalizePreloadedArgIntrin(MI, MRI, B,
5641                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5642   case Intrinsic::amdgcn_queue_ptr:
5643     return legalizePreloadedArgIntrin(MI, MRI, B,
5644                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5645   case Intrinsic::amdgcn_implicit_buffer_ptr:
5646     return legalizePreloadedArgIntrin(
5647       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5648   case Intrinsic::amdgcn_dispatch_id:
5649     return legalizePreloadedArgIntrin(MI, MRI, B,
5650                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5651   case Intrinsic::r600_read_ngroups_x:
5652     // TODO: Emit error for hsa
5653     return legalizeKernargMemParameter(MI, B,
5654                                        SI::KernelInputOffsets::NGROUPS_X);
5655   case Intrinsic::r600_read_ngroups_y:
5656     return legalizeKernargMemParameter(MI, B,
5657                                        SI::KernelInputOffsets::NGROUPS_Y);
5658   case Intrinsic::r600_read_ngroups_z:
5659     return legalizeKernargMemParameter(MI, B,
5660                                        SI::KernelInputOffsets::NGROUPS_Z);
5661   case Intrinsic::r600_read_local_size_x:
5662     // TODO: Could insert G_ASSERT_ZEXT from s16
5663     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5664   case Intrinsic::r600_read_local_size_y:
5665     // TODO: Could insert G_ASSERT_ZEXT from s16
5666     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
5667     // TODO: Could insert G_ASSERT_ZEXT from s16
5668   case Intrinsic::r600_read_local_size_z:
5669     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5670   case Intrinsic::r600_read_global_size_x:
5671     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5672   case Intrinsic::r600_read_global_size_y:
5673     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5674   case Intrinsic::r600_read_global_size_z:
5675     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5676   case Intrinsic::amdgcn_fdiv_fast:
5677     return legalizeFDIVFastIntrin(MI, MRI, B);
5678   case Intrinsic::amdgcn_is_shared:
5679     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5680   case Intrinsic::amdgcn_is_private:
5681     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5682   case Intrinsic::amdgcn_wavefrontsize: {
5683     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5684     MI.eraseFromParent();
5685     return true;
5686   }
5687   case Intrinsic::amdgcn_s_buffer_load:
5688     return legalizeSBufferLoad(Helper, MI);
5689   case Intrinsic::amdgcn_raw_buffer_store:
5690   case Intrinsic::amdgcn_struct_buffer_store:
5691     return legalizeBufferStore(MI, MRI, B, false, false);
5692   case Intrinsic::amdgcn_raw_buffer_store_format:
5693   case Intrinsic::amdgcn_struct_buffer_store_format:
5694     return legalizeBufferStore(MI, MRI, B, false, true);
5695   case Intrinsic::amdgcn_raw_tbuffer_store:
5696   case Intrinsic::amdgcn_struct_tbuffer_store:
5697     return legalizeBufferStore(MI, MRI, B, true, true);
5698   case Intrinsic::amdgcn_raw_buffer_load:
5699   case Intrinsic::amdgcn_struct_buffer_load:
5700     return legalizeBufferLoad(MI, MRI, B, false, false);
5701   case Intrinsic::amdgcn_raw_buffer_load_format:
5702   case Intrinsic::amdgcn_struct_buffer_load_format:
5703     return legalizeBufferLoad(MI, MRI, B, true, false);
5704   case Intrinsic::amdgcn_raw_tbuffer_load:
5705   case Intrinsic::amdgcn_struct_tbuffer_load:
5706     return legalizeBufferLoad(MI, MRI, B, true, true);
5707   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5708   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5709   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5710   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5711   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5712   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5713   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5714   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5715   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5716   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5717   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5718   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5719   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5720   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5721   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5722   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5723   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5724   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5725   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5726   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5727   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5728   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5729   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5730   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5731   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5732   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5733   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5734   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5735   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5736   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5737     return legalizeBufferAtomic(MI, B, IntrID);
5738   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5739   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5740     Register DstReg = MI.getOperand(0).getReg();
5741     if (!MRI.use_empty(DstReg) &&
5742         !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
5743       Function &F = B.getMF().getFunction();
5744       DiagnosticInfoUnsupported NoFpRet(
5745           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5746           DS_Error);
5747       F.getContext().diagnose(NoFpRet);
5748       B.buildUndef(DstReg);
5749       MI.eraseFromParent();
5750       return true;
5751     }
5752 
5753     return legalizeBufferAtomic(MI, B, IntrID);
5754   }
5755   case Intrinsic::amdgcn_atomic_inc:
5756     return legalizeAtomicIncDec(MI, B, true);
5757   case Intrinsic::amdgcn_atomic_dec:
5758     return legalizeAtomicIncDec(MI, B, false);
5759   case Intrinsic::trap:
5760     return legalizeTrapIntrinsic(MI, MRI, B);
5761   case Intrinsic::debugtrap:
5762     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5763   case Intrinsic::amdgcn_rsq_clamp:
5764     return legalizeRsqClampIntrinsic(MI, MRI, B);
5765   case Intrinsic::amdgcn_ds_fadd:
5766   case Intrinsic::amdgcn_ds_fmin:
5767   case Intrinsic::amdgcn_ds_fmax:
5768     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5769   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5770     return legalizeBVHIntrinsic(MI, B);
5771   default: {
5772     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5773             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5774       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5775     return true;
5776   }
5777   }
5778 
5779   return true;
5780 }
5781