xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision 85c17e40926132575d1b98ca1a36b8394fe511cd)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24 
25 using namespace llvm;
26 using namespace llvm::PatternMatch;
27 
28 #define DEBUG_TYPE "AMDGPUtti"
29 
30 namespace {
31 
32 struct AMDGPUImageDMaskIntrinsic {
33   unsigned Intr;
34 };
35 
36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37 #include "InstCombineTables.inc"
38 
39 } // end anonymous namespace
40 
41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42 //
43 // A single NaN input is folded to minnum, so we rely on that folding for
44 // handling NaNs.
45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46                            const APFloat &Src2) {
47   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48 
49   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51   if (Cmp0 == APFloat::cmpEqual)
52     return maxnum(Src1, Src2);
53 
54   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56   if (Cmp1 == APFloat::cmpEqual)
57     return maxnum(Src0, Src2);
58 
59   return maxnum(Src0, Src1);
60 }
61 
62 // Check if a value can be converted to a 16-bit value without losing
63 // precision.
64 // The value is expected to be either a float (IsFloat = true) or an unsigned
65 // integer (IsFloat = false).
66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67   Type *VTy = V.getType();
68   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69     // The value is already 16-bit, so we don't want to convert to 16-bit again!
70     return false;
71   }
72   if (IsFloat) {
73     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74       // We need to check that if we cast the index down to a half, we do not
75       // lose precision.
76       APFloat FloatValue(ConstFloat->getValueAPF());
77       bool LosesInfo = true;
78       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79                          &LosesInfo);
80       return !LosesInfo;
81     }
82   } else {
83     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84       // We need to check that if we cast the index down to an i16, we do not
85       // lose precision.
86       APInt IntValue(ConstInt->getValue());
87       return IntValue.getActiveBits() <= 16;
88     }
89   }
90 
91   Value *CastSrc;
92   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94   if (IsExt) {
95     Type *CastSrcTy = CastSrc->getType();
96     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97       return true;
98   }
99 
100   return false;
101 }
102 
103 // Convert a value to 16-bit.
104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105   Type *VTy = V.getType();
106   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107     return cast<Instruction>(&V)->getOperand(0);
108   if (VTy->isIntegerTy())
109     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110   if (VTy->isFloatingPointTy())
111     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112 
113   llvm_unreachable("Should never be called!");
114 }
115 
116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
118 /// this newly created intrinsic call.
119 static std::optional<Instruction *> modifyIntrinsicCall(
120     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121     InstCombiner &IC,
122     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123         Func) {
124   SmallVector<Type *, 4> ArgTys;
125   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126     return std::nullopt;
127 
128   SmallVector<Value *, 8> Args(OldIntr.args());
129 
130   // Modify arguments and types
131   Func(Args, ArgTys);
132 
133   CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args);
134   NewCall->takeName(&OldIntr);
135   NewCall->copyMetadata(OldIntr);
136   if (isa<FPMathOperator>(NewCall))
137     NewCall->copyFastMathFlags(&OldIntr);
138 
139   // Erase and replace uses
140   if (!InstToReplace.getType()->isVoidTy())
141     IC.replaceInstUsesWith(InstToReplace, NewCall);
142 
143   bool RemoveOldIntr = &OldIntr != &InstToReplace;
144 
145   auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
146   if (RemoveOldIntr)
147     IC.eraseInstFromFunction(OldIntr);
148 
149   return RetValue;
150 }
151 
152 static std::optional<Instruction *>
153 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
154                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
155                              IntrinsicInst &II, InstCombiner &IC) {
156   // Optimize _L to _LZ when _L is zero
157   if (const auto *LZMappingInfo =
158           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
159     if (auto *ConstantLod =
160             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
161       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
162         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
163             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
164                                                      ImageDimIntr->Dim);
165         return modifyIntrinsicCall(
166             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
167               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
168             });
169       }
170     }
171   }
172 
173   // Optimize _mip away, when 'lod' is zero
174   if (const auto *MIPMappingInfo =
175           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
176     if (auto *ConstantMip =
177             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
178       if (ConstantMip->isZero()) {
179         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
180             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
181                                                      ImageDimIntr->Dim);
182         return modifyIntrinsicCall(
183             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
184               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
185             });
186       }
187     }
188   }
189 
190   // Optimize _bias away when 'bias' is zero
191   if (const auto *BiasMappingInfo =
192           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
193     if (auto *ConstantBias =
194             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
195       if (ConstantBias->isZero()) {
196         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
197             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
198                                                      ImageDimIntr->Dim);
199         return modifyIntrinsicCall(
200             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
201               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
202               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
203             });
204       }
205     }
206   }
207 
208   // Optimize _offset away when 'offset' is zero
209   if (const auto *OffsetMappingInfo =
210           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
211     if (auto *ConstantOffset =
212             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
213       if (ConstantOffset->isZero()) {
214         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
215             AMDGPU::getImageDimIntrinsicByBaseOpcode(
216                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
217         return modifyIntrinsicCall(
218             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
219               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
220             });
221       }
222     }
223   }
224 
225   // Try to use D16
226   if (ST->hasD16Images()) {
227 
228     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
229         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
230 
231     if (BaseOpcode->HasD16) {
232 
233       // If the only use of image intrinsic is a fptrunc (with conversion to
234       // half) then both fptrunc and image intrinsic will be replaced with image
235       // intrinsic with D16 flag.
236       if (II.hasOneUse()) {
237         Instruction *User = II.user_back();
238 
239         if (User->getOpcode() == Instruction::FPTrunc &&
240             User->getType()->getScalarType()->isHalfTy()) {
241 
242           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
243                                      [&](auto &Args, auto &ArgTys) {
244                                        // Change return type of image intrinsic.
245                                        // Set it to return type of fptrunc.
246                                        ArgTys[0] = User->getType();
247                                      });
248         }
249       }
250     }
251   }
252 
253   // Try to use A16 or G16
254   if (!ST->hasA16() && !ST->hasG16())
255     return std::nullopt;
256 
257   // Address is interpreted as float if the instruction has a sampler or as
258   // unsigned int if there is no sampler.
259   bool HasSampler =
260       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
261   bool FloatCoord = false;
262   // true means derivatives can be converted to 16 bit, coordinates not
263   bool OnlyDerivatives = false;
264 
265   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
266        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
267     Value *Coord = II.getOperand(OperandIndex);
268     // If the values are not derived from 16-bit values, we cannot optimize.
269     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
270       if (OperandIndex < ImageDimIntr->CoordStart ||
271           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
272         return std::nullopt;
273       }
274       // All gradients can be converted, so convert only them
275       OnlyDerivatives = true;
276       break;
277     }
278 
279     assert(OperandIndex == ImageDimIntr->GradientStart ||
280            FloatCoord == Coord->getType()->isFloatingPointTy());
281     FloatCoord = Coord->getType()->isFloatingPointTy();
282   }
283 
284   if (!OnlyDerivatives && !ST->hasA16())
285     OnlyDerivatives = true; // Only supports G16
286 
287   // Check if there is a bias parameter and if it can be converted to f16
288   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
289     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
290     assert(HasSampler &&
291            "Only image instructions with a sampler can have a bias");
292     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
293       OnlyDerivatives = true;
294   }
295 
296   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
297                                                ImageDimIntr->CoordStart))
298     return std::nullopt;
299 
300   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
301                                : Type::getInt16Ty(II.getContext());
302 
303   return modifyIntrinsicCall(
304       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
305         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
306         if (!OnlyDerivatives) {
307           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
308 
309           // Change the bias type
310           if (ImageDimIntr->NumBiasArgs != 0)
311             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
312         }
313 
314         unsigned EndIndex =
315             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
316         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
317              OperandIndex < EndIndex; OperandIndex++) {
318           Args[OperandIndex] =
319               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
320         }
321 
322         // Convert the bias
323         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
324           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
325           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
326         }
327       });
328 }
329 
330 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
331                                            const Value *Op0, const Value *Op1,
332                                            InstCombiner &IC) const {
333   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
334   // infinity, gives +0.0. If we can prove we don't have one of the special
335   // cases then we can use a normal multiply instead.
336   // TODO: Create and use isKnownFiniteNonZero instead of just matching
337   // constants here.
338   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
339       match(Op1, PatternMatch::m_FiniteNonZero())) {
340     // One operand is not zero or infinity or NaN.
341     return true;
342   }
343 
344   SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
345   if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
346       isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
347     // Neither operand is infinity or NaN.
348     return true;
349   }
350   return false;
351 }
352 
353 /// Match an fpext from half to float, or a constant we can convert.
354 static Value *matchFPExtFromF16(Value *Arg) {
355   Value *Src = nullptr;
356   ConstantFP *CFP = nullptr;
357   if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
358     if (Src->getType()->isHalfTy())
359       return Src;
360   } else if (match(Arg, m_ConstantFP(CFP))) {
361     bool LosesInfo;
362     APFloat Val(CFP->getValueAPF());
363     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
364     if (!LosesInfo)
365       return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
366   }
367   return nullptr;
368 }
369 
370 // Trim all zero components from the end of the vector \p UseV and return
371 // an appropriate bitset with known elements.
372 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
373                                        Instruction *I) {
374   auto *VTy = cast<FixedVectorType>(UseV->getType());
375   unsigned VWidth = VTy->getNumElements();
376   APInt DemandedElts = APInt::getAllOnes(VWidth);
377 
378   for (int i = VWidth - 1; i > 0; --i) {
379     auto *Elt = findScalarElement(UseV, i);
380     if (!Elt)
381       break;
382 
383     if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
384       if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
385         break;
386     } else {
387       break;
388     }
389 
390     DemandedElts.clearBit(i);
391   }
392 
393   return DemandedElts;
394 }
395 
396 // Trim elements of the end of the vector \p V, if they are
397 // equal to the first element of the vector.
398 static APInt defaultComponentBroadcast(Value *V) {
399   auto *VTy = cast<FixedVectorType>(V->getType());
400   unsigned VWidth = VTy->getNumElements();
401   APInt DemandedElts = APInt::getAllOnes(VWidth);
402   Value *FirstComponent = findScalarElement(V, 0);
403 
404   SmallVector<int> ShuffleMask;
405   if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
406     SVI->getShuffleMask(ShuffleMask);
407 
408   for (int I = VWidth - 1; I > 0; --I) {
409     if (ShuffleMask.empty()) {
410       auto *Elt = findScalarElement(V, I);
411       if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
412         break;
413     } else {
414       // Detect identical elements in the shufflevector result, even though
415       // findScalarElement cannot tell us what that element is.
416       if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
417         break;
418     }
419     DemandedElts.clearBit(I);
420   }
421 
422   return DemandedElts;
423 }
424 
425 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
426                                                     IntrinsicInst &II,
427                                                     APInt DemandedElts,
428                                                     int DMaskIdx = -1,
429                                                     bool IsLoad = true);
430 
431 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
432 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
433   return (SqrtOp->getType()->isFloatTy() &&
434           (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
435          SqrtOp->getType()->isHalfTy();
436 }
437 
438 /// Return true if we can easily prove that use U is uniform.
439 static bool isTriviallyUniform(const Use &U) {
440   Value *V = U.get();
441   if (isa<Constant>(V))
442     return true;
443   if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
444     if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
445       return false;
446     // If II and U are in different blocks then there is a possibility of
447     // temporal divergence.
448     return II->getParent() == cast<Instruction>(U.getUser())->getParent();
449   }
450   return false;
451 }
452 
453 std::optional<Instruction *>
454 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
455   Intrinsic::ID IID = II.getIntrinsicID();
456   switch (IID) {
457   case Intrinsic::amdgcn_rcp: {
458     Value *Src = II.getArgOperand(0);
459 
460     // TODO: Move to ConstantFolding/InstSimplify?
461     if (isa<UndefValue>(Src)) {
462       Type *Ty = II.getType();
463       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
464       return IC.replaceInstUsesWith(II, QNaN);
465     }
466 
467     if (II.isStrictFP())
468       break;
469 
470     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
471       const APFloat &ArgVal = C->getValueAPF();
472       APFloat Val(ArgVal.getSemantics(), 1);
473       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
474 
475       // This is more precise than the instruction may give.
476       //
477       // TODO: The instruction always flushes denormal results (except for f16),
478       // should this also?
479       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
480     }
481 
482     FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
483     if (!FMF.allowContract())
484       break;
485     auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
486     if (!SrcCI)
487       break;
488 
489     auto IID = SrcCI->getIntrinsicID();
490     // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
491     //
492     // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
493     // relaxed.
494     if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
495       const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
496       FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
497       if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
498         break;
499 
500       if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
501         break;
502 
503       Function *NewDecl = Intrinsic::getOrInsertDeclaration(
504           SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
505 
506       InnerFMF |= FMF;
507       II.setFastMathFlags(InnerFMF);
508 
509       II.setCalledFunction(NewDecl);
510       return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
511     }
512 
513     break;
514   }
515   case Intrinsic::amdgcn_sqrt:
516   case Intrinsic::amdgcn_rsq: {
517     Value *Src = II.getArgOperand(0);
518 
519     // TODO: Move to ConstantFolding/InstSimplify?
520     if (isa<UndefValue>(Src)) {
521       Type *Ty = II.getType();
522       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
523       return IC.replaceInstUsesWith(II, QNaN);
524     }
525 
526     // f16 amdgcn.sqrt is identical to regular sqrt.
527     if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
528       Function *NewDecl = Intrinsic::getOrInsertDeclaration(
529           II.getModule(), Intrinsic::sqrt, {II.getType()});
530       II.setCalledFunction(NewDecl);
531       return &II;
532     }
533 
534     break;
535   }
536   case Intrinsic::amdgcn_log:
537   case Intrinsic::amdgcn_exp2: {
538     const bool IsLog = IID == Intrinsic::amdgcn_log;
539     const bool IsExp = IID == Intrinsic::amdgcn_exp2;
540     Value *Src = II.getArgOperand(0);
541     Type *Ty = II.getType();
542 
543     if (isa<PoisonValue>(Src))
544       return IC.replaceInstUsesWith(II, Src);
545 
546     if (IC.getSimplifyQuery().isUndefValue(Src))
547       return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
548 
549     if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
550       if (C->isInfinity()) {
551         // exp2(+inf) -> +inf
552         // log2(+inf) -> +inf
553         if (!C->isNegative())
554           return IC.replaceInstUsesWith(II, C);
555 
556         // exp2(-inf) -> 0
557         if (IsExp && C->isNegative())
558           return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
559       }
560 
561       if (II.isStrictFP())
562         break;
563 
564       if (C->isNaN()) {
565         Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
566         return IC.replaceInstUsesWith(II, Quieted);
567       }
568 
569       // f32 instruction doesn't handle denormals, f16 does.
570       if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
571         Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
572                                       : ConstantFP::get(Ty, 1.0);
573         return IC.replaceInstUsesWith(II, FoldedValue);
574       }
575 
576       if (IsLog && C->isNegative())
577         return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
578 
579       // TODO: Full constant folding matching hardware behavior.
580     }
581 
582     break;
583   }
584   case Intrinsic::amdgcn_frexp_mant:
585   case Intrinsic::amdgcn_frexp_exp: {
586     Value *Src = II.getArgOperand(0);
587     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
588       int Exp;
589       APFloat Significand =
590           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
591 
592       if (IID == Intrinsic::amdgcn_frexp_mant) {
593         return IC.replaceInstUsesWith(
594             II, ConstantFP::get(II.getContext(), Significand));
595       }
596 
597       // Match instruction special case behavior.
598       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
599         Exp = 0;
600 
601       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
602     }
603 
604     if (isa<UndefValue>(Src)) {
605       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
606     }
607 
608     break;
609   }
610   case Intrinsic::amdgcn_class: {
611     Value *Src0 = II.getArgOperand(0);
612     Value *Src1 = II.getArgOperand(1);
613     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
614     if (CMask) {
615       II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
616           II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
617 
618       // Clamp any excess bits, as they're illegal for the generic intrinsic.
619       II.setArgOperand(1, ConstantInt::get(Src1->getType(),
620                                            CMask->getZExtValue() & fcAllFlags));
621       return &II;
622     }
623 
624     // Propagate poison.
625     if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
626       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
627 
628     // llvm.amdgcn.class(_, undef) -> false
629     if (IC.getSimplifyQuery().isUndefValue(Src1))
630       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
631 
632     // llvm.amdgcn.class(undef, mask) -> mask != 0
633     if (IC.getSimplifyQuery().isUndefValue(Src0)) {
634       Value *CmpMask = IC.Builder.CreateICmpNE(
635           Src1, ConstantInt::getNullValue(Src1->getType()));
636       return IC.replaceInstUsesWith(II, CmpMask);
637     }
638     break;
639   }
640   case Intrinsic::amdgcn_cvt_pkrtz: {
641     auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
642       Type *HalfTy = Type::getHalfTy(Arg->getContext());
643 
644       if (isa<PoisonValue>(Arg))
645         return PoisonValue::get(HalfTy);
646       if (isa<UndefValue>(Arg))
647         return UndefValue::get(HalfTy);
648 
649       ConstantFP *CFP = nullptr;
650       if (match(Arg, m_ConstantFP(CFP))) {
651         bool LosesInfo;
652         APFloat Val(CFP->getValueAPF());
653         Val.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
654         return ConstantFP::get(HalfTy, Val);
655       }
656 
657       Value *Src = nullptr;
658       if (match(Arg, m_FPExt(m_Value(Src)))) {
659         if (Src->getType()->isHalfTy())
660           return Src;
661       }
662 
663       return nullptr;
664     };
665 
666     if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
667       if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
668         Value *V = PoisonValue::get(II.getType());
669         V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
670         V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
671         return IC.replaceInstUsesWith(II, V);
672       }
673     }
674 
675     break;
676   }
677   case Intrinsic::amdgcn_cvt_pknorm_i16:
678   case Intrinsic::amdgcn_cvt_pknorm_u16:
679   case Intrinsic::amdgcn_cvt_pk_i16:
680   case Intrinsic::amdgcn_cvt_pk_u16: {
681     Value *Src0 = II.getArgOperand(0);
682     Value *Src1 = II.getArgOperand(1);
683 
684     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
685       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
686     }
687 
688     break;
689   }
690   case Intrinsic::amdgcn_ubfe:
691   case Intrinsic::amdgcn_sbfe: {
692     // Decompose simple cases into standard shifts.
693     Value *Src = II.getArgOperand(0);
694     if (isa<UndefValue>(Src)) {
695       return IC.replaceInstUsesWith(II, Src);
696     }
697 
698     unsigned Width;
699     Type *Ty = II.getType();
700     unsigned IntSize = Ty->getIntegerBitWidth();
701 
702     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
703     if (CWidth) {
704       Width = CWidth->getZExtValue();
705       if ((Width & (IntSize - 1)) == 0) {
706         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
707       }
708 
709       // Hardware ignores high bits, so remove those.
710       if (Width >= IntSize) {
711         return IC.replaceOperand(
712             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
713       }
714     }
715 
716     unsigned Offset;
717     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
718     if (COffset) {
719       Offset = COffset->getZExtValue();
720       if (Offset >= IntSize) {
721         return IC.replaceOperand(
722             II, 1,
723             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
724       }
725     }
726 
727     bool Signed = IID == Intrinsic::amdgcn_sbfe;
728 
729     if (!CWidth || !COffset)
730       break;
731 
732     // The case of Width == 0 is handled above, which makes this transformation
733     // safe.  If Width == 0, then the ashr and lshr instructions become poison
734     // value since the shift amount would be equal to the bit size.
735     assert(Width != 0);
736 
737     // TODO: This allows folding to undef when the hardware has specific
738     // behavior?
739     if (Offset + Width < IntSize) {
740       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
741       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
742                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
743       RightShift->takeName(&II);
744       return IC.replaceInstUsesWith(II, RightShift);
745     }
746 
747     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
748                                : IC.Builder.CreateLShr(Src, Offset);
749 
750     RightShift->takeName(&II);
751     return IC.replaceInstUsesWith(II, RightShift);
752   }
753   case Intrinsic::amdgcn_exp:
754   case Intrinsic::amdgcn_exp_row:
755   case Intrinsic::amdgcn_exp_compr: {
756     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
757     unsigned EnBits = En->getZExtValue();
758     if (EnBits == 0xf)
759       break; // All inputs enabled.
760 
761     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
762     bool Changed = false;
763     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
764       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
765           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
766         Value *Src = II.getArgOperand(I + 2);
767         if (!isa<UndefValue>(Src)) {
768           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
769           Changed = true;
770         }
771       }
772     }
773 
774     if (Changed) {
775       return &II;
776     }
777 
778     break;
779   }
780   case Intrinsic::amdgcn_fmed3: {
781     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
782     // for the shader.
783 
784     Value *Src0 = II.getArgOperand(0);
785     Value *Src1 = II.getArgOperand(1);
786     Value *Src2 = II.getArgOperand(2);
787 
788     // Checking for NaN before canonicalization provides better fidelity when
789     // mapping other operations onto fmed3 since the order of operands is
790     // unchanged.
791     Value *V = nullptr;
792     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
793       V = IC.Builder.CreateMinNum(Src1, Src2);
794     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
795       V = IC.Builder.CreateMinNum(Src0, Src2);
796     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
797       V = IC.Builder.CreateMaxNum(Src0, Src1);
798     }
799 
800     if (V) {
801       if (auto *CI = dyn_cast<CallInst>(V)) {
802         CI->copyFastMathFlags(&II);
803         CI->takeName(&II);
804       }
805       return IC.replaceInstUsesWith(II, V);
806     }
807 
808     bool Swap = false;
809     // Canonicalize constants to RHS operands.
810     //
811     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
812     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
813       std::swap(Src0, Src1);
814       Swap = true;
815     }
816 
817     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
818       std::swap(Src1, Src2);
819       Swap = true;
820     }
821 
822     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
823       std::swap(Src0, Src1);
824       Swap = true;
825     }
826 
827     if (Swap) {
828       II.setArgOperand(0, Src0);
829       II.setArgOperand(1, Src1);
830       II.setArgOperand(2, Src2);
831       return &II;
832     }
833 
834     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
835       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
836         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
837           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
838                                        C2->getValueAPF());
839           return IC.replaceInstUsesWith(
840               II, ConstantFP::get(IC.Builder.getContext(), Result));
841         }
842       }
843     }
844 
845     if (!ST->hasMed3_16())
846       break;
847 
848     // Repeat floating-point width reduction done for minnum/maxnum.
849     // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
850     if (Value *X = matchFPExtFromF16(Src0)) {
851       if (Value *Y = matchFPExtFromF16(Src1)) {
852         if (Value *Z = matchFPExtFromF16(Src2)) {
853           Value *NewCall = IC.Builder.CreateIntrinsic(
854               IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
855           return new FPExtInst(NewCall, II.getType());
856         }
857       }
858     }
859 
860     break;
861   }
862   case Intrinsic::amdgcn_icmp:
863   case Intrinsic::amdgcn_fcmp: {
864     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
865     // Guard against invalid arguments.
866     int64_t CCVal = CC->getZExtValue();
867     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
868     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
869                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
870         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
871                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
872       break;
873 
874     Value *Src0 = II.getArgOperand(0);
875     Value *Src1 = II.getArgOperand(1);
876 
877     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
878       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
879         Constant *CCmp = ConstantFoldCompareInstOperands(
880             (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
881         if (CCmp && CCmp->isNullValue()) {
882           return IC.replaceInstUsesWith(
883               II, IC.Builder.CreateSExt(CCmp, II.getType()));
884         }
885 
886         // The result of V_ICMP/V_FCMP assembly instructions (which this
887         // intrinsic exposes) is one bit per thread, masked with the EXEC
888         // register (which contains the bitmask of live threads). So a
889         // comparison that always returns true is the same as a read of the
890         // EXEC register.
891         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
892         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
893         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
894         CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
895                                                        II.getType(), Args);
896         NewCall->addFnAttr(Attribute::Convergent);
897         NewCall->takeName(&II);
898         return IC.replaceInstUsesWith(II, NewCall);
899       }
900 
901       // Canonicalize constants to RHS.
902       CmpInst::Predicate SwapPred =
903           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
904       II.setArgOperand(0, Src1);
905       II.setArgOperand(1, Src0);
906       II.setArgOperand(
907           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
908       return &II;
909     }
910 
911     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
912       break;
913 
914     // Canonicalize compare eq with true value to compare != 0
915     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
916     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
917     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
918     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
919     Value *ExtSrc;
920     if (CCVal == CmpInst::ICMP_EQ &&
921         ((match(Src1, PatternMatch::m_One()) &&
922           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
923          (match(Src1, PatternMatch::m_AllOnes()) &&
924           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
925         ExtSrc->getType()->isIntegerTy(1)) {
926       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
927       IC.replaceOperand(II, 2,
928                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
929       return &II;
930     }
931 
932     CmpInst::Predicate SrcPred;
933     Value *SrcLHS;
934     Value *SrcRHS;
935 
936     // Fold compare eq/ne with 0 from a compare result as the predicate to the
937     // intrinsic. The typical use is a wave vote function in the library, which
938     // will be fed from a user code condition compared with 0. Fold in the
939     // redundant compare.
940 
941     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
942     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
943     //
944     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
945     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
946     if (match(Src1, PatternMatch::m_Zero()) &&
947         match(Src0, PatternMatch::m_ZExtOrSExt(
948                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
949                               PatternMatch::m_Value(SrcRHS))))) {
950       if (CCVal == CmpInst::ICMP_EQ)
951         SrcPred = CmpInst::getInversePredicate(SrcPred);
952 
953       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
954                                  ? Intrinsic::amdgcn_fcmp
955                                  : Intrinsic::amdgcn_icmp;
956 
957       Type *Ty = SrcLHS->getType();
958       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
959         // Promote to next legal integer type.
960         unsigned Width = CmpType->getBitWidth();
961         unsigned NewWidth = Width;
962 
963         // Don't do anything for i1 comparisons.
964         if (Width == 1)
965           break;
966 
967         if (Width <= 16)
968           NewWidth = 16;
969         else if (Width <= 32)
970           NewWidth = 32;
971         else if (Width <= 64)
972           NewWidth = 64;
973         else
974           break; // Can't handle this.
975 
976         if (Width != NewWidth) {
977           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
978           if (CmpInst::isSigned(SrcPred)) {
979             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
980             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
981           } else {
982             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
983             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
984           }
985         }
986       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
987         break;
988 
989       Value *Args[] = {SrcLHS, SrcRHS,
990                        ConstantInt::get(CC->getType(), SrcPred)};
991       CallInst *NewCall = IC.Builder.CreateIntrinsic(
992           NewIID, {II.getType(), SrcLHS->getType()}, Args);
993       NewCall->takeName(&II);
994       return IC.replaceInstUsesWith(II, NewCall);
995     }
996 
997     break;
998   }
999   case Intrinsic::amdgcn_mbcnt_hi: {
1000     // exec_hi is all 0, so this is just a copy.
1001     if (ST->isWave32())
1002       return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1003     break;
1004   }
1005   case Intrinsic::amdgcn_ballot: {
1006     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1007       if (Src->isZero()) {
1008         // amdgcn.ballot(i1 0) is zero.
1009         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1010       }
1011     }
1012     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1013       // %b64 = call i64 ballot.i64(...)
1014       // =>
1015       // %b32 = call i32 ballot.i32(...)
1016       // %b64 = zext i32 %b32 to i64
1017       Value *Call = IC.Builder.CreateZExt(
1018           IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1019                                      {IC.Builder.getInt32Ty()},
1020                                      {II.getArgOperand(0)}),
1021           II.getType());
1022       Call->takeName(&II);
1023       return IC.replaceInstUsesWith(II, Call);
1024     }
1025     break;
1026   }
1027   case Intrinsic::amdgcn_wqm_vote: {
1028     // wqm_vote is identity when the argument is constant.
1029     if (!isa<Constant>(II.getArgOperand(0)))
1030       break;
1031 
1032     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1033   }
1034   case Intrinsic::amdgcn_kill: {
1035     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1036     if (!C || !C->getZExtValue())
1037       break;
1038 
1039     // amdgcn.kill(i1 1) is a no-op
1040     return IC.eraseInstFromFunction(II);
1041   }
1042   case Intrinsic::amdgcn_update_dpp: {
1043     Value *Old = II.getArgOperand(0);
1044 
1045     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1046     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1047     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1048     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1049         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1050       break;
1051 
1052     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1053     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1054   }
1055   case Intrinsic::amdgcn_permlane16:
1056   case Intrinsic::amdgcn_permlane16_var:
1057   case Intrinsic::amdgcn_permlanex16:
1058   case Intrinsic::amdgcn_permlanex16_var: {
1059     // Discard vdst_in if it's not going to be read.
1060     Value *VDstIn = II.getArgOperand(0);
1061     if (isa<UndefValue>(VDstIn))
1062       break;
1063 
1064     // FetchInvalid operand idx.
1065     unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1066                           IID == Intrinsic::amdgcn_permlanex16)
1067                              ? 4  /* for permlane16 and permlanex16 */
1068                              : 3; /* for permlane16_var and permlanex16_var */
1069 
1070     // BoundCtrl operand idx.
1071     // For permlane16 and permlanex16 it should be 5
1072     // For Permlane16_var and permlanex16_var it should be 4
1073     unsigned int BcIdx = FiIdx + 1;
1074 
1075     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1076     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1077     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1078       break;
1079 
1080     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1081   }
1082   case Intrinsic::amdgcn_permlane64:
1083   case Intrinsic::amdgcn_readfirstlane:
1084   case Intrinsic::amdgcn_readlane: {
1085     // If the first argument is uniform these intrinsics return it unchanged.
1086     const Use &Src = II.getArgOperandUse(0);
1087     if (isTriviallyUniform(Src))
1088       return IC.replaceInstUsesWith(II, Src.get());
1089     break;
1090   }
1091   case Intrinsic::amdgcn_trig_preop: {
1092     // The intrinsic is declared with name mangling, but currently the
1093     // instruction only exists for f64
1094     if (!II.getType()->isDoubleTy())
1095       break;
1096 
1097     Value *Src = II.getArgOperand(0);
1098     Value *Segment = II.getArgOperand(1);
1099     if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1100       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1101 
1102     if (isa<UndefValue>(Src)) {
1103       auto *QNaN = ConstantFP::get(
1104           II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1105       return IC.replaceInstUsesWith(II, QNaN);
1106     }
1107 
1108     const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1109     if (!Csrc)
1110       break;
1111 
1112     if (II.isStrictFP())
1113       break;
1114 
1115     const APFloat &Fsrc = Csrc->getValueAPF();
1116     if (Fsrc.isNaN()) {
1117       auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1118       return IC.replaceInstUsesWith(II, Quieted);
1119     }
1120 
1121     const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1122     if (!Cseg)
1123       break;
1124 
1125     unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1126     unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1127     unsigned Shift = SegmentVal * 53;
1128     if (Exponent > 1077)
1129       Shift += Exponent - 1077;
1130 
1131     // 2.0/PI table.
1132     static const uint32_t TwoByPi[] = {
1133         0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1134         0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1135         0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1136         0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1137         0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1138         0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1139         0x56033046};
1140 
1141     // Return 0 for outbound segment (hardware behavior).
1142     unsigned Idx = Shift >> 5;
1143     if (Idx + 2 >= std::size(TwoByPi)) {
1144       APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1145       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1146     }
1147 
1148     unsigned BShift = Shift & 0x1f;
1149     uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1150     uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1151     if (BShift)
1152       Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1153     Thi = Thi >> 11;
1154     APFloat Result = APFloat((double)Thi);
1155 
1156     int Scale = -53 - Shift;
1157     if (Exponent >= 1968)
1158       Scale += 128;
1159 
1160     Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1161     return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1162   }
1163   case Intrinsic::amdgcn_fmul_legacy: {
1164     Value *Op0 = II.getArgOperand(0);
1165     Value *Op1 = II.getArgOperand(1);
1166 
1167     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1168     // infinity, gives +0.0.
1169     // TODO: Move to InstSimplify?
1170     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1171         match(Op1, PatternMatch::m_AnyZeroFP()))
1172       return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1173 
1174     // If we can prove we don't have one of the special cases then we can use a
1175     // normal fmul instruction instead.
1176     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1177       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1178       FMul->takeName(&II);
1179       return IC.replaceInstUsesWith(II, FMul);
1180     }
1181     break;
1182   }
1183   case Intrinsic::amdgcn_fma_legacy: {
1184     Value *Op0 = II.getArgOperand(0);
1185     Value *Op1 = II.getArgOperand(1);
1186     Value *Op2 = II.getArgOperand(2);
1187 
1188     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1189     // infinity, gives +0.0.
1190     // TODO: Move to InstSimplify?
1191     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1192         match(Op1, PatternMatch::m_AnyZeroFP())) {
1193       // It's tempting to just return Op2 here, but that would give the wrong
1194       // result if Op2 was -0.0.
1195       auto *Zero = ConstantFP::getZero(II.getType());
1196       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1197       FAdd->takeName(&II);
1198       return IC.replaceInstUsesWith(II, FAdd);
1199     }
1200 
1201     // If we can prove we don't have one of the special cases then we can use a
1202     // normal fma instead.
1203     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1204       II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1205           II.getModule(), Intrinsic::fma, II.getType()));
1206       return &II;
1207     }
1208     break;
1209   }
1210   case Intrinsic::amdgcn_is_shared:
1211   case Intrinsic::amdgcn_is_private: {
1212     if (isa<UndefValue>(II.getArgOperand(0)))
1213       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1214 
1215     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1216       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1217     break;
1218   }
1219   case Intrinsic::amdgcn_raw_buffer_store_format:
1220   case Intrinsic::amdgcn_struct_buffer_store_format:
1221   case Intrinsic::amdgcn_raw_tbuffer_store:
1222   case Intrinsic::amdgcn_struct_tbuffer_store:
1223   case Intrinsic::amdgcn_image_store_1d:
1224   case Intrinsic::amdgcn_image_store_1darray:
1225   case Intrinsic::amdgcn_image_store_2d:
1226   case Intrinsic::amdgcn_image_store_2darray:
1227   case Intrinsic::amdgcn_image_store_2darraymsaa:
1228   case Intrinsic::amdgcn_image_store_2dmsaa:
1229   case Intrinsic::amdgcn_image_store_3d:
1230   case Intrinsic::amdgcn_image_store_cube:
1231   case Intrinsic::amdgcn_image_store_mip_1d:
1232   case Intrinsic::amdgcn_image_store_mip_1darray:
1233   case Intrinsic::amdgcn_image_store_mip_2d:
1234   case Intrinsic::amdgcn_image_store_mip_2darray:
1235   case Intrinsic::amdgcn_image_store_mip_3d:
1236   case Intrinsic::amdgcn_image_store_mip_cube: {
1237     if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1238       break;
1239 
1240     APInt DemandedElts;
1241     if (ST->hasDefaultComponentBroadcast())
1242       DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1243     else if (ST->hasDefaultComponentZero())
1244       DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1245     else
1246       break;
1247 
1248     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1249     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1250                                               false)) {
1251       return IC.eraseInstFromFunction(II);
1252     }
1253 
1254     break;
1255   }
1256   }
1257   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1258             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1259     return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1260   }
1261   return std::nullopt;
1262 }
1263 
1264 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1265 ///
1266 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1267 /// definitions of the intrinsics vector argument, not Uses of the result like
1268 /// image and buffer loads.
1269 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1270 ///       struct returns.
1271 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1272                                                     IntrinsicInst &II,
1273                                                     APInt DemandedElts,
1274                                                     int DMaskIdx, bool IsLoad) {
1275 
1276   auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1277                                              : II.getOperand(0)->getType());
1278   unsigned VWidth = IIVTy->getNumElements();
1279   if (VWidth == 1)
1280     return nullptr;
1281   Type *EltTy = IIVTy->getElementType();
1282 
1283   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1284   IC.Builder.SetInsertPoint(&II);
1285 
1286   // Assume the arguments are unchanged and later override them, if needed.
1287   SmallVector<Value *, 16> Args(II.args());
1288 
1289   if (DMaskIdx < 0) {
1290     // Buffer case.
1291 
1292     const unsigned ActiveBits = DemandedElts.getActiveBits();
1293     const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1294 
1295     // Start assuming the prefix of elements is demanded, but possibly clear
1296     // some other bits if there are trailing zeros (unused components at front)
1297     // and update offset.
1298     DemandedElts = (1 << ActiveBits) - 1;
1299 
1300     if (UnusedComponentsAtFront > 0) {
1301       static const unsigned InvalidOffsetIdx = 0xf;
1302 
1303       unsigned OffsetIdx;
1304       switch (II.getIntrinsicID()) {
1305       case Intrinsic::amdgcn_raw_buffer_load:
1306       case Intrinsic::amdgcn_raw_ptr_buffer_load:
1307         OffsetIdx = 1;
1308         break;
1309       case Intrinsic::amdgcn_s_buffer_load:
1310         // If resulting type is vec3, there is no point in trimming the
1311         // load with updated offset, as the vec3 would most likely be widened to
1312         // vec4 anyway during lowering.
1313         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1314           OffsetIdx = InvalidOffsetIdx;
1315         else
1316           OffsetIdx = 1;
1317         break;
1318       case Intrinsic::amdgcn_struct_buffer_load:
1319       case Intrinsic::amdgcn_struct_ptr_buffer_load:
1320         OffsetIdx = 2;
1321         break;
1322       default:
1323         // TODO: handle tbuffer* intrinsics.
1324         OffsetIdx = InvalidOffsetIdx;
1325         break;
1326       }
1327 
1328       if (OffsetIdx != InvalidOffsetIdx) {
1329         // Clear demanded bits and update the offset.
1330         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1331         auto *Offset = Args[OffsetIdx];
1332         unsigned SingleComponentSizeInBits =
1333             IC.getDataLayout().getTypeSizeInBits(EltTy);
1334         unsigned OffsetAdd =
1335             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1336         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1337         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1338       }
1339     }
1340   } else {
1341     // Image case.
1342 
1343     ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1344     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1345 
1346     // dmask 0 has special semantics, do not simplify.
1347     if (DMaskVal == 0)
1348       return nullptr;
1349 
1350     // Mask off values that are undefined because the dmask doesn't cover them
1351     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1352 
1353     unsigned NewDMaskVal = 0;
1354     unsigned OrigLdStIdx = 0;
1355     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1356       const unsigned Bit = 1 << SrcIdx;
1357       if (!!(DMaskVal & Bit)) {
1358         if (!!DemandedElts[OrigLdStIdx])
1359           NewDMaskVal |= Bit;
1360         OrigLdStIdx++;
1361       }
1362     }
1363 
1364     if (DMaskVal != NewDMaskVal)
1365       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1366   }
1367 
1368   unsigned NewNumElts = DemandedElts.popcount();
1369   if (!NewNumElts)
1370     return PoisonValue::get(IIVTy);
1371 
1372   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1373     if (DMaskIdx >= 0)
1374       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1375     return nullptr;
1376   }
1377 
1378   // Validate function argument and return types, extracting overloaded types
1379   // along the way.
1380   SmallVector<Type *, 6> OverloadTys;
1381   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1382     return nullptr;
1383 
1384   Type *NewTy =
1385       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1386   OverloadTys[0] = NewTy;
1387 
1388   if (!IsLoad) {
1389     SmallVector<int, 8> EltMask;
1390     for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1391       if (DemandedElts[OrigStoreIdx])
1392         EltMask.push_back(OrigStoreIdx);
1393 
1394     if (NewNumElts == 1)
1395       Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1396     else
1397       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1398   }
1399 
1400   CallInst *NewCall =
1401       IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
1402   NewCall->takeName(&II);
1403   NewCall->copyMetadata(II);
1404 
1405   if (IsLoad) {
1406     if (NewNumElts == 1) {
1407       return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1408                                             DemandedElts.countr_zero());
1409     }
1410 
1411     SmallVector<int, 8> EltMask;
1412     unsigned NewLoadIdx = 0;
1413     for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1414       if (!!DemandedElts[OrigLoadIdx])
1415         EltMask.push_back(NewLoadIdx++);
1416       else
1417         EltMask.push_back(NewNumElts);
1418     }
1419 
1420     auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1421 
1422     return Shuffle;
1423   }
1424 
1425   return NewCall;
1426 }
1427 
1428 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1429     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1430     APInt &UndefElts2, APInt &UndefElts3,
1431     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1432         SimplifyAndSetOp) const {
1433   switch (II.getIntrinsicID()) {
1434   case Intrinsic::amdgcn_raw_buffer_load:
1435   case Intrinsic::amdgcn_raw_ptr_buffer_load:
1436   case Intrinsic::amdgcn_raw_buffer_load_format:
1437   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1438   case Intrinsic::amdgcn_raw_tbuffer_load:
1439   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1440   case Intrinsic::amdgcn_s_buffer_load:
1441   case Intrinsic::amdgcn_struct_buffer_load:
1442   case Intrinsic::amdgcn_struct_ptr_buffer_load:
1443   case Intrinsic::amdgcn_struct_buffer_load_format:
1444   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1445   case Intrinsic::amdgcn_struct_tbuffer_load:
1446   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1447     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1448   default: {
1449     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1450       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1451     }
1452     break;
1453   }
1454   }
1455   return std::nullopt;
1456 }
1457