xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision ff7eb1d0e900a6180e300a6f6a88ab3b12d80fc9)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24 
25 using namespace llvm;
26 using namespace llvm::PatternMatch;
27 
28 #define DEBUG_TYPE "AMDGPUtti"
29 
30 namespace {
31 
32 struct AMDGPUImageDMaskIntrinsic {
33   unsigned Intr;
34 };
35 
36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37 #include "InstCombineTables.inc"
38 
39 } // end anonymous namespace
40 
41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42 //
43 // A single NaN input is folded to minnum, so we rely on that folding for
44 // handling NaNs.
45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46                            const APFloat &Src2) {
47   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48 
49   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51   if (Cmp0 == APFloat::cmpEqual)
52     return maxnum(Src1, Src2);
53 
54   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56   if (Cmp1 == APFloat::cmpEqual)
57     return maxnum(Src0, Src2);
58 
59   return maxnum(Src0, Src1);
60 }
61 
62 // Check if a value can be converted to a 16-bit value without losing
63 // precision.
64 // The value is expected to be either a float (IsFloat = true) or an unsigned
65 // integer (IsFloat = false).
66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67   Type *VTy = V.getType();
68   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69     // The value is already 16-bit, so we don't want to convert to 16-bit again!
70     return false;
71   }
72   if (IsFloat) {
73     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74       // We need to check that if we cast the index down to a half, we do not
75       // lose precision.
76       APFloat FloatValue(ConstFloat->getValueAPF());
77       bool LosesInfo = true;
78       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79                          &LosesInfo);
80       return !LosesInfo;
81     }
82   } else {
83     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84       // We need to check that if we cast the index down to an i16, we do not
85       // lose precision.
86       APInt IntValue(ConstInt->getValue());
87       return IntValue.getActiveBits() <= 16;
88     }
89   }
90 
91   Value *CastSrc;
92   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94   if (IsExt) {
95     Type *CastSrcTy = CastSrc->getType();
96     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97       return true;
98   }
99 
100   return false;
101 }
102 
103 // Convert a value to 16-bit.
104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105   Type *VTy = V.getType();
106   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107     return cast<Instruction>(&V)->getOperand(0);
108   if (VTy->isIntegerTy())
109     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110   if (VTy->isFloatingPointTy())
111     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112 
113   llvm_unreachable("Should never be called!");
114 }
115 
116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
118 /// this newly created intrinsic call.
119 static std::optional<Instruction *> modifyIntrinsicCall(
120     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121     InstCombiner &IC,
122     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123         Func) {
124   SmallVector<Type *, 4> ArgTys;
125   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126     return std::nullopt;
127 
128   SmallVector<Value *, 8> Args(OldIntr.args());
129 
130   // Modify arguments and types
131   Func(Args, ArgTys);
132 
133   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
134 
135   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
136   NewCall->takeName(&OldIntr);
137   NewCall->copyMetadata(OldIntr);
138   if (isa<FPMathOperator>(NewCall))
139     NewCall->copyFastMathFlags(&OldIntr);
140 
141   // Erase and replace uses
142   if (!InstToReplace.getType()->isVoidTy())
143     IC.replaceInstUsesWith(InstToReplace, NewCall);
144 
145   bool RemoveOldIntr = &OldIntr != &InstToReplace;
146 
147   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
148   if (RemoveOldIntr)
149     IC.eraseInstFromFunction(OldIntr);
150 
151   return RetValue;
152 }
153 
154 static std::optional<Instruction *>
155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157                              IntrinsicInst &II, InstCombiner &IC) {
158   // Optimize _L to _LZ when _L is zero
159   if (const auto *LZMappingInfo =
160           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
161     if (auto *ConstantLod =
162             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
163       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
166                                                      ImageDimIntr->Dim);
167         return modifyIntrinsicCall(
168             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
169               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170             });
171       }
172     }
173   }
174 
175   // Optimize _mip away, when 'lod' is zero
176   if (const auto *MIPMappingInfo =
177           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
178     if (auto *ConstantMip =
179             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
180       if (ConstantMip->isZero()) {
181         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
183                                                      ImageDimIntr->Dim);
184         return modifyIntrinsicCall(
185             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
186               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187             });
188       }
189     }
190   }
191 
192   // Optimize _bias away when 'bias' is zero
193   if (const auto *BiasMappingInfo =
194           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
195     if (auto *ConstantBias =
196             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
197       if (ConstantBias->isZero()) {
198         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
200                                                      ImageDimIntr->Dim);
201         return modifyIntrinsicCall(
202             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
203               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205             });
206       }
207     }
208   }
209 
210   // Optimize _offset away when 'offset' is zero
211   if (const auto *OffsetMappingInfo =
212           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
213     if (auto *ConstantOffset =
214             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
215       if (ConstantOffset->isZero()) {
216         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217             AMDGPU::getImageDimIntrinsicByBaseOpcode(
218                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
219         return modifyIntrinsicCall(
220             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
221               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222             });
223       }
224     }
225   }
226 
227   // Try to use D16
228   if (ST->hasD16Images()) {
229 
230     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
232 
233     if (BaseOpcode->HasD16) {
234 
235       // If the only use of image intrinsic is a fptrunc (with conversion to
236       // half) then both fptrunc and image intrinsic will be replaced with image
237       // intrinsic with D16 flag.
238       if (II.hasOneUse()) {
239         Instruction *User = II.user_back();
240 
241         if (User->getOpcode() == Instruction::FPTrunc &&
242             User->getType()->getScalarType()->isHalfTy()) {
243 
244           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
245                                      [&](auto &Args, auto &ArgTys) {
246                                        // Change return type of image intrinsic.
247                                        // Set it to return type of fptrunc.
248                                        ArgTys[0] = User->getType();
249                                      });
250         }
251       }
252     }
253   }
254 
255   // Try to use A16 or G16
256   if (!ST->hasA16() && !ST->hasG16())
257     return std::nullopt;
258 
259   // Address is interpreted as float if the instruction has a sampler or as
260   // unsigned int if there is no sampler.
261   bool HasSampler =
262       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
263   bool FloatCoord = false;
264   // true means derivatives can be converted to 16 bit, coordinates not
265   bool OnlyDerivatives = false;
266 
267   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269     Value *Coord = II.getOperand(OperandIndex);
270     // If the values are not derived from 16-bit values, we cannot optimize.
271     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
272       if (OperandIndex < ImageDimIntr->CoordStart ||
273           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274         return std::nullopt;
275       }
276       // All gradients can be converted, so convert only them
277       OnlyDerivatives = true;
278       break;
279     }
280 
281     assert(OperandIndex == ImageDimIntr->GradientStart ||
282            FloatCoord == Coord->getType()->isFloatingPointTy());
283     FloatCoord = Coord->getType()->isFloatingPointTy();
284   }
285 
286   if (!OnlyDerivatives && !ST->hasA16())
287     OnlyDerivatives = true; // Only supports G16
288 
289   // Check if there is a bias parameter and if it can be converted to f16
290   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
292     assert(HasSampler &&
293            "Only image instructions with a sampler can have a bias");
294     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
295       OnlyDerivatives = true;
296   }
297 
298   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299                                                ImageDimIntr->CoordStart))
300     return std::nullopt;
301 
302   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303                                : Type::getInt16Ty(II.getContext());
304 
305   return modifyIntrinsicCall(
306       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
307         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308         if (!OnlyDerivatives) {
309           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310 
311           // Change the bias type
312           if (ImageDimIntr->NumBiasArgs != 0)
313             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
314         }
315 
316         unsigned EndIndex =
317             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319              OperandIndex < EndIndex; OperandIndex++) {
320           Args[OperandIndex] =
321               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
322         }
323 
324         // Convert the bias
325         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
327           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
328         }
329       });
330 }
331 
332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333                                            const Value *Op0, const Value *Op1,
334                                            InstCombiner &IC) const {
335   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336   // infinity, gives +0.0. If we can prove we don't have one of the special
337   // cases then we can use a normal multiply instead.
338   // TODO: Create and use isKnownFiniteNonZero instead of just matching
339   // constants here.
340   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
341       match(Op1, PatternMatch::m_FiniteNonZero())) {
342     // One operand is not zero or infinity or NaN.
343     return true;
344   }
345 
346   SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
347   if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
348       isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
349     // Neither operand is infinity or NaN.
350     return true;
351   }
352   return false;
353 }
354 
355 /// Match an fpext from half to float, or a constant we can convert.
356 static Value *matchFPExtFromF16(Value *Arg) {
357   Value *Src = nullptr;
358   ConstantFP *CFP = nullptr;
359   if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
360     if (Src->getType()->isHalfTy())
361       return Src;
362   } else if (match(Arg, m_ConstantFP(CFP))) {
363     bool LosesInfo;
364     APFloat Val(CFP->getValueAPF());
365     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
366     if (!LosesInfo)
367       return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
368   }
369   return nullptr;
370 }
371 
372 // Trim all zero components from the end of the vector \p UseV and return
373 // an appropriate bitset with known elements.
374 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
375                                        Instruction *I) {
376   auto *VTy = cast<FixedVectorType>(UseV->getType());
377   unsigned VWidth = VTy->getNumElements();
378   APInt DemandedElts = APInt::getAllOnes(VWidth);
379 
380   for (int i = VWidth - 1; i > 0; --i) {
381     auto *Elt = findScalarElement(UseV, i);
382     if (!Elt)
383       break;
384 
385     if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
386       if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
387         break;
388     } else {
389       break;
390     }
391 
392     DemandedElts.clearBit(i);
393   }
394 
395   return DemandedElts;
396 }
397 
398 // Trim elements of the end of the vector \p V, if they are
399 // equal to the first element of the vector.
400 static APInt defaultComponentBroadcast(Value *V) {
401   auto *VTy = cast<FixedVectorType>(V->getType());
402   unsigned VWidth = VTy->getNumElements();
403   APInt DemandedElts = APInt::getAllOnes(VWidth);
404   Value *FirstComponent = findScalarElement(V, 0);
405 
406   SmallVector<int> ShuffleMask;
407   if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
408     SVI->getShuffleMask(ShuffleMask);
409 
410   for (int I = VWidth - 1; I > 0; --I) {
411     if (ShuffleMask.empty()) {
412       auto *Elt = findScalarElement(V, I);
413       if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
414         break;
415     } else {
416       // Detect identical elements in the shufflevector result, even though
417       // findScalarElement cannot tell us what that element is.
418       if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
419         break;
420     }
421     DemandedElts.clearBit(I);
422   }
423 
424   return DemandedElts;
425 }
426 
427 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
428                                                     IntrinsicInst &II,
429                                                     APInt DemandedElts,
430                                                     int DMaskIdx = -1,
431                                                     bool IsLoad = true);
432 
433 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
434 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
435   return (SqrtOp->getType()->isFloatTy() &&
436           (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
437          SqrtOp->getType()->isHalfTy();
438 }
439 
440 /// Return true if we can easily prove that use U is uniform.
441 static bool isTriviallyUniform(const Use &U) {
442   Value *V = U.get();
443   if (isa<Constant>(V))
444     return true;
445   if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
446     if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
447       return false;
448     // If II and U are in different blocks then there is a possibility of
449     // temporal divergence.
450     return II->getParent() == cast<Instruction>(U.getUser())->getParent();
451   }
452   return false;
453 }
454 
455 std::optional<Instruction *>
456 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
457   Intrinsic::ID IID = II.getIntrinsicID();
458   switch (IID) {
459   case Intrinsic::amdgcn_rcp: {
460     Value *Src = II.getArgOperand(0);
461 
462     // TODO: Move to ConstantFolding/InstSimplify?
463     if (isa<UndefValue>(Src)) {
464       Type *Ty = II.getType();
465       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
466       return IC.replaceInstUsesWith(II, QNaN);
467     }
468 
469     if (II.isStrictFP())
470       break;
471 
472     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
473       const APFloat &ArgVal = C->getValueAPF();
474       APFloat Val(ArgVal.getSemantics(), 1);
475       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
476 
477       // This is more precise than the instruction may give.
478       //
479       // TODO: The instruction always flushes denormal results (except for f16),
480       // should this also?
481       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
482     }
483 
484     FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
485     if (!FMF.allowContract())
486       break;
487     auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
488     if (!SrcCI)
489       break;
490 
491     auto IID = SrcCI->getIntrinsicID();
492     // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
493     //
494     // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
495     // relaxed.
496     if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
497       const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
498       FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
499       if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
500         break;
501 
502       if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
503         break;
504 
505       Function *NewDecl = Intrinsic::getDeclaration(
506           SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
507 
508       InnerFMF |= FMF;
509       II.setFastMathFlags(InnerFMF);
510 
511       II.setCalledFunction(NewDecl);
512       return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
513     }
514 
515     break;
516   }
517   case Intrinsic::amdgcn_sqrt:
518   case Intrinsic::amdgcn_rsq: {
519     Value *Src = II.getArgOperand(0);
520 
521     // TODO: Move to ConstantFolding/InstSimplify?
522     if (isa<UndefValue>(Src)) {
523       Type *Ty = II.getType();
524       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
525       return IC.replaceInstUsesWith(II, QNaN);
526     }
527 
528     // f16 amdgcn.sqrt is identical to regular sqrt.
529     if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
530       Function *NewDecl = Intrinsic::getDeclaration(
531           II.getModule(), Intrinsic::sqrt, {II.getType()});
532       II.setCalledFunction(NewDecl);
533       return &II;
534     }
535 
536     break;
537   }
538   case Intrinsic::amdgcn_log:
539   case Intrinsic::amdgcn_exp2: {
540     const bool IsLog = IID == Intrinsic::amdgcn_log;
541     const bool IsExp = IID == Intrinsic::amdgcn_exp2;
542     Value *Src = II.getArgOperand(0);
543     Type *Ty = II.getType();
544 
545     if (isa<PoisonValue>(Src))
546       return IC.replaceInstUsesWith(II, Src);
547 
548     if (IC.getSimplifyQuery().isUndefValue(Src))
549       return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
550 
551     if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
552       if (C->isInfinity()) {
553         // exp2(+inf) -> +inf
554         // log2(+inf) -> +inf
555         if (!C->isNegative())
556           return IC.replaceInstUsesWith(II, C);
557 
558         // exp2(-inf) -> 0
559         if (IsExp && C->isNegative())
560           return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
561       }
562 
563       if (II.isStrictFP())
564         break;
565 
566       if (C->isNaN()) {
567         Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
568         return IC.replaceInstUsesWith(II, Quieted);
569       }
570 
571       // f32 instruction doesn't handle denormals, f16 does.
572       if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
573         Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
574                                       : ConstantFP::get(Ty, 1.0);
575         return IC.replaceInstUsesWith(II, FoldedValue);
576       }
577 
578       if (IsLog && C->isNegative())
579         return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
580 
581       // TODO: Full constant folding matching hardware behavior.
582     }
583 
584     break;
585   }
586   case Intrinsic::amdgcn_frexp_mant:
587   case Intrinsic::amdgcn_frexp_exp: {
588     Value *Src = II.getArgOperand(0);
589     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
590       int Exp;
591       APFloat Significand =
592           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
593 
594       if (IID == Intrinsic::amdgcn_frexp_mant) {
595         return IC.replaceInstUsesWith(
596             II, ConstantFP::get(II.getContext(), Significand));
597       }
598 
599       // Match instruction special case behavior.
600       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
601         Exp = 0;
602 
603       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
604     }
605 
606     if (isa<UndefValue>(Src)) {
607       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
608     }
609 
610     break;
611   }
612   case Intrinsic::amdgcn_class: {
613     Value *Src0 = II.getArgOperand(0);
614     Value *Src1 = II.getArgOperand(1);
615     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
616     if (CMask) {
617       II.setCalledOperand(Intrinsic::getDeclaration(
618           II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
619 
620       // Clamp any excess bits, as they're illegal for the generic intrinsic.
621       II.setArgOperand(1, ConstantInt::get(Src1->getType(),
622                                            CMask->getZExtValue() & fcAllFlags));
623       return &II;
624     }
625 
626     // Propagate poison.
627     if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
628       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
629 
630     // llvm.amdgcn.class(_, undef) -> false
631     if (IC.getSimplifyQuery().isUndefValue(Src1))
632       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
633 
634     // llvm.amdgcn.class(undef, mask) -> mask != 0
635     if (IC.getSimplifyQuery().isUndefValue(Src0)) {
636       Value *CmpMask = IC.Builder.CreateICmpNE(
637           Src1, ConstantInt::getNullValue(Src1->getType()));
638       return IC.replaceInstUsesWith(II, CmpMask);
639     }
640     break;
641   }
642   case Intrinsic::amdgcn_cvt_pkrtz: {
643     Value *Src0 = II.getArgOperand(0);
644     Value *Src1 = II.getArgOperand(1);
645     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
646       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
647         const fltSemantics &HalfSem =
648             II.getType()->getScalarType()->getFltSemantics();
649         bool LosesInfo;
650         APFloat Val0 = C0->getValueAPF();
651         APFloat Val1 = C1->getValueAPF();
652         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
653         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
654 
655         Constant *Folded =
656             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
657                                  ConstantFP::get(II.getContext(), Val1)});
658         return IC.replaceInstUsesWith(II, Folded);
659       }
660     }
661 
662     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
663       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
664     }
665 
666     break;
667   }
668   case Intrinsic::amdgcn_cvt_pknorm_i16:
669   case Intrinsic::amdgcn_cvt_pknorm_u16:
670   case Intrinsic::amdgcn_cvt_pk_i16:
671   case Intrinsic::amdgcn_cvt_pk_u16: {
672     Value *Src0 = II.getArgOperand(0);
673     Value *Src1 = II.getArgOperand(1);
674 
675     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
676       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
677     }
678 
679     break;
680   }
681   case Intrinsic::amdgcn_ubfe:
682   case Intrinsic::amdgcn_sbfe: {
683     // Decompose simple cases into standard shifts.
684     Value *Src = II.getArgOperand(0);
685     if (isa<UndefValue>(Src)) {
686       return IC.replaceInstUsesWith(II, Src);
687     }
688 
689     unsigned Width;
690     Type *Ty = II.getType();
691     unsigned IntSize = Ty->getIntegerBitWidth();
692 
693     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
694     if (CWidth) {
695       Width = CWidth->getZExtValue();
696       if ((Width & (IntSize - 1)) == 0) {
697         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
698       }
699 
700       // Hardware ignores high bits, so remove those.
701       if (Width >= IntSize) {
702         return IC.replaceOperand(
703             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
704       }
705     }
706 
707     unsigned Offset;
708     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
709     if (COffset) {
710       Offset = COffset->getZExtValue();
711       if (Offset >= IntSize) {
712         return IC.replaceOperand(
713             II, 1,
714             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
715       }
716     }
717 
718     bool Signed = IID == Intrinsic::amdgcn_sbfe;
719 
720     if (!CWidth || !COffset)
721       break;
722 
723     // The case of Width == 0 is handled above, which makes this transformation
724     // safe.  If Width == 0, then the ashr and lshr instructions become poison
725     // value since the shift amount would be equal to the bit size.
726     assert(Width != 0);
727 
728     // TODO: This allows folding to undef when the hardware has specific
729     // behavior?
730     if (Offset + Width < IntSize) {
731       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
732       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
733                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
734       RightShift->takeName(&II);
735       return IC.replaceInstUsesWith(II, RightShift);
736     }
737 
738     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
739                                : IC.Builder.CreateLShr(Src, Offset);
740 
741     RightShift->takeName(&II);
742     return IC.replaceInstUsesWith(II, RightShift);
743   }
744   case Intrinsic::amdgcn_exp:
745   case Intrinsic::amdgcn_exp_row:
746   case Intrinsic::amdgcn_exp_compr: {
747     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
748     unsigned EnBits = En->getZExtValue();
749     if (EnBits == 0xf)
750       break; // All inputs enabled.
751 
752     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
753     bool Changed = false;
754     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
755       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
756           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
757         Value *Src = II.getArgOperand(I + 2);
758         if (!isa<UndefValue>(Src)) {
759           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
760           Changed = true;
761         }
762       }
763     }
764 
765     if (Changed) {
766       return &II;
767     }
768 
769     break;
770   }
771   case Intrinsic::amdgcn_fmed3: {
772     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
773     // for the shader.
774 
775     Value *Src0 = II.getArgOperand(0);
776     Value *Src1 = II.getArgOperand(1);
777     Value *Src2 = II.getArgOperand(2);
778 
779     // Checking for NaN before canonicalization provides better fidelity when
780     // mapping other operations onto fmed3 since the order of operands is
781     // unchanged.
782     Value *V = nullptr;
783     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
784       V = IC.Builder.CreateMinNum(Src1, Src2);
785     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
786       V = IC.Builder.CreateMinNum(Src0, Src2);
787     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
788       V = IC.Builder.CreateMaxNum(Src0, Src1);
789     }
790 
791     if (V) {
792       if (auto *CI = dyn_cast<CallInst>(V)) {
793         CI->copyFastMathFlags(&II);
794         CI->takeName(&II);
795       }
796       return IC.replaceInstUsesWith(II, V);
797     }
798 
799     bool Swap = false;
800     // Canonicalize constants to RHS operands.
801     //
802     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
803     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
804       std::swap(Src0, Src1);
805       Swap = true;
806     }
807 
808     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
809       std::swap(Src1, Src2);
810       Swap = true;
811     }
812 
813     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
814       std::swap(Src0, Src1);
815       Swap = true;
816     }
817 
818     if (Swap) {
819       II.setArgOperand(0, Src0);
820       II.setArgOperand(1, Src1);
821       II.setArgOperand(2, Src2);
822       return &II;
823     }
824 
825     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
826       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
827         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
828           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
829                                        C2->getValueAPF());
830           return IC.replaceInstUsesWith(
831               II, ConstantFP::get(IC.Builder.getContext(), Result));
832         }
833       }
834     }
835 
836     if (!ST->hasMed3_16())
837       break;
838 
839     // Repeat floating-point width reduction done for minnum/maxnum.
840     // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
841     if (Value *X = matchFPExtFromF16(Src0)) {
842       if (Value *Y = matchFPExtFromF16(Src1)) {
843         if (Value *Z = matchFPExtFromF16(Src2)) {
844           Value *NewCall = IC.Builder.CreateIntrinsic(
845               IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
846           return new FPExtInst(NewCall, II.getType());
847         }
848       }
849     }
850 
851     break;
852   }
853   case Intrinsic::amdgcn_icmp:
854   case Intrinsic::amdgcn_fcmp: {
855     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
856     // Guard against invalid arguments.
857     int64_t CCVal = CC->getZExtValue();
858     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
859     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
860                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
861         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
862                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
863       break;
864 
865     Value *Src0 = II.getArgOperand(0);
866     Value *Src1 = II.getArgOperand(1);
867 
868     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
869       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
870         Constant *CCmp = ConstantFoldCompareInstOperands(
871             (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
872         if (CCmp && CCmp->isNullValue()) {
873           return IC.replaceInstUsesWith(
874               II, IC.Builder.CreateSExt(CCmp, II.getType()));
875         }
876 
877         // The result of V_ICMP/V_FCMP assembly instructions (which this
878         // intrinsic exposes) is one bit per thread, masked with the EXEC
879         // register (which contains the bitmask of live threads). So a
880         // comparison that always returns true is the same as a read of the
881         // EXEC register.
882         Function *NewF = Intrinsic::getDeclaration(
883             II.getModule(), Intrinsic::read_register, II.getType());
884         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
885         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
886         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
887         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
888         NewCall->addFnAttr(Attribute::Convergent);
889         NewCall->takeName(&II);
890         return IC.replaceInstUsesWith(II, NewCall);
891       }
892 
893       // Canonicalize constants to RHS.
894       CmpInst::Predicate SwapPred =
895           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
896       II.setArgOperand(0, Src1);
897       II.setArgOperand(1, Src0);
898       II.setArgOperand(
899           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
900       return &II;
901     }
902 
903     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
904       break;
905 
906     // Canonicalize compare eq with true value to compare != 0
907     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
908     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
909     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
910     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
911     Value *ExtSrc;
912     if (CCVal == CmpInst::ICMP_EQ &&
913         ((match(Src1, PatternMatch::m_One()) &&
914           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
915          (match(Src1, PatternMatch::m_AllOnes()) &&
916           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
917         ExtSrc->getType()->isIntegerTy(1)) {
918       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
919       IC.replaceOperand(II, 2,
920                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
921       return &II;
922     }
923 
924     CmpInst::Predicate SrcPred;
925     Value *SrcLHS;
926     Value *SrcRHS;
927 
928     // Fold compare eq/ne with 0 from a compare result as the predicate to the
929     // intrinsic. The typical use is a wave vote function in the library, which
930     // will be fed from a user code condition compared with 0. Fold in the
931     // redundant compare.
932 
933     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
934     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
935     //
936     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
937     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
938     if (match(Src1, PatternMatch::m_Zero()) &&
939         match(Src0, PatternMatch::m_ZExtOrSExt(
940                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
941                               PatternMatch::m_Value(SrcRHS))))) {
942       if (CCVal == CmpInst::ICMP_EQ)
943         SrcPred = CmpInst::getInversePredicate(SrcPred);
944 
945       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
946                                  ? Intrinsic::amdgcn_fcmp
947                                  : Intrinsic::amdgcn_icmp;
948 
949       Type *Ty = SrcLHS->getType();
950       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
951         // Promote to next legal integer type.
952         unsigned Width = CmpType->getBitWidth();
953         unsigned NewWidth = Width;
954 
955         // Don't do anything for i1 comparisons.
956         if (Width == 1)
957           break;
958 
959         if (Width <= 16)
960           NewWidth = 16;
961         else if (Width <= 32)
962           NewWidth = 32;
963         else if (Width <= 64)
964           NewWidth = 64;
965         else
966           break; // Can't handle this.
967 
968         if (Width != NewWidth) {
969           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
970           if (CmpInst::isSigned(SrcPred)) {
971             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
972             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
973           } else {
974             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
975             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
976           }
977         }
978       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
979         break;
980 
981       Function *NewF = Intrinsic::getDeclaration(
982           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
983       Value *Args[] = {SrcLHS, SrcRHS,
984                        ConstantInt::get(CC->getType(), SrcPred)};
985       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
986       NewCall->takeName(&II);
987       return IC.replaceInstUsesWith(II, NewCall);
988     }
989 
990     break;
991   }
992   case Intrinsic::amdgcn_mbcnt_hi: {
993     // exec_hi is all 0, so this is just a copy.
994     if (ST->isWave32())
995       return IC.replaceInstUsesWith(II, II.getArgOperand(1));
996     break;
997   }
998   case Intrinsic::amdgcn_ballot: {
999     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1000       if (Src->isZero()) {
1001         // amdgcn.ballot(i1 0) is zero.
1002         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1003       }
1004     }
1005     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1006       // %b64 = call i64 ballot.i64(...)
1007       // =>
1008       // %b32 = call i32 ballot.i32(...)
1009       // %b64 = zext i32 %b32 to i64
1010       Value *Call = IC.Builder.CreateZExt(
1011           IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1012                                      {IC.Builder.getInt32Ty()},
1013                                      {II.getArgOperand(0)}),
1014           II.getType());
1015       Call->takeName(&II);
1016       return IC.replaceInstUsesWith(II, Call);
1017     }
1018     break;
1019   }
1020   case Intrinsic::amdgcn_wqm_vote: {
1021     // wqm_vote is identity when the argument is constant.
1022     if (!isa<Constant>(II.getArgOperand(0)))
1023       break;
1024 
1025     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1026   }
1027   case Intrinsic::amdgcn_kill: {
1028     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1029     if (!C || !C->getZExtValue())
1030       break;
1031 
1032     // amdgcn.kill(i1 1) is a no-op
1033     return IC.eraseInstFromFunction(II);
1034   }
1035   case Intrinsic::amdgcn_update_dpp: {
1036     Value *Old = II.getArgOperand(0);
1037 
1038     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1039     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1040     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1041     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1042         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1043       break;
1044 
1045     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1046     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1047   }
1048   case Intrinsic::amdgcn_permlane16:
1049   case Intrinsic::amdgcn_permlane16_var:
1050   case Intrinsic::amdgcn_permlanex16:
1051   case Intrinsic::amdgcn_permlanex16_var: {
1052     // Discard vdst_in if it's not going to be read.
1053     Value *VDstIn = II.getArgOperand(0);
1054     if (isa<UndefValue>(VDstIn))
1055       break;
1056 
1057     // FetchInvalid operand idx.
1058     unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1059                           IID == Intrinsic::amdgcn_permlanex16)
1060                              ? 4  /* for permlane16 and permlanex16 */
1061                              : 3; /* for permlane16_var and permlanex16_var */
1062 
1063     // BoundCtrl operand idx.
1064     // For permlane16 and permlanex16 it should be 5
1065     // For Permlane16_var and permlanex16_var it should be 4
1066     unsigned int BcIdx = FiIdx + 1;
1067 
1068     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1069     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1070     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1071       break;
1072 
1073     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1074   }
1075   case Intrinsic::amdgcn_permlane64:
1076   case Intrinsic::amdgcn_readfirstlane:
1077   case Intrinsic::amdgcn_readlane: {
1078     // If the first argument is uniform these intrinsics return it unchanged.
1079     const Use &Src = II.getArgOperandUse(0);
1080     if (isTriviallyUniform(Src))
1081       return IC.replaceInstUsesWith(II, Src.get());
1082     break;
1083   }
1084   case Intrinsic::amdgcn_trig_preop: {
1085     // The intrinsic is declared with name mangling, but currently the
1086     // instruction only exists for f64
1087     if (!II.getType()->isDoubleTy())
1088       break;
1089 
1090     Value *Src = II.getArgOperand(0);
1091     Value *Segment = II.getArgOperand(1);
1092     if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1093       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1094 
1095     if (isa<UndefValue>(Src)) {
1096       auto *QNaN = ConstantFP::get(
1097           II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1098       return IC.replaceInstUsesWith(II, QNaN);
1099     }
1100 
1101     const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1102     if (!Csrc)
1103       break;
1104 
1105     if (II.isStrictFP())
1106       break;
1107 
1108     const APFloat &Fsrc = Csrc->getValueAPF();
1109     if (Fsrc.isNaN()) {
1110       auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1111       return IC.replaceInstUsesWith(II, Quieted);
1112     }
1113 
1114     const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1115     if (!Cseg)
1116       break;
1117 
1118     unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1119     unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1120     unsigned Shift = SegmentVal * 53;
1121     if (Exponent > 1077)
1122       Shift += Exponent - 1077;
1123 
1124     // 2.0/PI table.
1125     static const uint32_t TwoByPi[] = {
1126         0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1127         0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1128         0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1129         0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1130         0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1131         0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1132         0x56033046};
1133 
1134     // Return 0 for outbound segment (hardware behavior).
1135     unsigned Idx = Shift >> 5;
1136     if (Idx + 2 >= std::size(TwoByPi)) {
1137       APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1138       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1139     }
1140 
1141     unsigned BShift = Shift & 0x1f;
1142     uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1143     uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1144     if (BShift)
1145       Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1146     Thi = Thi >> 11;
1147     APFloat Result = APFloat((double)Thi);
1148 
1149     int Scale = -53 - Shift;
1150     if (Exponent >= 1968)
1151       Scale += 128;
1152 
1153     Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1154     return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1155   }
1156   case Intrinsic::amdgcn_fmul_legacy: {
1157     Value *Op0 = II.getArgOperand(0);
1158     Value *Op1 = II.getArgOperand(1);
1159 
1160     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1161     // infinity, gives +0.0.
1162     // TODO: Move to InstSimplify?
1163     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1164         match(Op1, PatternMatch::m_AnyZeroFP()))
1165       return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1166 
1167     // If we can prove we don't have one of the special cases then we can use a
1168     // normal fmul instruction instead.
1169     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1170       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1171       FMul->takeName(&II);
1172       return IC.replaceInstUsesWith(II, FMul);
1173     }
1174     break;
1175   }
1176   case Intrinsic::amdgcn_fma_legacy: {
1177     Value *Op0 = II.getArgOperand(0);
1178     Value *Op1 = II.getArgOperand(1);
1179     Value *Op2 = II.getArgOperand(2);
1180 
1181     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182     // infinity, gives +0.0.
1183     // TODO: Move to InstSimplify?
1184     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1185         match(Op1, PatternMatch::m_AnyZeroFP())) {
1186       // It's tempting to just return Op2 here, but that would give the wrong
1187       // result if Op2 was -0.0.
1188       auto *Zero = ConstantFP::getZero(II.getType());
1189       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1190       FAdd->takeName(&II);
1191       return IC.replaceInstUsesWith(II, FAdd);
1192     }
1193 
1194     // If we can prove we don't have one of the special cases then we can use a
1195     // normal fma instead.
1196     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1197       II.setCalledOperand(Intrinsic::getDeclaration(
1198           II.getModule(), Intrinsic::fma, II.getType()));
1199       return &II;
1200     }
1201     break;
1202   }
1203   case Intrinsic::amdgcn_is_shared:
1204   case Intrinsic::amdgcn_is_private: {
1205     if (isa<UndefValue>(II.getArgOperand(0)))
1206       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1207 
1208     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1209       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1210     break;
1211   }
1212   case Intrinsic::amdgcn_raw_buffer_store_format:
1213   case Intrinsic::amdgcn_struct_buffer_store_format:
1214   case Intrinsic::amdgcn_raw_tbuffer_store:
1215   case Intrinsic::amdgcn_struct_tbuffer_store:
1216   case Intrinsic::amdgcn_image_store_1d:
1217   case Intrinsic::amdgcn_image_store_1darray:
1218   case Intrinsic::amdgcn_image_store_2d:
1219   case Intrinsic::amdgcn_image_store_2darray:
1220   case Intrinsic::amdgcn_image_store_2darraymsaa:
1221   case Intrinsic::amdgcn_image_store_2dmsaa:
1222   case Intrinsic::amdgcn_image_store_3d:
1223   case Intrinsic::amdgcn_image_store_cube:
1224   case Intrinsic::amdgcn_image_store_mip_1d:
1225   case Intrinsic::amdgcn_image_store_mip_1darray:
1226   case Intrinsic::amdgcn_image_store_mip_2d:
1227   case Intrinsic::amdgcn_image_store_mip_2darray:
1228   case Intrinsic::amdgcn_image_store_mip_3d:
1229   case Intrinsic::amdgcn_image_store_mip_cube: {
1230     if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1231       break;
1232 
1233     APInt DemandedElts;
1234     if (ST->hasDefaultComponentBroadcast())
1235       DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1236     else if (ST->hasDefaultComponentZero())
1237       DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1238     else
1239       break;
1240 
1241     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1242     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1243                                               false)) {
1244       return IC.eraseInstFromFunction(II);
1245     }
1246 
1247     break;
1248   }
1249   }
1250   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1251             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1252     return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1253   }
1254   return std::nullopt;
1255 }
1256 
1257 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1258 ///
1259 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1260 /// definitions of the intrinsics vector argument, not Uses of the result like
1261 /// image and buffer loads.
1262 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1263 ///       struct returns.
1264 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1265                                                     IntrinsicInst &II,
1266                                                     APInt DemandedElts,
1267                                                     int DMaskIdx, bool IsLoad) {
1268 
1269   auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1270                                              : II.getOperand(0)->getType());
1271   unsigned VWidth = IIVTy->getNumElements();
1272   if (VWidth == 1)
1273     return nullptr;
1274   Type *EltTy = IIVTy->getElementType();
1275 
1276   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1277   IC.Builder.SetInsertPoint(&II);
1278 
1279   // Assume the arguments are unchanged and later override them, if needed.
1280   SmallVector<Value *, 16> Args(II.args());
1281 
1282   if (DMaskIdx < 0) {
1283     // Buffer case.
1284 
1285     const unsigned ActiveBits = DemandedElts.getActiveBits();
1286     const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1287 
1288     // Start assuming the prefix of elements is demanded, but possibly clear
1289     // some other bits if there are trailing zeros (unused components at front)
1290     // and update offset.
1291     DemandedElts = (1 << ActiveBits) - 1;
1292 
1293     if (UnusedComponentsAtFront > 0) {
1294       static const unsigned InvalidOffsetIdx = 0xf;
1295 
1296       unsigned OffsetIdx;
1297       switch (II.getIntrinsicID()) {
1298       case Intrinsic::amdgcn_raw_buffer_load:
1299       case Intrinsic::amdgcn_raw_ptr_buffer_load:
1300         OffsetIdx = 1;
1301         break;
1302       case Intrinsic::amdgcn_s_buffer_load:
1303         // If resulting type is vec3, there is no point in trimming the
1304         // load with updated offset, as the vec3 would most likely be widened to
1305         // vec4 anyway during lowering.
1306         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1307           OffsetIdx = InvalidOffsetIdx;
1308         else
1309           OffsetIdx = 1;
1310         break;
1311       case Intrinsic::amdgcn_struct_buffer_load:
1312       case Intrinsic::amdgcn_struct_ptr_buffer_load:
1313         OffsetIdx = 2;
1314         break;
1315       default:
1316         // TODO: handle tbuffer* intrinsics.
1317         OffsetIdx = InvalidOffsetIdx;
1318         break;
1319       }
1320 
1321       if (OffsetIdx != InvalidOffsetIdx) {
1322         // Clear demanded bits and update the offset.
1323         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1324         auto *Offset = Args[OffsetIdx];
1325         unsigned SingleComponentSizeInBits =
1326             IC.getDataLayout().getTypeSizeInBits(EltTy);
1327         unsigned OffsetAdd =
1328             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1329         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1330         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1331       }
1332     }
1333   } else {
1334     // Image case.
1335 
1336     ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1337     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1338 
1339     // dmask 0 has special semantics, do not simplify.
1340     if (DMaskVal == 0)
1341       return nullptr;
1342 
1343     // Mask off values that are undefined because the dmask doesn't cover them
1344     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1345 
1346     unsigned NewDMaskVal = 0;
1347     unsigned OrigLdStIdx = 0;
1348     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1349       const unsigned Bit = 1 << SrcIdx;
1350       if (!!(DMaskVal & Bit)) {
1351         if (!!DemandedElts[OrigLdStIdx])
1352           NewDMaskVal |= Bit;
1353         OrigLdStIdx++;
1354       }
1355     }
1356 
1357     if (DMaskVal != NewDMaskVal)
1358       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1359   }
1360 
1361   unsigned NewNumElts = DemandedElts.popcount();
1362   if (!NewNumElts)
1363     return PoisonValue::get(IIVTy);
1364 
1365   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1366     if (DMaskIdx >= 0)
1367       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1368     return nullptr;
1369   }
1370 
1371   // Validate function argument and return types, extracting overloaded types
1372   // along the way.
1373   SmallVector<Type *, 6> OverloadTys;
1374   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1375     return nullptr;
1376 
1377   Type *NewTy =
1378       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1379   OverloadTys[0] = NewTy;
1380 
1381   if (!IsLoad) {
1382     SmallVector<int, 8> EltMask;
1383     for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1384       if (DemandedElts[OrigStoreIdx])
1385         EltMask.push_back(OrigStoreIdx);
1386 
1387     if (NewNumElts == 1)
1388       Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1389     else
1390       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1391   }
1392 
1393   Function *NewIntrin = Intrinsic::getDeclaration(
1394       II.getModule(), II.getIntrinsicID(), OverloadTys);
1395   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1396   NewCall->takeName(&II);
1397   NewCall->copyMetadata(II);
1398 
1399   if (IsLoad) {
1400     if (NewNumElts == 1) {
1401       return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1402                                             DemandedElts.countr_zero());
1403     }
1404 
1405     SmallVector<int, 8> EltMask;
1406     unsigned NewLoadIdx = 0;
1407     for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1408       if (!!DemandedElts[OrigLoadIdx])
1409         EltMask.push_back(NewLoadIdx++);
1410       else
1411         EltMask.push_back(NewNumElts);
1412     }
1413 
1414     auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1415 
1416     return Shuffle;
1417   }
1418 
1419   return NewCall;
1420 }
1421 
1422 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1423     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1424     APInt &UndefElts2, APInt &UndefElts3,
1425     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1426         SimplifyAndSetOp) const {
1427   switch (II.getIntrinsicID()) {
1428   case Intrinsic::amdgcn_raw_buffer_load:
1429   case Intrinsic::amdgcn_raw_ptr_buffer_load:
1430   case Intrinsic::amdgcn_raw_buffer_load_format:
1431   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1432   case Intrinsic::amdgcn_raw_tbuffer_load:
1433   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1434   case Intrinsic::amdgcn_s_buffer_load:
1435   case Intrinsic::amdgcn_struct_buffer_load:
1436   case Intrinsic::amdgcn_struct_ptr_buffer_load:
1437   case Intrinsic::amdgcn_struct_buffer_load_format:
1438   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1439   case Intrinsic::amdgcn_struct_tbuffer_load:
1440   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1441     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1442   default: {
1443     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1444       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1445     }
1446     break;
1447   }
1448   }
1449   return std::nullopt;
1450 }
1451