xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision d1762fc454c0d7ee0bcffe87e798f67b6c43c1d2)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "AMDGPUtti"
26 
27 namespace {
28 
29 struct AMDGPUImageDMaskIntrinsic {
30   unsigned Intr;
31 };
32 
33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
34 #include "InstCombineTables.inc"
35 
36 } // end anonymous namespace
37 
38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
39 //
40 // A single NaN input is folded to minnum, so we rely on that folding for
41 // handling NaNs.
42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
43                            const APFloat &Src2) {
44   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
45 
46   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
47   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
48   if (Cmp0 == APFloat::cmpEqual)
49     return maxnum(Src1, Src2);
50 
51   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
52   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
53   if (Cmp1 == APFloat::cmpEqual)
54     return maxnum(Src0, Src2);
55 
56   return maxnum(Src0, Src1);
57 }
58 
59 // Check if a value can be converted to a 16-bit value without losing
60 // precision.
61 // The value is expected to be either a float (IsFloat = true) or an unsigned
62 // integer (IsFloat = false).
63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
64   Type *VTy = V.getType();
65   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
66     // The value is already 16-bit, so we don't want to convert to 16-bit again!
67     return false;
68   }
69   if (IsFloat) {
70     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
71       // We need to check that if we cast the index down to a half, we do not
72       // lose precision.
73       APFloat FloatValue(ConstFloat->getValueAPF());
74       bool LosesInfo = true;
75       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
76                          &LosesInfo);
77       return !LosesInfo;
78     }
79   } else {
80     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
81       // We need to check that if we cast the index down to an i16, we do not
82       // lose precision.
83       APInt IntValue(ConstInt->getValue());
84       return IntValue.getActiveBits() <= 16;
85     }
86   }
87 
88   Value *CastSrc;
89   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
90                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
91   if (IsExt) {
92     Type *CastSrcTy = CastSrc->getType();
93     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
94       return true;
95   }
96 
97   return false;
98 }
99 
100 // Convert a value to 16-bit.
101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
102   Type *VTy = V.getType();
103   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
104     return cast<Instruction>(&V)->getOperand(0);
105   if (VTy->isIntegerTy())
106     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
107   if (VTy->isFloatingPointTy())
108     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
109 
110   llvm_unreachable("Should never be called!");
111 }
112 
113 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
114 /// modified arguments (based on OldIntr) and replaces InstToReplace with
115 /// this newly created intrinsic call.
116 static Optional<Instruction *> modifyIntrinsicCall(
117     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
118     InstCombiner &IC,
119     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
120         Func) {
121   SmallVector<Type *, 4> ArgTys;
122   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
123     return None;
124 
125   SmallVector<Value *, 8> Args(OldIntr.args());
126 
127   // Modify arguments and types
128   Func(Args, ArgTys);
129 
130   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
131 
132   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
133   NewCall->takeName(&OldIntr);
134   NewCall->copyMetadata(OldIntr);
135   if (isa<FPMathOperator>(NewCall))
136     NewCall->copyFastMathFlags(&OldIntr);
137 
138   // Erase and replace uses
139   if (!InstToReplace.getType()->isVoidTy())
140     IC.replaceInstUsesWith(InstToReplace, NewCall);
141 
142   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
143   if (!OldIntr.isIdenticalTo(&InstToReplace))
144     IC.eraseInstFromFunction(OldIntr);
145 
146   return RetValue;
147 }
148 
149 static Optional<Instruction *>
150 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
151                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
152                              IntrinsicInst &II, InstCombiner &IC) {
153   // Optimize _L to _LZ when _L is zero
154   if (const auto *LZMappingInfo =
155           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
156     if (auto *ConstantLod =
157             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
158       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
159         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
160             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
161                                                      ImageDimIntr->Dim);
162         return modifyIntrinsicCall(
163             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
164               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
165             });
166       }
167     }
168   }
169 
170   // Optimize _mip away, when 'lod' is zero
171   if (const auto *MIPMappingInfo =
172           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
173     if (auto *ConstantMip =
174             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
175       if (ConstantMip->isZero()) {
176         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
177             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
178                                                      ImageDimIntr->Dim);
179         return modifyIntrinsicCall(
180             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
181               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
182             });
183       }
184     }
185   }
186 
187   // Optimize _bias away when 'bias' is zero
188   if (const auto *BiasMappingInfo =
189           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
190     if (auto *ConstantBias =
191             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
192       if (ConstantBias->isZero()) {
193         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
194             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
195                                                      ImageDimIntr->Dim);
196         return modifyIntrinsicCall(
197             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
198               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
199               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
200             });
201       }
202     }
203   }
204 
205   // Optimize _offset away when 'offset' is zero
206   if (const auto *OffsetMappingInfo =
207           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
208     if (auto *ConstantOffset =
209             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
210       if (ConstantOffset->isZero()) {
211         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
212             AMDGPU::getImageDimIntrinsicByBaseOpcode(
213                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
214         return modifyIntrinsicCall(
215             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
216               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
217             });
218       }
219     }
220   }
221 
222   // Try to use D16
223   if (ST->hasD16Images()) {
224 
225     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
226         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
227 
228     if (BaseOpcode->HasD16) {
229 
230       // If the only use of image intrinsic is a fptrunc (with conversion to
231       // half) then both fptrunc and image intrinsic will be replaced with image
232       // intrinsic with D16 flag.
233       if (II.hasOneUse()) {
234         Instruction *User = II.user_back();
235 
236         if (User->getOpcode() == Instruction::FPTrunc &&
237             User->getType()->getScalarType()->isHalfTy()) {
238 
239           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
240                                      [&](auto &Args, auto &ArgTys) {
241                                        // Change return type of image intrinsic.
242                                        // Set it to return type of fptrunc.
243                                        ArgTys[0] = User->getType();
244                                      });
245         }
246       }
247     }
248   }
249 
250   // Try to use A16 or G16
251   if (!ST->hasA16() && !ST->hasG16())
252     return None;
253 
254   // Address is interpreted as float if the instruction has a sampler or as
255   // unsigned int if there is no sampler.
256   bool HasSampler =
257       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
258   bool FloatCoord = false;
259   // true means derivatives can be converted to 16 bit, coordinates not
260   bool OnlyDerivatives = false;
261 
262   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
263        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
264     Value *Coord = II.getOperand(OperandIndex);
265     // If the values are not derived from 16-bit values, we cannot optimize.
266     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
267       if (OperandIndex < ImageDimIntr->CoordStart ||
268           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
269         return None;
270       }
271       // All gradients can be converted, so convert only them
272       OnlyDerivatives = true;
273       break;
274     }
275 
276     assert(OperandIndex == ImageDimIntr->GradientStart ||
277            FloatCoord == Coord->getType()->isFloatingPointTy());
278     FloatCoord = Coord->getType()->isFloatingPointTy();
279   }
280 
281   if (!OnlyDerivatives && !ST->hasA16())
282     OnlyDerivatives = true; // Only supports G16
283 
284   // Check if there is a bias parameter and if it can be converted to f16
285   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
286     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
287     assert(HasSampler &&
288            "Only image instructions with a sampler can have a bias");
289     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
290       OnlyDerivatives = true;
291   }
292 
293   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
294                                                ImageDimIntr->CoordStart))
295     return None;
296 
297   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
298                                : Type::getInt16Ty(II.getContext());
299 
300   return modifyIntrinsicCall(
301       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
302         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
303         if (!OnlyDerivatives) {
304           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
305 
306           // Change the bias type
307           if (ImageDimIntr->NumBiasArgs != 0)
308             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
309         }
310 
311         unsigned EndIndex =
312             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
313         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
314              OperandIndex < EndIndex; OperandIndex++) {
315           Args[OperandIndex] =
316               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
317         }
318 
319         // Convert the bias
320         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
321           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
322           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
323         }
324       });
325 }
326 
327 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
328                                            InstCombiner &IC) const {
329   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
330   // infinity, gives +0.0. If we can prove we don't have one of the special
331   // cases then we can use a normal multiply instead.
332   // TODO: Create and use isKnownFiniteNonZero instead of just matching
333   // constants here.
334   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
335       match(Op1, PatternMatch::m_FiniteNonZero())) {
336     // One operand is not zero or infinity or NaN.
337     return true;
338   }
339   auto *TLI = &IC.getTargetLibraryInfo();
340   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
341       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
342     // Neither operand is infinity or NaN.
343     return true;
344   }
345   return false;
346 }
347 
348 Optional<Instruction *>
349 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
350   Intrinsic::ID IID = II.getIntrinsicID();
351   switch (IID) {
352   case Intrinsic::amdgcn_rcp: {
353     Value *Src = II.getArgOperand(0);
354 
355     // TODO: Move to ConstantFolding/InstSimplify?
356     if (isa<UndefValue>(Src)) {
357       Type *Ty = II.getType();
358       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
359       return IC.replaceInstUsesWith(II, QNaN);
360     }
361 
362     if (II.isStrictFP())
363       break;
364 
365     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
366       const APFloat &ArgVal = C->getValueAPF();
367       APFloat Val(ArgVal.getSemantics(), 1);
368       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
369 
370       // This is more precise than the instruction may give.
371       //
372       // TODO: The instruction always flushes denormal results (except for f16),
373       // should this also?
374       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
375     }
376 
377     break;
378   }
379   case Intrinsic::amdgcn_rsq: {
380     Value *Src = II.getArgOperand(0);
381 
382     // TODO: Move to ConstantFolding/InstSimplify?
383     if (isa<UndefValue>(Src)) {
384       Type *Ty = II.getType();
385       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
386       return IC.replaceInstUsesWith(II, QNaN);
387     }
388 
389     break;
390   }
391   case Intrinsic::amdgcn_frexp_mant:
392   case Intrinsic::amdgcn_frexp_exp: {
393     Value *Src = II.getArgOperand(0);
394     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
395       int Exp;
396       APFloat Significand =
397           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
398 
399       if (IID == Intrinsic::amdgcn_frexp_mant) {
400         return IC.replaceInstUsesWith(
401             II, ConstantFP::get(II.getContext(), Significand));
402       }
403 
404       // Match instruction special case behavior.
405       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
406         Exp = 0;
407 
408       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
409     }
410 
411     if (isa<UndefValue>(Src)) {
412       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
413     }
414 
415     break;
416   }
417   case Intrinsic::amdgcn_class: {
418     enum {
419       S_NAN = 1 << 0,       // Signaling NaN
420       Q_NAN = 1 << 1,       // Quiet NaN
421       N_INFINITY = 1 << 2,  // Negative infinity
422       N_NORMAL = 1 << 3,    // Negative normal
423       N_SUBNORMAL = 1 << 4, // Negative subnormal
424       N_ZERO = 1 << 5,      // Negative zero
425       P_ZERO = 1 << 6,      // Positive zero
426       P_SUBNORMAL = 1 << 7, // Positive subnormal
427       P_NORMAL = 1 << 8,    // Positive normal
428       P_INFINITY = 1 << 9   // Positive infinity
429     };
430 
431     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
432                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
433                               P_NORMAL | P_INFINITY;
434 
435     Value *Src0 = II.getArgOperand(0);
436     Value *Src1 = II.getArgOperand(1);
437     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
438     if (!CMask) {
439       if (isa<UndefValue>(Src0)) {
440         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
441       }
442 
443       if (isa<UndefValue>(Src1)) {
444         return IC.replaceInstUsesWith(II,
445                                       ConstantInt::get(II.getType(), false));
446       }
447       break;
448     }
449 
450     uint32_t Mask = CMask->getZExtValue();
451 
452     // If all tests are made, it doesn't matter what the value is.
453     if ((Mask & FullMask) == FullMask) {
454       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
455     }
456 
457     if ((Mask & FullMask) == 0) {
458       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
459     }
460 
461     if (Mask == (S_NAN | Q_NAN)) {
462       // Equivalent of isnan. Replace with standard fcmp.
463       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
464       FCmp->takeName(&II);
465       return IC.replaceInstUsesWith(II, FCmp);
466     }
467 
468     if (Mask == (N_ZERO | P_ZERO)) {
469       // Equivalent of == 0.
470       Value *FCmp =
471           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
472 
473       FCmp->takeName(&II);
474       return IC.replaceInstUsesWith(II, FCmp);
475     }
476 
477     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
478     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
479         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
480       return IC.replaceOperand(
481           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
482     }
483 
484     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
485     if (!CVal) {
486       if (isa<UndefValue>(Src0)) {
487         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
488       }
489 
490       // Clamp mask to used bits
491       if ((Mask & FullMask) != Mask) {
492         CallInst *NewCall = IC.Builder.CreateCall(
493             II.getCalledFunction(),
494             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
495 
496         NewCall->takeName(&II);
497         return IC.replaceInstUsesWith(II, NewCall);
498       }
499 
500       break;
501     }
502 
503     const APFloat &Val = CVal->getValueAPF();
504 
505     bool Result =
506         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
507         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
508         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
509         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
510         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
511         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
512         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
513         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
514         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
515         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
516 
517     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
518   }
519   case Intrinsic::amdgcn_cvt_pkrtz: {
520     Value *Src0 = II.getArgOperand(0);
521     Value *Src1 = II.getArgOperand(1);
522     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
523       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
524         const fltSemantics &HalfSem =
525             II.getType()->getScalarType()->getFltSemantics();
526         bool LosesInfo;
527         APFloat Val0 = C0->getValueAPF();
528         APFloat Val1 = C1->getValueAPF();
529         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
530         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
531 
532         Constant *Folded =
533             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
534                                  ConstantFP::get(II.getContext(), Val1)});
535         return IC.replaceInstUsesWith(II, Folded);
536       }
537     }
538 
539     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
540       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
541     }
542 
543     break;
544   }
545   case Intrinsic::amdgcn_cvt_pknorm_i16:
546   case Intrinsic::amdgcn_cvt_pknorm_u16:
547   case Intrinsic::amdgcn_cvt_pk_i16:
548   case Intrinsic::amdgcn_cvt_pk_u16: {
549     Value *Src0 = II.getArgOperand(0);
550     Value *Src1 = II.getArgOperand(1);
551 
552     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
553       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
554     }
555 
556     break;
557   }
558   case Intrinsic::amdgcn_ubfe:
559   case Intrinsic::amdgcn_sbfe: {
560     // Decompose simple cases into standard shifts.
561     Value *Src = II.getArgOperand(0);
562     if (isa<UndefValue>(Src)) {
563       return IC.replaceInstUsesWith(II, Src);
564     }
565 
566     unsigned Width;
567     Type *Ty = II.getType();
568     unsigned IntSize = Ty->getIntegerBitWidth();
569 
570     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
571     if (CWidth) {
572       Width = CWidth->getZExtValue();
573       if ((Width & (IntSize - 1)) == 0) {
574         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
575       }
576 
577       // Hardware ignores high bits, so remove those.
578       if (Width >= IntSize) {
579         return IC.replaceOperand(
580             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
581       }
582     }
583 
584     unsigned Offset;
585     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
586     if (COffset) {
587       Offset = COffset->getZExtValue();
588       if (Offset >= IntSize) {
589         return IC.replaceOperand(
590             II, 1,
591             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
592       }
593     }
594 
595     bool Signed = IID == Intrinsic::amdgcn_sbfe;
596 
597     if (!CWidth || !COffset)
598       break;
599 
600     // The case of Width == 0 is handled above, which makes this transformation
601     // safe.  If Width == 0, then the ashr and lshr instructions become poison
602     // value since the shift amount would be equal to the bit size.
603     assert(Width != 0);
604 
605     // TODO: This allows folding to undef when the hardware has specific
606     // behavior?
607     if (Offset + Width < IntSize) {
608       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
609       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
610                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
611       RightShift->takeName(&II);
612       return IC.replaceInstUsesWith(II, RightShift);
613     }
614 
615     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
616                                : IC.Builder.CreateLShr(Src, Offset);
617 
618     RightShift->takeName(&II);
619     return IC.replaceInstUsesWith(II, RightShift);
620   }
621   case Intrinsic::amdgcn_exp:
622   case Intrinsic::amdgcn_exp_compr: {
623     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
624     unsigned EnBits = En->getZExtValue();
625     if (EnBits == 0xf)
626       break; // All inputs enabled.
627 
628     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
629     bool Changed = false;
630     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
631       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
632           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
633         Value *Src = II.getArgOperand(I + 2);
634         if (!isa<UndefValue>(Src)) {
635           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
636           Changed = true;
637         }
638       }
639     }
640 
641     if (Changed) {
642       return &II;
643     }
644 
645     break;
646   }
647   case Intrinsic::amdgcn_fmed3: {
648     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
649     // for the shader.
650 
651     Value *Src0 = II.getArgOperand(0);
652     Value *Src1 = II.getArgOperand(1);
653     Value *Src2 = II.getArgOperand(2);
654 
655     // Checking for NaN before canonicalization provides better fidelity when
656     // mapping other operations onto fmed3 since the order of operands is
657     // unchanged.
658     CallInst *NewCall = nullptr;
659     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
660       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
661     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
662       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
663     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
664       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
665     }
666 
667     if (NewCall) {
668       NewCall->copyFastMathFlags(&II);
669       NewCall->takeName(&II);
670       return IC.replaceInstUsesWith(II, NewCall);
671     }
672 
673     bool Swap = false;
674     // Canonicalize constants to RHS operands.
675     //
676     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
677     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
678       std::swap(Src0, Src1);
679       Swap = true;
680     }
681 
682     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
683       std::swap(Src1, Src2);
684       Swap = true;
685     }
686 
687     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
688       std::swap(Src0, Src1);
689       Swap = true;
690     }
691 
692     if (Swap) {
693       II.setArgOperand(0, Src0);
694       II.setArgOperand(1, Src1);
695       II.setArgOperand(2, Src2);
696       return &II;
697     }
698 
699     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
700       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
701         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
702           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
703                                        C2->getValueAPF());
704           return IC.replaceInstUsesWith(
705               II, ConstantFP::get(IC.Builder.getContext(), Result));
706         }
707       }
708     }
709 
710     break;
711   }
712   case Intrinsic::amdgcn_icmp:
713   case Intrinsic::amdgcn_fcmp: {
714     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
715     // Guard against invalid arguments.
716     int64_t CCVal = CC->getZExtValue();
717     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
718     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
719                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
720         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
721                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
722       break;
723 
724     Value *Src0 = II.getArgOperand(0);
725     Value *Src1 = II.getArgOperand(1);
726 
727     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
728       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
729         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
730         if (CCmp->isNullValue()) {
731           return IC.replaceInstUsesWith(
732               II, ConstantExpr::getSExt(CCmp, II.getType()));
733         }
734 
735         // The result of V_ICMP/V_FCMP assembly instructions (which this
736         // intrinsic exposes) is one bit per thread, masked with the EXEC
737         // register (which contains the bitmask of live threads). So a
738         // comparison that always returns true is the same as a read of the
739         // EXEC register.
740         Function *NewF = Intrinsic::getDeclaration(
741             II.getModule(), Intrinsic::read_register, II.getType());
742         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
743         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
744         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
745         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
746         NewCall->addFnAttr(Attribute::Convergent);
747         NewCall->takeName(&II);
748         return IC.replaceInstUsesWith(II, NewCall);
749       }
750 
751       // Canonicalize constants to RHS.
752       CmpInst::Predicate SwapPred =
753           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
754       II.setArgOperand(0, Src1);
755       II.setArgOperand(1, Src0);
756       II.setArgOperand(
757           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
758       return &II;
759     }
760 
761     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
762       break;
763 
764     // Canonicalize compare eq with true value to compare != 0
765     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
766     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
767     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
768     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
769     Value *ExtSrc;
770     if (CCVal == CmpInst::ICMP_EQ &&
771         ((match(Src1, PatternMatch::m_One()) &&
772           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
773          (match(Src1, PatternMatch::m_AllOnes()) &&
774           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
775         ExtSrc->getType()->isIntegerTy(1)) {
776       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
777       IC.replaceOperand(II, 2,
778                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
779       return &II;
780     }
781 
782     CmpInst::Predicate SrcPred;
783     Value *SrcLHS;
784     Value *SrcRHS;
785 
786     // Fold compare eq/ne with 0 from a compare result as the predicate to the
787     // intrinsic. The typical use is a wave vote function in the library, which
788     // will be fed from a user code condition compared with 0. Fold in the
789     // redundant compare.
790 
791     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
792     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
793     //
794     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
795     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
796     if (match(Src1, PatternMatch::m_Zero()) &&
797         match(Src0, PatternMatch::m_ZExtOrSExt(
798                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
799                               PatternMatch::m_Value(SrcRHS))))) {
800       if (CCVal == CmpInst::ICMP_EQ)
801         SrcPred = CmpInst::getInversePredicate(SrcPred);
802 
803       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
804                                  ? Intrinsic::amdgcn_fcmp
805                                  : Intrinsic::amdgcn_icmp;
806 
807       Type *Ty = SrcLHS->getType();
808       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
809         // Promote to next legal integer type.
810         unsigned Width = CmpType->getBitWidth();
811         unsigned NewWidth = Width;
812 
813         // Don't do anything for i1 comparisons.
814         if (Width == 1)
815           break;
816 
817         if (Width <= 16)
818           NewWidth = 16;
819         else if (Width <= 32)
820           NewWidth = 32;
821         else if (Width <= 64)
822           NewWidth = 64;
823         else if (Width > 64)
824           break; // Can't handle this.
825 
826         if (Width != NewWidth) {
827           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
828           if (CmpInst::isSigned(SrcPred)) {
829             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
830             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
831           } else {
832             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
833             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
834           }
835         }
836       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
837         break;
838 
839       Function *NewF = Intrinsic::getDeclaration(
840           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
841       Value *Args[] = {SrcLHS, SrcRHS,
842                        ConstantInt::get(CC->getType(), SrcPred)};
843       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
844       NewCall->takeName(&II);
845       return IC.replaceInstUsesWith(II, NewCall);
846     }
847 
848     break;
849   }
850   case Intrinsic::amdgcn_ballot: {
851     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
852       if (Src->isZero()) {
853         // amdgcn.ballot(i1 0) is zero.
854         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
855       }
856 
857       if (Src->isOne()) {
858         // amdgcn.ballot(i1 1) is exec.
859         const char *RegName = "exec";
860         if (II.getType()->isIntegerTy(32))
861           RegName = "exec_lo";
862         else if (!II.getType()->isIntegerTy(64))
863           break;
864 
865         Function *NewF = Intrinsic::getDeclaration(
866             II.getModule(), Intrinsic::read_register, II.getType());
867         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
868         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
869         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
870         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
871         NewCall->addFnAttr(Attribute::Convergent);
872         NewCall->takeName(&II);
873         return IC.replaceInstUsesWith(II, NewCall);
874       }
875     }
876     break;
877   }
878   case Intrinsic::amdgcn_wqm_vote: {
879     // wqm_vote is identity when the argument is constant.
880     if (!isa<Constant>(II.getArgOperand(0)))
881       break;
882 
883     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
884   }
885   case Intrinsic::amdgcn_kill: {
886     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
887     if (!C || !C->getZExtValue())
888       break;
889 
890     // amdgcn.kill(i1 1) is a no-op
891     return IC.eraseInstFromFunction(II);
892   }
893   case Intrinsic::amdgcn_update_dpp: {
894     Value *Old = II.getArgOperand(0);
895 
896     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
897     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
898     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
899     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
900         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
901       break;
902 
903     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
904     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
905   }
906   case Intrinsic::amdgcn_permlane16:
907   case Intrinsic::amdgcn_permlanex16: {
908     // Discard vdst_in if it's not going to be read.
909     Value *VDstIn = II.getArgOperand(0);
910     if (isa<UndefValue>(VDstIn))
911       break;
912 
913     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
914     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
915     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
916       break;
917 
918     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
919   }
920   case Intrinsic::amdgcn_readfirstlane:
921   case Intrinsic::amdgcn_readlane: {
922     // A constant value is trivially uniform.
923     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
924       return IC.replaceInstUsesWith(II, C);
925     }
926 
927     // The rest of these may not be safe if the exec may not be the same between
928     // the def and use.
929     Value *Src = II.getArgOperand(0);
930     Instruction *SrcInst = dyn_cast<Instruction>(Src);
931     if (SrcInst && SrcInst->getParent() != II.getParent())
932       break;
933 
934     // readfirstlane (readfirstlane x) -> readfirstlane x
935     // readlane (readfirstlane x), y -> readfirstlane x
936     if (match(Src,
937               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
938       return IC.replaceInstUsesWith(II, Src);
939     }
940 
941     if (IID == Intrinsic::amdgcn_readfirstlane) {
942       // readfirstlane (readlane x, y) -> readlane x, y
943       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
944         return IC.replaceInstUsesWith(II, Src);
945       }
946     } else {
947       // readlane (readlane x, y), y -> readlane x, y
948       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
949                          PatternMatch::m_Value(),
950                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
951         return IC.replaceInstUsesWith(II, Src);
952       }
953     }
954 
955     break;
956   }
957   case Intrinsic::amdgcn_ldexp: {
958     // FIXME: This doesn't introduce new instructions and belongs in
959     // InstructionSimplify.
960     Type *Ty = II.getType();
961     Value *Op0 = II.getArgOperand(0);
962     Value *Op1 = II.getArgOperand(1);
963 
964     // Folding undef to qnan is safe regardless of the FP mode.
965     if (isa<UndefValue>(Op0)) {
966       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
967       return IC.replaceInstUsesWith(II, QNaN);
968     }
969 
970     const APFloat *C = nullptr;
971     match(Op0, PatternMatch::m_APFloat(C));
972 
973     // FIXME: Should flush denorms depending on FP mode, but that's ignored
974     // everywhere else.
975     //
976     // These cases should be safe, even with strictfp.
977     // ldexp(0.0, x) -> 0.0
978     // ldexp(-0.0, x) -> -0.0
979     // ldexp(inf, x) -> inf
980     // ldexp(-inf, x) -> -inf
981     if (C && (C->isZero() || C->isInfinity())) {
982       return IC.replaceInstUsesWith(II, Op0);
983     }
984 
985     // With strictfp, be more careful about possibly needing to flush denormals
986     // or not, and snan behavior depends on ieee_mode.
987     if (II.isStrictFP())
988       break;
989 
990     if (C && C->isNaN()) {
991       // FIXME: We just need to make the nan quiet here, but that's unavailable
992       // on APFloat, only IEEEfloat
993       auto *Quieted =
994           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
995       return IC.replaceInstUsesWith(II, Quieted);
996     }
997 
998     // ldexp(x, 0) -> x
999     // ldexp(x, undef) -> x
1000     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
1001       return IC.replaceInstUsesWith(II, Op0);
1002     }
1003 
1004     break;
1005   }
1006   case Intrinsic::amdgcn_fmul_legacy: {
1007     Value *Op0 = II.getArgOperand(0);
1008     Value *Op1 = II.getArgOperand(1);
1009 
1010     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1011     // infinity, gives +0.0.
1012     // TODO: Move to InstSimplify?
1013     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1014         match(Op1, PatternMatch::m_AnyZeroFP()))
1015       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
1016 
1017     // If we can prove we don't have one of the special cases then we can use a
1018     // normal fmul instruction instead.
1019     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1020       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1021       FMul->takeName(&II);
1022       return IC.replaceInstUsesWith(II, FMul);
1023     }
1024     break;
1025   }
1026   case Intrinsic::amdgcn_fma_legacy: {
1027     Value *Op0 = II.getArgOperand(0);
1028     Value *Op1 = II.getArgOperand(1);
1029     Value *Op2 = II.getArgOperand(2);
1030 
1031     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1032     // infinity, gives +0.0.
1033     // TODO: Move to InstSimplify?
1034     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1035         match(Op1, PatternMatch::m_AnyZeroFP())) {
1036       // It's tempting to just return Op2 here, but that would give the wrong
1037       // result if Op2 was -0.0.
1038       auto *Zero = ConstantFP::getNullValue(II.getType());
1039       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1040       FAdd->takeName(&II);
1041       return IC.replaceInstUsesWith(II, FAdd);
1042     }
1043 
1044     // If we can prove we don't have one of the special cases then we can use a
1045     // normal fma instead.
1046     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1047       II.setCalledOperand(Intrinsic::getDeclaration(
1048           II.getModule(), Intrinsic::fma, II.getType()));
1049       return &II;
1050     }
1051     break;
1052   }
1053   case Intrinsic::amdgcn_is_shared:
1054   case Intrinsic::amdgcn_is_private: {
1055     if (isa<UndefValue>(II.getArgOperand(0)))
1056       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1057 
1058     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1059       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1060     break;
1061   }
1062   default: {
1063     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1064             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1065       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1066     }
1067   }
1068   }
1069   return None;
1070 }
1071 
1072 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1073 ///
1074 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1075 ///       struct returns.
1076 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1077                                                     IntrinsicInst &II,
1078                                                     APInt DemandedElts,
1079                                                     int DMaskIdx = -1) {
1080 
1081   auto *IIVTy = cast<FixedVectorType>(II.getType());
1082   unsigned VWidth = IIVTy->getNumElements();
1083   if (VWidth == 1)
1084     return nullptr;
1085 
1086   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1087   IC.Builder.SetInsertPoint(&II);
1088 
1089   // Assume the arguments are unchanged and later override them, if needed.
1090   SmallVector<Value *, 16> Args(II.args());
1091 
1092   if (DMaskIdx < 0) {
1093     // Buffer case.
1094 
1095     const unsigned ActiveBits = DemandedElts.getActiveBits();
1096     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1097 
1098     // Start assuming the prefix of elements is demanded, but possibly clear
1099     // some other bits if there are trailing zeros (unused components at front)
1100     // and update offset.
1101     DemandedElts = (1 << ActiveBits) - 1;
1102 
1103     if (UnusedComponentsAtFront > 0) {
1104       static const unsigned InvalidOffsetIdx = 0xf;
1105 
1106       unsigned OffsetIdx;
1107       switch (II.getIntrinsicID()) {
1108       case Intrinsic::amdgcn_raw_buffer_load:
1109         OffsetIdx = 1;
1110         break;
1111       case Intrinsic::amdgcn_s_buffer_load:
1112         // If resulting type is vec3, there is no point in trimming the
1113         // load with updated offset, as the vec3 would most likely be widened to
1114         // vec4 anyway during lowering.
1115         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1116           OffsetIdx = InvalidOffsetIdx;
1117         else
1118           OffsetIdx = 1;
1119         break;
1120       case Intrinsic::amdgcn_struct_buffer_load:
1121         OffsetIdx = 2;
1122         break;
1123       default:
1124         // TODO: handle tbuffer* intrinsics.
1125         OffsetIdx = InvalidOffsetIdx;
1126         break;
1127       }
1128 
1129       if (OffsetIdx != InvalidOffsetIdx) {
1130         // Clear demanded bits and update the offset.
1131         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1132         auto *Offset = II.getArgOperand(OffsetIdx);
1133         unsigned SingleComponentSizeInBits =
1134             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1135         unsigned OffsetAdd =
1136             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1137         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1138         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1139       }
1140     }
1141   } else {
1142     // Image case.
1143 
1144     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1145     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1146 
1147     // Mask off values that are undefined because the dmask doesn't cover them
1148     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1149 
1150     unsigned NewDMaskVal = 0;
1151     unsigned OrigLoadIdx = 0;
1152     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1153       const unsigned Bit = 1 << SrcIdx;
1154       if (!!(DMaskVal & Bit)) {
1155         if (!!DemandedElts[OrigLoadIdx])
1156           NewDMaskVal |= Bit;
1157         OrigLoadIdx++;
1158       }
1159     }
1160 
1161     if (DMaskVal != NewDMaskVal)
1162       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1163   }
1164 
1165   unsigned NewNumElts = DemandedElts.countPopulation();
1166   if (!NewNumElts)
1167     return UndefValue::get(II.getType());
1168 
1169   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1170     if (DMaskIdx >= 0)
1171       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1172     return nullptr;
1173   }
1174 
1175   // Validate function argument and return types, extracting overloaded types
1176   // along the way.
1177   SmallVector<Type *, 6> OverloadTys;
1178   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1179     return nullptr;
1180 
1181   Module *M = II.getParent()->getParent()->getParent();
1182   Type *EltTy = IIVTy->getElementType();
1183   Type *NewTy =
1184       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1185 
1186   OverloadTys[0] = NewTy;
1187   Function *NewIntrin =
1188       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1189 
1190   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1191   NewCall->takeName(&II);
1192   NewCall->copyMetadata(II);
1193 
1194   if (NewNumElts == 1) {
1195     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1196                                           NewCall,
1197                                           DemandedElts.countTrailingZeros());
1198   }
1199 
1200   SmallVector<int, 8> EltMask;
1201   unsigned NewLoadIdx = 0;
1202   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1203     if (!!DemandedElts[OrigLoadIdx])
1204       EltMask.push_back(NewLoadIdx++);
1205     else
1206       EltMask.push_back(NewNumElts);
1207   }
1208 
1209   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1210 
1211   return Shuffle;
1212 }
1213 
1214 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1215     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1216     APInt &UndefElts2, APInt &UndefElts3,
1217     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1218         SimplifyAndSetOp) const {
1219   switch (II.getIntrinsicID()) {
1220   case Intrinsic::amdgcn_buffer_load:
1221   case Intrinsic::amdgcn_buffer_load_format:
1222   case Intrinsic::amdgcn_raw_buffer_load:
1223   case Intrinsic::amdgcn_raw_buffer_load_format:
1224   case Intrinsic::amdgcn_raw_tbuffer_load:
1225   case Intrinsic::amdgcn_s_buffer_load:
1226   case Intrinsic::amdgcn_struct_buffer_load:
1227   case Intrinsic::amdgcn_struct_buffer_load_format:
1228   case Intrinsic::amdgcn_struct_tbuffer_load:
1229   case Intrinsic::amdgcn_tbuffer_load:
1230     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1231   default: {
1232     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1233       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1234     }
1235     break;
1236   }
1237   }
1238   return None;
1239 }
1240