xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision a58541f14d2d3e7d65f2e9b341bf2321b312c615)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 
24 using namespace llvm;
25 
26 #define DEBUG_TYPE "AMDGPUtti"
27 
28 namespace {
29 
30 struct AMDGPUImageDMaskIntrinsic {
31   unsigned Intr;
32 };
33 
34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
35 #include "InstCombineTables.inc"
36 
37 } // end anonymous namespace
38 
39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
40 //
41 // A single NaN input is folded to minnum, so we rely on that folding for
42 // handling NaNs.
43 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
44                            const APFloat &Src2) {
45   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
46 
47   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
48   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
49   if (Cmp0 == APFloat::cmpEqual)
50     return maxnum(Src1, Src2);
51 
52   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
53   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
54   if (Cmp1 == APFloat::cmpEqual)
55     return maxnum(Src0, Src2);
56 
57   return maxnum(Src0, Src1);
58 }
59 
60 // Check if a value can be converted to a 16-bit value without losing
61 // precision.
62 // The value is expected to be either a float (IsFloat = true) or an unsigned
63 // integer (IsFloat = false).
64 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
65   Type *VTy = V.getType();
66   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
67     // The value is already 16-bit, so we don't want to convert to 16-bit again!
68     return false;
69   }
70   if (IsFloat) {
71     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
72       // We need to check that if we cast the index down to a half, we do not
73       // lose precision.
74       APFloat FloatValue(ConstFloat->getValueAPF());
75       bool LosesInfo = true;
76       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
77                          &LosesInfo);
78       return !LosesInfo;
79     }
80   } else {
81     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
82       // We need to check that if we cast the index down to an i16, we do not
83       // lose precision.
84       APInt IntValue(ConstInt->getValue());
85       return IntValue.getActiveBits() <= 16;
86     }
87   }
88 
89   Value *CastSrc;
90   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
91                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
92   if (IsExt) {
93     Type *CastSrcTy = CastSrc->getType();
94     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
95       return true;
96   }
97 
98   return false;
99 }
100 
101 // Convert a value to 16-bit.
102 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
103   Type *VTy = V.getType();
104   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
105     return cast<Instruction>(&V)->getOperand(0);
106   if (VTy->isIntegerTy())
107     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
108   if (VTy->isFloatingPointTy())
109     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
110 
111   llvm_unreachable("Should never be called!");
112 }
113 
114 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
115 /// modified arguments (based on OldIntr) and replaces InstToReplace with
116 /// this newly created intrinsic call.
117 static Optional<Instruction *> modifyIntrinsicCall(
118     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
119     InstCombiner &IC,
120     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
121         Func) {
122   SmallVector<Type *, 4> ArgTys;
123   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
124     return None;
125 
126   SmallVector<Value *, 8> Args(OldIntr.args());
127 
128   // Modify arguments and types
129   Func(Args, ArgTys);
130 
131   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
132 
133   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
134   NewCall->takeName(&OldIntr);
135   NewCall->copyMetadata(OldIntr);
136   if (isa<FPMathOperator>(NewCall))
137     NewCall->copyFastMathFlags(&OldIntr);
138 
139   // Erase and replace uses
140   if (!InstToReplace.getType()->isVoidTy())
141     IC.replaceInstUsesWith(InstToReplace, NewCall);
142 
143   bool RemoveOldIntr = &OldIntr != &InstToReplace;
144 
145   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
146   if (RemoveOldIntr)
147     IC.eraseInstFromFunction(OldIntr);
148 
149   return RetValue;
150 }
151 
152 static Optional<Instruction *>
153 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
154                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
155                              IntrinsicInst &II, InstCombiner &IC) {
156   // Optimize _L to _LZ when _L is zero
157   if (const auto *LZMappingInfo =
158           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
159     if (auto *ConstantLod =
160             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
161       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
162         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
163             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
164                                                      ImageDimIntr->Dim);
165         return modifyIntrinsicCall(
166             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
167               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
168             });
169       }
170     }
171   }
172 
173   // Optimize _mip away, when 'lod' is zero
174   if (const auto *MIPMappingInfo =
175           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
176     if (auto *ConstantMip =
177             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
178       if (ConstantMip->isZero()) {
179         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
180             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
181                                                      ImageDimIntr->Dim);
182         return modifyIntrinsicCall(
183             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
184               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
185             });
186       }
187     }
188   }
189 
190   // Optimize _bias away when 'bias' is zero
191   if (const auto *BiasMappingInfo =
192           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
193     if (auto *ConstantBias =
194             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
195       if (ConstantBias->isZero()) {
196         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
197             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
198                                                      ImageDimIntr->Dim);
199         return modifyIntrinsicCall(
200             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
201               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
202               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
203             });
204       }
205     }
206   }
207 
208   // Optimize _offset away when 'offset' is zero
209   if (const auto *OffsetMappingInfo =
210           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
211     if (auto *ConstantOffset =
212             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
213       if (ConstantOffset->isZero()) {
214         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
215             AMDGPU::getImageDimIntrinsicByBaseOpcode(
216                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
217         return modifyIntrinsicCall(
218             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
219               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
220             });
221       }
222     }
223   }
224 
225   // Try to use D16
226   if (ST->hasD16Images()) {
227 
228     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
229         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
230 
231     if (BaseOpcode->HasD16) {
232 
233       // If the only use of image intrinsic is a fptrunc (with conversion to
234       // half) then both fptrunc and image intrinsic will be replaced with image
235       // intrinsic with D16 flag.
236       if (II.hasOneUse()) {
237         Instruction *User = II.user_back();
238 
239         if (User->getOpcode() == Instruction::FPTrunc &&
240             User->getType()->getScalarType()->isHalfTy()) {
241 
242           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
243                                      [&](auto &Args, auto &ArgTys) {
244                                        // Change return type of image intrinsic.
245                                        // Set it to return type of fptrunc.
246                                        ArgTys[0] = User->getType();
247                                      });
248         }
249       }
250     }
251   }
252 
253   // Try to use A16 or G16
254   if (!ST->hasA16() && !ST->hasG16())
255     return None;
256 
257   // Address is interpreted as float if the instruction has a sampler or as
258   // unsigned int if there is no sampler.
259   bool HasSampler =
260       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
261   bool FloatCoord = false;
262   // true means derivatives can be converted to 16 bit, coordinates not
263   bool OnlyDerivatives = false;
264 
265   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
266        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
267     Value *Coord = II.getOperand(OperandIndex);
268     // If the values are not derived from 16-bit values, we cannot optimize.
269     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
270       if (OperandIndex < ImageDimIntr->CoordStart ||
271           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
272         return None;
273       }
274       // All gradients can be converted, so convert only them
275       OnlyDerivatives = true;
276       break;
277     }
278 
279     assert(OperandIndex == ImageDimIntr->GradientStart ||
280            FloatCoord == Coord->getType()->isFloatingPointTy());
281     FloatCoord = Coord->getType()->isFloatingPointTy();
282   }
283 
284   if (!OnlyDerivatives && !ST->hasA16())
285     OnlyDerivatives = true; // Only supports G16
286 
287   // Check if there is a bias parameter and if it can be converted to f16
288   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
289     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
290     assert(HasSampler &&
291            "Only image instructions with a sampler can have a bias");
292     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
293       OnlyDerivatives = true;
294   }
295 
296   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
297                                                ImageDimIntr->CoordStart))
298     return None;
299 
300   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
301                                : Type::getInt16Ty(II.getContext());
302 
303   return modifyIntrinsicCall(
304       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
305         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
306         if (!OnlyDerivatives) {
307           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
308 
309           // Change the bias type
310           if (ImageDimIntr->NumBiasArgs != 0)
311             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
312         }
313 
314         unsigned EndIndex =
315             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
316         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
317              OperandIndex < EndIndex; OperandIndex++) {
318           Args[OperandIndex] =
319               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
320         }
321 
322         // Convert the bias
323         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
324           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
325           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
326         }
327       });
328 }
329 
330 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
331                                            InstCombiner &IC) const {
332   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
333   // infinity, gives +0.0. If we can prove we don't have one of the special
334   // cases then we can use a normal multiply instead.
335   // TODO: Create and use isKnownFiniteNonZero instead of just matching
336   // constants here.
337   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
338       match(Op1, PatternMatch::m_FiniteNonZero())) {
339     // One operand is not zero or infinity or NaN.
340     return true;
341   }
342   auto *TLI = &IC.getTargetLibraryInfo();
343   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
344       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
345     // Neither operand is infinity or NaN.
346     return true;
347   }
348   return false;
349 }
350 
351 Optional<Instruction *>
352 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
353   Intrinsic::ID IID = II.getIntrinsicID();
354   switch (IID) {
355   case Intrinsic::amdgcn_rcp: {
356     Value *Src = II.getArgOperand(0);
357 
358     // TODO: Move to ConstantFolding/InstSimplify?
359     if (isa<UndefValue>(Src)) {
360       Type *Ty = II.getType();
361       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
362       return IC.replaceInstUsesWith(II, QNaN);
363     }
364 
365     if (II.isStrictFP())
366       break;
367 
368     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
369       const APFloat &ArgVal = C->getValueAPF();
370       APFloat Val(ArgVal.getSemantics(), 1);
371       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
372 
373       // This is more precise than the instruction may give.
374       //
375       // TODO: The instruction always flushes denormal results (except for f16),
376       // should this also?
377       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
378     }
379 
380     break;
381   }
382   case Intrinsic::amdgcn_sqrt:
383   case Intrinsic::amdgcn_rsq: {
384     Value *Src = II.getArgOperand(0);
385 
386     // TODO: Move to ConstantFolding/InstSimplify?
387     if (isa<UndefValue>(Src)) {
388       Type *Ty = II.getType();
389       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
390       return IC.replaceInstUsesWith(II, QNaN);
391     }
392 
393     break;
394   }
395   case Intrinsic::amdgcn_frexp_mant:
396   case Intrinsic::amdgcn_frexp_exp: {
397     Value *Src = II.getArgOperand(0);
398     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
399       int Exp;
400       APFloat Significand =
401           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
402 
403       if (IID == Intrinsic::amdgcn_frexp_mant) {
404         return IC.replaceInstUsesWith(
405             II, ConstantFP::get(II.getContext(), Significand));
406       }
407 
408       // Match instruction special case behavior.
409       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
410         Exp = 0;
411 
412       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
413     }
414 
415     if (isa<UndefValue>(Src)) {
416       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
417     }
418 
419     break;
420   }
421   case Intrinsic::amdgcn_class: {
422     Value *Src0 = II.getArgOperand(0);
423     Value *Src1 = II.getArgOperand(1);
424     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
425     if (!CMask) {
426       if (isa<UndefValue>(Src0)) {
427         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
428       }
429 
430       if (isa<UndefValue>(Src1)) {
431         return IC.replaceInstUsesWith(II,
432                                       ConstantInt::get(II.getType(), false));
433       }
434       break;
435     }
436 
437     uint32_t Mask = CMask->getZExtValue();
438 
439     // If all tests are made, it doesn't matter what the value is.
440     if ((Mask & fcAllFlags) == fcAllFlags) {
441       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
442     }
443 
444     if ((Mask & fcAllFlags) == 0) {
445       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
446     }
447 
448     if (Mask == fcNan && !II.isStrictFP()) {
449       // Equivalent of isnan. Replace with standard fcmp.
450       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
451       FCmp->takeName(&II);
452       return IC.replaceInstUsesWith(II, FCmp);
453     }
454 
455     if (Mask == fcZero && !II.isStrictFP()) {
456       // Equivalent of == 0.
457       Value *FCmp =
458           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
459 
460       FCmp->takeName(&II);
461       return IC.replaceInstUsesWith(II, FCmp);
462     }
463 
464     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
465     if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
466       return IC.replaceOperand(
467           II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan));
468     }
469 
470     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
471     if (!CVal) {
472       if (isa<UndefValue>(Src0)) {
473         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
474       }
475 
476       // Clamp mask to used bits
477       if ((Mask & fcAllFlags) != Mask) {
478         CallInst *NewCall = IC.Builder.CreateCall(
479             II.getCalledFunction(),
480             {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)});
481 
482         NewCall->takeName(&II);
483         return IC.replaceInstUsesWith(II, NewCall);
484       }
485 
486       break;
487     }
488 
489     const APFloat &Val = CVal->getValueAPF();
490 
491     bool Result =
492         ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) ||
493         ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) ||
494         ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) ||
495         ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) ||
496         ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) ||
497         ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) ||
498         ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) ||
499         ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) ||
500         ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) ||
501         ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative());
502 
503     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
504   }
505   case Intrinsic::amdgcn_cvt_pkrtz: {
506     Value *Src0 = II.getArgOperand(0);
507     Value *Src1 = II.getArgOperand(1);
508     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
509       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
510         const fltSemantics &HalfSem =
511             II.getType()->getScalarType()->getFltSemantics();
512         bool LosesInfo;
513         APFloat Val0 = C0->getValueAPF();
514         APFloat Val1 = C1->getValueAPF();
515         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
516         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
517 
518         Constant *Folded =
519             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
520                                  ConstantFP::get(II.getContext(), Val1)});
521         return IC.replaceInstUsesWith(II, Folded);
522       }
523     }
524 
525     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
526       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
527     }
528 
529     break;
530   }
531   case Intrinsic::amdgcn_cvt_pknorm_i16:
532   case Intrinsic::amdgcn_cvt_pknorm_u16:
533   case Intrinsic::amdgcn_cvt_pk_i16:
534   case Intrinsic::amdgcn_cvt_pk_u16: {
535     Value *Src0 = II.getArgOperand(0);
536     Value *Src1 = II.getArgOperand(1);
537 
538     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
539       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
540     }
541 
542     break;
543   }
544   case Intrinsic::amdgcn_ubfe:
545   case Intrinsic::amdgcn_sbfe: {
546     // Decompose simple cases into standard shifts.
547     Value *Src = II.getArgOperand(0);
548     if (isa<UndefValue>(Src)) {
549       return IC.replaceInstUsesWith(II, Src);
550     }
551 
552     unsigned Width;
553     Type *Ty = II.getType();
554     unsigned IntSize = Ty->getIntegerBitWidth();
555 
556     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
557     if (CWidth) {
558       Width = CWidth->getZExtValue();
559       if ((Width & (IntSize - 1)) == 0) {
560         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
561       }
562 
563       // Hardware ignores high bits, so remove those.
564       if (Width >= IntSize) {
565         return IC.replaceOperand(
566             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
567       }
568     }
569 
570     unsigned Offset;
571     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
572     if (COffset) {
573       Offset = COffset->getZExtValue();
574       if (Offset >= IntSize) {
575         return IC.replaceOperand(
576             II, 1,
577             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
578       }
579     }
580 
581     bool Signed = IID == Intrinsic::amdgcn_sbfe;
582 
583     if (!CWidth || !COffset)
584       break;
585 
586     // The case of Width == 0 is handled above, which makes this transformation
587     // safe.  If Width == 0, then the ashr and lshr instructions become poison
588     // value since the shift amount would be equal to the bit size.
589     assert(Width != 0);
590 
591     // TODO: This allows folding to undef when the hardware has specific
592     // behavior?
593     if (Offset + Width < IntSize) {
594       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
595       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
596                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
597       RightShift->takeName(&II);
598       return IC.replaceInstUsesWith(II, RightShift);
599     }
600 
601     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
602                                : IC.Builder.CreateLShr(Src, Offset);
603 
604     RightShift->takeName(&II);
605     return IC.replaceInstUsesWith(II, RightShift);
606   }
607   case Intrinsic::amdgcn_exp:
608   case Intrinsic::amdgcn_exp_row:
609   case Intrinsic::amdgcn_exp_compr: {
610     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
611     unsigned EnBits = En->getZExtValue();
612     if (EnBits == 0xf)
613       break; // All inputs enabled.
614 
615     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
616     bool Changed = false;
617     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
618       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
619           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
620         Value *Src = II.getArgOperand(I + 2);
621         if (!isa<UndefValue>(Src)) {
622           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
623           Changed = true;
624         }
625       }
626     }
627 
628     if (Changed) {
629       return &II;
630     }
631 
632     break;
633   }
634   case Intrinsic::amdgcn_fmed3: {
635     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
636     // for the shader.
637 
638     Value *Src0 = II.getArgOperand(0);
639     Value *Src1 = II.getArgOperand(1);
640     Value *Src2 = II.getArgOperand(2);
641 
642     // Checking for NaN before canonicalization provides better fidelity when
643     // mapping other operations onto fmed3 since the order of operands is
644     // unchanged.
645     CallInst *NewCall = nullptr;
646     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
647       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
648     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
649       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
650     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
651       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
652     }
653 
654     if (NewCall) {
655       NewCall->copyFastMathFlags(&II);
656       NewCall->takeName(&II);
657       return IC.replaceInstUsesWith(II, NewCall);
658     }
659 
660     bool Swap = false;
661     // Canonicalize constants to RHS operands.
662     //
663     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
664     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
665       std::swap(Src0, Src1);
666       Swap = true;
667     }
668 
669     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
670       std::swap(Src1, Src2);
671       Swap = true;
672     }
673 
674     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
675       std::swap(Src0, Src1);
676       Swap = true;
677     }
678 
679     if (Swap) {
680       II.setArgOperand(0, Src0);
681       II.setArgOperand(1, Src1);
682       II.setArgOperand(2, Src2);
683       return &II;
684     }
685 
686     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
687       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
688         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
689           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
690                                        C2->getValueAPF());
691           return IC.replaceInstUsesWith(
692               II, ConstantFP::get(IC.Builder.getContext(), Result));
693         }
694       }
695     }
696 
697     break;
698   }
699   case Intrinsic::amdgcn_icmp:
700   case Intrinsic::amdgcn_fcmp: {
701     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
702     // Guard against invalid arguments.
703     int64_t CCVal = CC->getZExtValue();
704     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
705     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
706                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
707         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
708                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
709       break;
710 
711     Value *Src0 = II.getArgOperand(0);
712     Value *Src1 = II.getArgOperand(1);
713 
714     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
715       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
716         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
717         if (CCmp->isNullValue()) {
718           return IC.replaceInstUsesWith(
719               II, ConstantExpr::getSExt(CCmp, II.getType()));
720         }
721 
722         // The result of V_ICMP/V_FCMP assembly instructions (which this
723         // intrinsic exposes) is one bit per thread, masked with the EXEC
724         // register (which contains the bitmask of live threads). So a
725         // comparison that always returns true is the same as a read of the
726         // EXEC register.
727         Function *NewF = Intrinsic::getDeclaration(
728             II.getModule(), Intrinsic::read_register, II.getType());
729         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
730         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
731         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
732         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
733         NewCall->addFnAttr(Attribute::Convergent);
734         NewCall->takeName(&II);
735         return IC.replaceInstUsesWith(II, NewCall);
736       }
737 
738       // Canonicalize constants to RHS.
739       CmpInst::Predicate SwapPred =
740           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
741       II.setArgOperand(0, Src1);
742       II.setArgOperand(1, Src0);
743       II.setArgOperand(
744           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
745       return &II;
746     }
747 
748     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
749       break;
750 
751     // Canonicalize compare eq with true value to compare != 0
752     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
753     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
754     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
755     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
756     Value *ExtSrc;
757     if (CCVal == CmpInst::ICMP_EQ &&
758         ((match(Src1, PatternMatch::m_One()) &&
759           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
760          (match(Src1, PatternMatch::m_AllOnes()) &&
761           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
762         ExtSrc->getType()->isIntegerTy(1)) {
763       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
764       IC.replaceOperand(II, 2,
765                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
766       return &II;
767     }
768 
769     CmpInst::Predicate SrcPred;
770     Value *SrcLHS;
771     Value *SrcRHS;
772 
773     // Fold compare eq/ne with 0 from a compare result as the predicate to the
774     // intrinsic. The typical use is a wave vote function in the library, which
775     // will be fed from a user code condition compared with 0. Fold in the
776     // redundant compare.
777 
778     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
779     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
780     //
781     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
782     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
783     if (match(Src1, PatternMatch::m_Zero()) &&
784         match(Src0, PatternMatch::m_ZExtOrSExt(
785                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
786                               PatternMatch::m_Value(SrcRHS))))) {
787       if (CCVal == CmpInst::ICMP_EQ)
788         SrcPred = CmpInst::getInversePredicate(SrcPred);
789 
790       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
791                                  ? Intrinsic::amdgcn_fcmp
792                                  : Intrinsic::amdgcn_icmp;
793 
794       Type *Ty = SrcLHS->getType();
795       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
796         // Promote to next legal integer type.
797         unsigned Width = CmpType->getBitWidth();
798         unsigned NewWidth = Width;
799 
800         // Don't do anything for i1 comparisons.
801         if (Width == 1)
802           break;
803 
804         if (Width <= 16)
805           NewWidth = 16;
806         else if (Width <= 32)
807           NewWidth = 32;
808         else if (Width <= 64)
809           NewWidth = 64;
810         else if (Width > 64)
811           break; // Can't handle this.
812 
813         if (Width != NewWidth) {
814           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
815           if (CmpInst::isSigned(SrcPred)) {
816             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
817             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
818           } else {
819             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
820             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
821           }
822         }
823       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
824         break;
825 
826       Function *NewF = Intrinsic::getDeclaration(
827           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
828       Value *Args[] = {SrcLHS, SrcRHS,
829                        ConstantInt::get(CC->getType(), SrcPred)};
830       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
831       NewCall->takeName(&II);
832       return IC.replaceInstUsesWith(II, NewCall);
833     }
834 
835     break;
836   }
837   case Intrinsic::amdgcn_ballot: {
838     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
839       if (Src->isZero()) {
840         // amdgcn.ballot(i1 0) is zero.
841         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
842       }
843 
844       if (Src->isOne()) {
845         // amdgcn.ballot(i1 1) is exec.
846         const char *RegName = "exec";
847         if (II.getType()->isIntegerTy(32))
848           RegName = "exec_lo";
849         else if (!II.getType()->isIntegerTy(64))
850           break;
851 
852         Function *NewF = Intrinsic::getDeclaration(
853             II.getModule(), Intrinsic::read_register, II.getType());
854         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
855         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
856         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
857         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
858         NewCall->addFnAttr(Attribute::Convergent);
859         NewCall->takeName(&II);
860         return IC.replaceInstUsesWith(II, NewCall);
861       }
862     }
863     break;
864   }
865   case Intrinsic::amdgcn_wqm_vote: {
866     // wqm_vote is identity when the argument is constant.
867     if (!isa<Constant>(II.getArgOperand(0)))
868       break;
869 
870     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
871   }
872   case Intrinsic::amdgcn_kill: {
873     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
874     if (!C || !C->getZExtValue())
875       break;
876 
877     // amdgcn.kill(i1 1) is a no-op
878     return IC.eraseInstFromFunction(II);
879   }
880   case Intrinsic::amdgcn_update_dpp: {
881     Value *Old = II.getArgOperand(0);
882 
883     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
884     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
885     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
886     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
887         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
888       break;
889 
890     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
891     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
892   }
893   case Intrinsic::amdgcn_permlane16:
894   case Intrinsic::amdgcn_permlanex16: {
895     // Discard vdst_in if it's not going to be read.
896     Value *VDstIn = II.getArgOperand(0);
897     if (isa<UndefValue>(VDstIn))
898       break;
899 
900     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
901     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
902     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
903       break;
904 
905     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
906   }
907   case Intrinsic::amdgcn_permlane64:
908     // A constant value is trivially uniform.
909     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
910       return IC.replaceInstUsesWith(II, C);
911     }
912     break;
913   case Intrinsic::amdgcn_readfirstlane:
914   case Intrinsic::amdgcn_readlane: {
915     // A constant value is trivially uniform.
916     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
917       return IC.replaceInstUsesWith(II, C);
918     }
919 
920     // The rest of these may not be safe if the exec may not be the same between
921     // the def and use.
922     Value *Src = II.getArgOperand(0);
923     Instruction *SrcInst = dyn_cast<Instruction>(Src);
924     if (SrcInst && SrcInst->getParent() != II.getParent())
925       break;
926 
927     // readfirstlane (readfirstlane x) -> readfirstlane x
928     // readlane (readfirstlane x), y -> readfirstlane x
929     if (match(Src,
930               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
931       return IC.replaceInstUsesWith(II, Src);
932     }
933 
934     if (IID == Intrinsic::amdgcn_readfirstlane) {
935       // readfirstlane (readlane x, y) -> readlane x, y
936       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
937         return IC.replaceInstUsesWith(II, Src);
938       }
939     } else {
940       // readlane (readlane x, y), y -> readlane x, y
941       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
942                          PatternMatch::m_Value(),
943                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
944         return IC.replaceInstUsesWith(II, Src);
945       }
946     }
947 
948     break;
949   }
950   case Intrinsic::amdgcn_ldexp: {
951     // FIXME: This doesn't introduce new instructions and belongs in
952     // InstructionSimplify.
953     Type *Ty = II.getType();
954     Value *Op0 = II.getArgOperand(0);
955     Value *Op1 = II.getArgOperand(1);
956 
957     // Folding undef to qnan is safe regardless of the FP mode.
958     if (isa<UndefValue>(Op0)) {
959       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
960       return IC.replaceInstUsesWith(II, QNaN);
961     }
962 
963     const APFloat *C = nullptr;
964     match(Op0, PatternMatch::m_APFloat(C));
965 
966     // FIXME: Should flush denorms depending on FP mode, but that's ignored
967     // everywhere else.
968     //
969     // These cases should be safe, even with strictfp.
970     // ldexp(0.0, x) -> 0.0
971     // ldexp(-0.0, x) -> -0.0
972     // ldexp(inf, x) -> inf
973     // ldexp(-inf, x) -> -inf
974     if (C && (C->isZero() || C->isInfinity())) {
975       return IC.replaceInstUsesWith(II, Op0);
976     }
977 
978     // With strictfp, be more careful about possibly needing to flush denormals
979     // or not, and snan behavior depends on ieee_mode.
980     if (II.isStrictFP())
981       break;
982 
983     if (C && C->isNaN()) {
984       // FIXME: We just need to make the nan quiet here, but that's unavailable
985       // on APFloat, only IEEEfloat
986       auto *Quieted =
987           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
988       return IC.replaceInstUsesWith(II, Quieted);
989     }
990 
991     // ldexp(x, 0) -> x
992     // ldexp(x, undef) -> x
993     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
994       return IC.replaceInstUsesWith(II, Op0);
995     }
996 
997     break;
998   }
999   case Intrinsic::amdgcn_fmul_legacy: {
1000     Value *Op0 = II.getArgOperand(0);
1001     Value *Op1 = II.getArgOperand(1);
1002 
1003     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1004     // infinity, gives +0.0.
1005     // TODO: Move to InstSimplify?
1006     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1007         match(Op1, PatternMatch::m_AnyZeroFP()))
1008       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
1009 
1010     // If we can prove we don't have one of the special cases then we can use a
1011     // normal fmul instruction instead.
1012     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1013       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1014       FMul->takeName(&II);
1015       return IC.replaceInstUsesWith(II, FMul);
1016     }
1017     break;
1018   }
1019   case Intrinsic::amdgcn_fma_legacy: {
1020     Value *Op0 = II.getArgOperand(0);
1021     Value *Op1 = II.getArgOperand(1);
1022     Value *Op2 = II.getArgOperand(2);
1023 
1024     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1025     // infinity, gives +0.0.
1026     // TODO: Move to InstSimplify?
1027     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1028         match(Op1, PatternMatch::m_AnyZeroFP())) {
1029       // It's tempting to just return Op2 here, but that would give the wrong
1030       // result if Op2 was -0.0.
1031       auto *Zero = ConstantFP::getNullValue(II.getType());
1032       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1033       FAdd->takeName(&II);
1034       return IC.replaceInstUsesWith(II, FAdd);
1035     }
1036 
1037     // If we can prove we don't have one of the special cases then we can use a
1038     // normal fma instead.
1039     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1040       II.setCalledOperand(Intrinsic::getDeclaration(
1041           II.getModule(), Intrinsic::fma, II.getType()));
1042       return &II;
1043     }
1044     break;
1045   }
1046   case Intrinsic::amdgcn_is_shared:
1047   case Intrinsic::amdgcn_is_private: {
1048     if (isa<UndefValue>(II.getArgOperand(0)))
1049       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1050 
1051     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1052       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1053     break;
1054   }
1055   default: {
1056     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1057             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1058       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1059     }
1060   }
1061   }
1062   return None;
1063 }
1064 
1065 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1066 ///
1067 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1068 ///       struct returns.
1069 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1070                                                     IntrinsicInst &II,
1071                                                     APInt DemandedElts,
1072                                                     int DMaskIdx = -1) {
1073 
1074   auto *IIVTy = cast<FixedVectorType>(II.getType());
1075   unsigned VWidth = IIVTy->getNumElements();
1076   if (VWidth == 1)
1077     return nullptr;
1078 
1079   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1080   IC.Builder.SetInsertPoint(&II);
1081 
1082   // Assume the arguments are unchanged and later override them, if needed.
1083   SmallVector<Value *, 16> Args(II.args());
1084 
1085   if (DMaskIdx < 0) {
1086     // Buffer case.
1087 
1088     const unsigned ActiveBits = DemandedElts.getActiveBits();
1089     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1090 
1091     // Start assuming the prefix of elements is demanded, but possibly clear
1092     // some other bits if there are trailing zeros (unused components at front)
1093     // and update offset.
1094     DemandedElts = (1 << ActiveBits) - 1;
1095 
1096     if (UnusedComponentsAtFront > 0) {
1097       static const unsigned InvalidOffsetIdx = 0xf;
1098 
1099       unsigned OffsetIdx;
1100       switch (II.getIntrinsicID()) {
1101       case Intrinsic::amdgcn_raw_buffer_load:
1102         OffsetIdx = 1;
1103         break;
1104       case Intrinsic::amdgcn_s_buffer_load:
1105         // If resulting type is vec3, there is no point in trimming the
1106         // load with updated offset, as the vec3 would most likely be widened to
1107         // vec4 anyway during lowering.
1108         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1109           OffsetIdx = InvalidOffsetIdx;
1110         else
1111           OffsetIdx = 1;
1112         break;
1113       case Intrinsic::amdgcn_struct_buffer_load:
1114         OffsetIdx = 2;
1115         break;
1116       default:
1117         // TODO: handle tbuffer* intrinsics.
1118         OffsetIdx = InvalidOffsetIdx;
1119         break;
1120       }
1121 
1122       if (OffsetIdx != InvalidOffsetIdx) {
1123         // Clear demanded bits and update the offset.
1124         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1125         auto *Offset = II.getArgOperand(OffsetIdx);
1126         unsigned SingleComponentSizeInBits =
1127             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1128         unsigned OffsetAdd =
1129             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1130         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1131         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1132       }
1133     }
1134   } else {
1135     // Image case.
1136 
1137     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1138     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1139 
1140     // Mask off values that are undefined because the dmask doesn't cover them
1141     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1142 
1143     unsigned NewDMaskVal = 0;
1144     unsigned OrigLoadIdx = 0;
1145     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1146       const unsigned Bit = 1 << SrcIdx;
1147       if (!!(DMaskVal & Bit)) {
1148         if (!!DemandedElts[OrigLoadIdx])
1149           NewDMaskVal |= Bit;
1150         OrigLoadIdx++;
1151       }
1152     }
1153 
1154     if (DMaskVal != NewDMaskVal)
1155       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1156   }
1157 
1158   unsigned NewNumElts = DemandedElts.countPopulation();
1159   if (!NewNumElts)
1160     return UndefValue::get(II.getType());
1161 
1162   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1163     if (DMaskIdx >= 0)
1164       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1165     return nullptr;
1166   }
1167 
1168   // Validate function argument and return types, extracting overloaded types
1169   // along the way.
1170   SmallVector<Type *, 6> OverloadTys;
1171   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1172     return nullptr;
1173 
1174   Module *M = II.getParent()->getParent()->getParent();
1175   Type *EltTy = IIVTy->getElementType();
1176   Type *NewTy =
1177       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1178 
1179   OverloadTys[0] = NewTy;
1180   Function *NewIntrin =
1181       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1182 
1183   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1184   NewCall->takeName(&II);
1185   NewCall->copyMetadata(II);
1186 
1187   if (NewNumElts == 1) {
1188     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1189                                           NewCall,
1190                                           DemandedElts.countTrailingZeros());
1191   }
1192 
1193   SmallVector<int, 8> EltMask;
1194   unsigned NewLoadIdx = 0;
1195   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1196     if (!!DemandedElts[OrigLoadIdx])
1197       EltMask.push_back(NewLoadIdx++);
1198     else
1199       EltMask.push_back(NewNumElts);
1200   }
1201 
1202   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1203 
1204   return Shuffle;
1205 }
1206 
1207 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1208     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1209     APInt &UndefElts2, APInt &UndefElts3,
1210     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1211         SimplifyAndSetOp) const {
1212   switch (II.getIntrinsicID()) {
1213   case Intrinsic::amdgcn_buffer_load:
1214   case Intrinsic::amdgcn_buffer_load_format:
1215   case Intrinsic::amdgcn_raw_buffer_load:
1216   case Intrinsic::amdgcn_raw_buffer_load_format:
1217   case Intrinsic::amdgcn_raw_tbuffer_load:
1218   case Intrinsic::amdgcn_s_buffer_load:
1219   case Intrinsic::amdgcn_struct_buffer_load:
1220   case Intrinsic::amdgcn_struct_buffer_load_format:
1221   case Intrinsic::amdgcn_struct_tbuffer_load:
1222   case Intrinsic::amdgcn_tbuffer_load:
1223     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1224   default: {
1225     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1226       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1227     }
1228     break;
1229   }
1230   }
1231   return None;
1232 }
1233