xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (revision 603d18033c510c99ad84f26b6603db1ca68a500f)
1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "AMDGPUtti"
26 
27 namespace {
28 
29 struct AMDGPUImageDMaskIntrinsic {
30   unsigned Intr;
31 };
32 
33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
34 #include "InstCombineTables.inc"
35 
36 } // end anonymous namespace
37 
38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
39 //
40 // A single NaN input is folded to minnum, so we rely on that folding for
41 // handling NaNs.
42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
43                            const APFloat &Src2) {
44   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
45 
46   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
47   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
48   if (Cmp0 == APFloat::cmpEqual)
49     return maxnum(Src1, Src2);
50 
51   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
52   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
53   if (Cmp1 == APFloat::cmpEqual)
54     return maxnum(Src0, Src2);
55 
56   return maxnum(Src0, Src1);
57 }
58 
59 // Check if a value can be converted to a 16-bit value without losing
60 // precision.
61 static bool canSafelyConvertTo16Bit(Value &V) {
62   Type *VTy = V.getType();
63   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
64     // The value is already 16-bit, so we don't want to convert to 16-bit again!
65     return false;
66   }
67   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
68     // We need to check that if we cast the index down to a half, we do not lose
69     // precision.
70     APFloat FloatValue(ConstFloat->getValueAPF());
71     bool LosesInfo = true;
72     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
73     return !LosesInfo;
74   }
75   Value *CastSrc;
76   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
77       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
78       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
79     Type *CastSrcTy = CastSrc->getType();
80     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
81       return true;
82   }
83 
84   return false;
85 }
86 
87 // Convert a value to 16-bit.
88 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
89   Type *VTy = V.getType();
90   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
91     return cast<Instruction>(&V)->getOperand(0);
92   if (VTy->isIntegerTy())
93     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
94   if (VTy->isFloatingPointTy())
95     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
96 
97   llvm_unreachable("Should never be called!");
98 }
99 
100 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
101 /// the modified arguments.
102 static Optional<Instruction *> modifyIntrinsicCall(
103     IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
104     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
105         Func) {
106   SmallVector<Type *, 4> ArgTys;
107   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
108     return None;
109 
110   SmallVector<Value *, 8> Args(II.args());
111 
112   // Modify arguments and types
113   Func(Args, ArgTys);
114 
115   Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
116 
117   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
118   NewCall->takeName(&II);
119   NewCall->copyMetadata(II);
120   if (isa<FPMathOperator>(NewCall))
121     NewCall->copyFastMathFlags(&II);
122 
123   // Erase and replace uses
124   if (!II.getType()->isVoidTy())
125     IC.replaceInstUsesWith(II, NewCall);
126   return IC.eraseInstFromFunction(II);
127 }
128 
129 static Optional<Instruction *>
130 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
131                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
132                              IntrinsicInst &II, InstCombiner &IC) {
133   // Optimize _L to _LZ when _L is zero
134   if (const auto *LZMappingInfo =
135           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
136     if (auto *ConstantLod =
137             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
138       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
139         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
140             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
141                                                      ImageDimIntr->Dim);
142         return modifyIntrinsicCall(
143             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
144               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
145             });
146       }
147     }
148   }
149 
150   // Optimize _mip away, when 'lod' is zero
151   if (const auto *MIPMappingInfo =
152           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
153     if (auto *ConstantMip =
154             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
155       if (ConstantMip->isZero()) {
156         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
157             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
158                                                      ImageDimIntr->Dim);
159         return modifyIntrinsicCall(
160             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
161               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
162             });
163       }
164     }
165   }
166 
167   // Optimize _bias away when 'bias' is zero
168   if (const auto *BiasMappingInfo =
169           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
170     if (auto *ConstantBias =
171             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
172       if (ConstantBias->isZero()) {
173         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
174             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
175                                                      ImageDimIntr->Dim);
176         return modifyIntrinsicCall(
177             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
178               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
179               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
180             });
181       }
182     }
183   }
184 
185   // Try to use A16 or G16
186   if (!ST->hasA16() && !ST->hasG16())
187     return None;
188 
189   bool FloatCoord = false;
190   // true means derivatives can be converted to 16 bit, coordinates not
191   bool OnlyDerivatives = false;
192 
193   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
194        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
195     Value *Coord = II.getOperand(OperandIndex);
196     // If the values are not derived from 16-bit values, we cannot optimize.
197     if (!canSafelyConvertTo16Bit(*Coord)) {
198       if (OperandIndex < ImageDimIntr->CoordStart ||
199           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
200         return None;
201       }
202       // All gradients can be converted, so convert only them
203       OnlyDerivatives = true;
204       break;
205     }
206 
207     assert(OperandIndex == ImageDimIntr->GradientStart ||
208            FloatCoord == Coord->getType()->isFloatingPointTy());
209     FloatCoord = Coord->getType()->isFloatingPointTy();
210   }
211 
212   if (!OnlyDerivatives && !ST->hasA16())
213     OnlyDerivatives = true; // Only supports G16
214 
215   // Check if there is a bias parameter and if it can be converted to f16
216   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
217     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
218     if (!canSafelyConvertTo16Bit(*Bias))
219       OnlyDerivatives = true;
220   }
221 
222   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
223                                                ImageDimIntr->CoordStart))
224     return None;
225 
226   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
227                                : Type::getInt16Ty(II.getContext());
228 
229   return modifyIntrinsicCall(
230       II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
231         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
232         if (!OnlyDerivatives) {
233           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
234 
235           // Change the bias type
236           if (ImageDimIntr->NumBiasArgs != 0)
237             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
238         }
239 
240         unsigned EndIndex =
241             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
242         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
243              OperandIndex < EndIndex; OperandIndex++) {
244           Args[OperandIndex] =
245               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
246         }
247 
248         // Convert the bias
249         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
250           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
251           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
252         }
253       });
254 }
255 
256 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
257                                            InstCombiner &IC) const {
258   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
259   // infinity, gives +0.0. If we can prove we don't have one of the special
260   // cases then we can use a normal multiply instead.
261   // TODO: Create and use isKnownFiniteNonZero instead of just matching
262   // constants here.
263   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
264       match(Op1, PatternMatch::m_FiniteNonZero())) {
265     // One operand is not zero or infinity or NaN.
266     return true;
267   }
268   auto *TLI = &IC.getTargetLibraryInfo();
269   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
270       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
271     // Neither operand is infinity or NaN.
272     return true;
273   }
274   return false;
275 }
276 
277 Optional<Instruction *>
278 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
279   Intrinsic::ID IID = II.getIntrinsicID();
280   switch (IID) {
281   case Intrinsic::amdgcn_rcp: {
282     Value *Src = II.getArgOperand(0);
283 
284     // TODO: Move to ConstantFolding/InstSimplify?
285     if (isa<UndefValue>(Src)) {
286       Type *Ty = II.getType();
287       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
288       return IC.replaceInstUsesWith(II, QNaN);
289     }
290 
291     if (II.isStrictFP())
292       break;
293 
294     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
295       const APFloat &ArgVal = C->getValueAPF();
296       APFloat Val(ArgVal.getSemantics(), 1);
297       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
298 
299       // This is more precise than the instruction may give.
300       //
301       // TODO: The instruction always flushes denormal results (except for f16),
302       // should this also?
303       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
304     }
305 
306     break;
307   }
308   case Intrinsic::amdgcn_rsq: {
309     Value *Src = II.getArgOperand(0);
310 
311     // TODO: Move to ConstantFolding/InstSimplify?
312     if (isa<UndefValue>(Src)) {
313       Type *Ty = II.getType();
314       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
315       return IC.replaceInstUsesWith(II, QNaN);
316     }
317 
318     break;
319   }
320   case Intrinsic::amdgcn_frexp_mant:
321   case Intrinsic::amdgcn_frexp_exp: {
322     Value *Src = II.getArgOperand(0);
323     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
324       int Exp;
325       APFloat Significand =
326           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
327 
328       if (IID == Intrinsic::amdgcn_frexp_mant) {
329         return IC.replaceInstUsesWith(
330             II, ConstantFP::get(II.getContext(), Significand));
331       }
332 
333       // Match instruction special case behavior.
334       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
335         Exp = 0;
336 
337       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
338     }
339 
340     if (isa<UndefValue>(Src)) {
341       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
342     }
343 
344     break;
345   }
346   case Intrinsic::amdgcn_class: {
347     enum {
348       S_NAN = 1 << 0,       // Signaling NaN
349       Q_NAN = 1 << 1,       // Quiet NaN
350       N_INFINITY = 1 << 2,  // Negative infinity
351       N_NORMAL = 1 << 3,    // Negative normal
352       N_SUBNORMAL = 1 << 4, // Negative subnormal
353       N_ZERO = 1 << 5,      // Negative zero
354       P_ZERO = 1 << 6,      // Positive zero
355       P_SUBNORMAL = 1 << 7, // Positive subnormal
356       P_NORMAL = 1 << 8,    // Positive normal
357       P_INFINITY = 1 << 9   // Positive infinity
358     };
359 
360     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
361                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
362                               P_NORMAL | P_INFINITY;
363 
364     Value *Src0 = II.getArgOperand(0);
365     Value *Src1 = II.getArgOperand(1);
366     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
367     if (!CMask) {
368       if (isa<UndefValue>(Src0)) {
369         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
370       }
371 
372       if (isa<UndefValue>(Src1)) {
373         return IC.replaceInstUsesWith(II,
374                                       ConstantInt::get(II.getType(), false));
375       }
376       break;
377     }
378 
379     uint32_t Mask = CMask->getZExtValue();
380 
381     // If all tests are made, it doesn't matter what the value is.
382     if ((Mask & FullMask) == FullMask) {
383       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
384     }
385 
386     if ((Mask & FullMask) == 0) {
387       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
388     }
389 
390     if (Mask == (S_NAN | Q_NAN)) {
391       // Equivalent of isnan. Replace with standard fcmp.
392       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
393       FCmp->takeName(&II);
394       return IC.replaceInstUsesWith(II, FCmp);
395     }
396 
397     if (Mask == (N_ZERO | P_ZERO)) {
398       // Equivalent of == 0.
399       Value *FCmp =
400           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
401 
402       FCmp->takeName(&II);
403       return IC.replaceInstUsesWith(II, FCmp);
404     }
405 
406     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
407     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
408         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
409       return IC.replaceOperand(
410           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
411     }
412 
413     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
414     if (!CVal) {
415       if (isa<UndefValue>(Src0)) {
416         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
417       }
418 
419       // Clamp mask to used bits
420       if ((Mask & FullMask) != Mask) {
421         CallInst *NewCall = IC.Builder.CreateCall(
422             II.getCalledFunction(),
423             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
424 
425         NewCall->takeName(&II);
426         return IC.replaceInstUsesWith(II, NewCall);
427       }
428 
429       break;
430     }
431 
432     const APFloat &Val = CVal->getValueAPF();
433 
434     bool Result =
435         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
436         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
437         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
438         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
439         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
440         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
441         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
442         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
443         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
444         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
445 
446     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
447   }
448   case Intrinsic::amdgcn_cvt_pkrtz: {
449     Value *Src0 = II.getArgOperand(0);
450     Value *Src1 = II.getArgOperand(1);
451     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
452       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
453         const fltSemantics &HalfSem =
454             II.getType()->getScalarType()->getFltSemantics();
455         bool LosesInfo;
456         APFloat Val0 = C0->getValueAPF();
457         APFloat Val1 = C1->getValueAPF();
458         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
459         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
460 
461         Constant *Folded =
462             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
463                                  ConstantFP::get(II.getContext(), Val1)});
464         return IC.replaceInstUsesWith(II, Folded);
465       }
466     }
467 
468     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
469       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
470     }
471 
472     break;
473   }
474   case Intrinsic::amdgcn_cvt_pknorm_i16:
475   case Intrinsic::amdgcn_cvt_pknorm_u16:
476   case Intrinsic::amdgcn_cvt_pk_i16:
477   case Intrinsic::amdgcn_cvt_pk_u16: {
478     Value *Src0 = II.getArgOperand(0);
479     Value *Src1 = II.getArgOperand(1);
480 
481     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
482       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
483     }
484 
485     break;
486   }
487   case Intrinsic::amdgcn_ubfe:
488   case Intrinsic::amdgcn_sbfe: {
489     // Decompose simple cases into standard shifts.
490     Value *Src = II.getArgOperand(0);
491     if (isa<UndefValue>(Src)) {
492       return IC.replaceInstUsesWith(II, Src);
493     }
494 
495     unsigned Width;
496     Type *Ty = II.getType();
497     unsigned IntSize = Ty->getIntegerBitWidth();
498 
499     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
500     if (CWidth) {
501       Width = CWidth->getZExtValue();
502       if ((Width & (IntSize - 1)) == 0) {
503         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
504       }
505 
506       // Hardware ignores high bits, so remove those.
507       if (Width >= IntSize) {
508         return IC.replaceOperand(
509             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
510       }
511     }
512 
513     unsigned Offset;
514     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
515     if (COffset) {
516       Offset = COffset->getZExtValue();
517       if (Offset >= IntSize) {
518         return IC.replaceOperand(
519             II, 1,
520             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
521       }
522     }
523 
524     bool Signed = IID == Intrinsic::amdgcn_sbfe;
525 
526     if (!CWidth || !COffset)
527       break;
528 
529     // The case of Width == 0 is handled above, which makes this transformation
530     // safe.  If Width == 0, then the ashr and lshr instructions become poison
531     // value since the shift amount would be equal to the bit size.
532     assert(Width != 0);
533 
534     // TODO: This allows folding to undef when the hardware has specific
535     // behavior?
536     if (Offset + Width < IntSize) {
537       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
538       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
539                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
540       RightShift->takeName(&II);
541       return IC.replaceInstUsesWith(II, RightShift);
542     }
543 
544     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
545                                : IC.Builder.CreateLShr(Src, Offset);
546 
547     RightShift->takeName(&II);
548     return IC.replaceInstUsesWith(II, RightShift);
549   }
550   case Intrinsic::amdgcn_exp:
551   case Intrinsic::amdgcn_exp_compr: {
552     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
553     unsigned EnBits = En->getZExtValue();
554     if (EnBits == 0xf)
555       break; // All inputs enabled.
556 
557     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
558     bool Changed = false;
559     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
560       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
561           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
562         Value *Src = II.getArgOperand(I + 2);
563         if (!isa<UndefValue>(Src)) {
564           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
565           Changed = true;
566         }
567       }
568     }
569 
570     if (Changed) {
571       return &II;
572     }
573 
574     break;
575   }
576   case Intrinsic::amdgcn_fmed3: {
577     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
578     // for the shader.
579 
580     Value *Src0 = II.getArgOperand(0);
581     Value *Src1 = II.getArgOperand(1);
582     Value *Src2 = II.getArgOperand(2);
583 
584     // Checking for NaN before canonicalization provides better fidelity when
585     // mapping other operations onto fmed3 since the order of operands is
586     // unchanged.
587     CallInst *NewCall = nullptr;
588     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
589       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
590     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
591       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
592     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
593       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
594     }
595 
596     if (NewCall) {
597       NewCall->copyFastMathFlags(&II);
598       NewCall->takeName(&II);
599       return IC.replaceInstUsesWith(II, NewCall);
600     }
601 
602     bool Swap = false;
603     // Canonicalize constants to RHS operands.
604     //
605     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
606     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
607       std::swap(Src0, Src1);
608       Swap = true;
609     }
610 
611     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
612       std::swap(Src1, Src2);
613       Swap = true;
614     }
615 
616     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
617       std::swap(Src0, Src1);
618       Swap = true;
619     }
620 
621     if (Swap) {
622       II.setArgOperand(0, Src0);
623       II.setArgOperand(1, Src1);
624       II.setArgOperand(2, Src2);
625       return &II;
626     }
627 
628     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
629       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
630         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
631           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
632                                        C2->getValueAPF());
633           return IC.replaceInstUsesWith(
634               II, ConstantFP::get(IC.Builder.getContext(), Result));
635         }
636       }
637     }
638 
639     break;
640   }
641   case Intrinsic::amdgcn_icmp:
642   case Intrinsic::amdgcn_fcmp: {
643     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
644     // Guard against invalid arguments.
645     int64_t CCVal = CC->getZExtValue();
646     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
647     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
648                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
649         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
650                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
651       break;
652 
653     Value *Src0 = II.getArgOperand(0);
654     Value *Src1 = II.getArgOperand(1);
655 
656     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
657       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
658         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
659         if (CCmp->isNullValue()) {
660           return IC.replaceInstUsesWith(
661               II, ConstantExpr::getSExt(CCmp, II.getType()));
662         }
663 
664         // The result of V_ICMP/V_FCMP assembly instructions (which this
665         // intrinsic exposes) is one bit per thread, masked with the EXEC
666         // register (which contains the bitmask of live threads). So a
667         // comparison that always returns true is the same as a read of the
668         // EXEC register.
669         Function *NewF = Intrinsic::getDeclaration(
670             II.getModule(), Intrinsic::read_register, II.getType());
671         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
672         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
673         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
674         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
675         NewCall->addFnAttr(Attribute::Convergent);
676         NewCall->takeName(&II);
677         return IC.replaceInstUsesWith(II, NewCall);
678       }
679 
680       // Canonicalize constants to RHS.
681       CmpInst::Predicate SwapPred =
682           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
683       II.setArgOperand(0, Src1);
684       II.setArgOperand(1, Src0);
685       II.setArgOperand(
686           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
687       return &II;
688     }
689 
690     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
691       break;
692 
693     // Canonicalize compare eq with true value to compare != 0
694     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
695     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
696     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
697     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
698     Value *ExtSrc;
699     if (CCVal == CmpInst::ICMP_EQ &&
700         ((match(Src1, PatternMatch::m_One()) &&
701           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
702          (match(Src1, PatternMatch::m_AllOnes()) &&
703           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
704         ExtSrc->getType()->isIntegerTy(1)) {
705       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
706       IC.replaceOperand(II, 2,
707                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
708       return &II;
709     }
710 
711     CmpInst::Predicate SrcPred;
712     Value *SrcLHS;
713     Value *SrcRHS;
714 
715     // Fold compare eq/ne with 0 from a compare result as the predicate to the
716     // intrinsic. The typical use is a wave vote function in the library, which
717     // will be fed from a user code condition compared with 0. Fold in the
718     // redundant compare.
719 
720     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
721     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
722     //
723     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
724     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
725     if (match(Src1, PatternMatch::m_Zero()) &&
726         match(Src0, PatternMatch::m_ZExtOrSExt(
727                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
728                               PatternMatch::m_Value(SrcRHS))))) {
729       if (CCVal == CmpInst::ICMP_EQ)
730         SrcPred = CmpInst::getInversePredicate(SrcPred);
731 
732       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
733                                  ? Intrinsic::amdgcn_fcmp
734                                  : Intrinsic::amdgcn_icmp;
735 
736       Type *Ty = SrcLHS->getType();
737       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
738         // Promote to next legal integer type.
739         unsigned Width = CmpType->getBitWidth();
740         unsigned NewWidth = Width;
741 
742         // Don't do anything for i1 comparisons.
743         if (Width == 1)
744           break;
745 
746         if (Width <= 16)
747           NewWidth = 16;
748         else if (Width <= 32)
749           NewWidth = 32;
750         else if (Width <= 64)
751           NewWidth = 64;
752         else if (Width > 64)
753           break; // Can't handle this.
754 
755         if (Width != NewWidth) {
756           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
757           if (CmpInst::isSigned(SrcPred)) {
758             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
759             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
760           } else {
761             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
762             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
763           }
764         }
765       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
766         break;
767 
768       Function *NewF = Intrinsic::getDeclaration(
769           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
770       Value *Args[] = {SrcLHS, SrcRHS,
771                        ConstantInt::get(CC->getType(), SrcPred)};
772       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
773       NewCall->takeName(&II);
774       return IC.replaceInstUsesWith(II, NewCall);
775     }
776 
777     break;
778   }
779   case Intrinsic::amdgcn_ballot: {
780     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
781       if (Src->isZero()) {
782         // amdgcn.ballot(i1 0) is zero.
783         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
784       }
785 
786       if (Src->isOne()) {
787         // amdgcn.ballot(i1 1) is exec.
788         const char *RegName = "exec";
789         if (II.getType()->isIntegerTy(32))
790           RegName = "exec_lo";
791         else if (!II.getType()->isIntegerTy(64))
792           break;
793 
794         Function *NewF = Intrinsic::getDeclaration(
795             II.getModule(), Intrinsic::read_register, II.getType());
796         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
797         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
798         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
799         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
800         NewCall->addFnAttr(Attribute::Convergent);
801         NewCall->takeName(&II);
802         return IC.replaceInstUsesWith(II, NewCall);
803       }
804     }
805     break;
806   }
807   case Intrinsic::amdgcn_wqm_vote: {
808     // wqm_vote is identity when the argument is constant.
809     if (!isa<Constant>(II.getArgOperand(0)))
810       break;
811 
812     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
813   }
814   case Intrinsic::amdgcn_kill: {
815     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
816     if (!C || !C->getZExtValue())
817       break;
818 
819     // amdgcn.kill(i1 1) is a no-op
820     return IC.eraseInstFromFunction(II);
821   }
822   case Intrinsic::amdgcn_update_dpp: {
823     Value *Old = II.getArgOperand(0);
824 
825     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
826     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
827     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
828     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
829         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
830       break;
831 
832     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
833     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
834   }
835   case Intrinsic::amdgcn_permlane16:
836   case Intrinsic::amdgcn_permlanex16: {
837     // Discard vdst_in if it's not going to be read.
838     Value *VDstIn = II.getArgOperand(0);
839     if (isa<UndefValue>(VDstIn))
840       break;
841 
842     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
843     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
844     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
845       break;
846 
847     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
848   }
849   case Intrinsic::amdgcn_readfirstlane:
850   case Intrinsic::amdgcn_readlane: {
851     // A constant value is trivially uniform.
852     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
853       return IC.replaceInstUsesWith(II, C);
854     }
855 
856     // The rest of these may not be safe if the exec may not be the same between
857     // the def and use.
858     Value *Src = II.getArgOperand(0);
859     Instruction *SrcInst = dyn_cast<Instruction>(Src);
860     if (SrcInst && SrcInst->getParent() != II.getParent())
861       break;
862 
863     // readfirstlane (readfirstlane x) -> readfirstlane x
864     // readlane (readfirstlane x), y -> readfirstlane x
865     if (match(Src,
866               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
867       return IC.replaceInstUsesWith(II, Src);
868     }
869 
870     if (IID == Intrinsic::amdgcn_readfirstlane) {
871       // readfirstlane (readlane x, y) -> readlane x, y
872       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
873         return IC.replaceInstUsesWith(II, Src);
874       }
875     } else {
876       // readlane (readlane x, y), y -> readlane x, y
877       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
878                          PatternMatch::m_Value(),
879                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
880         return IC.replaceInstUsesWith(II, Src);
881       }
882     }
883 
884     break;
885   }
886   case Intrinsic::amdgcn_ldexp: {
887     // FIXME: This doesn't introduce new instructions and belongs in
888     // InstructionSimplify.
889     Type *Ty = II.getType();
890     Value *Op0 = II.getArgOperand(0);
891     Value *Op1 = II.getArgOperand(1);
892 
893     // Folding undef to qnan is safe regardless of the FP mode.
894     if (isa<UndefValue>(Op0)) {
895       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
896       return IC.replaceInstUsesWith(II, QNaN);
897     }
898 
899     const APFloat *C = nullptr;
900     match(Op0, PatternMatch::m_APFloat(C));
901 
902     // FIXME: Should flush denorms depending on FP mode, but that's ignored
903     // everywhere else.
904     //
905     // These cases should be safe, even with strictfp.
906     // ldexp(0.0, x) -> 0.0
907     // ldexp(-0.0, x) -> -0.0
908     // ldexp(inf, x) -> inf
909     // ldexp(-inf, x) -> -inf
910     if (C && (C->isZero() || C->isInfinity())) {
911       return IC.replaceInstUsesWith(II, Op0);
912     }
913 
914     // With strictfp, be more careful about possibly needing to flush denormals
915     // or not, and snan behavior depends on ieee_mode.
916     if (II.isStrictFP())
917       break;
918 
919     if (C && C->isNaN()) {
920       // FIXME: We just need to make the nan quiet here, but that's unavailable
921       // on APFloat, only IEEEfloat
922       auto *Quieted =
923           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
924       return IC.replaceInstUsesWith(II, Quieted);
925     }
926 
927     // ldexp(x, 0) -> x
928     // ldexp(x, undef) -> x
929     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
930       return IC.replaceInstUsesWith(II, Op0);
931     }
932 
933     break;
934   }
935   case Intrinsic::amdgcn_fmul_legacy: {
936     Value *Op0 = II.getArgOperand(0);
937     Value *Op1 = II.getArgOperand(1);
938 
939     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
940     // infinity, gives +0.0.
941     // TODO: Move to InstSimplify?
942     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
943         match(Op1, PatternMatch::m_AnyZeroFP()))
944       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
945 
946     // If we can prove we don't have one of the special cases then we can use a
947     // normal fmul instruction instead.
948     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
949       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
950       FMul->takeName(&II);
951       return IC.replaceInstUsesWith(II, FMul);
952     }
953     break;
954   }
955   case Intrinsic::amdgcn_fma_legacy: {
956     Value *Op0 = II.getArgOperand(0);
957     Value *Op1 = II.getArgOperand(1);
958     Value *Op2 = II.getArgOperand(2);
959 
960     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
961     // infinity, gives +0.0.
962     // TODO: Move to InstSimplify?
963     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
964         match(Op1, PatternMatch::m_AnyZeroFP())) {
965       // It's tempting to just return Op2 here, but that would give the wrong
966       // result if Op2 was -0.0.
967       auto *Zero = ConstantFP::getNullValue(II.getType());
968       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
969       FAdd->takeName(&II);
970       return IC.replaceInstUsesWith(II, FAdd);
971     }
972 
973     // If we can prove we don't have one of the special cases then we can use a
974     // normal fma instead.
975     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
976       II.setCalledOperand(Intrinsic::getDeclaration(
977           II.getModule(), Intrinsic::fma, II.getType()));
978       return &II;
979     }
980     break;
981   }
982   case Intrinsic::amdgcn_is_shared:
983   case Intrinsic::amdgcn_is_private: {
984     if (isa<UndefValue>(II.getArgOperand(0)))
985       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
986 
987     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
988       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
989     break;
990   }
991   default: {
992     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
993             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
994       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
995     }
996   }
997   }
998   return None;
999 }
1000 
1001 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1002 ///
1003 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1004 ///       struct returns.
1005 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1006                                                     IntrinsicInst &II,
1007                                                     APInt DemandedElts,
1008                                                     int DMaskIdx = -1) {
1009 
1010   auto *IIVTy = cast<FixedVectorType>(II.getType());
1011   unsigned VWidth = IIVTy->getNumElements();
1012   if (VWidth == 1)
1013     return nullptr;
1014 
1015   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1016   IC.Builder.SetInsertPoint(&II);
1017 
1018   // Assume the arguments are unchanged and later override them, if needed.
1019   SmallVector<Value *, 16> Args(II.args());
1020 
1021   if (DMaskIdx < 0) {
1022     // Buffer case.
1023 
1024     const unsigned ActiveBits = DemandedElts.getActiveBits();
1025     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1026 
1027     // Start assuming the prefix of elements is demanded, but possibly clear
1028     // some other bits if there are trailing zeros (unused components at front)
1029     // and update offset.
1030     DemandedElts = (1 << ActiveBits) - 1;
1031 
1032     if (UnusedComponentsAtFront > 0) {
1033       static const unsigned InvalidOffsetIdx = 0xf;
1034 
1035       unsigned OffsetIdx;
1036       switch (II.getIntrinsicID()) {
1037       case Intrinsic::amdgcn_raw_buffer_load:
1038         OffsetIdx = 1;
1039         break;
1040       case Intrinsic::amdgcn_s_buffer_load:
1041         // If resulting type is vec3, there is no point in trimming the
1042         // load with updated offset, as the vec3 would most likely be widened to
1043         // vec4 anyway during lowering.
1044         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1045           OffsetIdx = InvalidOffsetIdx;
1046         else
1047           OffsetIdx = 1;
1048         break;
1049       case Intrinsic::amdgcn_struct_buffer_load:
1050         OffsetIdx = 2;
1051         break;
1052       default:
1053         // TODO: handle tbuffer* intrinsics.
1054         OffsetIdx = InvalidOffsetIdx;
1055         break;
1056       }
1057 
1058       if (OffsetIdx != InvalidOffsetIdx) {
1059         // Clear demanded bits and update the offset.
1060         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1061         auto *Offset = II.getArgOperand(OffsetIdx);
1062         unsigned SingleComponentSizeInBits =
1063             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1064         unsigned OffsetAdd =
1065             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1066         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1067         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1068       }
1069     }
1070   } else {
1071     // Image case.
1072 
1073     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1074     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1075 
1076     // Mask off values that are undefined because the dmask doesn't cover them
1077     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1078 
1079     unsigned NewDMaskVal = 0;
1080     unsigned OrigLoadIdx = 0;
1081     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1082       const unsigned Bit = 1 << SrcIdx;
1083       if (!!(DMaskVal & Bit)) {
1084         if (!!DemandedElts[OrigLoadIdx])
1085           NewDMaskVal |= Bit;
1086         OrigLoadIdx++;
1087       }
1088     }
1089 
1090     if (DMaskVal != NewDMaskVal)
1091       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1092   }
1093 
1094   unsigned NewNumElts = DemandedElts.countPopulation();
1095   if (!NewNumElts)
1096     return UndefValue::get(II.getType());
1097 
1098   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1099     if (DMaskIdx >= 0)
1100       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1101     return nullptr;
1102   }
1103 
1104   // Validate function argument and return types, extracting overloaded types
1105   // along the way.
1106   SmallVector<Type *, 6> OverloadTys;
1107   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1108     return nullptr;
1109 
1110   Module *M = II.getParent()->getParent()->getParent();
1111   Type *EltTy = IIVTy->getElementType();
1112   Type *NewTy =
1113       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1114 
1115   OverloadTys[0] = NewTy;
1116   Function *NewIntrin =
1117       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1118 
1119   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1120   NewCall->takeName(&II);
1121   NewCall->copyMetadata(II);
1122 
1123   if (NewNumElts == 1) {
1124     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1125                                           NewCall,
1126                                           DemandedElts.countTrailingZeros());
1127   }
1128 
1129   SmallVector<int, 8> EltMask;
1130   unsigned NewLoadIdx = 0;
1131   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1132     if (!!DemandedElts[OrigLoadIdx])
1133       EltMask.push_back(NewLoadIdx++);
1134     else
1135       EltMask.push_back(NewNumElts);
1136   }
1137 
1138   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1139 
1140   return Shuffle;
1141 }
1142 
1143 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1144     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1145     APInt &UndefElts2, APInt &UndefElts3,
1146     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1147         SimplifyAndSetOp) const {
1148   switch (II.getIntrinsicID()) {
1149   case Intrinsic::amdgcn_buffer_load:
1150   case Intrinsic::amdgcn_buffer_load_format:
1151   case Intrinsic::amdgcn_raw_buffer_load:
1152   case Intrinsic::amdgcn_raw_buffer_load_format:
1153   case Intrinsic::amdgcn_raw_tbuffer_load:
1154   case Intrinsic::amdgcn_s_buffer_load:
1155   case Intrinsic::amdgcn_struct_buffer_load:
1156   case Intrinsic::amdgcn_struct_buffer_load_format:
1157   case Intrinsic::amdgcn_struct_tbuffer_load:
1158   case Intrinsic::amdgcn_tbuffer_load:
1159     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1160   default: {
1161     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1162       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1163     }
1164     break;
1165   }
1166   }
1167   return None;
1168 }
1169