1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24
25 using namespace llvm;
26
27 #define DEBUG_TYPE "AMDGPUtti"
28
29 namespace {
30
31 struct AMDGPUImageDMaskIntrinsic {
32 unsigned Intr;
33 };
34
35 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
36 #include "InstCombineTables.inc"
37
38 } // end anonymous namespace
39
40 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
41 //
42 // A single NaN input is folded to minnum, so we rely on that folding for
43 // handling NaNs.
fmed3AMDGCN(const APFloat & Src0,const APFloat & Src1,const APFloat & Src2)44 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
45 const APFloat &Src2) {
46 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
47
48 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
49 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
50 if (Cmp0 == APFloat::cmpEqual)
51 return maxnum(Src1, Src2);
52
53 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
54 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
55 if (Cmp1 == APFloat::cmpEqual)
56 return maxnum(Src0, Src2);
57
58 return maxnum(Src0, Src1);
59 }
60
61 // Check if a value can be converted to a 16-bit value without losing
62 // precision.
63 // The value is expected to be either a float (IsFloat = true) or an unsigned
64 // integer (IsFloat = false).
canSafelyConvertTo16Bit(Value & V,bool IsFloat)65 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
66 Type *VTy = V.getType();
67 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
68 // The value is already 16-bit, so we don't want to convert to 16-bit again!
69 return false;
70 }
71 if (IsFloat) {
72 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
73 // We need to check that if we cast the index down to a half, we do not
74 // lose precision.
75 APFloat FloatValue(ConstFloat->getValueAPF());
76 bool LosesInfo = true;
77 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
78 &LosesInfo);
79 return !LosesInfo;
80 }
81 } else {
82 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
83 // We need to check that if we cast the index down to an i16, we do not
84 // lose precision.
85 APInt IntValue(ConstInt->getValue());
86 return IntValue.getActiveBits() <= 16;
87 }
88 }
89
90 Value *CastSrc;
91 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
92 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
93 if (IsExt) {
94 Type *CastSrcTy = CastSrc->getType();
95 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
96 return true;
97 }
98
99 return false;
100 }
101
102 // Convert a value to 16-bit.
convertTo16Bit(Value & V,InstCombiner::BuilderTy & Builder)103 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
104 Type *VTy = V.getType();
105 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
106 return cast<Instruction>(&V)->getOperand(0);
107 if (VTy->isIntegerTy())
108 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
109 if (VTy->isFloatingPointTy())
110 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
111
112 llvm_unreachable("Should never be called!");
113 }
114
115 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
116 /// modified arguments (based on OldIntr) and replaces InstToReplace with
117 /// this newly created intrinsic call.
modifyIntrinsicCall(IntrinsicInst & OldIntr,Instruction & InstToReplace,unsigned NewIntr,InstCombiner & IC,std::function<void (SmallVectorImpl<Value * > &,SmallVectorImpl<Type * > &)> Func)118 static std::optional<Instruction *> modifyIntrinsicCall(
119 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
120 InstCombiner &IC,
121 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
122 Func) {
123 SmallVector<Type *, 4> ArgTys;
124 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
125 return std::nullopt;
126
127 SmallVector<Value *, 8> Args(OldIntr.args());
128
129 // Modify arguments and types
130 Func(Args, ArgTys);
131
132 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateCall(I, Args);
135 NewCall->takeName(&OldIntr);
136 NewCall->copyMetadata(OldIntr);
137 if (isa<FPMathOperator>(NewCall))
138 NewCall->copyFastMathFlags(&OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(InstToReplace, NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto RetValue = IC.eraseInstFromFunction(InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(OldIntr);
149
150 return RetValue;
151 }
152
153 static std::optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget * ST,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr,IntrinsicInst & II,InstCombiner & IC)154 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156 IntrinsicInst &II, InstCombiner &IC) {
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
159 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
165 ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
176 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182 ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
193 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199 ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
211 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216 AMDGPU::getImageDimIntrinsicByBaseOpcode(
217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
241 User->getType()->getScalarType()->isHalfTy()) {
242
243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244 [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251 }
252 }
253
254 // Try to use A16 or G16
255 if (!ST->hasA16() && !ST->hasG16())
256 return std::nullopt;
257
258 // Address is interpreted as float if the instruction has a sampler or as
259 // unsigned int if there is no sampler.
260 bool HasSampler =
261 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
262 bool FloatCoord = false;
263 // true means derivatives can be converted to 16 bit, coordinates not
264 bool OnlyDerivatives = false;
265
266 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
267 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
268 Value *Coord = II.getOperand(OperandIndex);
269 // If the values are not derived from 16-bit values, we cannot optimize.
270 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
271 if (OperandIndex < ImageDimIntr->CoordStart ||
272 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
273 return std::nullopt;
274 }
275 // All gradients can be converted, so convert only them
276 OnlyDerivatives = true;
277 break;
278 }
279
280 assert(OperandIndex == ImageDimIntr->GradientStart ||
281 FloatCoord == Coord->getType()->isFloatingPointTy());
282 FloatCoord = Coord->getType()->isFloatingPointTy();
283 }
284
285 if (!OnlyDerivatives && !ST->hasA16())
286 OnlyDerivatives = true; // Only supports G16
287
288 // Check if there is a bias parameter and if it can be converted to f16
289 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
290 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
291 assert(HasSampler &&
292 "Only image instructions with a sampler can have a bias");
293 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
294 OnlyDerivatives = true;
295 }
296
297 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
298 ImageDimIntr->CoordStart))
299 return std::nullopt;
300
301 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
302 : Type::getInt16Ty(II.getContext());
303
304 return modifyIntrinsicCall(
305 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
306 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
307 if (!OnlyDerivatives) {
308 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
309
310 // Change the bias type
311 if (ImageDimIntr->NumBiasArgs != 0)
312 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
313 }
314
315 unsigned EndIndex =
316 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
317 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
318 OperandIndex < EndIndex; OperandIndex++) {
319 Args[OperandIndex] =
320 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
321 }
322
323 // Convert the bias
324 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
325 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
326 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
327 }
328 });
329 }
330
canSimplifyLegacyMulToMul(const Value * Op0,const Value * Op1,InstCombiner & IC) const331 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
332 InstCombiner &IC) const {
333 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
334 // infinity, gives +0.0. If we can prove we don't have one of the special
335 // cases then we can use a normal multiply instead.
336 // TODO: Create and use isKnownFiniteNonZero instead of just matching
337 // constants here.
338 if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
339 match(Op1, PatternMatch::m_FiniteNonZero())) {
340 // One operand is not zero or infinity or NaN.
341 return true;
342 }
343 auto *TLI = &IC.getTargetLibraryInfo();
344 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
345 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
346 // Neither operand is infinity or NaN.
347 return true;
348 }
349 return false;
350 }
351
352 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const353 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
354 Intrinsic::ID IID = II.getIntrinsicID();
355 switch (IID) {
356 case Intrinsic::amdgcn_rcp: {
357 Value *Src = II.getArgOperand(0);
358
359 // TODO: Move to ConstantFolding/InstSimplify?
360 if (isa<UndefValue>(Src)) {
361 Type *Ty = II.getType();
362 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
363 return IC.replaceInstUsesWith(II, QNaN);
364 }
365
366 if (II.isStrictFP())
367 break;
368
369 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
370 const APFloat &ArgVal = C->getValueAPF();
371 APFloat Val(ArgVal.getSemantics(), 1);
372 Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
373
374 // This is more precise than the instruction may give.
375 //
376 // TODO: The instruction always flushes denormal results (except for f16),
377 // should this also?
378 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
379 }
380
381 break;
382 }
383 case Intrinsic::amdgcn_sqrt:
384 case Intrinsic::amdgcn_rsq: {
385 Value *Src = II.getArgOperand(0);
386
387 // TODO: Move to ConstantFolding/InstSimplify?
388 if (isa<UndefValue>(Src)) {
389 Type *Ty = II.getType();
390 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
391 return IC.replaceInstUsesWith(II, QNaN);
392 }
393
394 break;
395 }
396 case Intrinsic::amdgcn_frexp_mant:
397 case Intrinsic::amdgcn_frexp_exp: {
398 Value *Src = II.getArgOperand(0);
399 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
400 int Exp;
401 APFloat Significand =
402 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
403
404 if (IID == Intrinsic::amdgcn_frexp_mant) {
405 return IC.replaceInstUsesWith(
406 II, ConstantFP::get(II.getContext(), Significand));
407 }
408
409 // Match instruction special case behavior.
410 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
411 Exp = 0;
412
413 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
414 }
415
416 if (isa<UndefValue>(Src)) {
417 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
418 }
419
420 break;
421 }
422 case Intrinsic::amdgcn_class: {
423 Value *Src0 = II.getArgOperand(0);
424 Value *Src1 = II.getArgOperand(1);
425 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
426 if (!CMask) {
427 if (isa<UndefValue>(Src0)) {
428 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
429 }
430
431 if (isa<UndefValue>(Src1)) {
432 return IC.replaceInstUsesWith(II,
433 ConstantInt::get(II.getType(), false));
434 }
435 break;
436 }
437
438 uint32_t Mask = CMask->getZExtValue();
439
440 // If all tests are made, it doesn't matter what the value is.
441 if ((Mask & fcAllFlags) == fcAllFlags) {
442 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
443 }
444
445 if ((Mask & fcAllFlags) == 0) {
446 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
447 }
448
449 if (Mask == fcNan && !II.isStrictFP()) {
450 // Equivalent of isnan. Replace with standard fcmp.
451 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
452 FCmp->takeName(&II);
453 return IC.replaceInstUsesWith(II, FCmp);
454 }
455
456 if (Mask == fcZero && !II.isStrictFP()) {
457 // Equivalent of == 0.
458 Value *FCmp =
459 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
460
461 FCmp->takeName(&II);
462 return IC.replaceInstUsesWith(II, FCmp);
463 }
464
465 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
466 if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
467 return IC.replaceOperand(
468 II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan));
469 }
470
471 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
472 if (!CVal) {
473 if (isa<UndefValue>(Src0)) {
474 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
475 }
476
477 // Clamp mask to used bits
478 if ((Mask & fcAllFlags) != Mask) {
479 CallInst *NewCall = IC.Builder.CreateCall(
480 II.getCalledFunction(),
481 {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)});
482
483 NewCall->takeName(&II);
484 return IC.replaceInstUsesWith(II, NewCall);
485 }
486
487 break;
488 }
489
490 const APFloat &Val = CVal->getValueAPF();
491
492 bool Result =
493 ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) ||
494 ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) ||
495 ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) ||
496 ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) ||
497 ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) ||
498 ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) ||
499 ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) ||
500 ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) ||
501 ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) ||
502 ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative());
503
504 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
505 }
506 case Intrinsic::amdgcn_cvt_pkrtz: {
507 Value *Src0 = II.getArgOperand(0);
508 Value *Src1 = II.getArgOperand(1);
509 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
510 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
511 const fltSemantics &HalfSem =
512 II.getType()->getScalarType()->getFltSemantics();
513 bool LosesInfo;
514 APFloat Val0 = C0->getValueAPF();
515 APFloat Val1 = C1->getValueAPF();
516 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
517 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
518
519 Constant *Folded =
520 ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
521 ConstantFP::get(II.getContext(), Val1)});
522 return IC.replaceInstUsesWith(II, Folded);
523 }
524 }
525
526 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
527 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
528 }
529
530 break;
531 }
532 case Intrinsic::amdgcn_cvt_pknorm_i16:
533 case Intrinsic::amdgcn_cvt_pknorm_u16:
534 case Intrinsic::amdgcn_cvt_pk_i16:
535 case Intrinsic::amdgcn_cvt_pk_u16: {
536 Value *Src0 = II.getArgOperand(0);
537 Value *Src1 = II.getArgOperand(1);
538
539 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
540 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
541 }
542
543 break;
544 }
545 case Intrinsic::amdgcn_ubfe:
546 case Intrinsic::amdgcn_sbfe: {
547 // Decompose simple cases into standard shifts.
548 Value *Src = II.getArgOperand(0);
549 if (isa<UndefValue>(Src)) {
550 return IC.replaceInstUsesWith(II, Src);
551 }
552
553 unsigned Width;
554 Type *Ty = II.getType();
555 unsigned IntSize = Ty->getIntegerBitWidth();
556
557 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
558 if (CWidth) {
559 Width = CWidth->getZExtValue();
560 if ((Width & (IntSize - 1)) == 0) {
561 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
562 }
563
564 // Hardware ignores high bits, so remove those.
565 if (Width >= IntSize) {
566 return IC.replaceOperand(
567 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
568 }
569 }
570
571 unsigned Offset;
572 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
573 if (COffset) {
574 Offset = COffset->getZExtValue();
575 if (Offset >= IntSize) {
576 return IC.replaceOperand(
577 II, 1,
578 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
579 }
580 }
581
582 bool Signed = IID == Intrinsic::amdgcn_sbfe;
583
584 if (!CWidth || !COffset)
585 break;
586
587 // The case of Width == 0 is handled above, which makes this transformation
588 // safe. If Width == 0, then the ashr and lshr instructions become poison
589 // value since the shift amount would be equal to the bit size.
590 assert(Width != 0);
591
592 // TODO: This allows folding to undef when the hardware has specific
593 // behavior?
594 if (Offset + Width < IntSize) {
595 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
596 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
597 : IC.Builder.CreateLShr(Shl, IntSize - Width);
598 RightShift->takeName(&II);
599 return IC.replaceInstUsesWith(II, RightShift);
600 }
601
602 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
603 : IC.Builder.CreateLShr(Src, Offset);
604
605 RightShift->takeName(&II);
606 return IC.replaceInstUsesWith(II, RightShift);
607 }
608 case Intrinsic::amdgcn_exp:
609 case Intrinsic::amdgcn_exp_row:
610 case Intrinsic::amdgcn_exp_compr: {
611 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
612 unsigned EnBits = En->getZExtValue();
613 if (EnBits == 0xf)
614 break; // All inputs enabled.
615
616 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
617 bool Changed = false;
618 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
619 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
620 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
621 Value *Src = II.getArgOperand(I + 2);
622 if (!isa<UndefValue>(Src)) {
623 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
624 Changed = true;
625 }
626 }
627 }
628
629 if (Changed) {
630 return &II;
631 }
632
633 break;
634 }
635 case Intrinsic::amdgcn_fmed3: {
636 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
637 // for the shader.
638
639 Value *Src0 = II.getArgOperand(0);
640 Value *Src1 = II.getArgOperand(1);
641 Value *Src2 = II.getArgOperand(2);
642
643 // Checking for NaN before canonicalization provides better fidelity when
644 // mapping other operations onto fmed3 since the order of operands is
645 // unchanged.
646 CallInst *NewCall = nullptr;
647 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
648 NewCall = IC.Builder.CreateMinNum(Src1, Src2);
649 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
650 NewCall = IC.Builder.CreateMinNum(Src0, Src2);
651 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
652 NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
653 }
654
655 if (NewCall) {
656 NewCall->copyFastMathFlags(&II);
657 NewCall->takeName(&II);
658 return IC.replaceInstUsesWith(II, NewCall);
659 }
660
661 bool Swap = false;
662 // Canonicalize constants to RHS operands.
663 //
664 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
665 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
666 std::swap(Src0, Src1);
667 Swap = true;
668 }
669
670 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
671 std::swap(Src1, Src2);
672 Swap = true;
673 }
674
675 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
676 std::swap(Src0, Src1);
677 Swap = true;
678 }
679
680 if (Swap) {
681 II.setArgOperand(0, Src0);
682 II.setArgOperand(1, Src1);
683 II.setArgOperand(2, Src2);
684 return &II;
685 }
686
687 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
688 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
689 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
690 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
691 C2->getValueAPF());
692 return IC.replaceInstUsesWith(
693 II, ConstantFP::get(IC.Builder.getContext(), Result));
694 }
695 }
696 }
697
698 break;
699 }
700 case Intrinsic::amdgcn_icmp:
701 case Intrinsic::amdgcn_fcmp: {
702 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
703 // Guard against invalid arguments.
704 int64_t CCVal = CC->getZExtValue();
705 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
706 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
707 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
708 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
709 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
710 break;
711
712 Value *Src0 = II.getArgOperand(0);
713 Value *Src1 = II.getArgOperand(1);
714
715 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
716 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
717 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
718 if (CCmp->isNullValue()) {
719 return IC.replaceInstUsesWith(
720 II, ConstantExpr::getSExt(CCmp, II.getType()));
721 }
722
723 // The result of V_ICMP/V_FCMP assembly instructions (which this
724 // intrinsic exposes) is one bit per thread, masked with the EXEC
725 // register (which contains the bitmask of live threads). So a
726 // comparison that always returns true is the same as a read of the
727 // EXEC register.
728 Function *NewF = Intrinsic::getDeclaration(
729 II.getModule(), Intrinsic::read_register, II.getType());
730 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
731 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
732 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
733 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
734 NewCall->addFnAttr(Attribute::Convergent);
735 NewCall->takeName(&II);
736 return IC.replaceInstUsesWith(II, NewCall);
737 }
738
739 // Canonicalize constants to RHS.
740 CmpInst::Predicate SwapPred =
741 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
742 II.setArgOperand(0, Src1);
743 II.setArgOperand(1, Src0);
744 II.setArgOperand(
745 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
746 return &II;
747 }
748
749 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
750 break;
751
752 // Canonicalize compare eq with true value to compare != 0
753 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
754 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
755 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
756 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
757 Value *ExtSrc;
758 if (CCVal == CmpInst::ICMP_EQ &&
759 ((match(Src1, PatternMatch::m_One()) &&
760 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
761 (match(Src1, PatternMatch::m_AllOnes()) &&
762 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
763 ExtSrc->getType()->isIntegerTy(1)) {
764 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
765 IC.replaceOperand(II, 2,
766 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
767 return &II;
768 }
769
770 CmpInst::Predicate SrcPred;
771 Value *SrcLHS;
772 Value *SrcRHS;
773
774 // Fold compare eq/ne with 0 from a compare result as the predicate to the
775 // intrinsic. The typical use is a wave vote function in the library, which
776 // will be fed from a user code condition compared with 0. Fold in the
777 // redundant compare.
778
779 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
780 // -> llvm.amdgcn.[if]cmp(a, b, pred)
781 //
782 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
783 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
784 if (match(Src1, PatternMatch::m_Zero()) &&
785 match(Src0, PatternMatch::m_ZExtOrSExt(
786 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
787 PatternMatch::m_Value(SrcRHS))))) {
788 if (CCVal == CmpInst::ICMP_EQ)
789 SrcPred = CmpInst::getInversePredicate(SrcPred);
790
791 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
792 ? Intrinsic::amdgcn_fcmp
793 : Intrinsic::amdgcn_icmp;
794
795 Type *Ty = SrcLHS->getType();
796 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
797 // Promote to next legal integer type.
798 unsigned Width = CmpType->getBitWidth();
799 unsigned NewWidth = Width;
800
801 // Don't do anything for i1 comparisons.
802 if (Width == 1)
803 break;
804
805 if (Width <= 16)
806 NewWidth = 16;
807 else if (Width <= 32)
808 NewWidth = 32;
809 else if (Width <= 64)
810 NewWidth = 64;
811 else if (Width > 64)
812 break; // Can't handle this.
813
814 if (Width != NewWidth) {
815 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
816 if (CmpInst::isSigned(SrcPred)) {
817 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
818 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
819 } else {
820 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
821 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
822 }
823 }
824 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
825 break;
826
827 Function *NewF = Intrinsic::getDeclaration(
828 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
829 Value *Args[] = {SrcLHS, SrcRHS,
830 ConstantInt::get(CC->getType(), SrcPred)};
831 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
832 NewCall->takeName(&II);
833 return IC.replaceInstUsesWith(II, NewCall);
834 }
835
836 break;
837 }
838 case Intrinsic::amdgcn_ballot: {
839 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
840 if (Src->isZero()) {
841 // amdgcn.ballot(i1 0) is zero.
842 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
843 }
844
845 if (Src->isOne()) {
846 // amdgcn.ballot(i1 1) is exec.
847 const char *RegName = "exec";
848 if (II.getType()->isIntegerTy(32))
849 RegName = "exec_lo";
850 else if (!II.getType()->isIntegerTy(64))
851 break;
852
853 Function *NewF = Intrinsic::getDeclaration(
854 II.getModule(), Intrinsic::read_register, II.getType());
855 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
856 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
857 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
858 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
859 NewCall->addFnAttr(Attribute::Convergent);
860 NewCall->takeName(&II);
861 return IC.replaceInstUsesWith(II, NewCall);
862 }
863 }
864 break;
865 }
866 case Intrinsic::amdgcn_wqm_vote: {
867 // wqm_vote is identity when the argument is constant.
868 if (!isa<Constant>(II.getArgOperand(0)))
869 break;
870
871 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
872 }
873 case Intrinsic::amdgcn_kill: {
874 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
875 if (!C || !C->getZExtValue())
876 break;
877
878 // amdgcn.kill(i1 1) is a no-op
879 return IC.eraseInstFromFunction(II);
880 }
881 case Intrinsic::amdgcn_update_dpp: {
882 Value *Old = II.getArgOperand(0);
883
884 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
885 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
886 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
887 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
888 BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
889 break;
890
891 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
892 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
893 }
894 case Intrinsic::amdgcn_permlane16:
895 case Intrinsic::amdgcn_permlanex16: {
896 // Discard vdst_in if it's not going to be read.
897 Value *VDstIn = II.getArgOperand(0);
898 if (isa<UndefValue>(VDstIn))
899 break;
900
901 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
902 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
903 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
904 break;
905
906 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
907 }
908 case Intrinsic::amdgcn_permlane64:
909 // A constant value is trivially uniform.
910 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
911 return IC.replaceInstUsesWith(II, C);
912 }
913 break;
914 case Intrinsic::amdgcn_readfirstlane:
915 case Intrinsic::amdgcn_readlane: {
916 // A constant value is trivially uniform.
917 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
918 return IC.replaceInstUsesWith(II, C);
919 }
920
921 // The rest of these may not be safe if the exec may not be the same between
922 // the def and use.
923 Value *Src = II.getArgOperand(0);
924 Instruction *SrcInst = dyn_cast<Instruction>(Src);
925 if (SrcInst && SrcInst->getParent() != II.getParent())
926 break;
927
928 // readfirstlane (readfirstlane x) -> readfirstlane x
929 // readlane (readfirstlane x), y -> readfirstlane x
930 if (match(Src,
931 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
932 return IC.replaceInstUsesWith(II, Src);
933 }
934
935 if (IID == Intrinsic::amdgcn_readfirstlane) {
936 // readfirstlane (readlane x, y) -> readlane x, y
937 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
938 return IC.replaceInstUsesWith(II, Src);
939 }
940 } else {
941 // readlane (readlane x, y), y -> readlane x, y
942 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
943 PatternMatch::m_Value(),
944 PatternMatch::m_Specific(II.getArgOperand(1))))) {
945 return IC.replaceInstUsesWith(II, Src);
946 }
947 }
948
949 break;
950 }
951 case Intrinsic::amdgcn_ldexp: {
952 // FIXME: This doesn't introduce new instructions and belongs in
953 // InstructionSimplify.
954 Type *Ty = II.getType();
955 Value *Op0 = II.getArgOperand(0);
956 Value *Op1 = II.getArgOperand(1);
957
958 // Folding undef to qnan is safe regardless of the FP mode.
959 if (isa<UndefValue>(Op0)) {
960 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
961 return IC.replaceInstUsesWith(II, QNaN);
962 }
963
964 const APFloat *C = nullptr;
965 match(Op0, PatternMatch::m_APFloat(C));
966
967 // FIXME: Should flush denorms depending on FP mode, but that's ignored
968 // everywhere else.
969 //
970 // These cases should be safe, even with strictfp.
971 // ldexp(0.0, x) -> 0.0
972 // ldexp(-0.0, x) -> -0.0
973 // ldexp(inf, x) -> inf
974 // ldexp(-inf, x) -> -inf
975 if (C && (C->isZero() || C->isInfinity())) {
976 return IC.replaceInstUsesWith(II, Op0);
977 }
978
979 // With strictfp, be more careful about possibly needing to flush denormals
980 // or not, and snan behavior depends on ieee_mode.
981 if (II.isStrictFP())
982 break;
983
984 if (C && C->isNaN()) {
985 // FIXME: We just need to make the nan quiet here, but that's unavailable
986 // on APFloat, only IEEEfloat
987 auto *Quieted =
988 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
989 return IC.replaceInstUsesWith(II, Quieted);
990 }
991
992 // ldexp(x, 0) -> x
993 // ldexp(x, undef) -> x
994 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
995 return IC.replaceInstUsesWith(II, Op0);
996 }
997
998 break;
999 }
1000 case Intrinsic::amdgcn_fmul_legacy: {
1001 Value *Op0 = II.getArgOperand(0);
1002 Value *Op1 = II.getArgOperand(1);
1003
1004 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1005 // infinity, gives +0.0.
1006 // TODO: Move to InstSimplify?
1007 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1008 match(Op1, PatternMatch::m_AnyZeroFP()))
1009 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
1010
1011 // If we can prove we don't have one of the special cases then we can use a
1012 // normal fmul instruction instead.
1013 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1014 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1015 FMul->takeName(&II);
1016 return IC.replaceInstUsesWith(II, FMul);
1017 }
1018 break;
1019 }
1020 case Intrinsic::amdgcn_fma_legacy: {
1021 Value *Op0 = II.getArgOperand(0);
1022 Value *Op1 = II.getArgOperand(1);
1023 Value *Op2 = II.getArgOperand(2);
1024
1025 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1026 // infinity, gives +0.0.
1027 // TODO: Move to InstSimplify?
1028 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1029 match(Op1, PatternMatch::m_AnyZeroFP())) {
1030 // It's tempting to just return Op2 here, but that would give the wrong
1031 // result if Op2 was -0.0.
1032 auto *Zero = ConstantFP::getNullValue(II.getType());
1033 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1034 FAdd->takeName(&II);
1035 return IC.replaceInstUsesWith(II, FAdd);
1036 }
1037
1038 // If we can prove we don't have one of the special cases then we can use a
1039 // normal fma instead.
1040 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1041 II.setCalledOperand(Intrinsic::getDeclaration(
1042 II.getModule(), Intrinsic::fma, II.getType()));
1043 return &II;
1044 }
1045 break;
1046 }
1047 case Intrinsic::amdgcn_is_shared:
1048 case Intrinsic::amdgcn_is_private: {
1049 if (isa<UndefValue>(II.getArgOperand(0)))
1050 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1051
1052 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1053 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1054 break;
1055 }
1056 default: {
1057 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1058 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1059 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1060 }
1061 }
1062 }
1063 return std::nullopt;
1064 }
1065
1066 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1067 ///
1068 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1069 /// struct returns.
simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,int DMaskIdx=-1)1070 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1071 IntrinsicInst &II,
1072 APInt DemandedElts,
1073 int DMaskIdx = -1) {
1074
1075 auto *IIVTy = cast<FixedVectorType>(II.getType());
1076 unsigned VWidth = IIVTy->getNumElements();
1077 if (VWidth == 1)
1078 return nullptr;
1079 Type *EltTy = IIVTy->getElementType();
1080
1081 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1082 IC.Builder.SetInsertPoint(&II);
1083
1084 // Assume the arguments are unchanged and later override them, if needed.
1085 SmallVector<Value *, 16> Args(II.args());
1086
1087 if (DMaskIdx < 0) {
1088 // Buffer case.
1089
1090 const unsigned ActiveBits = DemandedElts.getActiveBits();
1091 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1092
1093 // Start assuming the prefix of elements is demanded, but possibly clear
1094 // some other bits if there are trailing zeros (unused components at front)
1095 // and update offset.
1096 DemandedElts = (1 << ActiveBits) - 1;
1097
1098 if (UnusedComponentsAtFront > 0) {
1099 static const unsigned InvalidOffsetIdx = 0xf;
1100
1101 unsigned OffsetIdx;
1102 switch (II.getIntrinsicID()) {
1103 case Intrinsic::amdgcn_raw_buffer_load:
1104 OffsetIdx = 1;
1105 break;
1106 case Intrinsic::amdgcn_s_buffer_load:
1107 // If resulting type is vec3, there is no point in trimming the
1108 // load with updated offset, as the vec3 would most likely be widened to
1109 // vec4 anyway during lowering.
1110 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1111 OffsetIdx = InvalidOffsetIdx;
1112 else
1113 OffsetIdx = 1;
1114 break;
1115 case Intrinsic::amdgcn_struct_buffer_load:
1116 OffsetIdx = 2;
1117 break;
1118 default:
1119 // TODO: handle tbuffer* intrinsics.
1120 OffsetIdx = InvalidOffsetIdx;
1121 break;
1122 }
1123
1124 if (OffsetIdx != InvalidOffsetIdx) {
1125 // Clear demanded bits and update the offset.
1126 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1127 auto *Offset = Args[OffsetIdx];
1128 unsigned SingleComponentSizeInBits =
1129 IC.getDataLayout().getTypeSizeInBits(EltTy);
1130 unsigned OffsetAdd =
1131 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1132 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1133 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1134 }
1135 }
1136 } else {
1137 // Image case.
1138
1139 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1140 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1141
1142 // Mask off values that are undefined because the dmask doesn't cover them
1143 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1144
1145 unsigned NewDMaskVal = 0;
1146 unsigned OrigLoadIdx = 0;
1147 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1148 const unsigned Bit = 1 << SrcIdx;
1149 if (!!(DMaskVal & Bit)) {
1150 if (!!DemandedElts[OrigLoadIdx])
1151 NewDMaskVal |= Bit;
1152 OrigLoadIdx++;
1153 }
1154 }
1155
1156 if (DMaskVal != NewDMaskVal)
1157 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1158 }
1159
1160 unsigned NewNumElts = DemandedElts.countPopulation();
1161 if (!NewNumElts)
1162 return UndefValue::get(IIVTy);
1163
1164 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1165 if (DMaskIdx >= 0)
1166 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1167 return nullptr;
1168 }
1169
1170 // Validate function argument and return types, extracting overloaded types
1171 // along the way.
1172 SmallVector<Type *, 6> OverloadTys;
1173 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1174 return nullptr;
1175
1176 Type *NewTy =
1177 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1178 OverloadTys[0] = NewTy;
1179
1180 Function *NewIntrin = Intrinsic::getDeclaration(
1181 II.getModule(), II.getIntrinsicID(), OverloadTys);
1182 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1183 NewCall->takeName(&II);
1184 NewCall->copyMetadata(II);
1185
1186 if (NewNumElts == 1) {
1187 return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
1188 DemandedElts.countTrailingZeros());
1189 }
1190
1191 SmallVector<int, 8> EltMask;
1192 unsigned NewLoadIdx = 0;
1193 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1194 if (!!DemandedElts[OrigLoadIdx])
1195 EltMask.push_back(NewLoadIdx++);
1196 else
1197 EltMask.push_back(NewNumElts);
1198 }
1199
1200 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1201
1202 return Shuffle;
1203 }
1204
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const1205 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1206 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1207 APInt &UndefElts2, APInt &UndefElts3,
1208 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1209 SimplifyAndSetOp) const {
1210 switch (II.getIntrinsicID()) {
1211 case Intrinsic::amdgcn_buffer_load:
1212 case Intrinsic::amdgcn_buffer_load_format:
1213 case Intrinsic::amdgcn_raw_buffer_load:
1214 case Intrinsic::amdgcn_raw_buffer_load_format:
1215 case Intrinsic::amdgcn_raw_tbuffer_load:
1216 case Intrinsic::amdgcn_s_buffer_load:
1217 case Intrinsic::amdgcn_struct_buffer_load:
1218 case Intrinsic::amdgcn_struct_buffer_load_format:
1219 case Intrinsic::amdgcn_struct_tbuffer_load:
1220 case Intrinsic::amdgcn_tbuffer_load:
1221 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1222 default: {
1223 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1224 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1225 }
1226 break;
1227 }
1228 }
1229 return std::nullopt;
1230 }
1231