1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "R600Subtarget.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23
24 using namespace llvm;
25
26 #define DEBUG_TYPE "AMDGPUtti"
27
28 namespace {
29
30 struct AMDGPUImageDMaskIntrinsic {
31 unsigned Intr;
32 };
33
34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
35 #include "InstCombineTables.inc"
36
37 } // end anonymous namespace
38
39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
40 //
41 // A single NaN input is folded to minnum, so we rely on that folding for
42 // handling NaNs.
fmed3AMDGCN(const APFloat & Src0,const APFloat & Src1,const APFloat & Src2)43 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
44 const APFloat &Src2) {
45 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
46
47 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
48 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
49 if (Cmp0 == APFloat::cmpEqual)
50 return maxnum(Src1, Src2);
51
52 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
53 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
54 if (Cmp1 == APFloat::cmpEqual)
55 return maxnum(Src0, Src2);
56
57 return maxnum(Src0, Src1);
58 }
59
60 // Check if a value can be converted to a 16-bit value without losing
61 // precision.
canSafelyConvertTo16Bit(Value & V)62 static bool canSafelyConvertTo16Bit(Value &V) {
63 Type *VTy = V.getType();
64 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
65 // The value is already 16-bit, so we don't want to convert to 16-bit again!
66 return false;
67 }
68 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
69 // We need to check that if we cast the index down to a half, we do not lose
70 // precision.
71 APFloat FloatValue(ConstFloat->getValueAPF());
72 bool LosesInfo = true;
73 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
74 return !LosesInfo;
75 }
76 Value *CastSrc;
77 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
78 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
79 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
80 Type *CastSrcTy = CastSrc->getType();
81 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
82 return true;
83 }
84
85 return false;
86 }
87
88 // Convert a value to 16-bit.
convertTo16Bit(Value & V,InstCombiner::BuilderTy & Builder)89 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
90 Type *VTy = V.getType();
91 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
92 return cast<Instruction>(&V)->getOperand(0);
93 if (VTy->isIntegerTy())
94 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
95 if (VTy->isFloatingPointTy())
96 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
97
98 llvm_unreachable("Should never be called!");
99 }
100
101 static Optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget * ST,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr,IntrinsicInst & II,InstCombiner & IC)102 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
103 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
104 IntrinsicInst &II, InstCombiner &IC) {
105 if (!ST->hasA16() && !ST->hasG16())
106 return None;
107
108 bool FloatCoord = false;
109 // true means derivatives can be converted to 16 bit, coordinates not
110 bool OnlyDerivatives = false;
111
112 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
113 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
114 Value *Coord = II.getOperand(OperandIndex);
115 // If the values are not derived from 16-bit values, we cannot optimize.
116 if (!canSafelyConvertTo16Bit(*Coord)) {
117 if (OperandIndex < ImageDimIntr->CoordStart ||
118 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
119 return None;
120 }
121 // All gradients can be converted, so convert only them
122 OnlyDerivatives = true;
123 break;
124 }
125
126 assert(OperandIndex == ImageDimIntr->GradientStart ||
127 FloatCoord == Coord->getType()->isFloatingPointTy());
128 FloatCoord = Coord->getType()->isFloatingPointTy();
129 }
130
131 if (OnlyDerivatives) {
132 if (!ST->hasG16())
133 return None;
134 } else {
135 if (!ST->hasA16())
136 OnlyDerivatives = true; // Only supports G16
137 }
138
139 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
140 : Type::getInt16Ty(II.getContext());
141
142 SmallVector<Type *, 4> ArgTys;
143 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
144 return None;
145
146 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
147 if (!OnlyDerivatives)
148 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
149 Function *I =
150 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
151
152 SmallVector<Value *, 8> Args(II.arg_operands());
153
154 unsigned EndIndex =
155 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
156 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
157 OperandIndex < EndIndex; OperandIndex++) {
158 Args[OperandIndex] =
159 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
160 }
161
162 CallInst *NewCall = IC.Builder.CreateCall(I, Args);
163 NewCall->takeName(&II);
164 NewCall->copyMetadata(II);
165 if (isa<FPMathOperator>(NewCall))
166 NewCall->copyFastMathFlags(&II);
167 return IC.replaceInstUsesWith(II, NewCall);
168 }
169
canSimplifyLegacyMulToMul(const Value * Op0,const Value * Op1,InstCombiner & IC) const170 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
171 InstCombiner &IC) const {
172 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
173 // infinity, gives +0.0. If we can prove we don't have one of the special
174 // cases then we can use a normal multiply instead.
175 // TODO: Create and use isKnownFiniteNonZero instead of just matching
176 // constants here.
177 if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
178 match(Op1, PatternMatch::m_FiniteNonZero())) {
179 // One operand is not zero or infinity or NaN.
180 return true;
181 }
182 auto *TLI = &IC.getTargetLibraryInfo();
183 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
184 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
185 // Neither operand is infinity or NaN.
186 return true;
187 }
188 return false;
189 }
190
191 Optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const192 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
193 Intrinsic::ID IID = II.getIntrinsicID();
194 switch (IID) {
195 case Intrinsic::amdgcn_rcp: {
196 Value *Src = II.getArgOperand(0);
197
198 // TODO: Move to ConstantFolding/InstSimplify?
199 if (isa<UndefValue>(Src)) {
200 Type *Ty = II.getType();
201 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
202 return IC.replaceInstUsesWith(II, QNaN);
203 }
204
205 if (II.isStrictFP())
206 break;
207
208 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
209 const APFloat &ArgVal = C->getValueAPF();
210 APFloat Val(ArgVal.getSemantics(), 1);
211 Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
212
213 // This is more precise than the instruction may give.
214 //
215 // TODO: The instruction always flushes denormal results (except for f16),
216 // should this also?
217 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
218 }
219
220 break;
221 }
222 case Intrinsic::amdgcn_rsq: {
223 Value *Src = II.getArgOperand(0);
224
225 // TODO: Move to ConstantFolding/InstSimplify?
226 if (isa<UndefValue>(Src)) {
227 Type *Ty = II.getType();
228 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
229 return IC.replaceInstUsesWith(II, QNaN);
230 }
231
232 break;
233 }
234 case Intrinsic::amdgcn_frexp_mant:
235 case Intrinsic::amdgcn_frexp_exp: {
236 Value *Src = II.getArgOperand(0);
237 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
238 int Exp;
239 APFloat Significand =
240 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
241
242 if (IID == Intrinsic::amdgcn_frexp_mant) {
243 return IC.replaceInstUsesWith(
244 II, ConstantFP::get(II.getContext(), Significand));
245 }
246
247 // Match instruction special case behavior.
248 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
249 Exp = 0;
250
251 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
252 }
253
254 if (isa<UndefValue>(Src)) {
255 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
256 }
257
258 break;
259 }
260 case Intrinsic::amdgcn_class: {
261 enum {
262 S_NAN = 1 << 0, // Signaling NaN
263 Q_NAN = 1 << 1, // Quiet NaN
264 N_INFINITY = 1 << 2, // Negative infinity
265 N_NORMAL = 1 << 3, // Negative normal
266 N_SUBNORMAL = 1 << 4, // Negative subnormal
267 N_ZERO = 1 << 5, // Negative zero
268 P_ZERO = 1 << 6, // Positive zero
269 P_SUBNORMAL = 1 << 7, // Positive subnormal
270 P_NORMAL = 1 << 8, // Positive normal
271 P_INFINITY = 1 << 9 // Positive infinity
272 };
273
274 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
275 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
276 P_NORMAL | P_INFINITY;
277
278 Value *Src0 = II.getArgOperand(0);
279 Value *Src1 = II.getArgOperand(1);
280 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
281 if (!CMask) {
282 if (isa<UndefValue>(Src0)) {
283 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
284 }
285
286 if (isa<UndefValue>(Src1)) {
287 return IC.replaceInstUsesWith(II,
288 ConstantInt::get(II.getType(), false));
289 }
290 break;
291 }
292
293 uint32_t Mask = CMask->getZExtValue();
294
295 // If all tests are made, it doesn't matter what the value is.
296 if ((Mask & FullMask) == FullMask) {
297 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
298 }
299
300 if ((Mask & FullMask) == 0) {
301 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
302 }
303
304 if (Mask == (S_NAN | Q_NAN)) {
305 // Equivalent of isnan. Replace with standard fcmp.
306 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
307 FCmp->takeName(&II);
308 return IC.replaceInstUsesWith(II, FCmp);
309 }
310
311 if (Mask == (N_ZERO | P_ZERO)) {
312 // Equivalent of == 0.
313 Value *FCmp =
314 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
315
316 FCmp->takeName(&II);
317 return IC.replaceInstUsesWith(II, FCmp);
318 }
319
320 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
321 if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
322 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
323 return IC.replaceOperand(
324 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
325 }
326
327 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
328 if (!CVal) {
329 if (isa<UndefValue>(Src0)) {
330 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
331 }
332
333 // Clamp mask to used bits
334 if ((Mask & FullMask) != Mask) {
335 CallInst *NewCall = IC.Builder.CreateCall(
336 II.getCalledFunction(),
337 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
338
339 NewCall->takeName(&II);
340 return IC.replaceInstUsesWith(II, NewCall);
341 }
342
343 break;
344 }
345
346 const APFloat &Val = CVal->getValueAPF();
347
348 bool Result =
349 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
350 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
351 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
352 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
353 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
354 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
355 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
356 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
357 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
358 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
359
360 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
361 }
362 case Intrinsic::amdgcn_cvt_pkrtz: {
363 Value *Src0 = II.getArgOperand(0);
364 Value *Src1 = II.getArgOperand(1);
365 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
366 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
367 const fltSemantics &HalfSem =
368 II.getType()->getScalarType()->getFltSemantics();
369 bool LosesInfo;
370 APFloat Val0 = C0->getValueAPF();
371 APFloat Val1 = C1->getValueAPF();
372 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
373 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
374
375 Constant *Folded =
376 ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
377 ConstantFP::get(II.getContext(), Val1)});
378 return IC.replaceInstUsesWith(II, Folded);
379 }
380 }
381
382 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
383 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
384 }
385
386 break;
387 }
388 case Intrinsic::amdgcn_cvt_pknorm_i16:
389 case Intrinsic::amdgcn_cvt_pknorm_u16:
390 case Intrinsic::amdgcn_cvt_pk_i16:
391 case Intrinsic::amdgcn_cvt_pk_u16: {
392 Value *Src0 = II.getArgOperand(0);
393 Value *Src1 = II.getArgOperand(1);
394
395 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
396 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
397 }
398
399 break;
400 }
401 case Intrinsic::amdgcn_ubfe:
402 case Intrinsic::amdgcn_sbfe: {
403 // Decompose simple cases into standard shifts.
404 Value *Src = II.getArgOperand(0);
405 if (isa<UndefValue>(Src)) {
406 return IC.replaceInstUsesWith(II, Src);
407 }
408
409 unsigned Width;
410 Type *Ty = II.getType();
411 unsigned IntSize = Ty->getIntegerBitWidth();
412
413 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
414 if (CWidth) {
415 Width = CWidth->getZExtValue();
416 if ((Width & (IntSize - 1)) == 0) {
417 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
418 }
419
420 // Hardware ignores high bits, so remove those.
421 if (Width >= IntSize) {
422 return IC.replaceOperand(
423 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
424 }
425 }
426
427 unsigned Offset;
428 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
429 if (COffset) {
430 Offset = COffset->getZExtValue();
431 if (Offset >= IntSize) {
432 return IC.replaceOperand(
433 II, 1,
434 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
435 }
436 }
437
438 bool Signed = IID == Intrinsic::amdgcn_sbfe;
439
440 if (!CWidth || !COffset)
441 break;
442
443 // The case of Width == 0 is handled above, which makes this tranformation
444 // safe. If Width == 0, then the ashr and lshr instructions become poison
445 // value since the shift amount would be equal to the bit size.
446 assert(Width != 0);
447
448 // TODO: This allows folding to undef when the hardware has specific
449 // behavior?
450 if (Offset + Width < IntSize) {
451 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
452 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
453 : IC.Builder.CreateLShr(Shl, IntSize - Width);
454 RightShift->takeName(&II);
455 return IC.replaceInstUsesWith(II, RightShift);
456 }
457
458 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
459 : IC.Builder.CreateLShr(Src, Offset);
460
461 RightShift->takeName(&II);
462 return IC.replaceInstUsesWith(II, RightShift);
463 }
464 case Intrinsic::amdgcn_exp:
465 case Intrinsic::amdgcn_exp_compr: {
466 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
467 unsigned EnBits = En->getZExtValue();
468 if (EnBits == 0xf)
469 break; // All inputs enabled.
470
471 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
472 bool Changed = false;
473 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
474 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
475 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
476 Value *Src = II.getArgOperand(I + 2);
477 if (!isa<UndefValue>(Src)) {
478 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
479 Changed = true;
480 }
481 }
482 }
483
484 if (Changed) {
485 return &II;
486 }
487
488 break;
489 }
490 case Intrinsic::amdgcn_fmed3: {
491 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
492 // for the shader.
493
494 Value *Src0 = II.getArgOperand(0);
495 Value *Src1 = II.getArgOperand(1);
496 Value *Src2 = II.getArgOperand(2);
497
498 // Checking for NaN before canonicalization provides better fidelity when
499 // mapping other operations onto fmed3 since the order of operands is
500 // unchanged.
501 CallInst *NewCall = nullptr;
502 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
503 NewCall = IC.Builder.CreateMinNum(Src1, Src2);
504 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
505 NewCall = IC.Builder.CreateMinNum(Src0, Src2);
506 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
507 NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
508 }
509
510 if (NewCall) {
511 NewCall->copyFastMathFlags(&II);
512 NewCall->takeName(&II);
513 return IC.replaceInstUsesWith(II, NewCall);
514 }
515
516 bool Swap = false;
517 // Canonicalize constants to RHS operands.
518 //
519 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
520 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
521 std::swap(Src0, Src1);
522 Swap = true;
523 }
524
525 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
526 std::swap(Src1, Src2);
527 Swap = true;
528 }
529
530 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
531 std::swap(Src0, Src1);
532 Swap = true;
533 }
534
535 if (Swap) {
536 II.setArgOperand(0, Src0);
537 II.setArgOperand(1, Src1);
538 II.setArgOperand(2, Src2);
539 return &II;
540 }
541
542 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
543 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
544 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
545 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
546 C2->getValueAPF());
547 return IC.replaceInstUsesWith(
548 II, ConstantFP::get(IC.Builder.getContext(), Result));
549 }
550 }
551 }
552
553 break;
554 }
555 case Intrinsic::amdgcn_icmp:
556 case Intrinsic::amdgcn_fcmp: {
557 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
558 // Guard against invalid arguments.
559 int64_t CCVal = CC->getZExtValue();
560 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
561 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
562 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
563 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
564 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
565 break;
566
567 Value *Src0 = II.getArgOperand(0);
568 Value *Src1 = II.getArgOperand(1);
569
570 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
571 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
572 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
573 if (CCmp->isNullValue()) {
574 return IC.replaceInstUsesWith(
575 II, ConstantExpr::getSExt(CCmp, II.getType()));
576 }
577
578 // The result of V_ICMP/V_FCMP assembly instructions (which this
579 // intrinsic exposes) is one bit per thread, masked with the EXEC
580 // register (which contains the bitmask of live threads). So a
581 // comparison that always returns true is the same as a read of the
582 // EXEC register.
583 Function *NewF = Intrinsic::getDeclaration(
584 II.getModule(), Intrinsic::read_register, II.getType());
585 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
586 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
587 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
588 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
589 NewCall->addAttribute(AttributeList::FunctionIndex,
590 Attribute::Convergent);
591 NewCall->takeName(&II);
592 return IC.replaceInstUsesWith(II, NewCall);
593 }
594
595 // Canonicalize constants to RHS.
596 CmpInst::Predicate SwapPred =
597 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
598 II.setArgOperand(0, Src1);
599 II.setArgOperand(1, Src0);
600 II.setArgOperand(
601 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
602 return &II;
603 }
604
605 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
606 break;
607
608 // Canonicalize compare eq with true value to compare != 0
609 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
610 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
611 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
612 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
613 Value *ExtSrc;
614 if (CCVal == CmpInst::ICMP_EQ &&
615 ((match(Src1, PatternMatch::m_One()) &&
616 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
617 (match(Src1, PatternMatch::m_AllOnes()) &&
618 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
619 ExtSrc->getType()->isIntegerTy(1)) {
620 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
621 IC.replaceOperand(II, 2,
622 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
623 return &II;
624 }
625
626 CmpInst::Predicate SrcPred;
627 Value *SrcLHS;
628 Value *SrcRHS;
629
630 // Fold compare eq/ne with 0 from a compare result as the predicate to the
631 // intrinsic. The typical use is a wave vote function in the library, which
632 // will be fed from a user code condition compared with 0. Fold in the
633 // redundant compare.
634
635 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
636 // -> llvm.amdgcn.[if]cmp(a, b, pred)
637 //
638 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
639 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
640 if (match(Src1, PatternMatch::m_Zero()) &&
641 match(Src0, PatternMatch::m_ZExtOrSExt(
642 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
643 PatternMatch::m_Value(SrcRHS))))) {
644 if (CCVal == CmpInst::ICMP_EQ)
645 SrcPred = CmpInst::getInversePredicate(SrcPred);
646
647 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
648 ? Intrinsic::amdgcn_fcmp
649 : Intrinsic::amdgcn_icmp;
650
651 Type *Ty = SrcLHS->getType();
652 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
653 // Promote to next legal integer type.
654 unsigned Width = CmpType->getBitWidth();
655 unsigned NewWidth = Width;
656
657 // Don't do anything for i1 comparisons.
658 if (Width == 1)
659 break;
660
661 if (Width <= 16)
662 NewWidth = 16;
663 else if (Width <= 32)
664 NewWidth = 32;
665 else if (Width <= 64)
666 NewWidth = 64;
667 else if (Width > 64)
668 break; // Can't handle this.
669
670 if (Width != NewWidth) {
671 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
672 if (CmpInst::isSigned(SrcPred)) {
673 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
674 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
675 } else {
676 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
677 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
678 }
679 }
680 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
681 break;
682
683 Function *NewF = Intrinsic::getDeclaration(
684 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
685 Value *Args[] = {SrcLHS, SrcRHS,
686 ConstantInt::get(CC->getType(), SrcPred)};
687 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
688 NewCall->takeName(&II);
689 return IC.replaceInstUsesWith(II, NewCall);
690 }
691
692 break;
693 }
694 case Intrinsic::amdgcn_ballot: {
695 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
696 if (Src->isZero()) {
697 // amdgcn.ballot(i1 0) is zero.
698 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
699 }
700
701 if (Src->isOne()) {
702 // amdgcn.ballot(i1 1) is exec.
703 const char *RegName = "exec";
704 if (II.getType()->isIntegerTy(32))
705 RegName = "exec_lo";
706 else if (!II.getType()->isIntegerTy(64))
707 break;
708
709 Function *NewF = Intrinsic::getDeclaration(
710 II.getModule(), Intrinsic::read_register, II.getType());
711 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
712 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
713 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
714 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
715 NewCall->addAttribute(AttributeList::FunctionIndex,
716 Attribute::Convergent);
717 NewCall->takeName(&II);
718 return IC.replaceInstUsesWith(II, NewCall);
719 }
720 }
721 break;
722 }
723 case Intrinsic::amdgcn_wqm_vote: {
724 // wqm_vote is identity when the argument is constant.
725 if (!isa<Constant>(II.getArgOperand(0)))
726 break;
727
728 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
729 }
730 case Intrinsic::amdgcn_kill: {
731 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
732 if (!C || !C->getZExtValue())
733 break;
734
735 // amdgcn.kill(i1 1) is a no-op
736 return IC.eraseInstFromFunction(II);
737 }
738 case Intrinsic::amdgcn_update_dpp: {
739 Value *Old = II.getArgOperand(0);
740
741 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
742 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
743 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
744 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
745 BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
746 break;
747
748 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
749 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
750 }
751 case Intrinsic::amdgcn_permlane16:
752 case Intrinsic::amdgcn_permlanex16: {
753 // Discard vdst_in if it's not going to be read.
754 Value *VDstIn = II.getArgOperand(0);
755 if (isa<UndefValue>(VDstIn))
756 break;
757
758 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
759 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
760 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
761 break;
762
763 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
764 }
765 case Intrinsic::amdgcn_readfirstlane:
766 case Intrinsic::amdgcn_readlane: {
767 // A constant value is trivially uniform.
768 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
769 return IC.replaceInstUsesWith(II, C);
770 }
771
772 // The rest of these may not be safe if the exec may not be the same between
773 // the def and use.
774 Value *Src = II.getArgOperand(0);
775 Instruction *SrcInst = dyn_cast<Instruction>(Src);
776 if (SrcInst && SrcInst->getParent() != II.getParent())
777 break;
778
779 // readfirstlane (readfirstlane x) -> readfirstlane x
780 // readlane (readfirstlane x), y -> readfirstlane x
781 if (match(Src,
782 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
783 return IC.replaceInstUsesWith(II, Src);
784 }
785
786 if (IID == Intrinsic::amdgcn_readfirstlane) {
787 // readfirstlane (readlane x, y) -> readlane x, y
788 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
789 return IC.replaceInstUsesWith(II, Src);
790 }
791 } else {
792 // readlane (readlane x, y), y -> readlane x, y
793 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
794 PatternMatch::m_Value(),
795 PatternMatch::m_Specific(II.getArgOperand(1))))) {
796 return IC.replaceInstUsesWith(II, Src);
797 }
798 }
799
800 break;
801 }
802 case Intrinsic::amdgcn_ldexp: {
803 // FIXME: This doesn't introduce new instructions and belongs in
804 // InstructionSimplify.
805 Type *Ty = II.getType();
806 Value *Op0 = II.getArgOperand(0);
807 Value *Op1 = II.getArgOperand(1);
808
809 // Folding undef to qnan is safe regardless of the FP mode.
810 if (isa<UndefValue>(Op0)) {
811 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
812 return IC.replaceInstUsesWith(II, QNaN);
813 }
814
815 const APFloat *C = nullptr;
816 match(Op0, PatternMatch::m_APFloat(C));
817
818 // FIXME: Should flush denorms depending on FP mode, but that's ignored
819 // everywhere else.
820 //
821 // These cases should be safe, even with strictfp.
822 // ldexp(0.0, x) -> 0.0
823 // ldexp(-0.0, x) -> -0.0
824 // ldexp(inf, x) -> inf
825 // ldexp(-inf, x) -> -inf
826 if (C && (C->isZero() || C->isInfinity())) {
827 return IC.replaceInstUsesWith(II, Op0);
828 }
829
830 // With strictfp, be more careful about possibly needing to flush denormals
831 // or not, and snan behavior depends on ieee_mode.
832 if (II.isStrictFP())
833 break;
834
835 if (C && C->isNaN()) {
836 // FIXME: We just need to make the nan quiet here, but that's unavailable
837 // on APFloat, only IEEEfloat
838 auto *Quieted =
839 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
840 return IC.replaceInstUsesWith(II, Quieted);
841 }
842
843 // ldexp(x, 0) -> x
844 // ldexp(x, undef) -> x
845 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
846 return IC.replaceInstUsesWith(II, Op0);
847 }
848
849 break;
850 }
851 case Intrinsic::amdgcn_fmul_legacy: {
852 Value *Op0 = II.getArgOperand(0);
853 Value *Op1 = II.getArgOperand(1);
854
855 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
856 // infinity, gives +0.0.
857 // TODO: Move to InstSimplify?
858 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
859 match(Op1, PatternMatch::m_AnyZeroFP()))
860 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
861
862 // If we can prove we don't have one of the special cases then we can use a
863 // normal fmul instruction instead.
864 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
865 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
866 FMul->takeName(&II);
867 return IC.replaceInstUsesWith(II, FMul);
868 }
869 break;
870 }
871 case Intrinsic::amdgcn_fma_legacy: {
872 Value *Op0 = II.getArgOperand(0);
873 Value *Op1 = II.getArgOperand(1);
874 Value *Op2 = II.getArgOperand(2);
875
876 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
877 // infinity, gives +0.0.
878 // TODO: Move to InstSimplify?
879 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
880 match(Op1, PatternMatch::m_AnyZeroFP())) {
881 // It's tempting to just return Op2 here, but that would give the wrong
882 // result if Op2 was -0.0.
883 auto *Zero = ConstantFP::getNullValue(II.getType());
884 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
885 FAdd->takeName(&II);
886 return IC.replaceInstUsesWith(II, FAdd);
887 }
888
889 // If we can prove we don't have one of the special cases then we can use a
890 // normal fma instead.
891 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
892 II.setCalledOperand(Intrinsic::getDeclaration(
893 II.getModule(), Intrinsic::fma, II.getType()));
894 return &II;
895 }
896 break;
897 }
898 default: {
899 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
900 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
901 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
902 }
903 }
904 }
905 return None;
906 }
907
908 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
909 ///
910 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
911 /// struct returns.
simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,int DMaskIdx=-1)912 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
913 IntrinsicInst &II,
914 APInt DemandedElts,
915 int DMaskIdx = -1) {
916
917 auto *IIVTy = cast<FixedVectorType>(II.getType());
918 unsigned VWidth = IIVTy->getNumElements();
919 if (VWidth == 1)
920 return nullptr;
921
922 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
923 IC.Builder.SetInsertPoint(&II);
924
925 // Assume the arguments are unchanged and later override them, if needed.
926 SmallVector<Value *, 16> Args(II.args());
927
928 if (DMaskIdx < 0) {
929 // Buffer case.
930
931 const unsigned ActiveBits = DemandedElts.getActiveBits();
932 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
933
934 // Start assuming the prefix of elements is demanded, but possibly clear
935 // some other bits if there are trailing zeros (unused components at front)
936 // and update offset.
937 DemandedElts = (1 << ActiveBits) - 1;
938
939 if (UnusedComponentsAtFront > 0) {
940 static const unsigned InvalidOffsetIdx = 0xf;
941
942 unsigned OffsetIdx;
943 switch (II.getIntrinsicID()) {
944 case Intrinsic::amdgcn_raw_buffer_load:
945 OffsetIdx = 1;
946 break;
947 case Intrinsic::amdgcn_s_buffer_load:
948 // If resulting type is vec3, there is no point in trimming the
949 // load with updated offset, as the vec3 would most likely be widened to
950 // vec4 anyway during lowering.
951 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
952 OffsetIdx = InvalidOffsetIdx;
953 else
954 OffsetIdx = 1;
955 break;
956 case Intrinsic::amdgcn_struct_buffer_load:
957 OffsetIdx = 2;
958 break;
959 default:
960 // TODO: handle tbuffer* intrinsics.
961 OffsetIdx = InvalidOffsetIdx;
962 break;
963 }
964
965 if (OffsetIdx != InvalidOffsetIdx) {
966 // Clear demanded bits and update the offset.
967 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
968 auto *Offset = II.getArgOperand(OffsetIdx);
969 unsigned SingleComponentSizeInBits =
970 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
971 unsigned OffsetAdd =
972 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
973 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
974 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
975 }
976 }
977 } else {
978 // Image case.
979
980 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
981 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
982
983 // Mask off values that are undefined because the dmask doesn't cover them
984 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
985
986 unsigned NewDMaskVal = 0;
987 unsigned OrigLoadIdx = 0;
988 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
989 const unsigned Bit = 1 << SrcIdx;
990 if (!!(DMaskVal & Bit)) {
991 if (!!DemandedElts[OrigLoadIdx])
992 NewDMaskVal |= Bit;
993 OrigLoadIdx++;
994 }
995 }
996
997 if (DMaskVal != NewDMaskVal)
998 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
999 }
1000
1001 unsigned NewNumElts = DemandedElts.countPopulation();
1002 if (!NewNumElts)
1003 return UndefValue::get(II.getType());
1004
1005 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1006 if (DMaskIdx >= 0)
1007 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1008 return nullptr;
1009 }
1010
1011 // Validate function argument and return types, extracting overloaded types
1012 // along the way.
1013 SmallVector<Type *, 6> OverloadTys;
1014 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1015 return nullptr;
1016
1017 Module *M = II.getParent()->getParent()->getParent();
1018 Type *EltTy = IIVTy->getElementType();
1019 Type *NewTy =
1020 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1021
1022 OverloadTys[0] = NewTy;
1023 Function *NewIntrin =
1024 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1025
1026 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1027 NewCall->takeName(&II);
1028 NewCall->copyMetadata(II);
1029
1030 if (NewNumElts == 1) {
1031 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1032 NewCall,
1033 DemandedElts.countTrailingZeros());
1034 }
1035
1036 SmallVector<int, 8> EltMask;
1037 unsigned NewLoadIdx = 0;
1038 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1039 if (!!DemandedElts[OrigLoadIdx])
1040 EltMask.push_back(NewLoadIdx++);
1041 else
1042 EltMask.push_back(NewNumElts);
1043 }
1044
1045 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1046
1047 return Shuffle;
1048 }
1049
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const1050 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1051 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1052 APInt &UndefElts2, APInt &UndefElts3,
1053 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1054 SimplifyAndSetOp) const {
1055 switch (II.getIntrinsicID()) {
1056 case Intrinsic::amdgcn_buffer_load:
1057 case Intrinsic::amdgcn_buffer_load_format:
1058 case Intrinsic::amdgcn_raw_buffer_load:
1059 case Intrinsic::amdgcn_raw_buffer_load_format:
1060 case Intrinsic::amdgcn_raw_tbuffer_load:
1061 case Intrinsic::amdgcn_s_buffer_load:
1062 case Intrinsic::amdgcn_struct_buffer_load:
1063 case Intrinsic::amdgcn_struct_buffer_load_format:
1064 case Intrinsic::amdgcn_struct_tbuffer_load:
1065 case Intrinsic::amdgcn_tbuffer_load:
1066 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1067 default: {
1068 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1069 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1070 }
1071 break;
1072 }
1073 }
1074 return None;
1075 }
1076