1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/Transforms/InstCombine/InstCombiner.h" 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "AMDGPUtti" 26 27 namespace { 28 29 struct AMDGPUImageDMaskIntrinsic { 30 unsigned Intr; 31 }; 32 33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 34 #include "InstCombineTables.inc" 35 36 } // end anonymous namespace 37 38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 39 // 40 // A single NaN input is folded to minnum, so we rely on that folding for 41 // handling NaNs. 42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 43 const APFloat &Src2) { 44 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 45 46 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 47 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 48 if (Cmp0 == APFloat::cmpEqual) 49 return maxnum(Src1, Src2); 50 51 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 52 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 53 if (Cmp1 == APFloat::cmpEqual) 54 return maxnum(Src0, Src2); 55 56 return maxnum(Src0, Src1); 57 } 58 59 // Check if a value can be converted to a 16-bit value without losing 60 // precision. 61 static bool canSafelyConvertTo16Bit(Value &V) { 62 Type *VTy = V.getType(); 63 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 64 // The value is already 16-bit, so we don't want to convert to 16-bit again! 65 return false; 66 } 67 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 68 // We need to check that if we cast the index down to a half, we do not lose 69 // precision. 70 APFloat FloatValue(ConstFloat->getValueAPF()); 71 bool LosesInfo = true; 72 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 73 return !LosesInfo; 74 } 75 Value *CastSrc; 76 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 77 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 78 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 79 Type *CastSrcTy = CastSrc->getType(); 80 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 81 return true; 82 } 83 84 return false; 85 } 86 87 // Convert a value to 16-bit. 88 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 89 Type *VTy = V.getType(); 90 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 91 return cast<Instruction>(&V)->getOperand(0); 92 if (VTy->isIntegerTy()) 93 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 94 if (VTy->isFloatingPointTy()) 95 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 96 97 llvm_unreachable("Should never be called!"); 98 } 99 100 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with 101 /// the modified arguments. 102 static Optional<Instruction *> modifyIntrinsicCall( 103 IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC, 104 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 105 Func) { 106 SmallVector<Type *, 4> ArgTys; 107 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 108 return None; 109 110 SmallVector<Value *, 8> Args(II.args()); 111 112 // Modify arguments and types 113 Func(Args, ArgTys); 114 115 Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys); 116 117 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 118 NewCall->takeName(&II); 119 NewCall->copyMetadata(II); 120 if (isa<FPMathOperator>(NewCall)) 121 NewCall->copyFastMathFlags(&II); 122 123 // Erase and replace uses 124 if (!II.getType()->isVoidTy()) 125 IC.replaceInstUsesWith(II, NewCall); 126 return IC.eraseInstFromFunction(II); 127 } 128 129 static Optional<Instruction *> 130 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 131 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 132 IntrinsicInst &II, InstCombiner &IC) { 133 // Optimize _L to _LZ when _L is zero 134 if (const auto *LZMappingInfo = 135 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 136 if (auto *ConstantLod = 137 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 138 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 139 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 140 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 141 ImageDimIntr->Dim); 142 return modifyIntrinsicCall( 143 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 144 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 145 }); 146 } 147 } 148 } 149 150 // Optimize _mip away, when 'lod' is zero 151 if (const auto *MIPMappingInfo = 152 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 153 if (auto *ConstantMip = 154 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 155 if (ConstantMip->isZero()) { 156 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 157 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 158 ImageDimIntr->Dim); 159 return modifyIntrinsicCall( 160 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 161 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 162 }); 163 } 164 } 165 } 166 167 // Optimize _bias away when 'bias' is zero 168 if (const auto *BiasMappingInfo = 169 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 170 if (auto *ConstantBias = 171 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 172 if (ConstantBias->isZero()) { 173 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 174 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 175 ImageDimIntr->Dim); 176 return modifyIntrinsicCall( 177 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 178 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 179 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 180 }); 181 } 182 } 183 } 184 185 // Try to use A16 or G16 186 if (!ST->hasA16() && !ST->hasG16()) 187 return None; 188 189 bool FloatCoord = false; 190 // true means derivatives can be converted to 16 bit, coordinates not 191 bool OnlyDerivatives = false; 192 193 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 194 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 195 Value *Coord = II.getOperand(OperandIndex); 196 // If the values are not derived from 16-bit values, we cannot optimize. 197 if (!canSafelyConvertTo16Bit(*Coord)) { 198 if (OperandIndex < ImageDimIntr->CoordStart || 199 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 200 return None; 201 } 202 // All gradients can be converted, so convert only them 203 OnlyDerivatives = true; 204 break; 205 } 206 207 assert(OperandIndex == ImageDimIntr->GradientStart || 208 FloatCoord == Coord->getType()->isFloatingPointTy()); 209 FloatCoord = Coord->getType()->isFloatingPointTy(); 210 } 211 212 if (!OnlyDerivatives && !ST->hasA16()) 213 OnlyDerivatives = true; // Only supports G16 214 215 // Check if there is a bias parameter and if it can be converted to f16 216 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 217 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 218 if (!canSafelyConvertTo16Bit(*Bias)) 219 OnlyDerivatives = true; 220 } 221 222 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 223 ImageDimIntr->CoordStart)) 224 return None; 225 226 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 227 : Type::getInt16Ty(II.getContext()); 228 229 return modifyIntrinsicCall( 230 II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 231 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 232 if (!OnlyDerivatives) { 233 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 234 235 // Change the bias type 236 if (ImageDimIntr->NumBiasArgs != 0) 237 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 238 } 239 240 unsigned EndIndex = 241 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 242 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 243 OperandIndex < EndIndex; OperandIndex++) { 244 Args[OperandIndex] = 245 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 246 } 247 248 // Convert the bias 249 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 250 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 251 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 252 } 253 }); 254 } 255 256 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 257 InstCombiner &IC) const { 258 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 259 // infinity, gives +0.0. If we can prove we don't have one of the special 260 // cases then we can use a normal multiply instead. 261 // TODO: Create and use isKnownFiniteNonZero instead of just matching 262 // constants here. 263 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 264 match(Op1, PatternMatch::m_FiniteNonZero())) { 265 // One operand is not zero or infinity or NaN. 266 return true; 267 } 268 auto *TLI = &IC.getTargetLibraryInfo(); 269 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 270 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 271 // Neither operand is infinity or NaN. 272 return true; 273 } 274 return false; 275 } 276 277 Optional<Instruction *> 278 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 279 Intrinsic::ID IID = II.getIntrinsicID(); 280 switch (IID) { 281 case Intrinsic::amdgcn_rcp: { 282 Value *Src = II.getArgOperand(0); 283 284 // TODO: Move to ConstantFolding/InstSimplify? 285 if (isa<UndefValue>(Src)) { 286 Type *Ty = II.getType(); 287 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 288 return IC.replaceInstUsesWith(II, QNaN); 289 } 290 291 if (II.isStrictFP()) 292 break; 293 294 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 295 const APFloat &ArgVal = C->getValueAPF(); 296 APFloat Val(ArgVal.getSemantics(), 1); 297 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 298 299 // This is more precise than the instruction may give. 300 // 301 // TODO: The instruction always flushes denormal results (except for f16), 302 // should this also? 303 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 304 } 305 306 break; 307 } 308 case Intrinsic::amdgcn_rsq: { 309 Value *Src = II.getArgOperand(0); 310 311 // TODO: Move to ConstantFolding/InstSimplify? 312 if (isa<UndefValue>(Src)) { 313 Type *Ty = II.getType(); 314 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 315 return IC.replaceInstUsesWith(II, QNaN); 316 } 317 318 break; 319 } 320 case Intrinsic::amdgcn_frexp_mant: 321 case Intrinsic::amdgcn_frexp_exp: { 322 Value *Src = II.getArgOperand(0); 323 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 324 int Exp; 325 APFloat Significand = 326 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 327 328 if (IID == Intrinsic::amdgcn_frexp_mant) { 329 return IC.replaceInstUsesWith( 330 II, ConstantFP::get(II.getContext(), Significand)); 331 } 332 333 // Match instruction special case behavior. 334 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 335 Exp = 0; 336 337 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 338 } 339 340 if (isa<UndefValue>(Src)) { 341 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 342 } 343 344 break; 345 } 346 case Intrinsic::amdgcn_class: { 347 enum { 348 S_NAN = 1 << 0, // Signaling NaN 349 Q_NAN = 1 << 1, // Quiet NaN 350 N_INFINITY = 1 << 2, // Negative infinity 351 N_NORMAL = 1 << 3, // Negative normal 352 N_SUBNORMAL = 1 << 4, // Negative subnormal 353 N_ZERO = 1 << 5, // Negative zero 354 P_ZERO = 1 << 6, // Positive zero 355 P_SUBNORMAL = 1 << 7, // Positive subnormal 356 P_NORMAL = 1 << 8, // Positive normal 357 P_INFINITY = 1 << 9 // Positive infinity 358 }; 359 360 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 361 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 362 P_NORMAL | P_INFINITY; 363 364 Value *Src0 = II.getArgOperand(0); 365 Value *Src1 = II.getArgOperand(1); 366 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 367 if (!CMask) { 368 if (isa<UndefValue>(Src0)) { 369 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 370 } 371 372 if (isa<UndefValue>(Src1)) { 373 return IC.replaceInstUsesWith(II, 374 ConstantInt::get(II.getType(), false)); 375 } 376 break; 377 } 378 379 uint32_t Mask = CMask->getZExtValue(); 380 381 // If all tests are made, it doesn't matter what the value is. 382 if ((Mask & FullMask) == FullMask) { 383 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 384 } 385 386 if ((Mask & FullMask) == 0) { 387 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 388 } 389 390 if (Mask == (S_NAN | Q_NAN)) { 391 // Equivalent of isnan. Replace with standard fcmp. 392 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 393 FCmp->takeName(&II); 394 return IC.replaceInstUsesWith(II, FCmp); 395 } 396 397 if (Mask == (N_ZERO | P_ZERO)) { 398 // Equivalent of == 0. 399 Value *FCmp = 400 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 401 402 FCmp->takeName(&II); 403 return IC.replaceInstUsesWith(II, FCmp); 404 } 405 406 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 407 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 408 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 409 return IC.replaceOperand( 410 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 411 } 412 413 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 414 if (!CVal) { 415 if (isa<UndefValue>(Src0)) { 416 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 417 } 418 419 // Clamp mask to used bits 420 if ((Mask & FullMask) != Mask) { 421 CallInst *NewCall = IC.Builder.CreateCall( 422 II.getCalledFunction(), 423 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 424 425 NewCall->takeName(&II); 426 return IC.replaceInstUsesWith(II, NewCall); 427 } 428 429 break; 430 } 431 432 const APFloat &Val = CVal->getValueAPF(); 433 434 bool Result = 435 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 436 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 437 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 438 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 439 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 440 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 441 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 442 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 443 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 444 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 445 446 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 447 } 448 case Intrinsic::amdgcn_cvt_pkrtz: { 449 Value *Src0 = II.getArgOperand(0); 450 Value *Src1 = II.getArgOperand(1); 451 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 452 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 453 const fltSemantics &HalfSem = 454 II.getType()->getScalarType()->getFltSemantics(); 455 bool LosesInfo; 456 APFloat Val0 = C0->getValueAPF(); 457 APFloat Val1 = C1->getValueAPF(); 458 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 459 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 460 461 Constant *Folded = 462 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 463 ConstantFP::get(II.getContext(), Val1)}); 464 return IC.replaceInstUsesWith(II, Folded); 465 } 466 } 467 468 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 469 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 470 } 471 472 break; 473 } 474 case Intrinsic::amdgcn_cvt_pknorm_i16: 475 case Intrinsic::amdgcn_cvt_pknorm_u16: 476 case Intrinsic::amdgcn_cvt_pk_i16: 477 case Intrinsic::amdgcn_cvt_pk_u16: { 478 Value *Src0 = II.getArgOperand(0); 479 Value *Src1 = II.getArgOperand(1); 480 481 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 482 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 483 } 484 485 break; 486 } 487 case Intrinsic::amdgcn_ubfe: 488 case Intrinsic::amdgcn_sbfe: { 489 // Decompose simple cases into standard shifts. 490 Value *Src = II.getArgOperand(0); 491 if (isa<UndefValue>(Src)) { 492 return IC.replaceInstUsesWith(II, Src); 493 } 494 495 unsigned Width; 496 Type *Ty = II.getType(); 497 unsigned IntSize = Ty->getIntegerBitWidth(); 498 499 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 500 if (CWidth) { 501 Width = CWidth->getZExtValue(); 502 if ((Width & (IntSize - 1)) == 0) { 503 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 504 } 505 506 // Hardware ignores high bits, so remove those. 507 if (Width >= IntSize) { 508 return IC.replaceOperand( 509 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 510 } 511 } 512 513 unsigned Offset; 514 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 515 if (COffset) { 516 Offset = COffset->getZExtValue(); 517 if (Offset >= IntSize) { 518 return IC.replaceOperand( 519 II, 1, 520 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 521 } 522 } 523 524 bool Signed = IID == Intrinsic::amdgcn_sbfe; 525 526 if (!CWidth || !COffset) 527 break; 528 529 // The case of Width == 0 is handled above, which makes this transformation 530 // safe. If Width == 0, then the ashr and lshr instructions become poison 531 // value since the shift amount would be equal to the bit size. 532 assert(Width != 0); 533 534 // TODO: This allows folding to undef when the hardware has specific 535 // behavior? 536 if (Offset + Width < IntSize) { 537 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 538 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 539 : IC.Builder.CreateLShr(Shl, IntSize - Width); 540 RightShift->takeName(&II); 541 return IC.replaceInstUsesWith(II, RightShift); 542 } 543 544 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 545 : IC.Builder.CreateLShr(Src, Offset); 546 547 RightShift->takeName(&II); 548 return IC.replaceInstUsesWith(II, RightShift); 549 } 550 case Intrinsic::amdgcn_exp: 551 case Intrinsic::amdgcn_exp_compr: { 552 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 553 unsigned EnBits = En->getZExtValue(); 554 if (EnBits == 0xf) 555 break; // All inputs enabled. 556 557 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 558 bool Changed = false; 559 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 560 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 561 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 562 Value *Src = II.getArgOperand(I + 2); 563 if (!isa<UndefValue>(Src)) { 564 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 565 Changed = true; 566 } 567 } 568 } 569 570 if (Changed) { 571 return &II; 572 } 573 574 break; 575 } 576 case Intrinsic::amdgcn_fmed3: { 577 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 578 // for the shader. 579 580 Value *Src0 = II.getArgOperand(0); 581 Value *Src1 = II.getArgOperand(1); 582 Value *Src2 = II.getArgOperand(2); 583 584 // Checking for NaN before canonicalization provides better fidelity when 585 // mapping other operations onto fmed3 since the order of operands is 586 // unchanged. 587 CallInst *NewCall = nullptr; 588 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 589 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 590 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 591 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 592 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 593 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 594 } 595 596 if (NewCall) { 597 NewCall->copyFastMathFlags(&II); 598 NewCall->takeName(&II); 599 return IC.replaceInstUsesWith(II, NewCall); 600 } 601 602 bool Swap = false; 603 // Canonicalize constants to RHS operands. 604 // 605 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 606 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 607 std::swap(Src0, Src1); 608 Swap = true; 609 } 610 611 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 612 std::swap(Src1, Src2); 613 Swap = true; 614 } 615 616 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 617 std::swap(Src0, Src1); 618 Swap = true; 619 } 620 621 if (Swap) { 622 II.setArgOperand(0, Src0); 623 II.setArgOperand(1, Src1); 624 II.setArgOperand(2, Src2); 625 return &II; 626 } 627 628 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 629 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 630 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 631 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 632 C2->getValueAPF()); 633 return IC.replaceInstUsesWith( 634 II, ConstantFP::get(IC.Builder.getContext(), Result)); 635 } 636 } 637 } 638 639 break; 640 } 641 case Intrinsic::amdgcn_icmp: 642 case Intrinsic::amdgcn_fcmp: { 643 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 644 // Guard against invalid arguments. 645 int64_t CCVal = CC->getZExtValue(); 646 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 647 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 648 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 649 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 650 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 651 break; 652 653 Value *Src0 = II.getArgOperand(0); 654 Value *Src1 = II.getArgOperand(1); 655 656 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 657 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 658 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 659 if (CCmp->isNullValue()) { 660 return IC.replaceInstUsesWith( 661 II, ConstantExpr::getSExt(CCmp, II.getType())); 662 } 663 664 // The result of V_ICMP/V_FCMP assembly instructions (which this 665 // intrinsic exposes) is one bit per thread, masked with the EXEC 666 // register (which contains the bitmask of live threads). So a 667 // comparison that always returns true is the same as a read of the 668 // EXEC register. 669 Function *NewF = Intrinsic::getDeclaration( 670 II.getModule(), Intrinsic::read_register, II.getType()); 671 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 672 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 673 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 674 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 675 NewCall->addFnAttr(Attribute::Convergent); 676 NewCall->takeName(&II); 677 return IC.replaceInstUsesWith(II, NewCall); 678 } 679 680 // Canonicalize constants to RHS. 681 CmpInst::Predicate SwapPred = 682 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 683 II.setArgOperand(0, Src1); 684 II.setArgOperand(1, Src0); 685 II.setArgOperand( 686 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 687 return &II; 688 } 689 690 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 691 break; 692 693 // Canonicalize compare eq with true value to compare != 0 694 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 695 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 696 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 697 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 698 Value *ExtSrc; 699 if (CCVal == CmpInst::ICMP_EQ && 700 ((match(Src1, PatternMatch::m_One()) && 701 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 702 (match(Src1, PatternMatch::m_AllOnes()) && 703 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 704 ExtSrc->getType()->isIntegerTy(1)) { 705 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 706 IC.replaceOperand(II, 2, 707 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 708 return &II; 709 } 710 711 CmpInst::Predicate SrcPred; 712 Value *SrcLHS; 713 Value *SrcRHS; 714 715 // Fold compare eq/ne with 0 from a compare result as the predicate to the 716 // intrinsic. The typical use is a wave vote function in the library, which 717 // will be fed from a user code condition compared with 0. Fold in the 718 // redundant compare. 719 720 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 721 // -> llvm.amdgcn.[if]cmp(a, b, pred) 722 // 723 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 724 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 725 if (match(Src1, PatternMatch::m_Zero()) && 726 match(Src0, PatternMatch::m_ZExtOrSExt( 727 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 728 PatternMatch::m_Value(SrcRHS))))) { 729 if (CCVal == CmpInst::ICMP_EQ) 730 SrcPred = CmpInst::getInversePredicate(SrcPred); 731 732 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 733 ? Intrinsic::amdgcn_fcmp 734 : Intrinsic::amdgcn_icmp; 735 736 Type *Ty = SrcLHS->getType(); 737 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 738 // Promote to next legal integer type. 739 unsigned Width = CmpType->getBitWidth(); 740 unsigned NewWidth = Width; 741 742 // Don't do anything for i1 comparisons. 743 if (Width == 1) 744 break; 745 746 if (Width <= 16) 747 NewWidth = 16; 748 else if (Width <= 32) 749 NewWidth = 32; 750 else if (Width <= 64) 751 NewWidth = 64; 752 else if (Width > 64) 753 break; // Can't handle this. 754 755 if (Width != NewWidth) { 756 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 757 if (CmpInst::isSigned(SrcPred)) { 758 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 759 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 760 } else { 761 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 762 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 763 } 764 } 765 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 766 break; 767 768 Function *NewF = Intrinsic::getDeclaration( 769 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 770 Value *Args[] = {SrcLHS, SrcRHS, 771 ConstantInt::get(CC->getType(), SrcPred)}; 772 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 773 NewCall->takeName(&II); 774 return IC.replaceInstUsesWith(II, NewCall); 775 } 776 777 break; 778 } 779 case Intrinsic::amdgcn_ballot: { 780 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 781 if (Src->isZero()) { 782 // amdgcn.ballot(i1 0) is zero. 783 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 784 } 785 786 if (Src->isOne()) { 787 // amdgcn.ballot(i1 1) is exec. 788 const char *RegName = "exec"; 789 if (II.getType()->isIntegerTy(32)) 790 RegName = "exec_lo"; 791 else if (!II.getType()->isIntegerTy(64)) 792 break; 793 794 Function *NewF = Intrinsic::getDeclaration( 795 II.getModule(), Intrinsic::read_register, II.getType()); 796 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 797 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 798 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 799 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 800 NewCall->addFnAttr(Attribute::Convergent); 801 NewCall->takeName(&II); 802 return IC.replaceInstUsesWith(II, NewCall); 803 } 804 } 805 break; 806 } 807 case Intrinsic::amdgcn_wqm_vote: { 808 // wqm_vote is identity when the argument is constant. 809 if (!isa<Constant>(II.getArgOperand(0))) 810 break; 811 812 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 813 } 814 case Intrinsic::amdgcn_kill: { 815 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 816 if (!C || !C->getZExtValue()) 817 break; 818 819 // amdgcn.kill(i1 1) is a no-op 820 return IC.eraseInstFromFunction(II); 821 } 822 case Intrinsic::amdgcn_update_dpp: { 823 Value *Old = II.getArgOperand(0); 824 825 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 826 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 827 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 828 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 829 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 830 break; 831 832 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 833 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 834 } 835 case Intrinsic::amdgcn_permlane16: 836 case Intrinsic::amdgcn_permlanex16: { 837 // Discard vdst_in if it's not going to be read. 838 Value *VDstIn = II.getArgOperand(0); 839 if (isa<UndefValue>(VDstIn)) 840 break; 841 842 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 843 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 844 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 845 break; 846 847 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 848 } 849 case Intrinsic::amdgcn_readfirstlane: 850 case Intrinsic::amdgcn_readlane: { 851 // A constant value is trivially uniform. 852 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 853 return IC.replaceInstUsesWith(II, C); 854 } 855 856 // The rest of these may not be safe if the exec may not be the same between 857 // the def and use. 858 Value *Src = II.getArgOperand(0); 859 Instruction *SrcInst = dyn_cast<Instruction>(Src); 860 if (SrcInst && SrcInst->getParent() != II.getParent()) 861 break; 862 863 // readfirstlane (readfirstlane x) -> readfirstlane x 864 // readlane (readfirstlane x), y -> readfirstlane x 865 if (match(Src, 866 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 867 return IC.replaceInstUsesWith(II, Src); 868 } 869 870 if (IID == Intrinsic::amdgcn_readfirstlane) { 871 // readfirstlane (readlane x, y) -> readlane x, y 872 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 873 return IC.replaceInstUsesWith(II, Src); 874 } 875 } else { 876 // readlane (readlane x, y), y -> readlane x, y 877 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 878 PatternMatch::m_Value(), 879 PatternMatch::m_Specific(II.getArgOperand(1))))) { 880 return IC.replaceInstUsesWith(II, Src); 881 } 882 } 883 884 break; 885 } 886 case Intrinsic::amdgcn_ldexp: { 887 // FIXME: This doesn't introduce new instructions and belongs in 888 // InstructionSimplify. 889 Type *Ty = II.getType(); 890 Value *Op0 = II.getArgOperand(0); 891 Value *Op1 = II.getArgOperand(1); 892 893 // Folding undef to qnan is safe regardless of the FP mode. 894 if (isa<UndefValue>(Op0)) { 895 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 896 return IC.replaceInstUsesWith(II, QNaN); 897 } 898 899 const APFloat *C = nullptr; 900 match(Op0, PatternMatch::m_APFloat(C)); 901 902 // FIXME: Should flush denorms depending on FP mode, but that's ignored 903 // everywhere else. 904 // 905 // These cases should be safe, even with strictfp. 906 // ldexp(0.0, x) -> 0.0 907 // ldexp(-0.0, x) -> -0.0 908 // ldexp(inf, x) -> inf 909 // ldexp(-inf, x) -> -inf 910 if (C && (C->isZero() || C->isInfinity())) { 911 return IC.replaceInstUsesWith(II, Op0); 912 } 913 914 // With strictfp, be more careful about possibly needing to flush denormals 915 // or not, and snan behavior depends on ieee_mode. 916 if (II.isStrictFP()) 917 break; 918 919 if (C && C->isNaN()) { 920 // FIXME: We just need to make the nan quiet here, but that's unavailable 921 // on APFloat, only IEEEfloat 922 auto *Quieted = 923 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 924 return IC.replaceInstUsesWith(II, Quieted); 925 } 926 927 // ldexp(x, 0) -> x 928 // ldexp(x, undef) -> x 929 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 930 return IC.replaceInstUsesWith(II, Op0); 931 } 932 933 break; 934 } 935 case Intrinsic::amdgcn_fmul_legacy: { 936 Value *Op0 = II.getArgOperand(0); 937 Value *Op1 = II.getArgOperand(1); 938 939 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 940 // infinity, gives +0.0. 941 // TODO: Move to InstSimplify? 942 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 943 match(Op1, PatternMatch::m_AnyZeroFP())) 944 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 945 946 // If we can prove we don't have one of the special cases then we can use a 947 // normal fmul instruction instead. 948 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 949 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 950 FMul->takeName(&II); 951 return IC.replaceInstUsesWith(II, FMul); 952 } 953 break; 954 } 955 case Intrinsic::amdgcn_fma_legacy: { 956 Value *Op0 = II.getArgOperand(0); 957 Value *Op1 = II.getArgOperand(1); 958 Value *Op2 = II.getArgOperand(2); 959 960 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 961 // infinity, gives +0.0. 962 // TODO: Move to InstSimplify? 963 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 964 match(Op1, PatternMatch::m_AnyZeroFP())) { 965 // It's tempting to just return Op2 here, but that would give the wrong 966 // result if Op2 was -0.0. 967 auto *Zero = ConstantFP::getNullValue(II.getType()); 968 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 969 FAdd->takeName(&II); 970 return IC.replaceInstUsesWith(II, FAdd); 971 } 972 973 // If we can prove we don't have one of the special cases then we can use a 974 // normal fma instead. 975 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 976 II.setCalledOperand(Intrinsic::getDeclaration( 977 II.getModule(), Intrinsic::fma, II.getType())); 978 return &II; 979 } 980 break; 981 } 982 case Intrinsic::amdgcn_is_shared: 983 case Intrinsic::amdgcn_is_private: { 984 if (isa<UndefValue>(II.getArgOperand(0))) 985 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 986 987 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 988 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 989 break; 990 } 991 default: { 992 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 993 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 994 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 995 } 996 } 997 } 998 return None; 999 } 1000 1001 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1002 /// 1003 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1004 /// struct returns. 1005 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1006 IntrinsicInst &II, 1007 APInt DemandedElts, 1008 int DMaskIdx = -1) { 1009 1010 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1011 unsigned VWidth = IIVTy->getNumElements(); 1012 if (VWidth == 1) 1013 return nullptr; 1014 1015 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1016 IC.Builder.SetInsertPoint(&II); 1017 1018 // Assume the arguments are unchanged and later override them, if needed. 1019 SmallVector<Value *, 16> Args(II.args()); 1020 1021 if (DMaskIdx < 0) { 1022 // Buffer case. 1023 1024 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1025 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1026 1027 // Start assuming the prefix of elements is demanded, but possibly clear 1028 // some other bits if there are trailing zeros (unused components at front) 1029 // and update offset. 1030 DemandedElts = (1 << ActiveBits) - 1; 1031 1032 if (UnusedComponentsAtFront > 0) { 1033 static const unsigned InvalidOffsetIdx = 0xf; 1034 1035 unsigned OffsetIdx; 1036 switch (II.getIntrinsicID()) { 1037 case Intrinsic::amdgcn_raw_buffer_load: 1038 OffsetIdx = 1; 1039 break; 1040 case Intrinsic::amdgcn_s_buffer_load: 1041 // If resulting type is vec3, there is no point in trimming the 1042 // load with updated offset, as the vec3 would most likely be widened to 1043 // vec4 anyway during lowering. 1044 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1045 OffsetIdx = InvalidOffsetIdx; 1046 else 1047 OffsetIdx = 1; 1048 break; 1049 case Intrinsic::amdgcn_struct_buffer_load: 1050 OffsetIdx = 2; 1051 break; 1052 default: 1053 // TODO: handle tbuffer* intrinsics. 1054 OffsetIdx = InvalidOffsetIdx; 1055 break; 1056 } 1057 1058 if (OffsetIdx != InvalidOffsetIdx) { 1059 // Clear demanded bits and update the offset. 1060 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1061 auto *Offset = II.getArgOperand(OffsetIdx); 1062 unsigned SingleComponentSizeInBits = 1063 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 1064 unsigned OffsetAdd = 1065 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1066 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1067 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1068 } 1069 } 1070 } else { 1071 // Image case. 1072 1073 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 1074 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1075 1076 // Mask off values that are undefined because the dmask doesn't cover them 1077 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 1078 1079 unsigned NewDMaskVal = 0; 1080 unsigned OrigLoadIdx = 0; 1081 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1082 const unsigned Bit = 1 << SrcIdx; 1083 if (!!(DMaskVal & Bit)) { 1084 if (!!DemandedElts[OrigLoadIdx]) 1085 NewDMaskVal |= Bit; 1086 OrigLoadIdx++; 1087 } 1088 } 1089 1090 if (DMaskVal != NewDMaskVal) 1091 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1092 } 1093 1094 unsigned NewNumElts = DemandedElts.countPopulation(); 1095 if (!NewNumElts) 1096 return UndefValue::get(II.getType()); 1097 1098 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1099 if (DMaskIdx >= 0) 1100 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1101 return nullptr; 1102 } 1103 1104 // Validate function argument and return types, extracting overloaded types 1105 // along the way. 1106 SmallVector<Type *, 6> OverloadTys; 1107 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1108 return nullptr; 1109 1110 Module *M = II.getParent()->getParent()->getParent(); 1111 Type *EltTy = IIVTy->getElementType(); 1112 Type *NewTy = 1113 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1114 1115 OverloadTys[0] = NewTy; 1116 Function *NewIntrin = 1117 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1118 1119 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1120 NewCall->takeName(&II); 1121 NewCall->copyMetadata(II); 1122 1123 if (NewNumElts == 1) { 1124 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1125 NewCall, 1126 DemandedElts.countTrailingZeros()); 1127 } 1128 1129 SmallVector<int, 8> EltMask; 1130 unsigned NewLoadIdx = 0; 1131 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1132 if (!!DemandedElts[OrigLoadIdx]) 1133 EltMask.push_back(NewLoadIdx++); 1134 else 1135 EltMask.push_back(NewNumElts); 1136 } 1137 1138 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1139 1140 return Shuffle; 1141 } 1142 1143 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1144 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1145 APInt &UndefElts2, APInt &UndefElts3, 1146 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1147 SimplifyAndSetOp) const { 1148 switch (II.getIntrinsicID()) { 1149 case Intrinsic::amdgcn_buffer_load: 1150 case Intrinsic::amdgcn_buffer_load_format: 1151 case Intrinsic::amdgcn_raw_buffer_load: 1152 case Intrinsic::amdgcn_raw_buffer_load_format: 1153 case Intrinsic::amdgcn_raw_tbuffer_load: 1154 case Intrinsic::amdgcn_s_buffer_load: 1155 case Intrinsic::amdgcn_struct_buffer_load: 1156 case Intrinsic::amdgcn_struct_buffer_load_format: 1157 case Intrinsic::amdgcn_struct_tbuffer_load: 1158 case Intrinsic::amdgcn_tbuffer_load: 1159 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1160 default: { 1161 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1162 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1163 } 1164 break; 1165 } 1166 } 1167 return None; 1168 } 1169