1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/Transforms/InstCombine/InstCombiner.h" 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "AMDGPUtti" 26 27 namespace { 28 29 struct AMDGPUImageDMaskIntrinsic { 30 unsigned Intr; 31 }; 32 33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 34 #include "InstCombineTables.inc" 35 36 } // end anonymous namespace 37 38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 39 // 40 // A single NaN input is folded to minnum, so we rely on that folding for 41 // handling NaNs. 42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 43 const APFloat &Src2) { 44 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 45 46 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 47 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 48 if (Cmp0 == APFloat::cmpEqual) 49 return maxnum(Src1, Src2); 50 51 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 52 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 53 if (Cmp1 == APFloat::cmpEqual) 54 return maxnum(Src0, Src2); 55 56 return maxnum(Src0, Src1); 57 } 58 59 // Check if a value can be converted to a 16-bit value without losing 60 // precision. 61 // The value is expected to be either a float (IsFloat = true) or an unsigned 62 // integer (IsFloat = false). 63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 64 Type *VTy = V.getType(); 65 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 66 // The value is already 16-bit, so we don't want to convert to 16-bit again! 67 return false; 68 } 69 if (IsFloat) { 70 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 71 // We need to check that if we cast the index down to a half, we do not 72 // lose precision. 73 APFloat FloatValue(ConstFloat->getValueAPF()); 74 bool LosesInfo = true; 75 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 76 &LosesInfo); 77 return !LosesInfo; 78 } 79 } else { 80 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 81 // We need to check that if we cast the index down to an i16, we do not 82 // lose precision. 83 APInt IntValue(ConstInt->getValue()); 84 return IntValue.getActiveBits() <= 16; 85 } 86 } 87 88 Value *CastSrc; 89 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 90 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 91 if (IsExt) { 92 Type *CastSrcTy = CastSrc->getType(); 93 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 94 return true; 95 } 96 97 return false; 98 } 99 100 // Convert a value to 16-bit. 101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 102 Type *VTy = V.getType(); 103 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 104 return cast<Instruction>(&V)->getOperand(0); 105 if (VTy->isIntegerTy()) 106 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 107 if (VTy->isFloatingPointTy()) 108 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 109 110 llvm_unreachable("Should never be called!"); 111 } 112 113 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 114 /// modified arguments (based on OldIntr) and replaces InstToReplace with 115 /// this newly created intrinsic call. 116 static Optional<Instruction *> modifyIntrinsicCall( 117 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 118 InstCombiner &IC, 119 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 120 Func) { 121 SmallVector<Type *, 4> ArgTys; 122 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 123 return None; 124 125 SmallVector<Value *, 8> Args(OldIntr.args()); 126 127 // Modify arguments and types 128 Func(Args, ArgTys); 129 130 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); 131 132 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 133 NewCall->takeName(&OldIntr); 134 NewCall->copyMetadata(OldIntr); 135 if (isa<FPMathOperator>(NewCall)) 136 NewCall->copyFastMathFlags(&OldIntr); 137 138 // Erase and replace uses 139 if (!InstToReplace.getType()->isVoidTy()) 140 IC.replaceInstUsesWith(InstToReplace, NewCall); 141 142 auto RetValue = IC.eraseInstFromFunction(InstToReplace); 143 if (!OldIntr.isIdenticalTo(&InstToReplace)) 144 IC.eraseInstFromFunction(OldIntr); 145 146 return RetValue; 147 } 148 149 static Optional<Instruction *> 150 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 151 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 152 IntrinsicInst &II, InstCombiner &IC) { 153 // Optimize _L to _LZ when _L is zero 154 if (const auto *LZMappingInfo = 155 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 156 if (auto *ConstantLod = 157 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 158 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 159 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 160 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 161 ImageDimIntr->Dim); 162 return modifyIntrinsicCall( 163 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 164 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 165 }); 166 } 167 } 168 } 169 170 // Optimize _mip away, when 'lod' is zero 171 if (const auto *MIPMappingInfo = 172 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 173 if (auto *ConstantMip = 174 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 175 if (ConstantMip->isZero()) { 176 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 177 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 178 ImageDimIntr->Dim); 179 return modifyIntrinsicCall( 180 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 181 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 182 }); 183 } 184 } 185 } 186 187 // Optimize _bias away when 'bias' is zero 188 if (const auto *BiasMappingInfo = 189 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 190 if (auto *ConstantBias = 191 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 192 if (ConstantBias->isZero()) { 193 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 194 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 195 ImageDimIntr->Dim); 196 return modifyIntrinsicCall( 197 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 198 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 199 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 200 }); 201 } 202 } 203 } 204 205 // Optimize _offset away when 'offset' is zero 206 if (const auto *OffsetMappingInfo = 207 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 208 if (auto *ConstantOffset = 209 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 210 if (ConstantOffset->isZero()) { 211 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 212 AMDGPU::getImageDimIntrinsicByBaseOpcode( 213 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 214 return modifyIntrinsicCall( 215 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 216 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 217 }); 218 } 219 } 220 } 221 222 // Try to use D16 223 if (ST->hasD16Images()) { 224 225 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 226 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 227 228 if (BaseOpcode->HasD16) { 229 230 // If the only use of image intrinsic is a fptrunc (with conversion to 231 // half) then both fptrunc and image intrinsic will be replaced with image 232 // intrinsic with D16 flag. 233 if (II.hasOneUse()) { 234 Instruction *User = II.user_back(); 235 236 if (User->getOpcode() == Instruction::FPTrunc && 237 User->getType()->getScalarType()->isHalfTy()) { 238 239 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 240 [&](auto &Args, auto &ArgTys) { 241 // Change return type of image intrinsic. 242 // Set it to return type of fptrunc. 243 ArgTys[0] = User->getType(); 244 }); 245 } 246 } 247 } 248 } 249 250 // Try to use A16 or G16 251 if (!ST->hasA16() && !ST->hasG16()) 252 return None; 253 254 // Address is interpreted as float if the instruction has a sampler or as 255 // unsigned int if there is no sampler. 256 bool HasSampler = 257 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 258 bool FloatCoord = false; 259 // true means derivatives can be converted to 16 bit, coordinates not 260 bool OnlyDerivatives = false; 261 262 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 263 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 264 Value *Coord = II.getOperand(OperandIndex); 265 // If the values are not derived from 16-bit values, we cannot optimize. 266 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 267 if (OperandIndex < ImageDimIntr->CoordStart || 268 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 269 return None; 270 } 271 // All gradients can be converted, so convert only them 272 OnlyDerivatives = true; 273 break; 274 } 275 276 assert(OperandIndex == ImageDimIntr->GradientStart || 277 FloatCoord == Coord->getType()->isFloatingPointTy()); 278 FloatCoord = Coord->getType()->isFloatingPointTy(); 279 } 280 281 if (!OnlyDerivatives && !ST->hasA16()) 282 OnlyDerivatives = true; // Only supports G16 283 284 // Check if there is a bias parameter and if it can be converted to f16 285 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 286 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 287 assert(HasSampler && 288 "Only image instructions with a sampler can have a bias"); 289 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 290 OnlyDerivatives = true; 291 } 292 293 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 294 ImageDimIntr->CoordStart)) 295 return None; 296 297 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 298 : Type::getInt16Ty(II.getContext()); 299 300 return modifyIntrinsicCall( 301 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 302 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 303 if (!OnlyDerivatives) { 304 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 305 306 // Change the bias type 307 if (ImageDimIntr->NumBiasArgs != 0) 308 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 309 } 310 311 unsigned EndIndex = 312 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 313 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 314 OperandIndex < EndIndex; OperandIndex++) { 315 Args[OperandIndex] = 316 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 317 } 318 319 // Convert the bias 320 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 321 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 322 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 323 } 324 }); 325 } 326 327 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 328 InstCombiner &IC) const { 329 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 330 // infinity, gives +0.0. If we can prove we don't have one of the special 331 // cases then we can use a normal multiply instead. 332 // TODO: Create and use isKnownFiniteNonZero instead of just matching 333 // constants here. 334 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 335 match(Op1, PatternMatch::m_FiniteNonZero())) { 336 // One operand is not zero or infinity or NaN. 337 return true; 338 } 339 auto *TLI = &IC.getTargetLibraryInfo(); 340 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 341 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 342 // Neither operand is infinity or NaN. 343 return true; 344 } 345 return false; 346 } 347 348 Optional<Instruction *> 349 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 350 Intrinsic::ID IID = II.getIntrinsicID(); 351 switch (IID) { 352 case Intrinsic::amdgcn_rcp: { 353 Value *Src = II.getArgOperand(0); 354 355 // TODO: Move to ConstantFolding/InstSimplify? 356 if (isa<UndefValue>(Src)) { 357 Type *Ty = II.getType(); 358 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 359 return IC.replaceInstUsesWith(II, QNaN); 360 } 361 362 if (II.isStrictFP()) 363 break; 364 365 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 366 const APFloat &ArgVal = C->getValueAPF(); 367 APFloat Val(ArgVal.getSemantics(), 1); 368 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 369 370 // This is more precise than the instruction may give. 371 // 372 // TODO: The instruction always flushes denormal results (except for f16), 373 // should this also? 374 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 375 } 376 377 break; 378 } 379 case Intrinsic::amdgcn_rsq: { 380 Value *Src = II.getArgOperand(0); 381 382 // TODO: Move to ConstantFolding/InstSimplify? 383 if (isa<UndefValue>(Src)) { 384 Type *Ty = II.getType(); 385 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 386 return IC.replaceInstUsesWith(II, QNaN); 387 } 388 389 break; 390 } 391 case Intrinsic::amdgcn_frexp_mant: 392 case Intrinsic::amdgcn_frexp_exp: { 393 Value *Src = II.getArgOperand(0); 394 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 395 int Exp; 396 APFloat Significand = 397 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 398 399 if (IID == Intrinsic::amdgcn_frexp_mant) { 400 return IC.replaceInstUsesWith( 401 II, ConstantFP::get(II.getContext(), Significand)); 402 } 403 404 // Match instruction special case behavior. 405 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 406 Exp = 0; 407 408 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 409 } 410 411 if (isa<UndefValue>(Src)) { 412 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 413 } 414 415 break; 416 } 417 case Intrinsic::amdgcn_class: { 418 enum { 419 S_NAN = 1 << 0, // Signaling NaN 420 Q_NAN = 1 << 1, // Quiet NaN 421 N_INFINITY = 1 << 2, // Negative infinity 422 N_NORMAL = 1 << 3, // Negative normal 423 N_SUBNORMAL = 1 << 4, // Negative subnormal 424 N_ZERO = 1 << 5, // Negative zero 425 P_ZERO = 1 << 6, // Positive zero 426 P_SUBNORMAL = 1 << 7, // Positive subnormal 427 P_NORMAL = 1 << 8, // Positive normal 428 P_INFINITY = 1 << 9 // Positive infinity 429 }; 430 431 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 432 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 433 P_NORMAL | P_INFINITY; 434 435 Value *Src0 = II.getArgOperand(0); 436 Value *Src1 = II.getArgOperand(1); 437 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 438 if (!CMask) { 439 if (isa<UndefValue>(Src0)) { 440 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 441 } 442 443 if (isa<UndefValue>(Src1)) { 444 return IC.replaceInstUsesWith(II, 445 ConstantInt::get(II.getType(), false)); 446 } 447 break; 448 } 449 450 uint32_t Mask = CMask->getZExtValue(); 451 452 // If all tests are made, it doesn't matter what the value is. 453 if ((Mask & FullMask) == FullMask) { 454 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 455 } 456 457 if ((Mask & FullMask) == 0) { 458 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 459 } 460 461 if (Mask == (S_NAN | Q_NAN)) { 462 // Equivalent of isnan. Replace with standard fcmp. 463 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 464 FCmp->takeName(&II); 465 return IC.replaceInstUsesWith(II, FCmp); 466 } 467 468 if (Mask == (N_ZERO | P_ZERO)) { 469 // Equivalent of == 0. 470 Value *FCmp = 471 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 472 473 FCmp->takeName(&II); 474 return IC.replaceInstUsesWith(II, FCmp); 475 } 476 477 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 478 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 479 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 480 return IC.replaceOperand( 481 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 482 } 483 484 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 485 if (!CVal) { 486 if (isa<UndefValue>(Src0)) { 487 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 488 } 489 490 // Clamp mask to used bits 491 if ((Mask & FullMask) != Mask) { 492 CallInst *NewCall = IC.Builder.CreateCall( 493 II.getCalledFunction(), 494 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 495 496 NewCall->takeName(&II); 497 return IC.replaceInstUsesWith(II, NewCall); 498 } 499 500 break; 501 } 502 503 const APFloat &Val = CVal->getValueAPF(); 504 505 bool Result = 506 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 507 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 508 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 509 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 510 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 511 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 512 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 513 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 514 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 515 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 516 517 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 518 } 519 case Intrinsic::amdgcn_cvt_pkrtz: { 520 Value *Src0 = II.getArgOperand(0); 521 Value *Src1 = II.getArgOperand(1); 522 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 523 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 524 const fltSemantics &HalfSem = 525 II.getType()->getScalarType()->getFltSemantics(); 526 bool LosesInfo; 527 APFloat Val0 = C0->getValueAPF(); 528 APFloat Val1 = C1->getValueAPF(); 529 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 530 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 531 532 Constant *Folded = 533 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 534 ConstantFP::get(II.getContext(), Val1)}); 535 return IC.replaceInstUsesWith(II, Folded); 536 } 537 } 538 539 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 540 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 541 } 542 543 break; 544 } 545 case Intrinsic::amdgcn_cvt_pknorm_i16: 546 case Intrinsic::amdgcn_cvt_pknorm_u16: 547 case Intrinsic::amdgcn_cvt_pk_i16: 548 case Intrinsic::amdgcn_cvt_pk_u16: { 549 Value *Src0 = II.getArgOperand(0); 550 Value *Src1 = II.getArgOperand(1); 551 552 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 553 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 554 } 555 556 break; 557 } 558 case Intrinsic::amdgcn_ubfe: 559 case Intrinsic::amdgcn_sbfe: { 560 // Decompose simple cases into standard shifts. 561 Value *Src = II.getArgOperand(0); 562 if (isa<UndefValue>(Src)) { 563 return IC.replaceInstUsesWith(II, Src); 564 } 565 566 unsigned Width; 567 Type *Ty = II.getType(); 568 unsigned IntSize = Ty->getIntegerBitWidth(); 569 570 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 571 if (CWidth) { 572 Width = CWidth->getZExtValue(); 573 if ((Width & (IntSize - 1)) == 0) { 574 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 575 } 576 577 // Hardware ignores high bits, so remove those. 578 if (Width >= IntSize) { 579 return IC.replaceOperand( 580 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 581 } 582 } 583 584 unsigned Offset; 585 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 586 if (COffset) { 587 Offset = COffset->getZExtValue(); 588 if (Offset >= IntSize) { 589 return IC.replaceOperand( 590 II, 1, 591 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 592 } 593 } 594 595 bool Signed = IID == Intrinsic::amdgcn_sbfe; 596 597 if (!CWidth || !COffset) 598 break; 599 600 // The case of Width == 0 is handled above, which makes this transformation 601 // safe. If Width == 0, then the ashr and lshr instructions become poison 602 // value since the shift amount would be equal to the bit size. 603 assert(Width != 0); 604 605 // TODO: This allows folding to undef when the hardware has specific 606 // behavior? 607 if (Offset + Width < IntSize) { 608 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 609 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 610 : IC.Builder.CreateLShr(Shl, IntSize - Width); 611 RightShift->takeName(&II); 612 return IC.replaceInstUsesWith(II, RightShift); 613 } 614 615 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 616 : IC.Builder.CreateLShr(Src, Offset); 617 618 RightShift->takeName(&II); 619 return IC.replaceInstUsesWith(II, RightShift); 620 } 621 case Intrinsic::amdgcn_exp: 622 case Intrinsic::amdgcn_exp_compr: { 623 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 624 unsigned EnBits = En->getZExtValue(); 625 if (EnBits == 0xf) 626 break; // All inputs enabled. 627 628 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 629 bool Changed = false; 630 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 631 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 632 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 633 Value *Src = II.getArgOperand(I + 2); 634 if (!isa<UndefValue>(Src)) { 635 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 636 Changed = true; 637 } 638 } 639 } 640 641 if (Changed) { 642 return &II; 643 } 644 645 break; 646 } 647 case Intrinsic::amdgcn_fmed3: { 648 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 649 // for the shader. 650 651 Value *Src0 = II.getArgOperand(0); 652 Value *Src1 = II.getArgOperand(1); 653 Value *Src2 = II.getArgOperand(2); 654 655 // Checking for NaN before canonicalization provides better fidelity when 656 // mapping other operations onto fmed3 since the order of operands is 657 // unchanged. 658 CallInst *NewCall = nullptr; 659 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 660 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 661 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 662 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 663 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 664 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 665 } 666 667 if (NewCall) { 668 NewCall->copyFastMathFlags(&II); 669 NewCall->takeName(&II); 670 return IC.replaceInstUsesWith(II, NewCall); 671 } 672 673 bool Swap = false; 674 // Canonicalize constants to RHS operands. 675 // 676 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 677 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 678 std::swap(Src0, Src1); 679 Swap = true; 680 } 681 682 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 683 std::swap(Src1, Src2); 684 Swap = true; 685 } 686 687 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 688 std::swap(Src0, Src1); 689 Swap = true; 690 } 691 692 if (Swap) { 693 II.setArgOperand(0, Src0); 694 II.setArgOperand(1, Src1); 695 II.setArgOperand(2, Src2); 696 return &II; 697 } 698 699 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 700 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 701 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 702 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 703 C2->getValueAPF()); 704 return IC.replaceInstUsesWith( 705 II, ConstantFP::get(IC.Builder.getContext(), Result)); 706 } 707 } 708 } 709 710 break; 711 } 712 case Intrinsic::amdgcn_icmp: 713 case Intrinsic::amdgcn_fcmp: { 714 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 715 // Guard against invalid arguments. 716 int64_t CCVal = CC->getZExtValue(); 717 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 718 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 719 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 720 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 721 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 722 break; 723 724 Value *Src0 = II.getArgOperand(0); 725 Value *Src1 = II.getArgOperand(1); 726 727 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 728 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 729 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 730 if (CCmp->isNullValue()) { 731 return IC.replaceInstUsesWith( 732 II, ConstantExpr::getSExt(CCmp, II.getType())); 733 } 734 735 // The result of V_ICMP/V_FCMP assembly instructions (which this 736 // intrinsic exposes) is one bit per thread, masked with the EXEC 737 // register (which contains the bitmask of live threads). So a 738 // comparison that always returns true is the same as a read of the 739 // EXEC register. 740 Function *NewF = Intrinsic::getDeclaration( 741 II.getModule(), Intrinsic::read_register, II.getType()); 742 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 743 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 744 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 745 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 746 NewCall->addFnAttr(Attribute::Convergent); 747 NewCall->takeName(&II); 748 return IC.replaceInstUsesWith(II, NewCall); 749 } 750 751 // Canonicalize constants to RHS. 752 CmpInst::Predicate SwapPred = 753 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 754 II.setArgOperand(0, Src1); 755 II.setArgOperand(1, Src0); 756 II.setArgOperand( 757 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 758 return &II; 759 } 760 761 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 762 break; 763 764 // Canonicalize compare eq with true value to compare != 0 765 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 766 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 767 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 768 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 769 Value *ExtSrc; 770 if (CCVal == CmpInst::ICMP_EQ && 771 ((match(Src1, PatternMatch::m_One()) && 772 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 773 (match(Src1, PatternMatch::m_AllOnes()) && 774 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 775 ExtSrc->getType()->isIntegerTy(1)) { 776 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 777 IC.replaceOperand(II, 2, 778 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 779 return &II; 780 } 781 782 CmpInst::Predicate SrcPred; 783 Value *SrcLHS; 784 Value *SrcRHS; 785 786 // Fold compare eq/ne with 0 from a compare result as the predicate to the 787 // intrinsic. The typical use is a wave vote function in the library, which 788 // will be fed from a user code condition compared with 0. Fold in the 789 // redundant compare. 790 791 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 792 // -> llvm.amdgcn.[if]cmp(a, b, pred) 793 // 794 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 795 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 796 if (match(Src1, PatternMatch::m_Zero()) && 797 match(Src0, PatternMatch::m_ZExtOrSExt( 798 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 799 PatternMatch::m_Value(SrcRHS))))) { 800 if (CCVal == CmpInst::ICMP_EQ) 801 SrcPred = CmpInst::getInversePredicate(SrcPred); 802 803 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 804 ? Intrinsic::amdgcn_fcmp 805 : Intrinsic::amdgcn_icmp; 806 807 Type *Ty = SrcLHS->getType(); 808 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 809 // Promote to next legal integer type. 810 unsigned Width = CmpType->getBitWidth(); 811 unsigned NewWidth = Width; 812 813 // Don't do anything for i1 comparisons. 814 if (Width == 1) 815 break; 816 817 if (Width <= 16) 818 NewWidth = 16; 819 else if (Width <= 32) 820 NewWidth = 32; 821 else if (Width <= 64) 822 NewWidth = 64; 823 else if (Width > 64) 824 break; // Can't handle this. 825 826 if (Width != NewWidth) { 827 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 828 if (CmpInst::isSigned(SrcPred)) { 829 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 830 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 831 } else { 832 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 833 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 834 } 835 } 836 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 837 break; 838 839 Function *NewF = Intrinsic::getDeclaration( 840 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 841 Value *Args[] = {SrcLHS, SrcRHS, 842 ConstantInt::get(CC->getType(), SrcPred)}; 843 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 844 NewCall->takeName(&II); 845 return IC.replaceInstUsesWith(II, NewCall); 846 } 847 848 break; 849 } 850 case Intrinsic::amdgcn_ballot: { 851 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 852 if (Src->isZero()) { 853 // amdgcn.ballot(i1 0) is zero. 854 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 855 } 856 857 if (Src->isOne()) { 858 // amdgcn.ballot(i1 1) is exec. 859 const char *RegName = "exec"; 860 if (II.getType()->isIntegerTy(32)) 861 RegName = "exec_lo"; 862 else if (!II.getType()->isIntegerTy(64)) 863 break; 864 865 Function *NewF = Intrinsic::getDeclaration( 866 II.getModule(), Intrinsic::read_register, II.getType()); 867 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 868 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 869 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 870 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 871 NewCall->addFnAttr(Attribute::Convergent); 872 NewCall->takeName(&II); 873 return IC.replaceInstUsesWith(II, NewCall); 874 } 875 } 876 break; 877 } 878 case Intrinsic::amdgcn_wqm_vote: { 879 // wqm_vote is identity when the argument is constant. 880 if (!isa<Constant>(II.getArgOperand(0))) 881 break; 882 883 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 884 } 885 case Intrinsic::amdgcn_kill: { 886 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 887 if (!C || !C->getZExtValue()) 888 break; 889 890 // amdgcn.kill(i1 1) is a no-op 891 return IC.eraseInstFromFunction(II); 892 } 893 case Intrinsic::amdgcn_update_dpp: { 894 Value *Old = II.getArgOperand(0); 895 896 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 897 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 898 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 899 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 900 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 901 break; 902 903 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 904 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 905 } 906 case Intrinsic::amdgcn_permlane16: 907 case Intrinsic::amdgcn_permlanex16: { 908 // Discard vdst_in if it's not going to be read. 909 Value *VDstIn = II.getArgOperand(0); 910 if (isa<UndefValue>(VDstIn)) 911 break; 912 913 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 914 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 915 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 916 break; 917 918 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 919 } 920 case Intrinsic::amdgcn_readfirstlane: 921 case Intrinsic::amdgcn_readlane: { 922 // A constant value is trivially uniform. 923 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 924 return IC.replaceInstUsesWith(II, C); 925 } 926 927 // The rest of these may not be safe if the exec may not be the same between 928 // the def and use. 929 Value *Src = II.getArgOperand(0); 930 Instruction *SrcInst = dyn_cast<Instruction>(Src); 931 if (SrcInst && SrcInst->getParent() != II.getParent()) 932 break; 933 934 // readfirstlane (readfirstlane x) -> readfirstlane x 935 // readlane (readfirstlane x), y -> readfirstlane x 936 if (match(Src, 937 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 938 return IC.replaceInstUsesWith(II, Src); 939 } 940 941 if (IID == Intrinsic::amdgcn_readfirstlane) { 942 // readfirstlane (readlane x, y) -> readlane x, y 943 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 944 return IC.replaceInstUsesWith(II, Src); 945 } 946 } else { 947 // readlane (readlane x, y), y -> readlane x, y 948 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 949 PatternMatch::m_Value(), 950 PatternMatch::m_Specific(II.getArgOperand(1))))) { 951 return IC.replaceInstUsesWith(II, Src); 952 } 953 } 954 955 break; 956 } 957 case Intrinsic::amdgcn_ldexp: { 958 // FIXME: This doesn't introduce new instructions and belongs in 959 // InstructionSimplify. 960 Type *Ty = II.getType(); 961 Value *Op0 = II.getArgOperand(0); 962 Value *Op1 = II.getArgOperand(1); 963 964 // Folding undef to qnan is safe regardless of the FP mode. 965 if (isa<UndefValue>(Op0)) { 966 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 967 return IC.replaceInstUsesWith(II, QNaN); 968 } 969 970 const APFloat *C = nullptr; 971 match(Op0, PatternMatch::m_APFloat(C)); 972 973 // FIXME: Should flush denorms depending on FP mode, but that's ignored 974 // everywhere else. 975 // 976 // These cases should be safe, even with strictfp. 977 // ldexp(0.0, x) -> 0.0 978 // ldexp(-0.0, x) -> -0.0 979 // ldexp(inf, x) -> inf 980 // ldexp(-inf, x) -> -inf 981 if (C && (C->isZero() || C->isInfinity())) { 982 return IC.replaceInstUsesWith(II, Op0); 983 } 984 985 // With strictfp, be more careful about possibly needing to flush denormals 986 // or not, and snan behavior depends on ieee_mode. 987 if (II.isStrictFP()) 988 break; 989 990 if (C && C->isNaN()) { 991 // FIXME: We just need to make the nan quiet here, but that's unavailable 992 // on APFloat, only IEEEfloat 993 auto *Quieted = 994 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 995 return IC.replaceInstUsesWith(II, Quieted); 996 } 997 998 // ldexp(x, 0) -> x 999 // ldexp(x, undef) -> x 1000 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 1001 return IC.replaceInstUsesWith(II, Op0); 1002 } 1003 1004 break; 1005 } 1006 case Intrinsic::amdgcn_fmul_legacy: { 1007 Value *Op0 = II.getArgOperand(0); 1008 Value *Op1 = II.getArgOperand(1); 1009 1010 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1011 // infinity, gives +0.0. 1012 // TODO: Move to InstSimplify? 1013 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1014 match(Op1, PatternMatch::m_AnyZeroFP())) 1015 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 1016 1017 // If we can prove we don't have one of the special cases then we can use a 1018 // normal fmul instruction instead. 1019 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1020 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1021 FMul->takeName(&II); 1022 return IC.replaceInstUsesWith(II, FMul); 1023 } 1024 break; 1025 } 1026 case Intrinsic::amdgcn_fma_legacy: { 1027 Value *Op0 = II.getArgOperand(0); 1028 Value *Op1 = II.getArgOperand(1); 1029 Value *Op2 = II.getArgOperand(2); 1030 1031 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1032 // infinity, gives +0.0. 1033 // TODO: Move to InstSimplify? 1034 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1035 match(Op1, PatternMatch::m_AnyZeroFP())) { 1036 // It's tempting to just return Op2 here, but that would give the wrong 1037 // result if Op2 was -0.0. 1038 auto *Zero = ConstantFP::getNullValue(II.getType()); 1039 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1040 FAdd->takeName(&II); 1041 return IC.replaceInstUsesWith(II, FAdd); 1042 } 1043 1044 // If we can prove we don't have one of the special cases then we can use a 1045 // normal fma instead. 1046 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1047 II.setCalledOperand(Intrinsic::getDeclaration( 1048 II.getModule(), Intrinsic::fma, II.getType())); 1049 return &II; 1050 } 1051 break; 1052 } 1053 case Intrinsic::amdgcn_is_shared: 1054 case Intrinsic::amdgcn_is_private: { 1055 if (isa<UndefValue>(II.getArgOperand(0))) 1056 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1057 1058 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1059 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1060 break; 1061 } 1062 default: { 1063 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1064 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1065 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1066 } 1067 } 1068 } 1069 return None; 1070 } 1071 1072 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1073 /// 1074 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1075 /// struct returns. 1076 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1077 IntrinsicInst &II, 1078 APInt DemandedElts, 1079 int DMaskIdx = -1) { 1080 1081 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1082 unsigned VWidth = IIVTy->getNumElements(); 1083 if (VWidth == 1) 1084 return nullptr; 1085 1086 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1087 IC.Builder.SetInsertPoint(&II); 1088 1089 // Assume the arguments are unchanged and later override them, if needed. 1090 SmallVector<Value *, 16> Args(II.args()); 1091 1092 if (DMaskIdx < 0) { 1093 // Buffer case. 1094 1095 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1096 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1097 1098 // Start assuming the prefix of elements is demanded, but possibly clear 1099 // some other bits if there are trailing zeros (unused components at front) 1100 // and update offset. 1101 DemandedElts = (1 << ActiveBits) - 1; 1102 1103 if (UnusedComponentsAtFront > 0) { 1104 static const unsigned InvalidOffsetIdx = 0xf; 1105 1106 unsigned OffsetIdx; 1107 switch (II.getIntrinsicID()) { 1108 case Intrinsic::amdgcn_raw_buffer_load: 1109 OffsetIdx = 1; 1110 break; 1111 case Intrinsic::amdgcn_s_buffer_load: 1112 // If resulting type is vec3, there is no point in trimming the 1113 // load with updated offset, as the vec3 would most likely be widened to 1114 // vec4 anyway during lowering. 1115 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1116 OffsetIdx = InvalidOffsetIdx; 1117 else 1118 OffsetIdx = 1; 1119 break; 1120 case Intrinsic::amdgcn_struct_buffer_load: 1121 OffsetIdx = 2; 1122 break; 1123 default: 1124 // TODO: handle tbuffer* intrinsics. 1125 OffsetIdx = InvalidOffsetIdx; 1126 break; 1127 } 1128 1129 if (OffsetIdx != InvalidOffsetIdx) { 1130 // Clear demanded bits and update the offset. 1131 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1132 auto *Offset = II.getArgOperand(OffsetIdx); 1133 unsigned SingleComponentSizeInBits = 1134 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 1135 unsigned OffsetAdd = 1136 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1137 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1138 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1139 } 1140 } 1141 } else { 1142 // Image case. 1143 1144 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 1145 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1146 1147 // Mask off values that are undefined because the dmask doesn't cover them 1148 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 1149 1150 unsigned NewDMaskVal = 0; 1151 unsigned OrigLoadIdx = 0; 1152 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1153 const unsigned Bit = 1 << SrcIdx; 1154 if (!!(DMaskVal & Bit)) { 1155 if (!!DemandedElts[OrigLoadIdx]) 1156 NewDMaskVal |= Bit; 1157 OrigLoadIdx++; 1158 } 1159 } 1160 1161 if (DMaskVal != NewDMaskVal) 1162 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1163 } 1164 1165 unsigned NewNumElts = DemandedElts.countPopulation(); 1166 if (!NewNumElts) 1167 return UndefValue::get(II.getType()); 1168 1169 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1170 if (DMaskIdx >= 0) 1171 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1172 return nullptr; 1173 } 1174 1175 // Validate function argument and return types, extracting overloaded types 1176 // along the way. 1177 SmallVector<Type *, 6> OverloadTys; 1178 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1179 return nullptr; 1180 1181 Module *M = II.getParent()->getParent()->getParent(); 1182 Type *EltTy = IIVTy->getElementType(); 1183 Type *NewTy = 1184 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1185 1186 OverloadTys[0] = NewTy; 1187 Function *NewIntrin = 1188 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1189 1190 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1191 NewCall->takeName(&II); 1192 NewCall->copyMetadata(II); 1193 1194 if (NewNumElts == 1) { 1195 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1196 NewCall, 1197 DemandedElts.countTrailingZeros()); 1198 } 1199 1200 SmallVector<int, 8> EltMask; 1201 unsigned NewLoadIdx = 0; 1202 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1203 if (!!DemandedElts[OrigLoadIdx]) 1204 EltMask.push_back(NewLoadIdx++); 1205 else 1206 EltMask.push_back(NewNumElts); 1207 } 1208 1209 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1210 1211 return Shuffle; 1212 } 1213 1214 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1215 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1216 APInt &UndefElts2, APInt &UndefElts3, 1217 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1218 SimplifyAndSetOp) const { 1219 switch (II.getIntrinsicID()) { 1220 case Intrinsic::amdgcn_buffer_load: 1221 case Intrinsic::amdgcn_buffer_load_format: 1222 case Intrinsic::amdgcn_raw_buffer_load: 1223 case Intrinsic::amdgcn_raw_buffer_load_format: 1224 case Intrinsic::amdgcn_raw_tbuffer_load: 1225 case Intrinsic::amdgcn_s_buffer_load: 1226 case Intrinsic::amdgcn_struct_buffer_load: 1227 case Intrinsic::amdgcn_struct_buffer_load_format: 1228 case Intrinsic::amdgcn_struct_tbuffer_load: 1229 case Intrinsic::amdgcn_tbuffer_load: 1230 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1231 default: { 1232 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1233 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1234 } 1235 break; 1236 } 1237 } 1238 return None; 1239 } 1240