1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/ADT/FloatingPointMode.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/Transforms/InstCombine/InstCombiner.h" 23 #include <optional> 24 25 using namespace llvm; 26 using namespace llvm::PatternMatch; 27 28 #define DEBUG_TYPE "AMDGPUtti" 29 30 namespace { 31 32 struct AMDGPUImageDMaskIntrinsic { 33 unsigned Intr; 34 }; 35 36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 37 #include "InstCombineTables.inc" 38 39 } // end anonymous namespace 40 41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 42 // 43 // A single NaN input is folded to minnum, so we rely on that folding for 44 // handling NaNs. 45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 46 const APFloat &Src2) { 47 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 48 49 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 50 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 51 if (Cmp0 == APFloat::cmpEqual) 52 return maxnum(Src1, Src2); 53 54 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 55 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 56 if (Cmp1 == APFloat::cmpEqual) 57 return maxnum(Src0, Src2); 58 59 return maxnum(Src0, Src1); 60 } 61 62 // Check if a value can be converted to a 16-bit value without losing 63 // precision. 64 // The value is expected to be either a float (IsFloat = true) or an unsigned 65 // integer (IsFloat = false). 66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 67 Type *VTy = V.getType(); 68 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 69 // The value is already 16-bit, so we don't want to convert to 16-bit again! 70 return false; 71 } 72 if (IsFloat) { 73 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 74 // We need to check that if we cast the index down to a half, we do not 75 // lose precision. 76 APFloat FloatValue(ConstFloat->getValueAPF()); 77 bool LosesInfo = true; 78 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 79 &LosesInfo); 80 return !LosesInfo; 81 } 82 } else { 83 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 84 // We need to check that if we cast the index down to an i16, we do not 85 // lose precision. 86 APInt IntValue(ConstInt->getValue()); 87 return IntValue.getActiveBits() <= 16; 88 } 89 } 90 91 Value *CastSrc; 92 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 93 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 94 if (IsExt) { 95 Type *CastSrcTy = CastSrc->getType(); 96 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 97 return true; 98 } 99 100 return false; 101 } 102 103 // Convert a value to 16-bit. 104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 105 Type *VTy = V.getType(); 106 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 107 return cast<Instruction>(&V)->getOperand(0); 108 if (VTy->isIntegerTy()) 109 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 110 if (VTy->isFloatingPointTy()) 111 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 112 113 llvm_unreachable("Should never be called!"); 114 } 115 116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 117 /// modified arguments (based on OldIntr) and replaces InstToReplace with 118 /// this newly created intrinsic call. 119 static std::optional<Instruction *> modifyIntrinsicCall( 120 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 121 InstCombiner &IC, 122 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 123 Func) { 124 SmallVector<Type *, 4> ArgTys; 125 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 126 return std::nullopt; 127 128 SmallVector<Value *, 8> Args(OldIntr.args()); 129 130 // Modify arguments and types 131 Func(Args, ArgTys); 132 133 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); 134 135 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 136 NewCall->takeName(&OldIntr); 137 NewCall->copyMetadata(OldIntr); 138 if (isa<FPMathOperator>(NewCall)) 139 NewCall->copyFastMathFlags(&OldIntr); 140 141 // Erase and replace uses 142 if (!InstToReplace.getType()->isVoidTy()) 143 IC.replaceInstUsesWith(InstToReplace, NewCall); 144 145 bool RemoveOldIntr = &OldIntr != &InstToReplace; 146 147 auto RetValue = IC.eraseInstFromFunction(InstToReplace); 148 if (RemoveOldIntr) 149 IC.eraseInstFromFunction(OldIntr); 150 151 return RetValue; 152 } 153 154 static std::optional<Instruction *> 155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 156 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 157 IntrinsicInst &II, InstCombiner &IC) { 158 // Optimize _L to _LZ when _L is zero 159 if (const auto *LZMappingInfo = 160 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 161 if (auto *ConstantLod = 162 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 163 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 164 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 165 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 166 ImageDimIntr->Dim); 167 return modifyIntrinsicCall( 168 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 169 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 170 }); 171 } 172 } 173 } 174 175 // Optimize _mip away, when 'lod' is zero 176 if (const auto *MIPMappingInfo = 177 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 178 if (auto *ConstantMip = 179 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 180 if (ConstantMip->isZero()) { 181 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 182 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 183 ImageDimIntr->Dim); 184 return modifyIntrinsicCall( 185 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 186 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 187 }); 188 } 189 } 190 } 191 192 // Optimize _bias away when 'bias' is zero 193 if (const auto *BiasMappingInfo = 194 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 195 if (auto *ConstantBias = 196 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 197 if (ConstantBias->isZero()) { 198 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 199 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 200 ImageDimIntr->Dim); 201 return modifyIntrinsicCall( 202 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 203 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 204 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 205 }); 206 } 207 } 208 } 209 210 // Optimize _offset away when 'offset' is zero 211 if (const auto *OffsetMappingInfo = 212 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 213 if (auto *ConstantOffset = 214 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 215 if (ConstantOffset->isZero()) { 216 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 217 AMDGPU::getImageDimIntrinsicByBaseOpcode( 218 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 219 return modifyIntrinsicCall( 220 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 221 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 222 }); 223 } 224 } 225 } 226 227 // Try to use D16 228 if (ST->hasD16Images()) { 229 230 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 231 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 232 233 if (BaseOpcode->HasD16) { 234 235 // If the only use of image intrinsic is a fptrunc (with conversion to 236 // half) then both fptrunc and image intrinsic will be replaced with image 237 // intrinsic with D16 flag. 238 if (II.hasOneUse()) { 239 Instruction *User = II.user_back(); 240 241 if (User->getOpcode() == Instruction::FPTrunc && 242 User->getType()->getScalarType()->isHalfTy()) { 243 244 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 245 [&](auto &Args, auto &ArgTys) { 246 // Change return type of image intrinsic. 247 // Set it to return type of fptrunc. 248 ArgTys[0] = User->getType(); 249 }); 250 } 251 } 252 } 253 } 254 255 // Try to use A16 or G16 256 if (!ST->hasA16() && !ST->hasG16()) 257 return std::nullopt; 258 259 // Address is interpreted as float if the instruction has a sampler or as 260 // unsigned int if there is no sampler. 261 bool HasSampler = 262 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 263 bool FloatCoord = false; 264 // true means derivatives can be converted to 16 bit, coordinates not 265 bool OnlyDerivatives = false; 266 267 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 268 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 269 Value *Coord = II.getOperand(OperandIndex); 270 // If the values are not derived from 16-bit values, we cannot optimize. 271 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 272 if (OperandIndex < ImageDimIntr->CoordStart || 273 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 274 return std::nullopt; 275 } 276 // All gradients can be converted, so convert only them 277 OnlyDerivatives = true; 278 break; 279 } 280 281 assert(OperandIndex == ImageDimIntr->GradientStart || 282 FloatCoord == Coord->getType()->isFloatingPointTy()); 283 FloatCoord = Coord->getType()->isFloatingPointTy(); 284 } 285 286 if (!OnlyDerivatives && !ST->hasA16()) 287 OnlyDerivatives = true; // Only supports G16 288 289 // Check if there is a bias parameter and if it can be converted to f16 290 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 291 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 292 assert(HasSampler && 293 "Only image instructions with a sampler can have a bias"); 294 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 295 OnlyDerivatives = true; 296 } 297 298 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 299 ImageDimIntr->CoordStart)) 300 return std::nullopt; 301 302 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 303 : Type::getInt16Ty(II.getContext()); 304 305 return modifyIntrinsicCall( 306 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 307 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 308 if (!OnlyDerivatives) { 309 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 310 311 // Change the bias type 312 if (ImageDimIntr->NumBiasArgs != 0) 313 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 314 } 315 316 unsigned EndIndex = 317 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 318 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 319 OperandIndex < EndIndex; OperandIndex++) { 320 Args[OperandIndex] = 321 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 322 } 323 324 // Convert the bias 325 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 326 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 327 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 328 } 329 }); 330 } 331 332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, 333 const Value *Op0, const Value *Op1, 334 InstCombiner &IC) const { 335 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 336 // infinity, gives +0.0. If we can prove we don't have one of the special 337 // cases then we can use a normal multiply instead. 338 // TODO: Create and use isKnownFiniteNonZero instead of just matching 339 // constants here. 340 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 341 match(Op1, PatternMatch::m_FiniteNonZero())) { 342 // One operand is not zero or infinity or NaN. 343 return true; 344 } 345 346 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I); 347 if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) && 348 isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) { 349 // Neither operand is infinity or NaN. 350 return true; 351 } 352 return false; 353 } 354 355 /// Match an fpext from half to float, or a constant we can convert. 356 static Value *matchFPExtFromF16(Value *Arg) { 357 Value *Src = nullptr; 358 ConstantFP *CFP = nullptr; 359 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) { 360 if (Src->getType()->isHalfTy()) 361 return Src; 362 } else if (match(Arg, m_ConstantFP(CFP))) { 363 bool LosesInfo; 364 APFloat Val(CFP->getValueAPF()); 365 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 366 if (!LosesInfo) 367 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val); 368 } 369 return nullptr; 370 } 371 372 // Trim all zero components from the end of the vector \p UseV and return 373 // an appropriate bitset with known elements. 374 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, 375 Instruction *I) { 376 auto *VTy = cast<FixedVectorType>(UseV->getType()); 377 unsigned VWidth = VTy->getNumElements(); 378 APInt DemandedElts = APInt::getAllOnes(VWidth); 379 380 for (int i = VWidth - 1; i > 0; --i) { 381 auto *Elt = findScalarElement(UseV, i); 382 if (!Elt) 383 break; 384 385 if (auto *ConstElt = dyn_cast<Constant>(Elt)) { 386 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt)) 387 break; 388 } else { 389 break; 390 } 391 392 DemandedElts.clearBit(i); 393 } 394 395 return DemandedElts; 396 } 397 398 // Trim elements of the end of the vector \p V, if they are 399 // equal to the first element of the vector. 400 static APInt defaultComponentBroadcast(Value *V) { 401 auto *VTy = cast<FixedVectorType>(V->getType()); 402 unsigned VWidth = VTy->getNumElements(); 403 APInt DemandedElts = APInt::getAllOnes(VWidth); 404 Value *FirstComponent = findScalarElement(V, 0); 405 406 SmallVector<int> ShuffleMask; 407 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) 408 SVI->getShuffleMask(ShuffleMask); 409 410 for (int I = VWidth - 1; I > 0; --I) { 411 if (ShuffleMask.empty()) { 412 auto *Elt = findScalarElement(V, I); 413 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt))) 414 break; 415 } else { 416 // Detect identical elements in the shufflevector result, even though 417 // findScalarElement cannot tell us what that element is. 418 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) 419 break; 420 } 421 DemandedElts.clearBit(I); 422 } 423 424 return DemandedElts; 425 } 426 427 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 428 IntrinsicInst &II, 429 APInt DemandedElts, 430 int DMaskIdx = -1, 431 bool IsLoad = true); 432 433 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt) 434 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) { 435 return (SqrtOp->getType()->isFloatTy() && 436 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) || 437 SqrtOp->getType()->isHalfTy(); 438 } 439 440 /// Return true if we can easily prove that use U is uniform. 441 static bool isTriviallyUniform(const Use &U) { 442 Value *V = U.get(); 443 if (isa<Constant>(V)) 444 return true; 445 if (const auto *II = dyn_cast<IntrinsicInst>(V)) { 446 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID())) 447 return false; 448 // If II and U are in different blocks then there is a possibility of 449 // temporal divergence. 450 return II->getParent() == cast<Instruction>(U.getUser())->getParent(); 451 } 452 return false; 453 } 454 455 std::optional<Instruction *> 456 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 457 Intrinsic::ID IID = II.getIntrinsicID(); 458 switch (IID) { 459 case Intrinsic::amdgcn_rcp: { 460 Value *Src = II.getArgOperand(0); 461 462 // TODO: Move to ConstantFolding/InstSimplify? 463 if (isa<UndefValue>(Src)) { 464 Type *Ty = II.getType(); 465 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 466 return IC.replaceInstUsesWith(II, QNaN); 467 } 468 469 if (II.isStrictFP()) 470 break; 471 472 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 473 const APFloat &ArgVal = C->getValueAPF(); 474 APFloat Val(ArgVal.getSemantics(), 1); 475 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 476 477 // This is more precise than the instruction may give. 478 // 479 // TODO: The instruction always flushes denormal results (except for f16), 480 // should this also? 481 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 482 } 483 484 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags(); 485 if (!FMF.allowContract()) 486 break; 487 auto *SrcCI = dyn_cast<IntrinsicInst>(Src); 488 if (!SrcCI) 489 break; 490 491 auto IID = SrcCI->getIntrinsicID(); 492 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable 493 // 494 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and 495 // relaxed. 496 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) { 497 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI); 498 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags(); 499 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse()) 500 break; 501 502 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp)) 503 break; 504 505 Function *NewDecl = Intrinsic::getDeclaration( 506 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()}); 507 508 InnerFMF |= FMF; 509 II.setFastMathFlags(InnerFMF); 510 511 II.setCalledFunction(NewDecl); 512 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0)); 513 } 514 515 break; 516 } 517 case Intrinsic::amdgcn_sqrt: 518 case Intrinsic::amdgcn_rsq: { 519 Value *Src = II.getArgOperand(0); 520 521 // TODO: Move to ConstantFolding/InstSimplify? 522 if (isa<UndefValue>(Src)) { 523 Type *Ty = II.getType(); 524 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 525 return IC.replaceInstUsesWith(II, QNaN); 526 } 527 528 // f16 amdgcn.sqrt is identical to regular sqrt. 529 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) { 530 Function *NewDecl = Intrinsic::getDeclaration( 531 II.getModule(), Intrinsic::sqrt, {II.getType()}); 532 II.setCalledFunction(NewDecl); 533 return &II; 534 } 535 536 break; 537 } 538 case Intrinsic::amdgcn_log: 539 case Intrinsic::amdgcn_exp2: { 540 const bool IsLog = IID == Intrinsic::amdgcn_log; 541 const bool IsExp = IID == Intrinsic::amdgcn_exp2; 542 Value *Src = II.getArgOperand(0); 543 Type *Ty = II.getType(); 544 545 if (isa<PoisonValue>(Src)) 546 return IC.replaceInstUsesWith(II, Src); 547 548 if (IC.getSimplifyQuery().isUndefValue(Src)) 549 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); 550 551 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 552 if (C->isInfinity()) { 553 // exp2(+inf) -> +inf 554 // log2(+inf) -> +inf 555 if (!C->isNegative()) 556 return IC.replaceInstUsesWith(II, C); 557 558 // exp2(-inf) -> 0 559 if (IsExp && C->isNegative()) 560 return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty)); 561 } 562 563 if (II.isStrictFP()) 564 break; 565 566 if (C->isNaN()) { 567 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet()); 568 return IC.replaceInstUsesWith(II, Quieted); 569 } 570 571 // f32 instruction doesn't handle denormals, f16 does. 572 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) { 573 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true) 574 : ConstantFP::get(Ty, 1.0); 575 return IC.replaceInstUsesWith(II, FoldedValue); 576 } 577 578 if (IsLog && C->isNegative()) 579 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); 580 581 // TODO: Full constant folding matching hardware behavior. 582 } 583 584 break; 585 } 586 case Intrinsic::amdgcn_frexp_mant: 587 case Intrinsic::amdgcn_frexp_exp: { 588 Value *Src = II.getArgOperand(0); 589 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 590 int Exp; 591 APFloat Significand = 592 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 593 594 if (IID == Intrinsic::amdgcn_frexp_mant) { 595 return IC.replaceInstUsesWith( 596 II, ConstantFP::get(II.getContext(), Significand)); 597 } 598 599 // Match instruction special case behavior. 600 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 601 Exp = 0; 602 603 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 604 } 605 606 if (isa<UndefValue>(Src)) { 607 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 608 } 609 610 break; 611 } 612 case Intrinsic::amdgcn_class: { 613 Value *Src0 = II.getArgOperand(0); 614 Value *Src1 = II.getArgOperand(1); 615 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 616 if (CMask) { 617 II.setCalledOperand(Intrinsic::getDeclaration( 618 II.getModule(), Intrinsic::is_fpclass, Src0->getType())); 619 620 // Clamp any excess bits, as they're illegal for the generic intrinsic. 621 II.setArgOperand(1, ConstantInt::get(Src1->getType(), 622 CMask->getZExtValue() & fcAllFlags)); 623 return &II; 624 } 625 626 // Propagate poison. 627 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1)) 628 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); 629 630 // llvm.amdgcn.class(_, undef) -> false 631 if (IC.getSimplifyQuery().isUndefValue(Src1)) 632 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 633 634 // llvm.amdgcn.class(undef, mask) -> mask != 0 635 if (IC.getSimplifyQuery().isUndefValue(Src0)) { 636 Value *CmpMask = IC.Builder.CreateICmpNE( 637 Src1, ConstantInt::getNullValue(Src1->getType())); 638 return IC.replaceInstUsesWith(II, CmpMask); 639 } 640 break; 641 } 642 case Intrinsic::amdgcn_cvt_pkrtz: { 643 Value *Src0 = II.getArgOperand(0); 644 Value *Src1 = II.getArgOperand(1); 645 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 646 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 647 const fltSemantics &HalfSem = 648 II.getType()->getScalarType()->getFltSemantics(); 649 bool LosesInfo; 650 APFloat Val0 = C0->getValueAPF(); 651 APFloat Val1 = C1->getValueAPF(); 652 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 653 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 654 655 Constant *Folded = 656 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 657 ConstantFP::get(II.getContext(), Val1)}); 658 return IC.replaceInstUsesWith(II, Folded); 659 } 660 } 661 662 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 663 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 664 } 665 666 break; 667 } 668 case Intrinsic::amdgcn_cvt_pknorm_i16: 669 case Intrinsic::amdgcn_cvt_pknorm_u16: 670 case Intrinsic::amdgcn_cvt_pk_i16: 671 case Intrinsic::amdgcn_cvt_pk_u16: { 672 Value *Src0 = II.getArgOperand(0); 673 Value *Src1 = II.getArgOperand(1); 674 675 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 676 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 677 } 678 679 break; 680 } 681 case Intrinsic::amdgcn_ubfe: 682 case Intrinsic::amdgcn_sbfe: { 683 // Decompose simple cases into standard shifts. 684 Value *Src = II.getArgOperand(0); 685 if (isa<UndefValue>(Src)) { 686 return IC.replaceInstUsesWith(II, Src); 687 } 688 689 unsigned Width; 690 Type *Ty = II.getType(); 691 unsigned IntSize = Ty->getIntegerBitWidth(); 692 693 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 694 if (CWidth) { 695 Width = CWidth->getZExtValue(); 696 if ((Width & (IntSize - 1)) == 0) { 697 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 698 } 699 700 // Hardware ignores high bits, so remove those. 701 if (Width >= IntSize) { 702 return IC.replaceOperand( 703 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 704 } 705 } 706 707 unsigned Offset; 708 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 709 if (COffset) { 710 Offset = COffset->getZExtValue(); 711 if (Offset >= IntSize) { 712 return IC.replaceOperand( 713 II, 1, 714 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 715 } 716 } 717 718 bool Signed = IID == Intrinsic::amdgcn_sbfe; 719 720 if (!CWidth || !COffset) 721 break; 722 723 // The case of Width == 0 is handled above, which makes this transformation 724 // safe. If Width == 0, then the ashr and lshr instructions become poison 725 // value since the shift amount would be equal to the bit size. 726 assert(Width != 0); 727 728 // TODO: This allows folding to undef when the hardware has specific 729 // behavior? 730 if (Offset + Width < IntSize) { 731 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 732 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 733 : IC.Builder.CreateLShr(Shl, IntSize - Width); 734 RightShift->takeName(&II); 735 return IC.replaceInstUsesWith(II, RightShift); 736 } 737 738 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 739 : IC.Builder.CreateLShr(Src, Offset); 740 741 RightShift->takeName(&II); 742 return IC.replaceInstUsesWith(II, RightShift); 743 } 744 case Intrinsic::amdgcn_exp: 745 case Intrinsic::amdgcn_exp_row: 746 case Intrinsic::amdgcn_exp_compr: { 747 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 748 unsigned EnBits = En->getZExtValue(); 749 if (EnBits == 0xf) 750 break; // All inputs enabled. 751 752 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 753 bool Changed = false; 754 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 755 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 756 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 757 Value *Src = II.getArgOperand(I + 2); 758 if (!isa<UndefValue>(Src)) { 759 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 760 Changed = true; 761 } 762 } 763 } 764 765 if (Changed) { 766 return &II; 767 } 768 769 break; 770 } 771 case Intrinsic::amdgcn_fmed3: { 772 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 773 // for the shader. 774 775 Value *Src0 = II.getArgOperand(0); 776 Value *Src1 = II.getArgOperand(1); 777 Value *Src2 = II.getArgOperand(2); 778 779 // Checking for NaN before canonicalization provides better fidelity when 780 // mapping other operations onto fmed3 since the order of operands is 781 // unchanged. 782 Value *V = nullptr; 783 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 784 V = IC.Builder.CreateMinNum(Src1, Src2); 785 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 786 V = IC.Builder.CreateMinNum(Src0, Src2); 787 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 788 V = IC.Builder.CreateMaxNum(Src0, Src1); 789 } 790 791 if (V) { 792 if (auto *CI = dyn_cast<CallInst>(V)) { 793 CI->copyFastMathFlags(&II); 794 CI->takeName(&II); 795 } 796 return IC.replaceInstUsesWith(II, V); 797 } 798 799 bool Swap = false; 800 // Canonicalize constants to RHS operands. 801 // 802 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 803 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 804 std::swap(Src0, Src1); 805 Swap = true; 806 } 807 808 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 809 std::swap(Src1, Src2); 810 Swap = true; 811 } 812 813 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 814 std::swap(Src0, Src1); 815 Swap = true; 816 } 817 818 if (Swap) { 819 II.setArgOperand(0, Src0); 820 II.setArgOperand(1, Src1); 821 II.setArgOperand(2, Src2); 822 return &II; 823 } 824 825 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 826 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 827 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 828 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 829 C2->getValueAPF()); 830 return IC.replaceInstUsesWith( 831 II, ConstantFP::get(IC.Builder.getContext(), Result)); 832 } 833 } 834 } 835 836 if (!ST->hasMed3_16()) 837 break; 838 839 // Repeat floating-point width reduction done for minnum/maxnum. 840 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z)) 841 if (Value *X = matchFPExtFromF16(Src0)) { 842 if (Value *Y = matchFPExtFromF16(Src1)) { 843 if (Value *Z = matchFPExtFromF16(Src2)) { 844 Value *NewCall = IC.Builder.CreateIntrinsic( 845 IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); 846 return new FPExtInst(NewCall, II.getType()); 847 } 848 } 849 } 850 851 break; 852 } 853 case Intrinsic::amdgcn_icmp: 854 case Intrinsic::amdgcn_fcmp: { 855 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 856 // Guard against invalid arguments. 857 int64_t CCVal = CC->getZExtValue(); 858 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 859 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 860 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 861 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 862 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 863 break; 864 865 Value *Src0 = II.getArgOperand(0); 866 Value *Src1 = II.getArgOperand(1); 867 868 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 869 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 870 Constant *CCmp = ConstantFoldCompareInstOperands( 871 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL); 872 if (CCmp && CCmp->isNullValue()) { 873 return IC.replaceInstUsesWith( 874 II, IC.Builder.CreateSExt(CCmp, II.getType())); 875 } 876 877 // The result of V_ICMP/V_FCMP assembly instructions (which this 878 // intrinsic exposes) is one bit per thread, masked with the EXEC 879 // register (which contains the bitmask of live threads). So a 880 // comparison that always returns true is the same as a read of the 881 // EXEC register. 882 Function *NewF = Intrinsic::getDeclaration( 883 II.getModule(), Intrinsic::read_register, II.getType()); 884 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 885 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 886 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 887 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 888 NewCall->addFnAttr(Attribute::Convergent); 889 NewCall->takeName(&II); 890 return IC.replaceInstUsesWith(II, NewCall); 891 } 892 893 // Canonicalize constants to RHS. 894 CmpInst::Predicate SwapPred = 895 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 896 II.setArgOperand(0, Src1); 897 II.setArgOperand(1, Src0); 898 II.setArgOperand( 899 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 900 return &II; 901 } 902 903 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 904 break; 905 906 // Canonicalize compare eq with true value to compare != 0 907 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 908 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 909 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 910 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 911 Value *ExtSrc; 912 if (CCVal == CmpInst::ICMP_EQ && 913 ((match(Src1, PatternMatch::m_One()) && 914 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 915 (match(Src1, PatternMatch::m_AllOnes()) && 916 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 917 ExtSrc->getType()->isIntegerTy(1)) { 918 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 919 IC.replaceOperand(II, 2, 920 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 921 return &II; 922 } 923 924 CmpInst::Predicate SrcPred; 925 Value *SrcLHS; 926 Value *SrcRHS; 927 928 // Fold compare eq/ne with 0 from a compare result as the predicate to the 929 // intrinsic. The typical use is a wave vote function in the library, which 930 // will be fed from a user code condition compared with 0. Fold in the 931 // redundant compare. 932 933 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 934 // -> llvm.amdgcn.[if]cmp(a, b, pred) 935 // 936 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 937 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 938 if (match(Src1, PatternMatch::m_Zero()) && 939 match(Src0, PatternMatch::m_ZExtOrSExt( 940 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 941 PatternMatch::m_Value(SrcRHS))))) { 942 if (CCVal == CmpInst::ICMP_EQ) 943 SrcPred = CmpInst::getInversePredicate(SrcPred); 944 945 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 946 ? Intrinsic::amdgcn_fcmp 947 : Intrinsic::amdgcn_icmp; 948 949 Type *Ty = SrcLHS->getType(); 950 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 951 // Promote to next legal integer type. 952 unsigned Width = CmpType->getBitWidth(); 953 unsigned NewWidth = Width; 954 955 // Don't do anything for i1 comparisons. 956 if (Width == 1) 957 break; 958 959 if (Width <= 16) 960 NewWidth = 16; 961 else if (Width <= 32) 962 NewWidth = 32; 963 else if (Width <= 64) 964 NewWidth = 64; 965 else 966 break; // Can't handle this. 967 968 if (Width != NewWidth) { 969 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 970 if (CmpInst::isSigned(SrcPred)) { 971 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 972 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 973 } else { 974 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 975 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 976 } 977 } 978 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 979 break; 980 981 Function *NewF = Intrinsic::getDeclaration( 982 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 983 Value *Args[] = {SrcLHS, SrcRHS, 984 ConstantInt::get(CC->getType(), SrcPred)}; 985 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 986 NewCall->takeName(&II); 987 return IC.replaceInstUsesWith(II, NewCall); 988 } 989 990 break; 991 } 992 case Intrinsic::amdgcn_mbcnt_hi: { 993 // exec_hi is all 0, so this is just a copy. 994 if (ST->isWave32()) 995 return IC.replaceInstUsesWith(II, II.getArgOperand(1)); 996 break; 997 } 998 case Intrinsic::amdgcn_ballot: { 999 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1000 if (Src->isZero()) { 1001 // amdgcn.ballot(i1 0) is zero. 1002 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 1003 } 1004 } 1005 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) { 1006 // %b64 = call i64 ballot.i64(...) 1007 // => 1008 // %b32 = call i32 ballot.i32(...) 1009 // %b64 = zext i32 %b32 to i64 1010 Value *Call = IC.Builder.CreateZExt( 1011 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, 1012 {IC.Builder.getInt32Ty()}, 1013 {II.getArgOperand(0)}), 1014 II.getType()); 1015 Call->takeName(&II); 1016 return IC.replaceInstUsesWith(II, Call); 1017 } 1018 break; 1019 } 1020 case Intrinsic::amdgcn_wqm_vote: { 1021 // wqm_vote is identity when the argument is constant. 1022 if (!isa<Constant>(II.getArgOperand(0))) 1023 break; 1024 1025 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1026 } 1027 case Intrinsic::amdgcn_kill: { 1028 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 1029 if (!C || !C->getZExtValue()) 1030 break; 1031 1032 // amdgcn.kill(i1 1) is a no-op 1033 return IC.eraseInstFromFunction(II); 1034 } 1035 case Intrinsic::amdgcn_update_dpp: { 1036 Value *Old = II.getArgOperand(0); 1037 1038 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 1039 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 1040 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 1041 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 1042 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 1043 break; 1044 1045 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 1046 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 1047 } 1048 case Intrinsic::amdgcn_permlane16: 1049 case Intrinsic::amdgcn_permlane16_var: 1050 case Intrinsic::amdgcn_permlanex16: 1051 case Intrinsic::amdgcn_permlanex16_var: { 1052 // Discard vdst_in if it's not going to be read. 1053 Value *VDstIn = II.getArgOperand(0); 1054 if (isa<UndefValue>(VDstIn)) 1055 break; 1056 1057 // FetchInvalid operand idx. 1058 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 || 1059 IID == Intrinsic::amdgcn_permlanex16) 1060 ? 4 /* for permlane16 and permlanex16 */ 1061 : 3; /* for permlane16_var and permlanex16_var */ 1062 1063 // BoundCtrl operand idx. 1064 // For permlane16 and permlanex16 it should be 5 1065 // For Permlane16_var and permlanex16_var it should be 4 1066 unsigned int BcIdx = FiIdx + 1; 1067 1068 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx)); 1069 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx)); 1070 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 1071 break; 1072 1073 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 1074 } 1075 case Intrinsic::amdgcn_permlane64: 1076 case Intrinsic::amdgcn_readfirstlane: 1077 case Intrinsic::amdgcn_readlane: { 1078 // If the first argument is uniform these intrinsics return it unchanged. 1079 const Use &Src = II.getArgOperandUse(0); 1080 if (isTriviallyUniform(Src)) 1081 return IC.replaceInstUsesWith(II, Src.get()); 1082 break; 1083 } 1084 case Intrinsic::amdgcn_trig_preop: { 1085 // The intrinsic is declared with name mangling, but currently the 1086 // instruction only exists for f64 1087 if (!II.getType()->isDoubleTy()) 1088 break; 1089 1090 Value *Src = II.getArgOperand(0); 1091 Value *Segment = II.getArgOperand(1); 1092 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment)) 1093 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); 1094 1095 if (isa<UndefValue>(Src)) { 1096 auto *QNaN = ConstantFP::get( 1097 II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics())); 1098 return IC.replaceInstUsesWith(II, QNaN); 1099 } 1100 1101 const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src); 1102 if (!Csrc) 1103 break; 1104 1105 if (II.isStrictFP()) 1106 break; 1107 1108 const APFloat &Fsrc = Csrc->getValueAPF(); 1109 if (Fsrc.isNaN()) { 1110 auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet()); 1111 return IC.replaceInstUsesWith(II, Quieted); 1112 } 1113 1114 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment); 1115 if (!Cseg) 1116 break; 1117 1118 unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff; 1119 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue(); 1120 unsigned Shift = SegmentVal * 53; 1121 if (Exponent > 1077) 1122 Shift += Exponent - 1077; 1123 1124 // 2.0/PI table. 1125 static const uint32_t TwoByPi[] = { 1126 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041, 1127 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 1128 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, 1129 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, 1130 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d, 1131 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 1132 0x56033046}; 1133 1134 // Return 0 for outbound segment (hardware behavior). 1135 unsigned Idx = Shift >> 5; 1136 if (Idx + 2 >= std::size(TwoByPi)) { 1137 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics()); 1138 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero)); 1139 } 1140 1141 unsigned BShift = Shift & 0x1f; 1142 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]); 1143 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0); 1144 if (BShift) 1145 Thi = (Thi << BShift) | (Tlo >> (64 - BShift)); 1146 Thi = Thi >> 11; 1147 APFloat Result = APFloat((double)Thi); 1148 1149 int Scale = -53 - Shift; 1150 if (Exponent >= 1968) 1151 Scale += 128; 1152 1153 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven); 1154 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result)); 1155 } 1156 case Intrinsic::amdgcn_fmul_legacy: { 1157 Value *Op0 = II.getArgOperand(0); 1158 Value *Op1 = II.getArgOperand(1); 1159 1160 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1161 // infinity, gives +0.0. 1162 // TODO: Move to InstSimplify? 1163 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1164 match(Op1, PatternMatch::m_AnyZeroFP())) 1165 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType())); 1166 1167 // If we can prove we don't have one of the special cases then we can use a 1168 // normal fmul instruction instead. 1169 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { 1170 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1171 FMul->takeName(&II); 1172 return IC.replaceInstUsesWith(II, FMul); 1173 } 1174 break; 1175 } 1176 case Intrinsic::amdgcn_fma_legacy: { 1177 Value *Op0 = II.getArgOperand(0); 1178 Value *Op1 = II.getArgOperand(1); 1179 Value *Op2 = II.getArgOperand(2); 1180 1181 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1182 // infinity, gives +0.0. 1183 // TODO: Move to InstSimplify? 1184 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1185 match(Op1, PatternMatch::m_AnyZeroFP())) { 1186 // It's tempting to just return Op2 here, but that would give the wrong 1187 // result if Op2 was -0.0. 1188 auto *Zero = ConstantFP::getZero(II.getType()); 1189 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1190 FAdd->takeName(&II); 1191 return IC.replaceInstUsesWith(II, FAdd); 1192 } 1193 1194 // If we can prove we don't have one of the special cases then we can use a 1195 // normal fma instead. 1196 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { 1197 II.setCalledOperand(Intrinsic::getDeclaration( 1198 II.getModule(), Intrinsic::fma, II.getType())); 1199 return &II; 1200 } 1201 break; 1202 } 1203 case Intrinsic::amdgcn_is_shared: 1204 case Intrinsic::amdgcn_is_private: { 1205 if (isa<UndefValue>(II.getArgOperand(0))) 1206 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1207 1208 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1209 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1210 break; 1211 } 1212 case Intrinsic::amdgcn_raw_buffer_store_format: 1213 case Intrinsic::amdgcn_struct_buffer_store_format: 1214 case Intrinsic::amdgcn_raw_tbuffer_store: 1215 case Intrinsic::amdgcn_struct_tbuffer_store: 1216 case Intrinsic::amdgcn_image_store_1d: 1217 case Intrinsic::amdgcn_image_store_1darray: 1218 case Intrinsic::amdgcn_image_store_2d: 1219 case Intrinsic::amdgcn_image_store_2darray: 1220 case Intrinsic::amdgcn_image_store_2darraymsaa: 1221 case Intrinsic::amdgcn_image_store_2dmsaa: 1222 case Intrinsic::amdgcn_image_store_3d: 1223 case Intrinsic::amdgcn_image_store_cube: 1224 case Intrinsic::amdgcn_image_store_mip_1d: 1225 case Intrinsic::amdgcn_image_store_mip_1darray: 1226 case Intrinsic::amdgcn_image_store_mip_2d: 1227 case Intrinsic::amdgcn_image_store_mip_2darray: 1228 case Intrinsic::amdgcn_image_store_mip_3d: 1229 case Intrinsic::amdgcn_image_store_mip_cube: { 1230 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType())) 1231 break; 1232 1233 APInt DemandedElts; 1234 if (ST->hasDefaultComponentBroadcast()) 1235 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); 1236 else if (ST->hasDefaultComponentZero()) 1237 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); 1238 else 1239 break; 1240 1241 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; 1242 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, 1243 false)) { 1244 return IC.eraseInstFromFunction(II); 1245 } 1246 1247 break; 1248 } 1249 } 1250 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1251 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1252 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1253 } 1254 return std::nullopt; 1255 } 1256 1257 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1258 /// 1259 /// The result of simplifying amdgcn image and buffer store intrinsics is updating 1260 /// definitions of the intrinsics vector argument, not Uses of the result like 1261 /// image and buffer loads. 1262 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1263 /// struct returns. 1264 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1265 IntrinsicInst &II, 1266 APInt DemandedElts, 1267 int DMaskIdx, bool IsLoad) { 1268 1269 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType() 1270 : II.getOperand(0)->getType()); 1271 unsigned VWidth = IIVTy->getNumElements(); 1272 if (VWidth == 1) 1273 return nullptr; 1274 Type *EltTy = IIVTy->getElementType(); 1275 1276 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1277 IC.Builder.SetInsertPoint(&II); 1278 1279 // Assume the arguments are unchanged and later override them, if needed. 1280 SmallVector<Value *, 16> Args(II.args()); 1281 1282 if (DMaskIdx < 0) { 1283 // Buffer case. 1284 1285 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1286 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero(); 1287 1288 // Start assuming the prefix of elements is demanded, but possibly clear 1289 // some other bits if there are trailing zeros (unused components at front) 1290 // and update offset. 1291 DemandedElts = (1 << ActiveBits) - 1; 1292 1293 if (UnusedComponentsAtFront > 0) { 1294 static const unsigned InvalidOffsetIdx = 0xf; 1295 1296 unsigned OffsetIdx; 1297 switch (II.getIntrinsicID()) { 1298 case Intrinsic::amdgcn_raw_buffer_load: 1299 case Intrinsic::amdgcn_raw_ptr_buffer_load: 1300 OffsetIdx = 1; 1301 break; 1302 case Intrinsic::amdgcn_s_buffer_load: 1303 // If resulting type is vec3, there is no point in trimming the 1304 // load with updated offset, as the vec3 would most likely be widened to 1305 // vec4 anyway during lowering. 1306 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1307 OffsetIdx = InvalidOffsetIdx; 1308 else 1309 OffsetIdx = 1; 1310 break; 1311 case Intrinsic::amdgcn_struct_buffer_load: 1312 case Intrinsic::amdgcn_struct_ptr_buffer_load: 1313 OffsetIdx = 2; 1314 break; 1315 default: 1316 // TODO: handle tbuffer* intrinsics. 1317 OffsetIdx = InvalidOffsetIdx; 1318 break; 1319 } 1320 1321 if (OffsetIdx != InvalidOffsetIdx) { 1322 // Clear demanded bits and update the offset. 1323 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1324 auto *Offset = Args[OffsetIdx]; 1325 unsigned SingleComponentSizeInBits = 1326 IC.getDataLayout().getTypeSizeInBits(EltTy); 1327 unsigned OffsetAdd = 1328 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1329 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1330 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1331 } 1332 } 1333 } else { 1334 // Image case. 1335 1336 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]); 1337 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1338 1339 // dmask 0 has special semantics, do not simplify. 1340 if (DMaskVal == 0) 1341 return nullptr; 1342 1343 // Mask off values that are undefined because the dmask doesn't cover them 1344 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; 1345 1346 unsigned NewDMaskVal = 0; 1347 unsigned OrigLdStIdx = 0; 1348 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1349 const unsigned Bit = 1 << SrcIdx; 1350 if (!!(DMaskVal & Bit)) { 1351 if (!!DemandedElts[OrigLdStIdx]) 1352 NewDMaskVal |= Bit; 1353 OrigLdStIdx++; 1354 } 1355 } 1356 1357 if (DMaskVal != NewDMaskVal) 1358 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1359 } 1360 1361 unsigned NewNumElts = DemandedElts.popcount(); 1362 if (!NewNumElts) 1363 return PoisonValue::get(IIVTy); 1364 1365 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1366 if (DMaskIdx >= 0) 1367 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1368 return nullptr; 1369 } 1370 1371 // Validate function argument and return types, extracting overloaded types 1372 // along the way. 1373 SmallVector<Type *, 6> OverloadTys; 1374 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1375 return nullptr; 1376 1377 Type *NewTy = 1378 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1379 OverloadTys[0] = NewTy; 1380 1381 if (!IsLoad) { 1382 SmallVector<int, 8> EltMask; 1383 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) 1384 if (DemandedElts[OrigStoreIdx]) 1385 EltMask.push_back(OrigStoreIdx); 1386 1387 if (NewNumElts == 1) 1388 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); 1389 else 1390 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); 1391 } 1392 1393 Function *NewIntrin = Intrinsic::getDeclaration( 1394 II.getModule(), II.getIntrinsicID(), OverloadTys); 1395 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1396 NewCall->takeName(&II); 1397 NewCall->copyMetadata(II); 1398 1399 if (IsLoad) { 1400 if (NewNumElts == 1) { 1401 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall, 1402 DemandedElts.countr_zero()); 1403 } 1404 1405 SmallVector<int, 8> EltMask; 1406 unsigned NewLoadIdx = 0; 1407 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1408 if (!!DemandedElts[OrigLoadIdx]) 1409 EltMask.push_back(NewLoadIdx++); 1410 else 1411 EltMask.push_back(NewNumElts); 1412 } 1413 1414 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1415 1416 return Shuffle; 1417 } 1418 1419 return NewCall; 1420 } 1421 1422 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1423 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1424 APInt &UndefElts2, APInt &UndefElts3, 1425 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1426 SimplifyAndSetOp) const { 1427 switch (II.getIntrinsicID()) { 1428 case Intrinsic::amdgcn_raw_buffer_load: 1429 case Intrinsic::amdgcn_raw_ptr_buffer_load: 1430 case Intrinsic::amdgcn_raw_buffer_load_format: 1431 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 1432 case Intrinsic::amdgcn_raw_tbuffer_load: 1433 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 1434 case Intrinsic::amdgcn_s_buffer_load: 1435 case Intrinsic::amdgcn_struct_buffer_load: 1436 case Intrinsic::amdgcn_struct_ptr_buffer_load: 1437 case Intrinsic::amdgcn_struct_buffer_load_format: 1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 1439 case Intrinsic::amdgcn_struct_tbuffer_load: 1440 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 1441 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1442 default: { 1443 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1444 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1445 } 1446 break; 1447 } 1448 } 1449 return std::nullopt; 1450 } 1451