1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/ADT/FloatingPointMode.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/Transforms/InstCombine/InstCombiner.h" 23 #include <optional> 24 25 using namespace llvm; 26 using namespace llvm::PatternMatch; 27 28 #define DEBUG_TYPE "AMDGPUtti" 29 30 namespace { 31 32 struct AMDGPUImageDMaskIntrinsic { 33 unsigned Intr; 34 }; 35 36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 37 #include "InstCombineTables.inc" 38 39 } // end anonymous namespace 40 41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 42 // 43 // A single NaN input is folded to minnum, so we rely on that folding for 44 // handling NaNs. 45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 46 const APFloat &Src2) { 47 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 48 49 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 50 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 51 if (Cmp0 == APFloat::cmpEqual) 52 return maxnum(Src1, Src2); 53 54 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 55 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 56 if (Cmp1 == APFloat::cmpEqual) 57 return maxnum(Src0, Src2); 58 59 return maxnum(Src0, Src1); 60 } 61 62 // Check if a value can be converted to a 16-bit value without losing 63 // precision. 64 // The value is expected to be either a float (IsFloat = true) or an unsigned 65 // integer (IsFloat = false). 66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 67 Type *VTy = V.getType(); 68 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 69 // The value is already 16-bit, so we don't want to convert to 16-bit again! 70 return false; 71 } 72 if (IsFloat) { 73 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 74 // We need to check that if we cast the index down to a half, we do not 75 // lose precision. 76 APFloat FloatValue(ConstFloat->getValueAPF()); 77 bool LosesInfo = true; 78 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 79 &LosesInfo); 80 return !LosesInfo; 81 } 82 } else { 83 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 84 // We need to check that if we cast the index down to an i16, we do not 85 // lose precision. 86 APInt IntValue(ConstInt->getValue()); 87 return IntValue.getActiveBits() <= 16; 88 } 89 } 90 91 Value *CastSrc; 92 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 93 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 94 if (IsExt) { 95 Type *CastSrcTy = CastSrc->getType(); 96 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 97 return true; 98 } 99 100 return false; 101 } 102 103 // Convert a value to 16-bit. 104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 105 Type *VTy = V.getType(); 106 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 107 return cast<Instruction>(&V)->getOperand(0); 108 if (VTy->isIntegerTy()) 109 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 110 if (VTy->isFloatingPointTy()) 111 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 112 113 llvm_unreachable("Should never be called!"); 114 } 115 116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 117 /// modified arguments (based on OldIntr) and replaces InstToReplace with 118 /// this newly created intrinsic call. 119 static std::optional<Instruction *> modifyIntrinsicCall( 120 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 121 InstCombiner &IC, 122 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 123 Func) { 124 SmallVector<Type *, 4> ArgTys; 125 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 126 return std::nullopt; 127 128 SmallVector<Value *, 8> Args(OldIntr.args()); 129 130 // Modify arguments and types 131 Func(Args, ArgTys); 132 133 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args); 134 NewCall->takeName(&OldIntr); 135 NewCall->copyMetadata(OldIntr); 136 if (isa<FPMathOperator>(NewCall)) 137 NewCall->copyFastMathFlags(&OldIntr); 138 139 // Erase and replace uses 140 if (!InstToReplace.getType()->isVoidTy()) 141 IC.replaceInstUsesWith(InstToReplace, NewCall); 142 143 bool RemoveOldIntr = &OldIntr != &InstToReplace; 144 145 auto *RetValue = IC.eraseInstFromFunction(InstToReplace); 146 if (RemoveOldIntr) 147 IC.eraseInstFromFunction(OldIntr); 148 149 return RetValue; 150 } 151 152 static std::optional<Instruction *> 153 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 154 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 155 IntrinsicInst &II, InstCombiner &IC) { 156 // Optimize _L to _LZ when _L is zero 157 if (const auto *LZMappingInfo = 158 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 159 if (auto *ConstantLod = 160 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 161 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 162 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 163 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 164 ImageDimIntr->Dim); 165 return modifyIntrinsicCall( 166 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 167 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 168 }); 169 } 170 } 171 } 172 173 // Optimize _mip away, when 'lod' is zero 174 if (const auto *MIPMappingInfo = 175 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 176 if (auto *ConstantMip = 177 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 178 if (ConstantMip->isZero()) { 179 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 180 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 181 ImageDimIntr->Dim); 182 return modifyIntrinsicCall( 183 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 184 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 185 }); 186 } 187 } 188 } 189 190 // Optimize _bias away when 'bias' is zero 191 if (const auto *BiasMappingInfo = 192 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 193 if (auto *ConstantBias = 194 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 195 if (ConstantBias->isZero()) { 196 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 197 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 198 ImageDimIntr->Dim); 199 return modifyIntrinsicCall( 200 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 201 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 202 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 203 }); 204 } 205 } 206 } 207 208 // Optimize _offset away when 'offset' is zero 209 if (const auto *OffsetMappingInfo = 210 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 211 if (auto *ConstantOffset = 212 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 213 if (ConstantOffset->isZero()) { 214 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 215 AMDGPU::getImageDimIntrinsicByBaseOpcode( 216 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 217 return modifyIntrinsicCall( 218 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 219 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 220 }); 221 } 222 } 223 } 224 225 // Try to use D16 226 if (ST->hasD16Images()) { 227 228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 229 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 230 231 if (BaseOpcode->HasD16) { 232 233 // If the only use of image intrinsic is a fptrunc (with conversion to 234 // half) then both fptrunc and image intrinsic will be replaced with image 235 // intrinsic with D16 flag. 236 if (II.hasOneUse()) { 237 Instruction *User = II.user_back(); 238 239 if (User->getOpcode() == Instruction::FPTrunc && 240 User->getType()->getScalarType()->isHalfTy()) { 241 242 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 243 [&](auto &Args, auto &ArgTys) { 244 // Change return type of image intrinsic. 245 // Set it to return type of fptrunc. 246 ArgTys[0] = User->getType(); 247 }); 248 } 249 } 250 } 251 } 252 253 // Try to use A16 or G16 254 if (!ST->hasA16() && !ST->hasG16()) 255 return std::nullopt; 256 257 // Address is interpreted as float if the instruction has a sampler or as 258 // unsigned int if there is no sampler. 259 bool HasSampler = 260 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 261 bool FloatCoord = false; 262 // true means derivatives can be converted to 16 bit, coordinates not 263 bool OnlyDerivatives = false; 264 265 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 266 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 267 Value *Coord = II.getOperand(OperandIndex); 268 // If the values are not derived from 16-bit values, we cannot optimize. 269 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 270 if (OperandIndex < ImageDimIntr->CoordStart || 271 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 272 return std::nullopt; 273 } 274 // All gradients can be converted, so convert only them 275 OnlyDerivatives = true; 276 break; 277 } 278 279 assert(OperandIndex == ImageDimIntr->GradientStart || 280 FloatCoord == Coord->getType()->isFloatingPointTy()); 281 FloatCoord = Coord->getType()->isFloatingPointTy(); 282 } 283 284 if (!OnlyDerivatives && !ST->hasA16()) 285 OnlyDerivatives = true; // Only supports G16 286 287 // Check if there is a bias parameter and if it can be converted to f16 288 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 289 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 290 assert(HasSampler && 291 "Only image instructions with a sampler can have a bias"); 292 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 293 OnlyDerivatives = true; 294 } 295 296 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 297 ImageDimIntr->CoordStart)) 298 return std::nullopt; 299 300 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 301 : Type::getInt16Ty(II.getContext()); 302 303 return modifyIntrinsicCall( 304 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 305 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 306 if (!OnlyDerivatives) { 307 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 308 309 // Change the bias type 310 if (ImageDimIntr->NumBiasArgs != 0) 311 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 312 } 313 314 unsigned EndIndex = 315 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 316 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 317 OperandIndex < EndIndex; OperandIndex++) { 318 Args[OperandIndex] = 319 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 320 } 321 322 // Convert the bias 323 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 324 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 325 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 326 } 327 }); 328 } 329 330 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, 331 const Value *Op0, const Value *Op1, 332 InstCombiner &IC) const { 333 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 334 // infinity, gives +0.0. If we can prove we don't have one of the special 335 // cases then we can use a normal multiply instead. 336 // TODO: Create and use isKnownFiniteNonZero instead of just matching 337 // constants here. 338 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 339 match(Op1, PatternMatch::m_FiniteNonZero())) { 340 // One operand is not zero or infinity or NaN. 341 return true; 342 } 343 344 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I); 345 if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) && 346 isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) { 347 // Neither operand is infinity or NaN. 348 return true; 349 } 350 return false; 351 } 352 353 /// Match an fpext from half to float, or a constant we can convert. 354 static Value *matchFPExtFromF16(Value *Arg) { 355 Value *Src = nullptr; 356 ConstantFP *CFP = nullptr; 357 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) { 358 if (Src->getType()->isHalfTy()) 359 return Src; 360 } else if (match(Arg, m_ConstantFP(CFP))) { 361 bool LosesInfo; 362 APFloat Val(CFP->getValueAPF()); 363 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 364 if (!LosesInfo) 365 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val); 366 } 367 return nullptr; 368 } 369 370 // Trim all zero components from the end of the vector \p UseV and return 371 // an appropriate bitset with known elements. 372 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, 373 Instruction *I) { 374 auto *VTy = cast<FixedVectorType>(UseV->getType()); 375 unsigned VWidth = VTy->getNumElements(); 376 APInt DemandedElts = APInt::getAllOnes(VWidth); 377 378 for (int i = VWidth - 1; i > 0; --i) { 379 auto *Elt = findScalarElement(UseV, i); 380 if (!Elt) 381 break; 382 383 if (auto *ConstElt = dyn_cast<Constant>(Elt)) { 384 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt)) 385 break; 386 } else { 387 break; 388 } 389 390 DemandedElts.clearBit(i); 391 } 392 393 return DemandedElts; 394 } 395 396 // Trim elements of the end of the vector \p V, if they are 397 // equal to the first element of the vector. 398 static APInt defaultComponentBroadcast(Value *V) { 399 auto *VTy = cast<FixedVectorType>(V->getType()); 400 unsigned VWidth = VTy->getNumElements(); 401 APInt DemandedElts = APInt::getAllOnes(VWidth); 402 Value *FirstComponent = findScalarElement(V, 0); 403 404 SmallVector<int> ShuffleMask; 405 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) 406 SVI->getShuffleMask(ShuffleMask); 407 408 for (int I = VWidth - 1; I > 0; --I) { 409 if (ShuffleMask.empty()) { 410 auto *Elt = findScalarElement(V, I); 411 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt))) 412 break; 413 } else { 414 // Detect identical elements in the shufflevector result, even though 415 // findScalarElement cannot tell us what that element is. 416 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) 417 break; 418 } 419 DemandedElts.clearBit(I); 420 } 421 422 return DemandedElts; 423 } 424 425 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 426 IntrinsicInst &II, 427 APInt DemandedElts, 428 int DMaskIdx = -1, 429 bool IsLoad = true); 430 431 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt) 432 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) { 433 return (SqrtOp->getType()->isFloatTy() && 434 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) || 435 SqrtOp->getType()->isHalfTy(); 436 } 437 438 /// Return true if we can easily prove that use U is uniform. 439 static bool isTriviallyUniform(const Use &U) { 440 Value *V = U.get(); 441 if (isa<Constant>(V)) 442 return true; 443 if (const auto *II = dyn_cast<IntrinsicInst>(V)) { 444 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID())) 445 return false; 446 // If II and U are in different blocks then there is a possibility of 447 // temporal divergence. 448 return II->getParent() == cast<Instruction>(U.getUser())->getParent(); 449 } 450 return false; 451 } 452 453 std::optional<Instruction *> 454 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 455 Intrinsic::ID IID = II.getIntrinsicID(); 456 switch (IID) { 457 case Intrinsic::amdgcn_rcp: { 458 Value *Src = II.getArgOperand(0); 459 460 // TODO: Move to ConstantFolding/InstSimplify? 461 if (isa<UndefValue>(Src)) { 462 Type *Ty = II.getType(); 463 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 464 return IC.replaceInstUsesWith(II, QNaN); 465 } 466 467 if (II.isStrictFP()) 468 break; 469 470 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 471 const APFloat &ArgVal = C->getValueAPF(); 472 APFloat Val(ArgVal.getSemantics(), 1); 473 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 474 475 // This is more precise than the instruction may give. 476 // 477 // TODO: The instruction always flushes denormal results (except for f16), 478 // should this also? 479 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 480 } 481 482 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags(); 483 if (!FMF.allowContract()) 484 break; 485 auto *SrcCI = dyn_cast<IntrinsicInst>(Src); 486 if (!SrcCI) 487 break; 488 489 auto IID = SrcCI->getIntrinsicID(); 490 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable 491 // 492 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and 493 // relaxed. 494 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) { 495 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI); 496 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags(); 497 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse()) 498 break; 499 500 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp)) 501 break; 502 503 Function *NewDecl = Intrinsic::getOrInsertDeclaration( 504 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()}); 505 506 InnerFMF |= FMF; 507 II.setFastMathFlags(InnerFMF); 508 509 II.setCalledFunction(NewDecl); 510 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0)); 511 } 512 513 break; 514 } 515 case Intrinsic::amdgcn_sqrt: 516 case Intrinsic::amdgcn_rsq: { 517 Value *Src = II.getArgOperand(0); 518 519 // TODO: Move to ConstantFolding/InstSimplify? 520 if (isa<UndefValue>(Src)) { 521 Type *Ty = II.getType(); 522 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 523 return IC.replaceInstUsesWith(II, QNaN); 524 } 525 526 // f16 amdgcn.sqrt is identical to regular sqrt. 527 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) { 528 Function *NewDecl = Intrinsic::getOrInsertDeclaration( 529 II.getModule(), Intrinsic::sqrt, {II.getType()}); 530 II.setCalledFunction(NewDecl); 531 return &II; 532 } 533 534 break; 535 } 536 case Intrinsic::amdgcn_log: 537 case Intrinsic::amdgcn_exp2: { 538 const bool IsLog = IID == Intrinsic::amdgcn_log; 539 const bool IsExp = IID == Intrinsic::amdgcn_exp2; 540 Value *Src = II.getArgOperand(0); 541 Type *Ty = II.getType(); 542 543 if (isa<PoisonValue>(Src)) 544 return IC.replaceInstUsesWith(II, Src); 545 546 if (IC.getSimplifyQuery().isUndefValue(Src)) 547 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); 548 549 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 550 if (C->isInfinity()) { 551 // exp2(+inf) -> +inf 552 // log2(+inf) -> +inf 553 if (!C->isNegative()) 554 return IC.replaceInstUsesWith(II, C); 555 556 // exp2(-inf) -> 0 557 if (IsExp && C->isNegative()) 558 return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty)); 559 } 560 561 if (II.isStrictFP()) 562 break; 563 564 if (C->isNaN()) { 565 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet()); 566 return IC.replaceInstUsesWith(II, Quieted); 567 } 568 569 // f32 instruction doesn't handle denormals, f16 does. 570 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) { 571 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true) 572 : ConstantFP::get(Ty, 1.0); 573 return IC.replaceInstUsesWith(II, FoldedValue); 574 } 575 576 if (IsLog && C->isNegative()) 577 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); 578 579 // TODO: Full constant folding matching hardware behavior. 580 } 581 582 break; 583 } 584 case Intrinsic::amdgcn_frexp_mant: 585 case Intrinsic::amdgcn_frexp_exp: { 586 Value *Src = II.getArgOperand(0); 587 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 588 int Exp; 589 APFloat Significand = 590 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 591 592 if (IID == Intrinsic::amdgcn_frexp_mant) { 593 return IC.replaceInstUsesWith( 594 II, ConstantFP::get(II.getContext(), Significand)); 595 } 596 597 // Match instruction special case behavior. 598 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 599 Exp = 0; 600 601 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 602 } 603 604 if (isa<UndefValue>(Src)) { 605 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 606 } 607 608 break; 609 } 610 case Intrinsic::amdgcn_class: { 611 Value *Src0 = II.getArgOperand(0); 612 Value *Src1 = II.getArgOperand(1); 613 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 614 if (CMask) { 615 II.setCalledOperand(Intrinsic::getOrInsertDeclaration( 616 II.getModule(), Intrinsic::is_fpclass, Src0->getType())); 617 618 // Clamp any excess bits, as they're illegal for the generic intrinsic. 619 II.setArgOperand(1, ConstantInt::get(Src1->getType(), 620 CMask->getZExtValue() & fcAllFlags)); 621 return &II; 622 } 623 624 // Propagate poison. 625 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1)) 626 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); 627 628 // llvm.amdgcn.class(_, undef) -> false 629 if (IC.getSimplifyQuery().isUndefValue(Src1)) 630 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 631 632 // llvm.amdgcn.class(undef, mask) -> mask != 0 633 if (IC.getSimplifyQuery().isUndefValue(Src0)) { 634 Value *CmpMask = IC.Builder.CreateICmpNE( 635 Src1, ConstantInt::getNullValue(Src1->getType())); 636 return IC.replaceInstUsesWith(II, CmpMask); 637 } 638 break; 639 } 640 case Intrinsic::amdgcn_cvt_pkrtz: { 641 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * { 642 Type *HalfTy = Type::getHalfTy(Arg->getContext()); 643 644 if (isa<PoisonValue>(Arg)) 645 return PoisonValue::get(HalfTy); 646 if (isa<UndefValue>(Arg)) 647 return UndefValue::get(HalfTy); 648 649 ConstantFP *CFP = nullptr; 650 if (match(Arg, m_ConstantFP(CFP))) { 651 bool LosesInfo; 652 APFloat Val(CFP->getValueAPF()); 653 Val.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 654 return ConstantFP::get(HalfTy, Val); 655 } 656 657 Value *Src = nullptr; 658 if (match(Arg, m_FPExt(m_Value(Src)))) { 659 if (Src->getType()->isHalfTy()) 660 return Src; 661 } 662 663 return nullptr; 664 }; 665 666 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) { 667 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) { 668 Value *V = PoisonValue::get(II.getType()); 669 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0); 670 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1); 671 return IC.replaceInstUsesWith(II, V); 672 } 673 } 674 675 break; 676 } 677 case Intrinsic::amdgcn_cvt_pknorm_i16: 678 case Intrinsic::amdgcn_cvt_pknorm_u16: 679 case Intrinsic::amdgcn_cvt_pk_i16: 680 case Intrinsic::amdgcn_cvt_pk_u16: { 681 Value *Src0 = II.getArgOperand(0); 682 Value *Src1 = II.getArgOperand(1); 683 684 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 685 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 686 } 687 688 break; 689 } 690 case Intrinsic::amdgcn_ubfe: 691 case Intrinsic::amdgcn_sbfe: { 692 // Decompose simple cases into standard shifts. 693 Value *Src = II.getArgOperand(0); 694 if (isa<UndefValue>(Src)) { 695 return IC.replaceInstUsesWith(II, Src); 696 } 697 698 unsigned Width; 699 Type *Ty = II.getType(); 700 unsigned IntSize = Ty->getIntegerBitWidth(); 701 702 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 703 if (CWidth) { 704 Width = CWidth->getZExtValue(); 705 if ((Width & (IntSize - 1)) == 0) { 706 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 707 } 708 709 // Hardware ignores high bits, so remove those. 710 if (Width >= IntSize) { 711 return IC.replaceOperand( 712 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 713 } 714 } 715 716 unsigned Offset; 717 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 718 if (COffset) { 719 Offset = COffset->getZExtValue(); 720 if (Offset >= IntSize) { 721 return IC.replaceOperand( 722 II, 1, 723 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 724 } 725 } 726 727 bool Signed = IID == Intrinsic::amdgcn_sbfe; 728 729 if (!CWidth || !COffset) 730 break; 731 732 // The case of Width == 0 is handled above, which makes this transformation 733 // safe. If Width == 0, then the ashr and lshr instructions become poison 734 // value since the shift amount would be equal to the bit size. 735 assert(Width != 0); 736 737 // TODO: This allows folding to undef when the hardware has specific 738 // behavior? 739 if (Offset + Width < IntSize) { 740 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 741 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 742 : IC.Builder.CreateLShr(Shl, IntSize - Width); 743 RightShift->takeName(&II); 744 return IC.replaceInstUsesWith(II, RightShift); 745 } 746 747 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 748 : IC.Builder.CreateLShr(Src, Offset); 749 750 RightShift->takeName(&II); 751 return IC.replaceInstUsesWith(II, RightShift); 752 } 753 case Intrinsic::amdgcn_exp: 754 case Intrinsic::amdgcn_exp_row: 755 case Intrinsic::amdgcn_exp_compr: { 756 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 757 unsigned EnBits = En->getZExtValue(); 758 if (EnBits == 0xf) 759 break; // All inputs enabled. 760 761 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 762 bool Changed = false; 763 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 764 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 765 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 766 Value *Src = II.getArgOperand(I + 2); 767 if (!isa<UndefValue>(Src)) { 768 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 769 Changed = true; 770 } 771 } 772 } 773 774 if (Changed) { 775 return &II; 776 } 777 778 break; 779 } 780 case Intrinsic::amdgcn_fmed3: { 781 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 782 // for the shader. 783 784 Value *Src0 = II.getArgOperand(0); 785 Value *Src1 = II.getArgOperand(1); 786 Value *Src2 = II.getArgOperand(2); 787 788 // Checking for NaN before canonicalization provides better fidelity when 789 // mapping other operations onto fmed3 since the order of operands is 790 // unchanged. 791 Value *V = nullptr; 792 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 793 V = IC.Builder.CreateMinNum(Src1, Src2); 794 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 795 V = IC.Builder.CreateMinNum(Src0, Src2); 796 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 797 V = IC.Builder.CreateMaxNum(Src0, Src1); 798 } 799 800 if (V) { 801 if (auto *CI = dyn_cast<CallInst>(V)) { 802 CI->copyFastMathFlags(&II); 803 CI->takeName(&II); 804 } 805 return IC.replaceInstUsesWith(II, V); 806 } 807 808 bool Swap = false; 809 // Canonicalize constants to RHS operands. 810 // 811 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 812 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 813 std::swap(Src0, Src1); 814 Swap = true; 815 } 816 817 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 818 std::swap(Src1, Src2); 819 Swap = true; 820 } 821 822 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 823 std::swap(Src0, Src1); 824 Swap = true; 825 } 826 827 if (Swap) { 828 II.setArgOperand(0, Src0); 829 II.setArgOperand(1, Src1); 830 II.setArgOperand(2, Src2); 831 return &II; 832 } 833 834 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 835 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 836 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 837 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 838 C2->getValueAPF()); 839 return IC.replaceInstUsesWith( 840 II, ConstantFP::get(IC.Builder.getContext(), Result)); 841 } 842 } 843 } 844 845 if (!ST->hasMed3_16()) 846 break; 847 848 // Repeat floating-point width reduction done for minnum/maxnum. 849 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z)) 850 if (Value *X = matchFPExtFromF16(Src0)) { 851 if (Value *Y = matchFPExtFromF16(Src1)) { 852 if (Value *Z = matchFPExtFromF16(Src2)) { 853 Value *NewCall = IC.Builder.CreateIntrinsic( 854 IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); 855 return new FPExtInst(NewCall, II.getType()); 856 } 857 } 858 } 859 860 break; 861 } 862 case Intrinsic::amdgcn_icmp: 863 case Intrinsic::amdgcn_fcmp: { 864 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 865 // Guard against invalid arguments. 866 int64_t CCVal = CC->getZExtValue(); 867 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 868 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 869 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 870 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 871 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 872 break; 873 874 Value *Src0 = II.getArgOperand(0); 875 Value *Src1 = II.getArgOperand(1); 876 877 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 878 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 879 Constant *CCmp = ConstantFoldCompareInstOperands( 880 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL); 881 if (CCmp && CCmp->isNullValue()) { 882 return IC.replaceInstUsesWith( 883 II, IC.Builder.CreateSExt(CCmp, II.getType())); 884 } 885 886 // The result of V_ICMP/V_FCMP assembly instructions (which this 887 // intrinsic exposes) is one bit per thread, masked with the EXEC 888 // register (which contains the bitmask of live threads). So a 889 // comparison that always returns true is the same as a read of the 890 // EXEC register. 891 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 892 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 893 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 894 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register, 895 II.getType(), Args); 896 NewCall->addFnAttr(Attribute::Convergent); 897 NewCall->takeName(&II); 898 return IC.replaceInstUsesWith(II, NewCall); 899 } 900 901 // Canonicalize constants to RHS. 902 CmpInst::Predicate SwapPred = 903 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 904 II.setArgOperand(0, Src1); 905 II.setArgOperand(1, Src0); 906 II.setArgOperand( 907 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 908 return &II; 909 } 910 911 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 912 break; 913 914 // Canonicalize compare eq with true value to compare != 0 915 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 916 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 917 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 918 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 919 Value *ExtSrc; 920 if (CCVal == CmpInst::ICMP_EQ && 921 ((match(Src1, PatternMatch::m_One()) && 922 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 923 (match(Src1, PatternMatch::m_AllOnes()) && 924 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 925 ExtSrc->getType()->isIntegerTy(1)) { 926 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 927 IC.replaceOperand(II, 2, 928 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 929 return &II; 930 } 931 932 CmpInst::Predicate SrcPred; 933 Value *SrcLHS; 934 Value *SrcRHS; 935 936 // Fold compare eq/ne with 0 from a compare result as the predicate to the 937 // intrinsic. The typical use is a wave vote function in the library, which 938 // will be fed from a user code condition compared with 0. Fold in the 939 // redundant compare. 940 941 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 942 // -> llvm.amdgcn.[if]cmp(a, b, pred) 943 // 944 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 945 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 946 if (match(Src1, PatternMatch::m_Zero()) && 947 match(Src0, PatternMatch::m_ZExtOrSExt( 948 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 949 PatternMatch::m_Value(SrcRHS))))) { 950 if (CCVal == CmpInst::ICMP_EQ) 951 SrcPred = CmpInst::getInversePredicate(SrcPred); 952 953 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 954 ? Intrinsic::amdgcn_fcmp 955 : Intrinsic::amdgcn_icmp; 956 957 Type *Ty = SrcLHS->getType(); 958 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 959 // Promote to next legal integer type. 960 unsigned Width = CmpType->getBitWidth(); 961 unsigned NewWidth = Width; 962 963 // Don't do anything for i1 comparisons. 964 if (Width == 1) 965 break; 966 967 if (Width <= 16) 968 NewWidth = 16; 969 else if (Width <= 32) 970 NewWidth = 32; 971 else if (Width <= 64) 972 NewWidth = 64; 973 else 974 break; // Can't handle this. 975 976 if (Width != NewWidth) { 977 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 978 if (CmpInst::isSigned(SrcPred)) { 979 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 980 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 981 } else { 982 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 983 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 984 } 985 } 986 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 987 break; 988 989 Value *Args[] = {SrcLHS, SrcRHS, 990 ConstantInt::get(CC->getType(), SrcPred)}; 991 CallInst *NewCall = IC.Builder.CreateIntrinsic( 992 NewIID, {II.getType(), SrcLHS->getType()}, Args); 993 NewCall->takeName(&II); 994 return IC.replaceInstUsesWith(II, NewCall); 995 } 996 997 break; 998 } 999 case Intrinsic::amdgcn_mbcnt_hi: { 1000 // exec_hi is all 0, so this is just a copy. 1001 if (ST->isWave32()) 1002 return IC.replaceInstUsesWith(II, II.getArgOperand(1)); 1003 break; 1004 } 1005 case Intrinsic::amdgcn_ballot: { 1006 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1007 if (Src->isZero()) { 1008 // amdgcn.ballot(i1 0) is zero. 1009 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 1010 } 1011 } 1012 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) { 1013 // %b64 = call i64 ballot.i64(...) 1014 // => 1015 // %b32 = call i32 ballot.i32(...) 1016 // %b64 = zext i32 %b32 to i64 1017 Value *Call = IC.Builder.CreateZExt( 1018 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, 1019 {IC.Builder.getInt32Ty()}, 1020 {II.getArgOperand(0)}), 1021 II.getType()); 1022 Call->takeName(&II); 1023 return IC.replaceInstUsesWith(II, Call); 1024 } 1025 break; 1026 } 1027 case Intrinsic::amdgcn_wqm_vote: { 1028 // wqm_vote is identity when the argument is constant. 1029 if (!isa<Constant>(II.getArgOperand(0))) 1030 break; 1031 1032 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1033 } 1034 case Intrinsic::amdgcn_kill: { 1035 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 1036 if (!C || !C->getZExtValue()) 1037 break; 1038 1039 // amdgcn.kill(i1 1) is a no-op 1040 return IC.eraseInstFromFunction(II); 1041 } 1042 case Intrinsic::amdgcn_update_dpp: { 1043 Value *Old = II.getArgOperand(0); 1044 1045 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 1046 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 1047 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 1048 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 1049 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 1050 break; 1051 1052 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 1053 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 1054 } 1055 case Intrinsic::amdgcn_permlane16: 1056 case Intrinsic::amdgcn_permlane16_var: 1057 case Intrinsic::amdgcn_permlanex16: 1058 case Intrinsic::amdgcn_permlanex16_var: { 1059 // Discard vdst_in if it's not going to be read. 1060 Value *VDstIn = II.getArgOperand(0); 1061 if (isa<UndefValue>(VDstIn)) 1062 break; 1063 1064 // FetchInvalid operand idx. 1065 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 || 1066 IID == Intrinsic::amdgcn_permlanex16) 1067 ? 4 /* for permlane16 and permlanex16 */ 1068 : 3; /* for permlane16_var and permlanex16_var */ 1069 1070 // BoundCtrl operand idx. 1071 // For permlane16 and permlanex16 it should be 5 1072 // For Permlane16_var and permlanex16_var it should be 4 1073 unsigned int BcIdx = FiIdx + 1; 1074 1075 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx)); 1076 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx)); 1077 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 1078 break; 1079 1080 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 1081 } 1082 case Intrinsic::amdgcn_permlane64: 1083 case Intrinsic::amdgcn_readfirstlane: 1084 case Intrinsic::amdgcn_readlane: { 1085 // If the first argument is uniform these intrinsics return it unchanged. 1086 const Use &Src = II.getArgOperandUse(0); 1087 if (isTriviallyUniform(Src)) 1088 return IC.replaceInstUsesWith(II, Src.get()); 1089 break; 1090 } 1091 case Intrinsic::amdgcn_trig_preop: { 1092 // The intrinsic is declared with name mangling, but currently the 1093 // instruction only exists for f64 1094 if (!II.getType()->isDoubleTy()) 1095 break; 1096 1097 Value *Src = II.getArgOperand(0); 1098 Value *Segment = II.getArgOperand(1); 1099 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment)) 1100 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); 1101 1102 if (isa<UndefValue>(Src)) { 1103 auto *QNaN = ConstantFP::get( 1104 II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics())); 1105 return IC.replaceInstUsesWith(II, QNaN); 1106 } 1107 1108 const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src); 1109 if (!Csrc) 1110 break; 1111 1112 if (II.isStrictFP()) 1113 break; 1114 1115 const APFloat &Fsrc = Csrc->getValueAPF(); 1116 if (Fsrc.isNaN()) { 1117 auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet()); 1118 return IC.replaceInstUsesWith(II, Quieted); 1119 } 1120 1121 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment); 1122 if (!Cseg) 1123 break; 1124 1125 unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff; 1126 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue(); 1127 unsigned Shift = SegmentVal * 53; 1128 if (Exponent > 1077) 1129 Shift += Exponent - 1077; 1130 1131 // 2.0/PI table. 1132 static const uint32_t TwoByPi[] = { 1133 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041, 1134 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 1135 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, 1136 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, 1137 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d, 1138 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 1139 0x56033046}; 1140 1141 // Return 0 for outbound segment (hardware behavior). 1142 unsigned Idx = Shift >> 5; 1143 if (Idx + 2 >= std::size(TwoByPi)) { 1144 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics()); 1145 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero)); 1146 } 1147 1148 unsigned BShift = Shift & 0x1f; 1149 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]); 1150 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0); 1151 if (BShift) 1152 Thi = (Thi << BShift) | (Tlo >> (64 - BShift)); 1153 Thi = Thi >> 11; 1154 APFloat Result = APFloat((double)Thi); 1155 1156 int Scale = -53 - Shift; 1157 if (Exponent >= 1968) 1158 Scale += 128; 1159 1160 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven); 1161 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result)); 1162 } 1163 case Intrinsic::amdgcn_fmul_legacy: { 1164 Value *Op0 = II.getArgOperand(0); 1165 Value *Op1 = II.getArgOperand(1); 1166 1167 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1168 // infinity, gives +0.0. 1169 // TODO: Move to InstSimplify? 1170 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1171 match(Op1, PatternMatch::m_AnyZeroFP())) 1172 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType())); 1173 1174 // If we can prove we don't have one of the special cases then we can use a 1175 // normal fmul instruction instead. 1176 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { 1177 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1178 FMul->takeName(&II); 1179 return IC.replaceInstUsesWith(II, FMul); 1180 } 1181 break; 1182 } 1183 case Intrinsic::amdgcn_fma_legacy: { 1184 Value *Op0 = II.getArgOperand(0); 1185 Value *Op1 = II.getArgOperand(1); 1186 Value *Op2 = II.getArgOperand(2); 1187 1188 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1189 // infinity, gives +0.0. 1190 // TODO: Move to InstSimplify? 1191 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1192 match(Op1, PatternMatch::m_AnyZeroFP())) { 1193 // It's tempting to just return Op2 here, but that would give the wrong 1194 // result if Op2 was -0.0. 1195 auto *Zero = ConstantFP::getZero(II.getType()); 1196 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1197 FAdd->takeName(&II); 1198 return IC.replaceInstUsesWith(II, FAdd); 1199 } 1200 1201 // If we can prove we don't have one of the special cases then we can use a 1202 // normal fma instead. 1203 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { 1204 II.setCalledOperand(Intrinsic::getOrInsertDeclaration( 1205 II.getModule(), Intrinsic::fma, II.getType())); 1206 return &II; 1207 } 1208 break; 1209 } 1210 case Intrinsic::amdgcn_is_shared: 1211 case Intrinsic::amdgcn_is_private: { 1212 if (isa<UndefValue>(II.getArgOperand(0))) 1213 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1214 1215 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1216 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1217 break; 1218 } 1219 case Intrinsic::amdgcn_raw_buffer_store_format: 1220 case Intrinsic::amdgcn_struct_buffer_store_format: 1221 case Intrinsic::amdgcn_raw_tbuffer_store: 1222 case Intrinsic::amdgcn_struct_tbuffer_store: 1223 case Intrinsic::amdgcn_image_store_1d: 1224 case Intrinsic::amdgcn_image_store_1darray: 1225 case Intrinsic::amdgcn_image_store_2d: 1226 case Intrinsic::amdgcn_image_store_2darray: 1227 case Intrinsic::amdgcn_image_store_2darraymsaa: 1228 case Intrinsic::amdgcn_image_store_2dmsaa: 1229 case Intrinsic::amdgcn_image_store_3d: 1230 case Intrinsic::amdgcn_image_store_cube: 1231 case Intrinsic::amdgcn_image_store_mip_1d: 1232 case Intrinsic::amdgcn_image_store_mip_1darray: 1233 case Intrinsic::amdgcn_image_store_mip_2d: 1234 case Intrinsic::amdgcn_image_store_mip_2darray: 1235 case Intrinsic::amdgcn_image_store_mip_3d: 1236 case Intrinsic::amdgcn_image_store_mip_cube: { 1237 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType())) 1238 break; 1239 1240 APInt DemandedElts; 1241 if (ST->hasDefaultComponentBroadcast()) 1242 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); 1243 else if (ST->hasDefaultComponentZero()) 1244 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); 1245 else 1246 break; 1247 1248 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; 1249 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, 1250 false)) { 1251 return IC.eraseInstFromFunction(II); 1252 } 1253 1254 break; 1255 } 1256 } 1257 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1258 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1259 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1260 } 1261 return std::nullopt; 1262 } 1263 1264 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1265 /// 1266 /// The result of simplifying amdgcn image and buffer store intrinsics is updating 1267 /// definitions of the intrinsics vector argument, not Uses of the result like 1268 /// image and buffer loads. 1269 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1270 /// struct returns. 1271 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1272 IntrinsicInst &II, 1273 APInt DemandedElts, 1274 int DMaskIdx, bool IsLoad) { 1275 1276 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType() 1277 : II.getOperand(0)->getType()); 1278 unsigned VWidth = IIVTy->getNumElements(); 1279 if (VWidth == 1) 1280 return nullptr; 1281 Type *EltTy = IIVTy->getElementType(); 1282 1283 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1284 IC.Builder.SetInsertPoint(&II); 1285 1286 // Assume the arguments are unchanged and later override them, if needed. 1287 SmallVector<Value *, 16> Args(II.args()); 1288 1289 if (DMaskIdx < 0) { 1290 // Buffer case. 1291 1292 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1293 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero(); 1294 1295 // Start assuming the prefix of elements is demanded, but possibly clear 1296 // some other bits if there are trailing zeros (unused components at front) 1297 // and update offset. 1298 DemandedElts = (1 << ActiveBits) - 1; 1299 1300 if (UnusedComponentsAtFront > 0) { 1301 static const unsigned InvalidOffsetIdx = 0xf; 1302 1303 unsigned OffsetIdx; 1304 switch (II.getIntrinsicID()) { 1305 case Intrinsic::amdgcn_raw_buffer_load: 1306 case Intrinsic::amdgcn_raw_ptr_buffer_load: 1307 OffsetIdx = 1; 1308 break; 1309 case Intrinsic::amdgcn_s_buffer_load: 1310 // If resulting type is vec3, there is no point in trimming the 1311 // load with updated offset, as the vec3 would most likely be widened to 1312 // vec4 anyway during lowering. 1313 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1314 OffsetIdx = InvalidOffsetIdx; 1315 else 1316 OffsetIdx = 1; 1317 break; 1318 case Intrinsic::amdgcn_struct_buffer_load: 1319 case Intrinsic::amdgcn_struct_ptr_buffer_load: 1320 OffsetIdx = 2; 1321 break; 1322 default: 1323 // TODO: handle tbuffer* intrinsics. 1324 OffsetIdx = InvalidOffsetIdx; 1325 break; 1326 } 1327 1328 if (OffsetIdx != InvalidOffsetIdx) { 1329 // Clear demanded bits and update the offset. 1330 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1331 auto *Offset = Args[OffsetIdx]; 1332 unsigned SingleComponentSizeInBits = 1333 IC.getDataLayout().getTypeSizeInBits(EltTy); 1334 unsigned OffsetAdd = 1335 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1336 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1337 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1338 } 1339 } 1340 } else { 1341 // Image case. 1342 1343 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]); 1344 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1345 1346 // dmask 0 has special semantics, do not simplify. 1347 if (DMaskVal == 0) 1348 return nullptr; 1349 1350 // Mask off values that are undefined because the dmask doesn't cover them 1351 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; 1352 1353 unsigned NewDMaskVal = 0; 1354 unsigned OrigLdStIdx = 0; 1355 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1356 const unsigned Bit = 1 << SrcIdx; 1357 if (!!(DMaskVal & Bit)) { 1358 if (!!DemandedElts[OrigLdStIdx]) 1359 NewDMaskVal |= Bit; 1360 OrigLdStIdx++; 1361 } 1362 } 1363 1364 if (DMaskVal != NewDMaskVal) 1365 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1366 } 1367 1368 unsigned NewNumElts = DemandedElts.popcount(); 1369 if (!NewNumElts) 1370 return PoisonValue::get(IIVTy); 1371 1372 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1373 if (DMaskIdx >= 0) 1374 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1375 return nullptr; 1376 } 1377 1378 // Validate function argument and return types, extracting overloaded types 1379 // along the way. 1380 SmallVector<Type *, 6> OverloadTys; 1381 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1382 return nullptr; 1383 1384 Type *NewTy = 1385 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1386 OverloadTys[0] = NewTy; 1387 1388 if (!IsLoad) { 1389 SmallVector<int, 8> EltMask; 1390 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) 1391 if (DemandedElts[OrigStoreIdx]) 1392 EltMask.push_back(OrigStoreIdx); 1393 1394 if (NewNumElts == 1) 1395 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); 1396 else 1397 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); 1398 } 1399 1400 CallInst *NewCall = 1401 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args); 1402 NewCall->takeName(&II); 1403 NewCall->copyMetadata(II); 1404 1405 if (IsLoad) { 1406 if (NewNumElts == 1) { 1407 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall, 1408 DemandedElts.countr_zero()); 1409 } 1410 1411 SmallVector<int, 8> EltMask; 1412 unsigned NewLoadIdx = 0; 1413 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1414 if (!!DemandedElts[OrigLoadIdx]) 1415 EltMask.push_back(NewLoadIdx++); 1416 else 1417 EltMask.push_back(NewNumElts); 1418 } 1419 1420 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1421 1422 return Shuffle; 1423 } 1424 1425 return NewCall; 1426 } 1427 1428 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1429 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1430 APInt &UndefElts2, APInt &UndefElts3, 1431 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1432 SimplifyAndSetOp) const { 1433 switch (II.getIntrinsicID()) { 1434 case Intrinsic::amdgcn_raw_buffer_load: 1435 case Intrinsic::amdgcn_raw_ptr_buffer_load: 1436 case Intrinsic::amdgcn_raw_buffer_load_format: 1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 1438 case Intrinsic::amdgcn_raw_tbuffer_load: 1439 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 1440 case Intrinsic::amdgcn_s_buffer_load: 1441 case Intrinsic::amdgcn_struct_buffer_load: 1442 case Intrinsic::amdgcn_struct_ptr_buffer_load: 1443 case Intrinsic::amdgcn_struct_buffer_load_format: 1444 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 1445 case Intrinsic::amdgcn_struct_tbuffer_load: 1446 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 1447 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1448 default: { 1449 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1450 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1451 } 1452 break; 1453 } 1454 } 1455 return std::nullopt; 1456 } 1457