1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/ADT/FloatingPointMode.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/Transforms/InstCombine/InstCombiner.h" 23 24 using namespace llvm; 25 26 #define DEBUG_TYPE "AMDGPUtti" 27 28 namespace { 29 30 struct AMDGPUImageDMaskIntrinsic { 31 unsigned Intr; 32 }; 33 34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 35 #include "InstCombineTables.inc" 36 37 } // end anonymous namespace 38 39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 40 // 41 // A single NaN input is folded to minnum, so we rely on that folding for 42 // handling NaNs. 43 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 44 const APFloat &Src2) { 45 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 46 47 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 48 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 49 if (Cmp0 == APFloat::cmpEqual) 50 return maxnum(Src1, Src2); 51 52 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 53 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 54 if (Cmp1 == APFloat::cmpEqual) 55 return maxnum(Src0, Src2); 56 57 return maxnum(Src0, Src1); 58 } 59 60 // Check if a value can be converted to a 16-bit value without losing 61 // precision. 62 // The value is expected to be either a float (IsFloat = true) or an unsigned 63 // integer (IsFloat = false). 64 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 65 Type *VTy = V.getType(); 66 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 67 // The value is already 16-bit, so we don't want to convert to 16-bit again! 68 return false; 69 } 70 if (IsFloat) { 71 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 72 // We need to check that if we cast the index down to a half, we do not 73 // lose precision. 74 APFloat FloatValue(ConstFloat->getValueAPF()); 75 bool LosesInfo = true; 76 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 77 &LosesInfo); 78 return !LosesInfo; 79 } 80 } else { 81 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 82 // We need to check that if we cast the index down to an i16, we do not 83 // lose precision. 84 APInt IntValue(ConstInt->getValue()); 85 return IntValue.getActiveBits() <= 16; 86 } 87 } 88 89 Value *CastSrc; 90 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 91 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 92 if (IsExt) { 93 Type *CastSrcTy = CastSrc->getType(); 94 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 95 return true; 96 } 97 98 return false; 99 } 100 101 // Convert a value to 16-bit. 102 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 103 Type *VTy = V.getType(); 104 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 105 return cast<Instruction>(&V)->getOperand(0); 106 if (VTy->isIntegerTy()) 107 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 108 if (VTy->isFloatingPointTy()) 109 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 110 111 llvm_unreachable("Should never be called!"); 112 } 113 114 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 115 /// modified arguments (based on OldIntr) and replaces InstToReplace with 116 /// this newly created intrinsic call. 117 static Optional<Instruction *> modifyIntrinsicCall( 118 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 119 InstCombiner &IC, 120 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 121 Func) { 122 SmallVector<Type *, 4> ArgTys; 123 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 124 return None; 125 126 SmallVector<Value *, 8> Args(OldIntr.args()); 127 128 // Modify arguments and types 129 Func(Args, ArgTys); 130 131 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); 132 133 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 134 NewCall->takeName(&OldIntr); 135 NewCall->copyMetadata(OldIntr); 136 if (isa<FPMathOperator>(NewCall)) 137 NewCall->copyFastMathFlags(&OldIntr); 138 139 // Erase and replace uses 140 if (!InstToReplace.getType()->isVoidTy()) 141 IC.replaceInstUsesWith(InstToReplace, NewCall); 142 143 bool RemoveOldIntr = &OldIntr != &InstToReplace; 144 145 auto RetValue = IC.eraseInstFromFunction(InstToReplace); 146 if (RemoveOldIntr) 147 IC.eraseInstFromFunction(OldIntr); 148 149 return RetValue; 150 } 151 152 static Optional<Instruction *> 153 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 154 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 155 IntrinsicInst &II, InstCombiner &IC) { 156 // Optimize _L to _LZ when _L is zero 157 if (const auto *LZMappingInfo = 158 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 159 if (auto *ConstantLod = 160 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 161 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 162 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 163 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 164 ImageDimIntr->Dim); 165 return modifyIntrinsicCall( 166 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 167 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 168 }); 169 } 170 } 171 } 172 173 // Optimize _mip away, when 'lod' is zero 174 if (const auto *MIPMappingInfo = 175 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 176 if (auto *ConstantMip = 177 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 178 if (ConstantMip->isZero()) { 179 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 180 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 181 ImageDimIntr->Dim); 182 return modifyIntrinsicCall( 183 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 184 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 185 }); 186 } 187 } 188 } 189 190 // Optimize _bias away when 'bias' is zero 191 if (const auto *BiasMappingInfo = 192 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 193 if (auto *ConstantBias = 194 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 195 if (ConstantBias->isZero()) { 196 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 197 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 198 ImageDimIntr->Dim); 199 return modifyIntrinsicCall( 200 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 201 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 202 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 203 }); 204 } 205 } 206 } 207 208 // Optimize _offset away when 'offset' is zero 209 if (const auto *OffsetMappingInfo = 210 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 211 if (auto *ConstantOffset = 212 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 213 if (ConstantOffset->isZero()) { 214 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 215 AMDGPU::getImageDimIntrinsicByBaseOpcode( 216 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 217 return modifyIntrinsicCall( 218 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 219 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 220 }); 221 } 222 } 223 } 224 225 // Try to use D16 226 if (ST->hasD16Images()) { 227 228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 229 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 230 231 if (BaseOpcode->HasD16) { 232 233 // If the only use of image intrinsic is a fptrunc (with conversion to 234 // half) then both fptrunc and image intrinsic will be replaced with image 235 // intrinsic with D16 flag. 236 if (II.hasOneUse()) { 237 Instruction *User = II.user_back(); 238 239 if (User->getOpcode() == Instruction::FPTrunc && 240 User->getType()->getScalarType()->isHalfTy()) { 241 242 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 243 [&](auto &Args, auto &ArgTys) { 244 // Change return type of image intrinsic. 245 // Set it to return type of fptrunc. 246 ArgTys[0] = User->getType(); 247 }); 248 } 249 } 250 } 251 } 252 253 // Try to use A16 or G16 254 if (!ST->hasA16() && !ST->hasG16()) 255 return None; 256 257 // Address is interpreted as float if the instruction has a sampler or as 258 // unsigned int if there is no sampler. 259 bool HasSampler = 260 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 261 bool FloatCoord = false; 262 // true means derivatives can be converted to 16 bit, coordinates not 263 bool OnlyDerivatives = false; 264 265 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 266 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 267 Value *Coord = II.getOperand(OperandIndex); 268 // If the values are not derived from 16-bit values, we cannot optimize. 269 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 270 if (OperandIndex < ImageDimIntr->CoordStart || 271 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 272 return None; 273 } 274 // All gradients can be converted, so convert only them 275 OnlyDerivatives = true; 276 break; 277 } 278 279 assert(OperandIndex == ImageDimIntr->GradientStart || 280 FloatCoord == Coord->getType()->isFloatingPointTy()); 281 FloatCoord = Coord->getType()->isFloatingPointTy(); 282 } 283 284 if (!OnlyDerivatives && !ST->hasA16()) 285 OnlyDerivatives = true; // Only supports G16 286 287 // Check if there is a bias parameter and if it can be converted to f16 288 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 289 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 290 assert(HasSampler && 291 "Only image instructions with a sampler can have a bias"); 292 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 293 OnlyDerivatives = true; 294 } 295 296 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 297 ImageDimIntr->CoordStart)) 298 return None; 299 300 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 301 : Type::getInt16Ty(II.getContext()); 302 303 return modifyIntrinsicCall( 304 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 305 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 306 if (!OnlyDerivatives) { 307 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 308 309 // Change the bias type 310 if (ImageDimIntr->NumBiasArgs != 0) 311 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 312 } 313 314 unsigned EndIndex = 315 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 316 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 317 OperandIndex < EndIndex; OperandIndex++) { 318 Args[OperandIndex] = 319 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 320 } 321 322 // Convert the bias 323 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 324 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 325 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 326 } 327 }); 328 } 329 330 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 331 InstCombiner &IC) const { 332 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 333 // infinity, gives +0.0. If we can prove we don't have one of the special 334 // cases then we can use a normal multiply instead. 335 // TODO: Create and use isKnownFiniteNonZero instead of just matching 336 // constants here. 337 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 338 match(Op1, PatternMatch::m_FiniteNonZero())) { 339 // One operand is not zero or infinity or NaN. 340 return true; 341 } 342 auto *TLI = &IC.getTargetLibraryInfo(); 343 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 344 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 345 // Neither operand is infinity or NaN. 346 return true; 347 } 348 return false; 349 } 350 351 Optional<Instruction *> 352 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 353 Intrinsic::ID IID = II.getIntrinsicID(); 354 switch (IID) { 355 case Intrinsic::amdgcn_rcp: { 356 Value *Src = II.getArgOperand(0); 357 358 // TODO: Move to ConstantFolding/InstSimplify? 359 if (isa<UndefValue>(Src)) { 360 Type *Ty = II.getType(); 361 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 362 return IC.replaceInstUsesWith(II, QNaN); 363 } 364 365 if (II.isStrictFP()) 366 break; 367 368 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 369 const APFloat &ArgVal = C->getValueAPF(); 370 APFloat Val(ArgVal.getSemantics(), 1); 371 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 372 373 // This is more precise than the instruction may give. 374 // 375 // TODO: The instruction always flushes denormal results (except for f16), 376 // should this also? 377 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 378 } 379 380 break; 381 } 382 case Intrinsic::amdgcn_sqrt: 383 case Intrinsic::amdgcn_rsq: { 384 Value *Src = II.getArgOperand(0); 385 386 // TODO: Move to ConstantFolding/InstSimplify? 387 if (isa<UndefValue>(Src)) { 388 Type *Ty = II.getType(); 389 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 390 return IC.replaceInstUsesWith(II, QNaN); 391 } 392 393 break; 394 } 395 case Intrinsic::amdgcn_frexp_mant: 396 case Intrinsic::amdgcn_frexp_exp: { 397 Value *Src = II.getArgOperand(0); 398 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 399 int Exp; 400 APFloat Significand = 401 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 402 403 if (IID == Intrinsic::amdgcn_frexp_mant) { 404 return IC.replaceInstUsesWith( 405 II, ConstantFP::get(II.getContext(), Significand)); 406 } 407 408 // Match instruction special case behavior. 409 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 410 Exp = 0; 411 412 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 413 } 414 415 if (isa<UndefValue>(Src)) { 416 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 417 } 418 419 break; 420 } 421 case Intrinsic::amdgcn_class: { 422 Value *Src0 = II.getArgOperand(0); 423 Value *Src1 = II.getArgOperand(1); 424 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 425 if (!CMask) { 426 if (isa<UndefValue>(Src0)) { 427 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 428 } 429 430 if (isa<UndefValue>(Src1)) { 431 return IC.replaceInstUsesWith(II, 432 ConstantInt::get(II.getType(), false)); 433 } 434 break; 435 } 436 437 uint32_t Mask = CMask->getZExtValue(); 438 439 // If all tests are made, it doesn't matter what the value is. 440 if ((Mask & fcAllFlags) == fcAllFlags) { 441 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 442 } 443 444 if ((Mask & fcAllFlags) == 0) { 445 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 446 } 447 448 if (Mask == fcNan && !II.isStrictFP()) { 449 // Equivalent of isnan. Replace with standard fcmp. 450 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 451 FCmp->takeName(&II); 452 return IC.replaceInstUsesWith(II, FCmp); 453 } 454 455 if (Mask == fcZero && !II.isStrictFP()) { 456 // Equivalent of == 0. 457 Value *FCmp = 458 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 459 460 FCmp->takeName(&II); 461 return IC.replaceInstUsesWith(II, FCmp); 462 } 463 464 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 465 if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 466 return IC.replaceOperand( 467 II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan)); 468 } 469 470 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 471 if (!CVal) { 472 if (isa<UndefValue>(Src0)) { 473 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 474 } 475 476 // Clamp mask to used bits 477 if ((Mask & fcAllFlags) != Mask) { 478 CallInst *NewCall = IC.Builder.CreateCall( 479 II.getCalledFunction(), 480 {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)}); 481 482 NewCall->takeName(&II); 483 return IC.replaceInstUsesWith(II, NewCall); 484 } 485 486 break; 487 } 488 489 const APFloat &Val = CVal->getValueAPF(); 490 491 bool Result = 492 ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) || 493 ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) || 494 ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) || 495 ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) || 496 ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) || 497 ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) || 498 ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) || 499 ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) || 500 ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) || 501 ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative()); 502 503 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 504 } 505 case Intrinsic::amdgcn_cvt_pkrtz: { 506 Value *Src0 = II.getArgOperand(0); 507 Value *Src1 = II.getArgOperand(1); 508 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 509 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 510 const fltSemantics &HalfSem = 511 II.getType()->getScalarType()->getFltSemantics(); 512 bool LosesInfo; 513 APFloat Val0 = C0->getValueAPF(); 514 APFloat Val1 = C1->getValueAPF(); 515 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 516 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 517 518 Constant *Folded = 519 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 520 ConstantFP::get(II.getContext(), Val1)}); 521 return IC.replaceInstUsesWith(II, Folded); 522 } 523 } 524 525 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 526 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 527 } 528 529 break; 530 } 531 case Intrinsic::amdgcn_cvt_pknorm_i16: 532 case Intrinsic::amdgcn_cvt_pknorm_u16: 533 case Intrinsic::amdgcn_cvt_pk_i16: 534 case Intrinsic::amdgcn_cvt_pk_u16: { 535 Value *Src0 = II.getArgOperand(0); 536 Value *Src1 = II.getArgOperand(1); 537 538 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 539 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 540 } 541 542 break; 543 } 544 case Intrinsic::amdgcn_ubfe: 545 case Intrinsic::amdgcn_sbfe: { 546 // Decompose simple cases into standard shifts. 547 Value *Src = II.getArgOperand(0); 548 if (isa<UndefValue>(Src)) { 549 return IC.replaceInstUsesWith(II, Src); 550 } 551 552 unsigned Width; 553 Type *Ty = II.getType(); 554 unsigned IntSize = Ty->getIntegerBitWidth(); 555 556 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 557 if (CWidth) { 558 Width = CWidth->getZExtValue(); 559 if ((Width & (IntSize - 1)) == 0) { 560 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 561 } 562 563 // Hardware ignores high bits, so remove those. 564 if (Width >= IntSize) { 565 return IC.replaceOperand( 566 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 567 } 568 } 569 570 unsigned Offset; 571 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 572 if (COffset) { 573 Offset = COffset->getZExtValue(); 574 if (Offset >= IntSize) { 575 return IC.replaceOperand( 576 II, 1, 577 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 578 } 579 } 580 581 bool Signed = IID == Intrinsic::amdgcn_sbfe; 582 583 if (!CWidth || !COffset) 584 break; 585 586 // The case of Width == 0 is handled above, which makes this transformation 587 // safe. If Width == 0, then the ashr and lshr instructions become poison 588 // value since the shift amount would be equal to the bit size. 589 assert(Width != 0); 590 591 // TODO: This allows folding to undef when the hardware has specific 592 // behavior? 593 if (Offset + Width < IntSize) { 594 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 595 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 596 : IC.Builder.CreateLShr(Shl, IntSize - Width); 597 RightShift->takeName(&II); 598 return IC.replaceInstUsesWith(II, RightShift); 599 } 600 601 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 602 : IC.Builder.CreateLShr(Src, Offset); 603 604 RightShift->takeName(&II); 605 return IC.replaceInstUsesWith(II, RightShift); 606 } 607 case Intrinsic::amdgcn_exp: 608 case Intrinsic::amdgcn_exp_row: 609 case Intrinsic::amdgcn_exp_compr: { 610 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 611 unsigned EnBits = En->getZExtValue(); 612 if (EnBits == 0xf) 613 break; // All inputs enabled. 614 615 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 616 bool Changed = false; 617 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 618 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 619 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 620 Value *Src = II.getArgOperand(I + 2); 621 if (!isa<UndefValue>(Src)) { 622 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 623 Changed = true; 624 } 625 } 626 } 627 628 if (Changed) { 629 return &II; 630 } 631 632 break; 633 } 634 case Intrinsic::amdgcn_fmed3: { 635 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 636 // for the shader. 637 638 Value *Src0 = II.getArgOperand(0); 639 Value *Src1 = II.getArgOperand(1); 640 Value *Src2 = II.getArgOperand(2); 641 642 // Checking for NaN before canonicalization provides better fidelity when 643 // mapping other operations onto fmed3 since the order of operands is 644 // unchanged. 645 CallInst *NewCall = nullptr; 646 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 647 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 648 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 649 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 650 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 651 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 652 } 653 654 if (NewCall) { 655 NewCall->copyFastMathFlags(&II); 656 NewCall->takeName(&II); 657 return IC.replaceInstUsesWith(II, NewCall); 658 } 659 660 bool Swap = false; 661 // Canonicalize constants to RHS operands. 662 // 663 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 664 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 665 std::swap(Src0, Src1); 666 Swap = true; 667 } 668 669 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 670 std::swap(Src1, Src2); 671 Swap = true; 672 } 673 674 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 675 std::swap(Src0, Src1); 676 Swap = true; 677 } 678 679 if (Swap) { 680 II.setArgOperand(0, Src0); 681 II.setArgOperand(1, Src1); 682 II.setArgOperand(2, Src2); 683 return &II; 684 } 685 686 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 687 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 688 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 689 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 690 C2->getValueAPF()); 691 return IC.replaceInstUsesWith( 692 II, ConstantFP::get(IC.Builder.getContext(), Result)); 693 } 694 } 695 } 696 697 break; 698 } 699 case Intrinsic::amdgcn_icmp: 700 case Intrinsic::amdgcn_fcmp: { 701 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 702 // Guard against invalid arguments. 703 int64_t CCVal = CC->getZExtValue(); 704 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 705 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 706 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 707 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 708 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 709 break; 710 711 Value *Src0 = II.getArgOperand(0); 712 Value *Src1 = II.getArgOperand(1); 713 714 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 715 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 716 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 717 if (CCmp->isNullValue()) { 718 return IC.replaceInstUsesWith( 719 II, ConstantExpr::getSExt(CCmp, II.getType())); 720 } 721 722 // The result of V_ICMP/V_FCMP assembly instructions (which this 723 // intrinsic exposes) is one bit per thread, masked with the EXEC 724 // register (which contains the bitmask of live threads). So a 725 // comparison that always returns true is the same as a read of the 726 // EXEC register. 727 Function *NewF = Intrinsic::getDeclaration( 728 II.getModule(), Intrinsic::read_register, II.getType()); 729 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 730 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 731 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 732 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 733 NewCall->addFnAttr(Attribute::Convergent); 734 NewCall->takeName(&II); 735 return IC.replaceInstUsesWith(II, NewCall); 736 } 737 738 // Canonicalize constants to RHS. 739 CmpInst::Predicate SwapPred = 740 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 741 II.setArgOperand(0, Src1); 742 II.setArgOperand(1, Src0); 743 II.setArgOperand( 744 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 745 return &II; 746 } 747 748 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 749 break; 750 751 // Canonicalize compare eq with true value to compare != 0 752 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 753 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 754 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 755 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 756 Value *ExtSrc; 757 if (CCVal == CmpInst::ICMP_EQ && 758 ((match(Src1, PatternMatch::m_One()) && 759 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 760 (match(Src1, PatternMatch::m_AllOnes()) && 761 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 762 ExtSrc->getType()->isIntegerTy(1)) { 763 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 764 IC.replaceOperand(II, 2, 765 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 766 return &II; 767 } 768 769 CmpInst::Predicate SrcPred; 770 Value *SrcLHS; 771 Value *SrcRHS; 772 773 // Fold compare eq/ne with 0 from a compare result as the predicate to the 774 // intrinsic. The typical use is a wave vote function in the library, which 775 // will be fed from a user code condition compared with 0. Fold in the 776 // redundant compare. 777 778 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 779 // -> llvm.amdgcn.[if]cmp(a, b, pred) 780 // 781 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 782 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 783 if (match(Src1, PatternMatch::m_Zero()) && 784 match(Src0, PatternMatch::m_ZExtOrSExt( 785 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 786 PatternMatch::m_Value(SrcRHS))))) { 787 if (CCVal == CmpInst::ICMP_EQ) 788 SrcPred = CmpInst::getInversePredicate(SrcPred); 789 790 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 791 ? Intrinsic::amdgcn_fcmp 792 : Intrinsic::amdgcn_icmp; 793 794 Type *Ty = SrcLHS->getType(); 795 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 796 // Promote to next legal integer type. 797 unsigned Width = CmpType->getBitWidth(); 798 unsigned NewWidth = Width; 799 800 // Don't do anything for i1 comparisons. 801 if (Width == 1) 802 break; 803 804 if (Width <= 16) 805 NewWidth = 16; 806 else if (Width <= 32) 807 NewWidth = 32; 808 else if (Width <= 64) 809 NewWidth = 64; 810 else if (Width > 64) 811 break; // Can't handle this. 812 813 if (Width != NewWidth) { 814 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 815 if (CmpInst::isSigned(SrcPred)) { 816 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 817 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 818 } else { 819 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 820 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 821 } 822 } 823 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 824 break; 825 826 Function *NewF = Intrinsic::getDeclaration( 827 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 828 Value *Args[] = {SrcLHS, SrcRHS, 829 ConstantInt::get(CC->getType(), SrcPred)}; 830 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 831 NewCall->takeName(&II); 832 return IC.replaceInstUsesWith(II, NewCall); 833 } 834 835 break; 836 } 837 case Intrinsic::amdgcn_ballot: { 838 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 839 if (Src->isZero()) { 840 // amdgcn.ballot(i1 0) is zero. 841 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 842 } 843 844 if (Src->isOne()) { 845 // amdgcn.ballot(i1 1) is exec. 846 const char *RegName = "exec"; 847 if (II.getType()->isIntegerTy(32)) 848 RegName = "exec_lo"; 849 else if (!II.getType()->isIntegerTy(64)) 850 break; 851 852 Function *NewF = Intrinsic::getDeclaration( 853 II.getModule(), Intrinsic::read_register, II.getType()); 854 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 855 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 856 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 857 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 858 NewCall->addFnAttr(Attribute::Convergent); 859 NewCall->takeName(&II); 860 return IC.replaceInstUsesWith(II, NewCall); 861 } 862 } 863 break; 864 } 865 case Intrinsic::amdgcn_wqm_vote: { 866 // wqm_vote is identity when the argument is constant. 867 if (!isa<Constant>(II.getArgOperand(0))) 868 break; 869 870 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 871 } 872 case Intrinsic::amdgcn_kill: { 873 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 874 if (!C || !C->getZExtValue()) 875 break; 876 877 // amdgcn.kill(i1 1) is a no-op 878 return IC.eraseInstFromFunction(II); 879 } 880 case Intrinsic::amdgcn_update_dpp: { 881 Value *Old = II.getArgOperand(0); 882 883 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 884 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 885 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 886 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 887 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 888 break; 889 890 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 891 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 892 } 893 case Intrinsic::amdgcn_permlane16: 894 case Intrinsic::amdgcn_permlanex16: { 895 // Discard vdst_in if it's not going to be read. 896 Value *VDstIn = II.getArgOperand(0); 897 if (isa<UndefValue>(VDstIn)) 898 break; 899 900 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 901 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 902 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 903 break; 904 905 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 906 } 907 case Intrinsic::amdgcn_permlane64: 908 // A constant value is trivially uniform. 909 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 910 return IC.replaceInstUsesWith(II, C); 911 } 912 break; 913 case Intrinsic::amdgcn_readfirstlane: 914 case Intrinsic::amdgcn_readlane: { 915 // A constant value is trivially uniform. 916 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 917 return IC.replaceInstUsesWith(II, C); 918 } 919 920 // The rest of these may not be safe if the exec may not be the same between 921 // the def and use. 922 Value *Src = II.getArgOperand(0); 923 Instruction *SrcInst = dyn_cast<Instruction>(Src); 924 if (SrcInst && SrcInst->getParent() != II.getParent()) 925 break; 926 927 // readfirstlane (readfirstlane x) -> readfirstlane x 928 // readlane (readfirstlane x), y -> readfirstlane x 929 if (match(Src, 930 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 931 return IC.replaceInstUsesWith(II, Src); 932 } 933 934 if (IID == Intrinsic::amdgcn_readfirstlane) { 935 // readfirstlane (readlane x, y) -> readlane x, y 936 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 937 return IC.replaceInstUsesWith(II, Src); 938 } 939 } else { 940 // readlane (readlane x, y), y -> readlane x, y 941 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 942 PatternMatch::m_Value(), 943 PatternMatch::m_Specific(II.getArgOperand(1))))) { 944 return IC.replaceInstUsesWith(II, Src); 945 } 946 } 947 948 break; 949 } 950 case Intrinsic::amdgcn_ldexp: { 951 // FIXME: This doesn't introduce new instructions and belongs in 952 // InstructionSimplify. 953 Type *Ty = II.getType(); 954 Value *Op0 = II.getArgOperand(0); 955 Value *Op1 = II.getArgOperand(1); 956 957 // Folding undef to qnan is safe regardless of the FP mode. 958 if (isa<UndefValue>(Op0)) { 959 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 960 return IC.replaceInstUsesWith(II, QNaN); 961 } 962 963 const APFloat *C = nullptr; 964 match(Op0, PatternMatch::m_APFloat(C)); 965 966 // FIXME: Should flush denorms depending on FP mode, but that's ignored 967 // everywhere else. 968 // 969 // These cases should be safe, even with strictfp. 970 // ldexp(0.0, x) -> 0.0 971 // ldexp(-0.0, x) -> -0.0 972 // ldexp(inf, x) -> inf 973 // ldexp(-inf, x) -> -inf 974 if (C && (C->isZero() || C->isInfinity())) { 975 return IC.replaceInstUsesWith(II, Op0); 976 } 977 978 // With strictfp, be more careful about possibly needing to flush denormals 979 // or not, and snan behavior depends on ieee_mode. 980 if (II.isStrictFP()) 981 break; 982 983 if (C && C->isNaN()) { 984 // FIXME: We just need to make the nan quiet here, but that's unavailable 985 // on APFloat, only IEEEfloat 986 auto *Quieted = 987 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 988 return IC.replaceInstUsesWith(II, Quieted); 989 } 990 991 // ldexp(x, 0) -> x 992 // ldexp(x, undef) -> x 993 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 994 return IC.replaceInstUsesWith(II, Op0); 995 } 996 997 break; 998 } 999 case Intrinsic::amdgcn_fmul_legacy: { 1000 Value *Op0 = II.getArgOperand(0); 1001 Value *Op1 = II.getArgOperand(1); 1002 1003 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1004 // infinity, gives +0.0. 1005 // TODO: Move to InstSimplify? 1006 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1007 match(Op1, PatternMatch::m_AnyZeroFP())) 1008 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 1009 1010 // If we can prove we don't have one of the special cases then we can use a 1011 // normal fmul instruction instead. 1012 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1013 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1014 FMul->takeName(&II); 1015 return IC.replaceInstUsesWith(II, FMul); 1016 } 1017 break; 1018 } 1019 case Intrinsic::amdgcn_fma_legacy: { 1020 Value *Op0 = II.getArgOperand(0); 1021 Value *Op1 = II.getArgOperand(1); 1022 Value *Op2 = II.getArgOperand(2); 1023 1024 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1025 // infinity, gives +0.0. 1026 // TODO: Move to InstSimplify? 1027 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1028 match(Op1, PatternMatch::m_AnyZeroFP())) { 1029 // It's tempting to just return Op2 here, but that would give the wrong 1030 // result if Op2 was -0.0. 1031 auto *Zero = ConstantFP::getNullValue(II.getType()); 1032 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1033 FAdd->takeName(&II); 1034 return IC.replaceInstUsesWith(II, FAdd); 1035 } 1036 1037 // If we can prove we don't have one of the special cases then we can use a 1038 // normal fma instead. 1039 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1040 II.setCalledOperand(Intrinsic::getDeclaration( 1041 II.getModule(), Intrinsic::fma, II.getType())); 1042 return &II; 1043 } 1044 break; 1045 } 1046 case Intrinsic::amdgcn_is_shared: 1047 case Intrinsic::amdgcn_is_private: { 1048 if (isa<UndefValue>(II.getArgOperand(0))) 1049 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1050 1051 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1052 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1053 break; 1054 } 1055 default: { 1056 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1057 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1058 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1059 } 1060 } 1061 } 1062 return None; 1063 } 1064 1065 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1066 /// 1067 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1068 /// struct returns. 1069 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1070 IntrinsicInst &II, 1071 APInt DemandedElts, 1072 int DMaskIdx = -1) { 1073 1074 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1075 unsigned VWidth = IIVTy->getNumElements(); 1076 if (VWidth == 1) 1077 return nullptr; 1078 1079 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1080 IC.Builder.SetInsertPoint(&II); 1081 1082 // Assume the arguments are unchanged and later override them, if needed. 1083 SmallVector<Value *, 16> Args(II.args()); 1084 1085 if (DMaskIdx < 0) { 1086 // Buffer case. 1087 1088 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1089 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1090 1091 // Start assuming the prefix of elements is demanded, but possibly clear 1092 // some other bits if there are trailing zeros (unused components at front) 1093 // and update offset. 1094 DemandedElts = (1 << ActiveBits) - 1; 1095 1096 if (UnusedComponentsAtFront > 0) { 1097 static const unsigned InvalidOffsetIdx = 0xf; 1098 1099 unsigned OffsetIdx; 1100 switch (II.getIntrinsicID()) { 1101 case Intrinsic::amdgcn_raw_buffer_load: 1102 OffsetIdx = 1; 1103 break; 1104 case Intrinsic::amdgcn_s_buffer_load: 1105 // If resulting type is vec3, there is no point in trimming the 1106 // load with updated offset, as the vec3 would most likely be widened to 1107 // vec4 anyway during lowering. 1108 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1109 OffsetIdx = InvalidOffsetIdx; 1110 else 1111 OffsetIdx = 1; 1112 break; 1113 case Intrinsic::amdgcn_struct_buffer_load: 1114 OffsetIdx = 2; 1115 break; 1116 default: 1117 // TODO: handle tbuffer* intrinsics. 1118 OffsetIdx = InvalidOffsetIdx; 1119 break; 1120 } 1121 1122 if (OffsetIdx != InvalidOffsetIdx) { 1123 // Clear demanded bits and update the offset. 1124 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1125 auto *Offset = II.getArgOperand(OffsetIdx); 1126 unsigned SingleComponentSizeInBits = 1127 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 1128 unsigned OffsetAdd = 1129 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1130 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1131 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1132 } 1133 } 1134 } else { 1135 // Image case. 1136 1137 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 1138 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1139 1140 // Mask off values that are undefined because the dmask doesn't cover them 1141 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 1142 1143 unsigned NewDMaskVal = 0; 1144 unsigned OrigLoadIdx = 0; 1145 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1146 const unsigned Bit = 1 << SrcIdx; 1147 if (!!(DMaskVal & Bit)) { 1148 if (!!DemandedElts[OrigLoadIdx]) 1149 NewDMaskVal |= Bit; 1150 OrigLoadIdx++; 1151 } 1152 } 1153 1154 if (DMaskVal != NewDMaskVal) 1155 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1156 } 1157 1158 unsigned NewNumElts = DemandedElts.countPopulation(); 1159 if (!NewNumElts) 1160 return UndefValue::get(II.getType()); 1161 1162 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1163 if (DMaskIdx >= 0) 1164 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1165 return nullptr; 1166 } 1167 1168 // Validate function argument and return types, extracting overloaded types 1169 // along the way. 1170 SmallVector<Type *, 6> OverloadTys; 1171 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1172 return nullptr; 1173 1174 Module *M = II.getParent()->getParent()->getParent(); 1175 Type *EltTy = IIVTy->getElementType(); 1176 Type *NewTy = 1177 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1178 1179 OverloadTys[0] = NewTy; 1180 Function *NewIntrin = 1181 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1182 1183 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1184 NewCall->takeName(&II); 1185 NewCall->copyMetadata(II); 1186 1187 if (NewNumElts == 1) { 1188 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1189 NewCall, 1190 DemandedElts.countTrailingZeros()); 1191 } 1192 1193 SmallVector<int, 8> EltMask; 1194 unsigned NewLoadIdx = 0; 1195 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1196 if (!!DemandedElts[OrigLoadIdx]) 1197 EltMask.push_back(NewLoadIdx++); 1198 else 1199 EltMask.push_back(NewNumElts); 1200 } 1201 1202 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1203 1204 return Shuffle; 1205 } 1206 1207 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1208 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1209 APInt &UndefElts2, APInt &UndefElts3, 1210 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1211 SimplifyAndSetOp) const { 1212 switch (II.getIntrinsicID()) { 1213 case Intrinsic::amdgcn_buffer_load: 1214 case Intrinsic::amdgcn_buffer_load_format: 1215 case Intrinsic::amdgcn_raw_buffer_load: 1216 case Intrinsic::amdgcn_raw_buffer_load_format: 1217 case Intrinsic::amdgcn_raw_tbuffer_load: 1218 case Intrinsic::amdgcn_s_buffer_load: 1219 case Intrinsic::amdgcn_struct_buffer_load: 1220 case Intrinsic::amdgcn_struct_buffer_load_format: 1221 case Intrinsic::amdgcn_struct_tbuffer_load: 1222 case Intrinsic::amdgcn_tbuffer_load: 1223 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1224 default: { 1225 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1226 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1227 } 1228 break; 1229 } 1230 } 1231 return None; 1232 } 1233