1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIModeRegisterDefaults.h" 21 #include "llvm/Analysis/InlineCost.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/CodeGen/Analysis.h" 25 #include "llvm/IR/IRBuilder.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/IR/PatternMatch.h" 28 #include "llvm/Support/KnownBits.h" 29 #include <optional> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "AMDGPUtti" 34 35 static cl::opt<unsigned> UnrollThresholdPrivate( 36 "amdgpu-unroll-threshold-private", 37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 38 cl::init(2700), cl::Hidden); 39 40 static cl::opt<unsigned> UnrollThresholdLocal( 41 "amdgpu-unroll-threshold-local", 42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), 43 cl::init(1000), cl::Hidden); 44 45 static cl::opt<unsigned> UnrollThresholdIf( 46 "amdgpu-unroll-threshold-if", 47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), 48 cl::init(200), cl::Hidden); 49 50 static cl::opt<bool> UnrollRuntimeLocal( 51 "amdgpu-unroll-runtime-local", 52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), 53 cl::init(true), cl::Hidden); 54 55 static cl::opt<unsigned> UnrollMaxBlockToAnalyze( 56 "amdgpu-unroll-max-block-to-analyze", 57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), 58 cl::init(32), cl::Hidden); 59 60 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost", 61 cl::Hidden, cl::init(4000), 62 cl::desc("Cost of alloca argument")); 63 64 // If the amount of scratch memory to eliminate exceeds our ability to allocate 65 // it into registers we gain nothing by aggressively inlining functions for that 66 // heuristic. 67 static cl::opt<unsigned> 68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, 69 cl::init(256), 70 cl::desc("Maximum alloca size to use for inline cost")); 71 72 // Inliner constraint to achieve reasonable compilation time. 73 static cl::opt<size_t> InlineMaxBB( 74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), 75 cl::desc("Maximum number of BBs allowed in a function after inlining" 76 " (compile time constraint)")); 77 78 // This default unroll factor is based on microbenchmarks on gfx1030. 79 static cl::opt<unsigned> MemcpyLoopUnroll( 80 "amdgpu-memcpy-loop-unroll", 81 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " 82 "operations when lowering memcpy as a loop"), 83 cl::init(16), cl::Hidden); 84 85 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, 86 unsigned Depth = 0) { 87 const Instruction *I = dyn_cast<Instruction>(Cond); 88 if (!I) 89 return false; 90 91 for (const Value *V : I->operand_values()) { 92 if (!L->contains(I)) 93 continue; 94 if (const PHINode *PHI = dyn_cast<PHINode>(V)) { 95 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { 96 return SubLoop->contains(PHI); })) 97 return true; 98 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) 99 return true; 100 } 101 return false; 102 } 103 104 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 105 : BaseT(TM, F.getDataLayout()), 106 TargetTriple(TM->getTargetTriple()), 107 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 108 TLI(ST->getTargetLowering()) {} 109 110 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 111 TTI::UnrollingPreferences &UP, 112 OptimizationRemarkEmitter *ORE) { 113 const Function &F = *L->getHeader()->getParent(); 114 UP.Threshold = 115 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300); 116 UP.MaxCount = std::numeric_limits<unsigned>::max(); 117 UP.Partial = true; 118 119 // Conditional branch in a loop back edge needs 3 additional exec 120 // manipulations in average. 121 UP.BEInsns += 3; 122 123 // We want to run unroll even for the loops which have been vectorized. 124 UP.UnrollVectorizedLoop = true; 125 126 // TODO: Do we want runtime unrolling? 127 128 // Maximum alloca size than can fit registers. Reserve 16 registers. 129 const unsigned MaxAlloca = (256 - 16) * 4; 130 unsigned ThresholdPrivate = UnrollThresholdPrivate; 131 unsigned ThresholdLocal = UnrollThresholdLocal; 132 133 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the 134 // provided threshold value as the default for Threshold 135 if (MDNode *LoopUnrollThreshold = 136 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) { 137 if (LoopUnrollThreshold->getNumOperands() == 2) { 138 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>( 139 LoopUnrollThreshold->getOperand(1)); 140 if (MetaThresholdValue) { 141 // We will also use the supplied value for PartialThreshold for now. 142 // We may introduce additional metadata if it becomes necessary in the 143 // future. 144 UP.Threshold = MetaThresholdValue->getSExtValue(); 145 UP.PartialThreshold = UP.Threshold; 146 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold); 147 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold); 148 } 149 } 150 } 151 152 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); 153 for (const BasicBlock *BB : L->getBlocks()) { 154 const DataLayout &DL = BB->getDataLayout(); 155 unsigned LocalGEPsSeen = 0; 156 157 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { 158 return SubLoop->contains(BB); })) 159 continue; // Block belongs to an inner loop. 160 161 for (const Instruction &I : *BB) { 162 // Unroll a loop which contains an "if" statement whose condition 163 // defined by a PHI belonging to the loop. This may help to eliminate 164 // if region and potentially even PHI itself, saving on both divergence 165 // and registers used for the PHI. 166 // Add a small bonus for each of such "if" statements. 167 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { 168 if (UP.Threshold < MaxBoost && Br->isConditional()) { 169 BasicBlock *Succ0 = Br->getSuccessor(0); 170 BasicBlock *Succ1 = Br->getSuccessor(1); 171 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || 172 (L->contains(Succ1) && L->isLoopExiting(Succ1))) 173 continue; 174 if (dependsOnLocalPhi(L, Br->getCondition())) { 175 UP.Threshold += UnrollThresholdIf; 176 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold 177 << " for loop:\n" 178 << *L << " due to " << *Br << '\n'); 179 if (UP.Threshold >= MaxBoost) 180 return; 181 } 182 } 183 continue; 184 } 185 186 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 187 if (!GEP) 188 continue; 189 190 unsigned AS = GEP->getAddressSpace(); 191 unsigned Threshold = 0; 192 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 193 Threshold = ThresholdPrivate; 194 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 195 Threshold = ThresholdLocal; 196 else 197 continue; 198 199 if (UP.Threshold >= Threshold) 200 continue; 201 202 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 203 const Value *Ptr = GEP->getPointerOperand(); 204 const AllocaInst *Alloca = 205 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr)); 206 if (!Alloca || !Alloca->isStaticAlloca()) 207 continue; 208 Type *Ty = Alloca->getAllocatedType(); 209 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 210 if (AllocaSize > MaxAlloca) 211 continue; 212 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || 213 AS == AMDGPUAS::REGION_ADDRESS) { 214 LocalGEPsSeen++; 215 // Inhibit unroll for local memory if we have seen addressing not to 216 // a variable, most likely we will be unable to combine it. 217 // Do not unroll too deep inner loops for local memory to give a chance 218 // to unroll an outer loop for a more important reason. 219 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || 220 (!isa<GlobalVariable>(GEP->getPointerOperand()) && 221 !isa<Argument>(GEP->getPointerOperand()))) 222 continue; 223 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" 224 << *L << " due to LDS use.\n"); 225 UP.Runtime = UnrollRuntimeLocal; 226 } 227 228 // Check if GEP depends on a value defined by this loop itself. 229 bool HasLoopDef = false; 230 for (const Value *Op : GEP->operands()) { 231 const Instruction *Inst = dyn_cast<Instruction>(Op); 232 if (!Inst || L->isLoopInvariant(Op)) 233 continue; 234 235 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 236 return SubLoop->contains(Inst); })) 237 continue; 238 HasLoopDef = true; 239 break; 240 } 241 if (!HasLoopDef) 242 continue; 243 244 // We want to do whatever we can to limit the number of alloca 245 // instructions that make it through to the code generator. allocas 246 // require us to use indirect addressing, which is slow and prone to 247 // compiler bugs. If this loop does an address calculation on an 248 // alloca ptr, then we want to use a higher than normal loop unroll 249 // threshold. This will give SROA a better chance to eliminate these 250 // allocas. 251 // 252 // We also want to have more unrolling for local memory to let ds 253 // instructions with different offsets combine. 254 // 255 // Don't use the maximum allowed value here as it will make some 256 // programs way too big. 257 UP.Threshold = Threshold; 258 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold 259 << " for loop:\n" 260 << *L << " due to " << *GEP << '\n'); 261 if (UP.Threshold >= MaxBoost) 262 return; 263 } 264 265 // If we got a GEP in a small BB from inner loop then increase max trip 266 // count to analyze for better estimation cost in unroll 267 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze) 268 UP.MaxIterationsCountToAnalyze = 32; 269 } 270 } 271 272 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 273 TTI::PeelingPreferences &PP) { 274 BaseT::getPeelingPreferences(L, SE, PP); 275 } 276 277 int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { 278 return 1024; 279 } 280 281 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { 282 // Codegen control options which don't matter. 283 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, 284 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, 285 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, 286 AMDGPU::FeatureUnalignedAccessMode, 287 288 AMDGPU::FeatureAutoWaitcntBeforeBarrier, 289 290 // Property of the kernel/environment which can't actually differ. 291 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, 292 AMDGPU::FeatureTrapHandler, 293 294 // The default assumption needs to be ecc is enabled, but no directly 295 // exposed operations depend on it, so it can be safely inlined. 296 AMDGPU::FeatureSRAMECC, 297 298 // Perf-tuning features 299 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops}; 300 301 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 302 : BaseT(TM, F.getDataLayout()), 303 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 304 TLI(ST->getTargetLowering()), CommonTTI(TM, F), 305 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) { 306 SIModeRegisterDefaults Mode(F, *ST); 307 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign(); 308 HasFP64FP16Denormals = 309 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign(); 310 } 311 312 bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { 313 return !F || !ST->isSingleLaneExecution(*F); 314 } 315 316 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { 317 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector 318 // registers. See getRegisterClassForType for the implementation. 319 // In this case vector registers are not vector in terms of 320 // VGPRs, but those which can hold multiple values. 321 322 // This is really the number of registers to fill when vectorizing / 323 // interleaving loops, so we lie to avoid trying to use all registers. 324 return 4; 325 } 326 327 TypeSize 328 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 329 switch (K) { 330 case TargetTransformInfo::RGK_Scalar: 331 return TypeSize::getFixed(32); 332 case TargetTransformInfo::RGK_FixedWidthVector: 333 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32); 334 case TargetTransformInfo::RGK_ScalableVector: 335 return TypeSize::getScalable(0); 336 } 337 llvm_unreachable("Unsupported register kind"); 338 } 339 340 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { 341 return 32; 342 } 343 344 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { 345 if (Opcode == Instruction::Load || Opcode == Instruction::Store) 346 return 32 * 4 / ElemWidth; 347 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 348 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 349 : 1; 350 } 351 352 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, 353 unsigned ChainSizeInBytes, 354 VectorType *VecTy) const { 355 unsigned VecRegBitWidth = VF * LoadSize; 356 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) 357 // TODO: Support element-size less than 32bit? 358 return 128 / LoadSize; 359 360 return VF; 361 } 362 363 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, 364 unsigned ChainSizeInBytes, 365 VectorType *VecTy) const { 366 unsigned VecRegBitWidth = VF * StoreSize; 367 if (VecRegBitWidth > 128) 368 return 128 / StoreSize; 369 370 return VF; 371 } 372 373 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 374 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || 375 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || 376 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 377 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || 378 AddrSpace == AMDGPUAS::BUFFER_RESOURCE || 379 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) { 380 return 512; 381 } 382 383 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 384 return 8 * ST->getMaxPrivateElementSize(); 385 386 // Common to flat, global, local and region. Assume for unknown addrspace. 387 return 128; 388 } 389 390 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 391 Align Alignment, 392 unsigned AddrSpace) const { 393 // We allow vectorization of flat stores, even though we may need to decompose 394 // them later if they may access private memory. We don't have enough context 395 // here, and legalization can handle it. 396 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 397 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) && 398 ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 399 } 400 return true; 401 } 402 403 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 404 Align Alignment, 405 unsigned AddrSpace) const { 406 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 407 } 408 409 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 410 Align Alignment, 411 unsigned AddrSpace) const { 412 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 413 } 414 415 int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { 416 return 1024; 417 } 418 419 // FIXME: Should we use narrower types for local/region, or account for when 420 // unaligned access is legal? 421 Type *GCNTTIImpl::getMemcpyLoopLoweringType( 422 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, 423 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, 424 std::optional<uint32_t> AtomicElementSize) const { 425 426 if (AtomicElementSize) 427 return Type::getIntNTy(Context, *AtomicElementSize * 8); 428 429 Align MinAlign = std::min(SrcAlign, DestAlign); 430 431 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the 432 // hardware into byte accesses. If you assume all alignments are equally 433 // probable, it's more efficient on average to use short accesses for this 434 // case. 435 if (MinAlign == Align(2)) 436 return Type::getInt16Ty(Context); 437 438 // Not all subtargets have 128-bit DS instructions, and we currently don't 439 // form them by default. 440 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || 441 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || 442 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || 443 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { 444 return FixedVectorType::get(Type::getInt32Ty(Context), 2); 445 } 446 447 // Global memory works best with 16-byte accesses. 448 // If the operation has a fixed known length that is large enough, it is 449 // worthwhile to return an even wider type and let legalization lower it into 450 // multiple accesses, effectively unrolling the memcpy loop. Private memory 451 // also hits this, although accesses may be decomposed. 452 // 453 // Don't unroll if Length is not a constant, since unrolling leads to worse 454 // performance for length values that are smaller or slightly larger than the 455 // total size of the type returned here. Mitigating that would require a more 456 // complex lowering for variable-length memcpy and memmove. 457 unsigned I32EltsInVector = 4; 458 if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length)) 459 return FixedVectorType::get(Type::getInt32Ty(Context), 460 MemcpyLoopUnroll * I32EltsInVector); 461 462 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector); 463 } 464 465 void GCNTTIImpl::getMemcpyLoopResidualLoweringType( 466 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 467 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 468 Align SrcAlign, Align DestAlign, 469 std::optional<uint32_t> AtomicCpySize) const { 470 471 if (AtomicCpySize) 472 BaseT::getMemcpyLoopResidualLoweringType( 473 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, 474 DestAlign, AtomicCpySize); 475 476 Align MinAlign = std::min(SrcAlign, DestAlign); 477 478 if (MinAlign != Align(2)) { 479 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); 480 while (RemainingBytes >= 16) { 481 OpsOut.push_back(I32x4Ty); 482 RemainingBytes -= 16; 483 } 484 485 Type *I64Ty = Type::getInt64Ty(Context); 486 while (RemainingBytes >= 8) { 487 OpsOut.push_back(I64Ty); 488 RemainingBytes -= 8; 489 } 490 491 Type *I32Ty = Type::getInt32Ty(Context); 492 while (RemainingBytes >= 4) { 493 OpsOut.push_back(I32Ty); 494 RemainingBytes -= 4; 495 } 496 } 497 498 Type *I16Ty = Type::getInt16Ty(Context); 499 while (RemainingBytes >= 2) { 500 OpsOut.push_back(I16Ty); 501 RemainingBytes -= 2; 502 } 503 504 Type *I8Ty = Type::getInt8Ty(Context); 505 while (RemainingBytes) { 506 OpsOut.push_back(I8Ty); 507 --RemainingBytes; 508 } 509 } 510 511 unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) { 512 // Disable unrolling if the loop is not vectorized. 513 // TODO: Enable this again. 514 if (VF.isScalar()) 515 return 1; 516 517 return 8; 518 } 519 520 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 521 MemIntrinsicInfo &Info) const { 522 switch (Inst->getIntrinsicID()) { 523 case Intrinsic::amdgcn_ds_ordered_add: 524 case Intrinsic::amdgcn_ds_ordered_swap: { 525 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); 526 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); 527 if (!Ordering || !Volatile) 528 return false; // Invalid. 529 530 unsigned OrderingVal = Ordering->getZExtValue(); 531 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) 532 return false; 533 534 Info.PtrVal = Inst->getArgOperand(0); 535 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); 536 Info.ReadMem = true; 537 Info.WriteMem = true; 538 Info.IsVolatile = !Volatile->isZero(); 539 return true; 540 } 541 default: 542 return false; 543 } 544 } 545 546 InstructionCost GCNTTIImpl::getArithmeticInstrCost( 547 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 548 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 549 ArrayRef<const Value *> Args, 550 const Instruction *CxtI) { 551 552 // Legalize the type. 553 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 554 int ISD = TLI->InstructionOpcodeToISD(Opcode); 555 556 // Because we don't have any legal vector operations, but the legal types, we 557 // need to account for split vectors. 558 unsigned NElts = LT.second.isVector() ? 559 LT.second.getVectorNumElements() : 1; 560 561 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 562 563 switch (ISD) { 564 case ISD::SHL: 565 case ISD::SRL: 566 case ISD::SRA: 567 if (SLT == MVT::i64) 568 return get64BitInstrCost(CostKind) * LT.first * NElts; 569 570 if (ST->has16BitInsts() && SLT == MVT::i16) 571 NElts = (NElts + 1) / 2; 572 573 // i32 574 return getFullRateInstrCost() * LT.first * NElts; 575 case ISD::ADD: 576 case ISD::SUB: 577 case ISD::AND: 578 case ISD::OR: 579 case ISD::XOR: 580 if (SLT == MVT::i64) { 581 // and, or and xor are typically split into 2 VALU instructions. 582 return 2 * getFullRateInstrCost() * LT.first * NElts; 583 } 584 585 if (ST->has16BitInsts() && SLT == MVT::i16) 586 NElts = (NElts + 1) / 2; 587 588 return LT.first * NElts * getFullRateInstrCost(); 589 case ISD::MUL: { 590 const int QuarterRateCost = getQuarterRateInstrCost(CostKind); 591 if (SLT == MVT::i64) { 592 const int FullRateCost = getFullRateInstrCost(); 593 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 594 } 595 596 if (ST->has16BitInsts() && SLT == MVT::i16) 597 NElts = (NElts + 1) / 2; 598 599 // i32 600 return QuarterRateCost * NElts * LT.first; 601 } 602 case ISD::FMUL: 603 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for 604 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole 605 // fused operation. 606 if (CxtI && CxtI->hasOneUse()) 607 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) { 608 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode()); 609 if (OPC == ISD::FADD || OPC == ISD::FSUB) { 610 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals) 611 return TargetTransformInfo::TCC_Free; 612 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) 613 return TargetTransformInfo::TCC_Free; 614 615 // Estimate all types may be fused with contract/unsafe flags 616 const TargetOptions &Options = TLI->getTargetMachine().Options; 617 if (Options.AllowFPOpFusion == FPOpFusion::Fast || 618 Options.UnsafeFPMath || 619 (FAdd->hasAllowContract() && CxtI->hasAllowContract())) 620 return TargetTransformInfo::TCC_Free; 621 } 622 } 623 [[fallthrough]]; 624 case ISD::FADD: 625 case ISD::FSUB: 626 if (ST->hasPackedFP32Ops() && SLT == MVT::f32) 627 NElts = (NElts + 1) / 2; 628 if (SLT == MVT::f64) 629 return LT.first * NElts * get64BitInstrCost(CostKind); 630 631 if (ST->has16BitInsts() && SLT == MVT::f16) 632 NElts = (NElts + 1) / 2; 633 634 if (SLT == MVT::f32 || SLT == MVT::f16) 635 return LT.first * NElts * getFullRateInstrCost(); 636 break; 637 case ISD::FDIV: 638 case ISD::FREM: 639 // FIXME: frem should be handled separately. The fdiv in it is most of it, 640 // but the current lowering is also not entirely correct. 641 if (SLT == MVT::f64) { 642 int Cost = 7 * get64BitInstrCost(CostKind) + 643 getQuarterRateInstrCost(CostKind) + 644 3 * getHalfRateInstrCost(CostKind); 645 // Add cost of workaround. 646 if (!ST->hasUsableDivScaleConditionOutput()) 647 Cost += 3 * getFullRateInstrCost(); 648 649 return LT.first * Cost * NElts; 650 } 651 652 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { 653 // TODO: This is more complicated, unsafe flags etc. 654 if ((SLT == MVT::f32 && !HasFP32Denormals) || 655 (SLT == MVT::f16 && ST->has16BitInsts())) { 656 return LT.first * getQuarterRateInstrCost(CostKind) * NElts; 657 } 658 } 659 660 if (SLT == MVT::f16 && ST->has16BitInsts()) { 661 // 2 x v_cvt_f32_f16 662 // f32 rcp 663 // f32 fmul 664 // v_cvt_f16_f32 665 // f16 div_fixup 666 int Cost = 667 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); 668 return LT.first * Cost * NElts; 669 } 670 671 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || 672 TLI->getTargetMachine().Options.UnsafeFPMath)) { 673 // Fast unsafe fdiv lowering: 674 // f32 rcp 675 // f32 fmul 676 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost(); 677 return LT.first * Cost * NElts; 678 } 679 680 if (SLT == MVT::f32 || SLT == MVT::f16) { 681 // 4 more v_cvt_* insts without f16 insts support 682 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + 683 1 * getQuarterRateInstrCost(CostKind); 684 685 if (!HasFP32Denormals) { 686 // FP mode switches. 687 Cost += 2 * getFullRateInstrCost(); 688 } 689 690 return LT.first * NElts * Cost; 691 } 692 break; 693 case ISD::FNEG: 694 // Use the backend' estimation. If fneg is not free each element will cost 695 // one additional instruction. 696 return TLI->isFNegFree(SLT) ? 0 : NElts; 697 default: 698 break; 699 } 700 701 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 702 Args, CxtI); 703 } 704 705 // Return true if there's a potential benefit from using v2f16/v2i16 706 // instructions for an intrinsic, even if it requires nontrivial legalization. 707 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { 708 switch (ID) { 709 case Intrinsic::fma: 710 case Intrinsic::fmuladd: 711 case Intrinsic::copysign: 712 case Intrinsic::canonicalize: 713 // There's a small benefit to using vector ops in the legalized code. 714 case Intrinsic::round: 715 case Intrinsic::uadd_sat: 716 case Intrinsic::usub_sat: 717 case Intrinsic::sadd_sat: 718 case Intrinsic::ssub_sat: 719 case Intrinsic::abs: 720 return true; 721 default: 722 return false; 723 } 724 } 725 726 InstructionCost 727 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 728 TTI::TargetCostKind CostKind) { 729 if (ICA.getID() == Intrinsic::fabs) 730 return 0; 731 732 if (!intrinsicHasPackedVectorBenefit(ICA.getID())) 733 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 734 735 Type *RetTy = ICA.getReturnType(); 736 737 // Legalize the type. 738 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); 739 740 unsigned NElts = LT.second.isVector() ? 741 LT.second.getVectorNumElements() : 1; 742 743 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 744 745 if (SLT == MVT::f64) 746 return LT.first * NElts * get64BitInstrCost(CostKind); 747 748 if ((ST->has16BitInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) || 749 (ST->hasPackedFP32Ops() && SLT == MVT::f32)) 750 NElts = (NElts + 1) / 2; 751 752 // TODO: Get more refined intrinsic costs? 753 unsigned InstRate = getQuarterRateInstrCost(CostKind); 754 755 switch (ICA.getID()) { 756 case Intrinsic::fma: 757 case Intrinsic::fmuladd: 758 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16) 759 InstRate = getFullRateInstrCost(); 760 else { 761 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) 762 : getQuarterRateInstrCost(CostKind); 763 } 764 break; 765 case Intrinsic::copysign: 766 return NElts * getFullRateInstrCost(); 767 case Intrinsic::canonicalize: { 768 assert(SLT != MVT::f64); 769 InstRate = getFullRateInstrCost(); 770 break; 771 } 772 case Intrinsic::uadd_sat: 773 case Intrinsic::usub_sat: 774 case Intrinsic::sadd_sat: 775 case Intrinsic::ssub_sat: { 776 if (SLT == MVT::i16 || SLT == MVT::i32) 777 InstRate = getFullRateInstrCost(); 778 779 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; 780 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) 781 NElts = 1; 782 break; 783 } 784 case Intrinsic::abs: 785 // Expansion takes 2 instructions for VALU 786 if (SLT == MVT::i16 || SLT == MVT::i32) 787 InstRate = 2 * getFullRateInstrCost(); 788 break; 789 default: 790 break; 791 } 792 793 return LT.first * NElts * InstRate; 794 } 795 796 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode, 797 TTI::TargetCostKind CostKind, 798 const Instruction *I) { 799 assert((I == nullptr || I->getOpcode() == Opcode) && 800 "Opcode should reflect passed instruction."); 801 const bool SCost = 802 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency); 803 const int CBrCost = SCost ? 5 : 7; 804 switch (Opcode) { 805 case Instruction::Br: { 806 // Branch instruction takes about 4 slots on gfx900. 807 const auto *BI = dyn_cast_or_null<BranchInst>(I); 808 if (BI && BI->isUnconditional()) 809 return SCost ? 1 : 4; 810 // Suppose conditional branch takes additional 3 exec manipulations 811 // instructions in average. 812 return CBrCost; 813 } 814 case Instruction::Switch: { 815 const auto *SI = dyn_cast_or_null<SwitchInst>(I); 816 // Each case (including default) takes 1 cmp + 1 cbr instructions in 817 // average. 818 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1); 819 } 820 case Instruction::Ret: 821 return SCost ? 1 : 10; 822 } 823 return BaseT::getCFInstrCost(Opcode, CostKind, I); 824 } 825 826 InstructionCost 827 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 828 std::optional<FastMathFlags> FMF, 829 TTI::TargetCostKind CostKind) { 830 if (TTI::requiresOrderedReduction(FMF)) 831 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 832 833 EVT OrigTy = TLI->getValueType(DL, Ty); 834 835 // Computes cost on targets that have packed math instructions(which support 836 // 16-bit types only). 837 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) 838 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 839 840 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 841 return LT.first * getFullRateInstrCost(); 842 } 843 844 InstructionCost 845 GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 846 FastMathFlags FMF, 847 TTI::TargetCostKind CostKind) { 848 EVT OrigTy = TLI->getValueType(DL, Ty); 849 850 // Computes cost on targets that have packed math instructions(which support 851 // 16-bit types only). 852 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) 853 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 854 855 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 856 return LT.first * getHalfRateInstrCost(CostKind); 857 } 858 859 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 860 TTI::TargetCostKind CostKind, 861 unsigned Index, Value *Op0, 862 Value *Op1) { 863 switch (Opcode) { 864 case Instruction::ExtractElement: 865 case Instruction::InsertElement: { 866 unsigned EltSize 867 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 868 if (EltSize < 32) { 869 if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) 870 return 0; 871 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, 872 Op1); 873 } 874 875 // Extracts are just reads of a subregister, so are free. Inserts are 876 // considered free because we don't want to have any cost for scalarizing 877 // operations, and we don't have to copy into a different register class. 878 879 // Dynamic indexing isn't free and is best avoided. 880 return Index == ~0u ? 2 : 0; 881 } 882 default: 883 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); 884 } 885 } 886 887 /// Analyze if the results of inline asm are divergent. If \p Indices is empty, 888 /// this is analyzing the collective result of all output registers. Otherwise, 889 /// this is only querying a specific result index if this returns multiple 890 /// registers in a struct. 891 bool GCNTTIImpl::isInlineAsmSourceOfDivergence( 892 const CallInst *CI, ArrayRef<unsigned> Indices) const { 893 // TODO: Handle complex extract indices 894 if (Indices.size() > 1) 895 return true; 896 897 const DataLayout &DL = CI->getDataLayout(); 898 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 899 TargetLowering::AsmOperandInfoVector TargetConstraints = 900 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI); 901 902 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; 903 904 int OutputIdx = 0; 905 for (auto &TC : TargetConstraints) { 906 if (TC.Type != InlineAsm::isOutput) 907 continue; 908 909 // Skip outputs we don't care about. 910 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) 911 continue; 912 913 TLI->ComputeConstraintToUse(TC, SDValue()); 914 915 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint( 916 TRI, TC.ConstraintCode, TC.ConstraintVT).second; 917 918 // For AGPR constraints null is returned on subtargets without AGPRs, so 919 // assume divergent for null. 920 if (!RC || !TRI->isSGPRClass(RC)) 921 return true; 922 } 923 924 return false; 925 } 926 927 bool GCNTTIImpl::isReadRegisterSourceOfDivergence( 928 const IntrinsicInst *ReadReg) const { 929 Metadata *MD = 930 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata(); 931 StringRef RegName = 932 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString(); 933 934 // Special case registers that look like VCC. 935 MVT VT = MVT::getVT(ReadReg->getType()); 936 if (VT == MVT::i1) 937 return true; 938 939 // Special case scalar registers that start with 'v'. 940 if (RegName.starts_with("vcc") || RegName.empty()) 941 return false; 942 943 // VGPR or AGPR is divergent. There aren't any specially named vector 944 // registers. 945 return RegName[0] == 'v' || RegName[0] == 'a'; 946 } 947 948 /// \returns true if the result of the value could potentially be 949 /// different across workitems in a wavefront. 950 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { 951 if (const Argument *A = dyn_cast<Argument>(V)) 952 return !AMDGPU::isArgPassedInSGPR(A); 953 954 // Loads from the private and flat address spaces are divergent, because 955 // threads can execute the load instruction with the same inputs and get 956 // different results. 957 // 958 // All other loads are not divergent, because if threads issue loads with the 959 // same arguments, they will always get the same result. 960 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 961 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || 962 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; 963 964 // Atomics are divergent because they are executed sequentially: when an 965 // atomic operation refers to the same address in each thread, then each 966 // thread after the first sees the value written by the previous thread as 967 // original value. 968 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 969 return true; 970 971 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 972 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) 973 return isReadRegisterSourceOfDivergence(Intrinsic); 974 975 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); 976 } 977 978 // Assume all function calls are a source of divergence. 979 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 980 if (CI->isInlineAsm()) 981 return isInlineAsmSourceOfDivergence(CI); 982 return true; 983 } 984 985 // Assume all function calls are a source of divergence. 986 if (isa<InvokeInst>(V)) 987 return true; 988 989 return false; 990 } 991 992 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { 993 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) 994 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID()); 995 996 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 997 if (CI->isInlineAsm()) 998 return !isInlineAsmSourceOfDivergence(CI); 999 return false; 1000 } 1001 1002 // In most cases TID / wavefrontsize is uniform. 1003 // 1004 // However, if a kernel has uneven dimesions we can have a value of 1005 // workitem-id-x divided by the wavefrontsize non-uniform. For example 1006 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1) 1007 // packed into a same wave which gives 1 and 0 after the division by 64 1008 // respectively. 1009 // 1010 // FIXME: limit it to 1D kernels only, although that shall be possible 1011 // to perform this optimization is the size of the X dimension is a power 1012 // of 2, we just do not currently have infrastructure to query it. 1013 using namespace llvm::PatternMatch; 1014 uint64_t C; 1015 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), 1016 m_ConstantInt(C))) || 1017 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), 1018 m_ConstantInt(C)))) { 1019 const Function *F = cast<Instruction>(V)->getFunction(); 1020 return C >= ST->getWavefrontSizeLog2() && 1021 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; 1022 } 1023 1024 Value *Mask; 1025 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), 1026 m_Value(Mask)))) { 1027 const Function *F = cast<Instruction>(V)->getFunction(); 1028 const DataLayout &DL = F->getDataLayout(); 1029 return computeKnownBits(Mask, DL).countMinTrailingZeros() >= 1030 ST->getWavefrontSizeLog2() && 1031 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; 1032 } 1033 1034 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V); 1035 if (!ExtValue) 1036 return false; 1037 1038 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0)); 1039 if (!CI) 1040 return false; 1041 1042 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) { 1043 switch (Intrinsic->getIntrinsicID()) { 1044 default: 1045 return false; 1046 case Intrinsic::amdgcn_if: 1047 case Intrinsic::amdgcn_else: { 1048 ArrayRef<unsigned> Indices = ExtValue->getIndices(); 1049 return Indices.size() == 1 && Indices[0] == 1; 1050 } 1051 } 1052 } 1053 1054 // If we have inline asm returning mixed SGPR and VGPR results, we inferred 1055 // divergent for the overall struct return. We need to override it in the 1056 // case we're extracting an SGPR component here. 1057 if (CI->isInlineAsm()) 1058 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices()); 1059 1060 return false; 1061 } 1062 1063 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 1064 Intrinsic::ID IID) const { 1065 switch (IID) { 1066 case Intrinsic::amdgcn_is_shared: 1067 case Intrinsic::amdgcn_is_private: 1068 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1069 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1070 OpIndexes.push_back(0); 1071 return true; 1072 default: 1073 return false; 1074 } 1075 } 1076 1077 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, 1078 Value *OldV, 1079 Value *NewV) const { 1080 auto IntrID = II->getIntrinsicID(); 1081 switch (IntrID) { 1082 case Intrinsic::amdgcn_is_shared: 1083 case Intrinsic::amdgcn_is_private: { 1084 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? 1085 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; 1086 unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 1087 LLVMContext &Ctx = NewV->getType()->getContext(); 1088 ConstantInt *NewVal = (TrueAS == NewAS) ? 1089 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); 1090 return NewVal; 1091 } 1092 case Intrinsic::ptrmask: { 1093 unsigned OldAS = OldV->getType()->getPointerAddressSpace(); 1094 unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 1095 Value *MaskOp = II->getArgOperand(1); 1096 Type *MaskTy = MaskOp->getType(); 1097 1098 bool DoTruncate = false; 1099 1100 const GCNTargetMachine &TM = 1101 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); 1102 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { 1103 // All valid 64-bit to 32-bit casts work by chopping off the high 1104 // bits. Any masking only clearing the low bits will also apply in the new 1105 // address space. 1106 if (DL.getPointerSizeInBits(OldAS) != 64 || 1107 DL.getPointerSizeInBits(NewAS) != 32) 1108 return nullptr; 1109 1110 // TODO: Do we need to thread more context in here? 1111 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II); 1112 if (Known.countMinLeadingOnes() < 32) 1113 return nullptr; 1114 1115 DoTruncate = true; 1116 } 1117 1118 IRBuilder<> B(II); 1119 if (DoTruncate) { 1120 MaskTy = B.getInt32Ty(); 1121 MaskOp = B.CreateTrunc(MaskOp, MaskTy); 1122 } 1123 1124 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, 1125 {NewV, MaskOp}); 1126 } 1127 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1128 case Intrinsic::amdgcn_flat_atomic_fmin_num: { 1129 Type *DestTy = II->getType(); 1130 Type *SrcTy = NewV->getType(); 1131 unsigned NewAS = SrcTy->getPointerAddressSpace(); 1132 if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS)) 1133 return nullptr; 1134 Module *M = II->getModule(); 1135 Function *NewDecl = Intrinsic::getOrInsertDeclaration( 1136 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy}); 1137 II->setArgOperand(0, NewV); 1138 II->setCalledFunction(NewDecl); 1139 return II; 1140 } 1141 default: 1142 return nullptr; 1143 } 1144 } 1145 1146 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1147 VectorType *VT, ArrayRef<int> Mask, 1148 TTI::TargetCostKind CostKind, 1149 int Index, VectorType *SubTp, 1150 ArrayRef<const Value *> Args, 1151 const Instruction *CxtI) { 1152 if (!isa<FixedVectorType>(VT)) 1153 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); 1154 1155 Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); 1156 1157 // Larger vector widths may require additional instructions, but are 1158 // typically cheaper than scalarized versions. 1159 unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements(); 1160 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1161 DL.getTypeSizeInBits(VT->getElementType()) == 16) { 1162 bool HasVOP3P = ST->hasVOP3PInsts(); 1163 unsigned RequestedElts = 1164 count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); 1165 if (RequestedElts == 0) 1166 return 0; 1167 switch (Kind) { 1168 case TTI::SK_Broadcast: 1169 case TTI::SK_Reverse: 1170 case TTI::SK_PermuteSingleSrc: { 1171 // With op_sel VOP3P instructions freely can access the low half or high 1172 // half of a register, so any swizzle of two elements is free. 1173 if (HasVOP3P && NumVectorElts == 2) 1174 return 0; 1175 unsigned NumPerms = alignTo(RequestedElts, 2) / 2; 1176 // SK_Broadcast just reuses the same mask 1177 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; 1178 return NumPerms + NumPermMasks; 1179 } 1180 case TTI::SK_ExtractSubvector: 1181 case TTI::SK_InsertSubvector: { 1182 // Even aligned accesses are free 1183 if (!(Index % 2)) 1184 return 0; 1185 // Insert/extract subvectors only require shifts / extract code to get the 1186 // relevant bits 1187 return alignTo(RequestedElts, 2) / 2; 1188 } 1189 case TTI::SK_PermuteTwoSrc: 1190 case TTI::SK_Splice: 1191 case TTI::SK_Select: { 1192 unsigned NumPerms = alignTo(RequestedElts, 2) / 2; 1193 // SK_Select just reuses the same mask 1194 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; 1195 return NumPerms + NumPermMasks; 1196 } 1197 1198 default: 1199 break; 1200 } 1201 } 1202 1203 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); 1204 } 1205 1206 /// Whether it is profitable to sink the operands of an 1207 /// Instruction I to the basic block of I. 1208 /// This helps using several modifiers (like abs and neg) more often. 1209 bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I, 1210 SmallVectorImpl<Use *> &Ops) const { 1211 using namespace PatternMatch; 1212 1213 for (auto &Op : I->operands()) { 1214 // Ensure we are not already sinking this operand. 1215 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) 1216 continue; 1217 1218 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) 1219 Ops.push_back(&Op); 1220 } 1221 1222 return !Ops.empty(); 1223 } 1224 1225 bool GCNTTIImpl::areInlineCompatible(const Function *Caller, 1226 const Function *Callee) const { 1227 const TargetMachine &TM = getTLI()->getTargetMachine(); 1228 const GCNSubtarget *CallerST 1229 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); 1230 const GCNSubtarget *CalleeST 1231 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); 1232 1233 const FeatureBitset &CallerBits = CallerST->getFeatureBits(); 1234 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); 1235 1236 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 1237 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 1238 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 1239 return false; 1240 1241 // FIXME: dx10_clamp can just take the caller setting, but there seems to be 1242 // no way to support merge for backend defined attributes. 1243 SIModeRegisterDefaults CallerMode(*Caller, *CallerST); 1244 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); 1245 if (!CallerMode.isInlineCompatible(CalleeMode)) 1246 return false; 1247 1248 if (Callee->hasFnAttribute(Attribute::AlwaysInline) || 1249 Callee->hasFnAttribute(Attribute::InlineHint)) 1250 return true; 1251 1252 // Hack to make compile times reasonable. 1253 if (InlineMaxBB) { 1254 // Single BB does not increase total BB amount. 1255 if (Callee->size() == 1) 1256 return true; 1257 size_t BBSize = Caller->size() + Callee->size() - 1; 1258 return BBSize <= InlineMaxBB; 1259 } 1260 1261 return true; 1262 } 1263 1264 static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, 1265 const SITargetLowering *TLI, 1266 const GCNTTIImpl *TTIImpl) { 1267 const int NrOfSGPRUntilSpill = 26; 1268 const int NrOfVGPRUntilSpill = 32; 1269 1270 const DataLayout &DL = TTIImpl->getDataLayout(); 1271 1272 unsigned adjustThreshold = 0; 1273 int SGPRsInUse = 0; 1274 int VGPRsInUse = 0; 1275 for (const Use &A : CB->args()) { 1276 SmallVector<EVT, 4> ValueVTs; 1277 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs); 1278 for (auto ArgVT : ValueVTs) { 1279 unsigned CCRegNum = TLI->getNumRegistersForCallingConv( 1280 CB->getContext(), CB->getCallingConv(), ArgVT); 1281 if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A))) 1282 SGPRsInUse += CCRegNum; 1283 else 1284 VGPRsInUse += CCRegNum; 1285 } 1286 } 1287 1288 // The cost of passing function arguments through the stack: 1289 // 1 instruction to put a function argument on the stack in the caller. 1290 // 1 instruction to take a function argument from the stack in callee. 1291 // 1 instruction is explicitly take care of data dependencies in callee 1292 // function. 1293 InstructionCost ArgStackCost(1); 1294 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( 1295 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4), 1296 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency); 1297 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( 1298 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4), 1299 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency); 1300 1301 // The penalty cost is computed relative to the cost of instructions and does 1302 // not model any storage costs. 1303 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) * 1304 *ArgStackCost.getValue() * InlineConstants::getInstrCost(); 1305 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) * 1306 *ArgStackCost.getValue() * InlineConstants::getInstrCost(); 1307 return adjustThreshold; 1308 } 1309 1310 static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, 1311 const DataLayout &DL) { 1312 // If we have a pointer to a private array passed into a function 1313 // it will not be optimized out, leaving scratch usage. 1314 // This function calculates the total size in bytes of the memory that would 1315 // end in scratch if the call was not inlined. 1316 unsigned AllocaSize = 0; 1317 SmallPtrSet<const AllocaInst *, 8> AIVisited; 1318 for (Value *PtrArg : CB->args()) { 1319 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); 1320 if (!Ty) 1321 continue; 1322 1323 unsigned AddrSpace = Ty->getAddressSpace(); 1324 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS && 1325 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS) 1326 continue; 1327 1328 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg)); 1329 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second) 1330 continue; 1331 1332 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 1333 } 1334 return AllocaSize; 1335 } 1336 1337 int GCNTTIImpl::getInliningLastCallToStaticBonus() const { 1338 return BaseT::getInliningLastCallToStaticBonus() * 1339 getInliningThresholdMultiplier(); 1340 } 1341 1342 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 1343 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this); 1344 1345 // Private object passed as arguments may end up in scratch usage if the call 1346 // is not inlined. Increase the inline threshold to promote inlining. 1347 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL); 1348 if (AllocaSize > 0) 1349 Threshold += ArgAllocaCost; 1350 return Threshold; 1351 } 1352 1353 unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, 1354 const AllocaInst *AI) const { 1355 1356 // Below the cutoff, assume that the private memory objects would be 1357 // optimized 1358 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL); 1359 if (AllocaSize <= ArgAllocaCutoff) 1360 return 0; 1361 1362 // Above the cutoff, we give a cost to each private memory object 1363 // depending its size. If the array can be optimized by SROA this cost is not 1364 // added to the total-cost in the inliner cost analysis. 1365 // 1366 // We choose the total cost of the alloca such that their sum cancels the 1367 // bonus given in the threshold (ArgAllocaCost). 1368 // 1369 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost 1370 // 1371 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier, 1372 // the single-bb bonus and the vector-bonus. 1373 // 1374 // We compensate the first two multipliers, by repeating logic from the 1375 // inliner-cost in here. The vector-bonus is 0 on AMDGPU. 1376 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0"); 1377 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier(); 1378 1379 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) { 1380 return BB.getTerminator()->getNumSuccessors() > 1; 1381 }); 1382 if (SingleBB) { 1383 Threshold += Threshold / 2; 1384 } 1385 1386 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); 1387 1388 // Attribute the bonus proportionally to the alloca size 1389 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize; 1390 1391 return AllocaThresholdBonus; 1392 } 1393 1394 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1395 TTI::UnrollingPreferences &UP, 1396 OptimizationRemarkEmitter *ORE) { 1397 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE); 1398 } 1399 1400 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1401 TTI::PeelingPreferences &PP) { 1402 CommonTTI.getPeelingPreferences(L, SE, PP); 1403 } 1404 1405 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { 1406 return ST->hasFullRate64Ops() 1407 ? getFullRateInstrCost() 1408 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) 1409 : getQuarterRateInstrCost(CostKind); 1410 } 1411 1412 std::pair<InstructionCost, MVT> 1413 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { 1414 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty); 1415 auto Size = DL.getTypeSizeInBits(Ty); 1416 // Maximum load or store can handle 8 dwords for scalar and 4 for 1417 // vector ALU. Let's assume anything above 8 dwords is expensive 1418 // even if legal. 1419 if (Size <= 256) 1420 return Cost; 1421 1422 Cost.first += (Size + 255) / 256; 1423 return Cost; 1424 } 1425 1426 unsigned GCNTTIImpl::getPrefetchDistance() const { 1427 return ST->hasPrefetch() ? 128 : 0; 1428 } 1429 1430 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { 1431 return AMDGPU::isFlatGlobalAddrSpace(AS); 1432 } 1433 1434 void GCNTTIImpl::collectKernelLaunchBounds( 1435 const Function &F, 1436 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const { 1437 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); 1438 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]}); 1439 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]}); 1440 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]}); 1441 std::pair<unsigned, unsigned> FlatWorkGroupSize = 1442 ST->getFlatWorkGroupSizes(F); 1443 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first}); 1444 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second}); 1445 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F); 1446 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first}); 1447 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second}); 1448 } 1449