1 //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "GCNSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSelectionDAGInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/MDBuilder.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "gcn-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #undef AMDGPUSubtarget 40 41 static cl::opt<bool> EnableVGPRIndexMode( 42 "amdgpu-vgpr-index-mode", 43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 44 cl::init(false)); 45 46 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 47 cl::desc("Enable the use of AA during codegen."), 48 cl::init(true)); 49 50 static cl::opt<unsigned> 51 NSAThreshold("amdgpu-nsa-threshold", 52 cl::desc("Number of addresses from which to enable MIMG NSA."), 53 cl::init(2), cl::Hidden); 54 55 GCNSubtarget::~GCNSubtarget() = default; 56 57 GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 58 StringRef GPU, 59 StringRef FS) { 60 // Determine default and user-specified characteristics 61 // 62 // We want to be able to turn these off, but making this a subtarget feature 63 // for SI has the unhelpful behavior that it unsets everything else if you 64 // disable it. 65 // 66 // Similarly we want enable-prt-strict-null to be on by default and not to 67 // unset everything else if it is disabled 68 69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 70 71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by 72 // default 73 if (isAmdHsaOS()) 74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 75 76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 77 78 // Disable mutually exclusive bits. 79 if (FS.contains_insensitive("+wavefrontsize")) { 80 if (!FS.contains_insensitive("wavefrontsize16")) 81 FullFS += "-wavefrontsize16,"; 82 if (!FS.contains_insensitive("wavefrontsize32")) 83 FullFS += "-wavefrontsize32,"; 84 if (!FS.contains_insensitive("wavefrontsize64")) 85 FullFS += "-wavefrontsize64,"; 86 } 87 88 FullFS += FS; 89 90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 91 92 // Implement the "generic" processors, which acts as the default when no 93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 94 // the first amdgcn target that supports flat addressing. Other OSes defaults 95 // to the first amdgcn target. 96 if (Gen == AMDGPUSubtarget::INVALID) { 97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 98 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 99 // Assume wave64 for the unknown target, if not explicitly set. 100 if (getWavefrontSizeLog2() == 0) 101 WavefrontSizeLog2 = 6; 102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && 103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) { 104 // If there is no default wave size it must be a generation before gfx10, 105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+ 106 // set wave32 as a default. 107 ToggleFeature(AMDGPU::FeatureWavefrontSize32); 108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6; 109 } 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 115 // support flat operations, otherwise they cannot access a 64-bit global 116 // address space 117 assert(hasAddr64() || hasFlat()); 118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 119 // that do not support ADDR64 variants of MUBUF instructions. Such targets 120 // cannot use a 64 bit offset with a MUBUF instruction to access the global 121 // address space 122 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 123 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 124 FlatForGlobal = true; 125 } 126 // Unless +-flat-for-global is specified, use MUBUF instructions for global 127 // address space access if flat operations are not available. 128 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 129 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 130 FlatForGlobal = false; 131 } 132 133 // Set defaults if needed. 134 if (MaxPrivateElementSize == 0) 135 MaxPrivateElementSize = 4; 136 137 if (LDSBankCount == 0) 138 LDSBankCount = 32; 139 140 if (TT.getArch() == Triple::amdgcn && AddressableLocalMemorySize == 0) 141 AddressableLocalMemorySize = 32768; 142 143 LocalMemorySize = AddressableLocalMemorySize; 144 if (AMDGPU::isGFX10Plus(*this) && 145 !getFeatureBits().test(AMDGPU::FeatureCuMode)) 146 LocalMemorySize *= 2; 147 148 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 149 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 150 151 TargetID.setTargetIDFromFeaturesString(FS); 152 153 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 154 << TargetID.getXnackSetting() << '\n'); 155 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 156 << TargetID.getSramEccSetting() << '\n'); 157 158 return *this; 159 } 160 161 void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { 162 LLVMContext &Ctx = F.getContext(); 163 if (hasFeature(AMDGPU::FeatureWavefrontSize32) && 164 hasFeature(AMDGPU::FeatureWavefrontSize64)) { 165 Ctx.diagnose(DiagnosticInfoUnsupported( 166 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); 167 } 168 } 169 170 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 171 const GCNTargetMachine &TM) 172 : // clang-format off 173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 TargetID(*this), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 179 TLInfo(TM, *this), 180 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 181 // clang-format on 182 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 183 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); 184 185 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>(); 186 187 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering()); 188 InlineAsmLoweringInfo = 189 std::make_unique<InlineAsmLowering>(getTargetLowering()); 190 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM); 191 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this); 192 InstSelector = 193 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM); 194 } 195 196 const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const { 197 return TSInfo.get(); 198 } 199 200 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 201 if (getGeneration() < GFX10) 202 return 1; 203 204 switch (Opcode) { 205 case AMDGPU::V_LSHLREV_B64_e64: 206 case AMDGPU::V_LSHLREV_B64_gfx10: 207 case AMDGPU::V_LSHLREV_B64_e64_gfx11: 208 case AMDGPU::V_LSHLREV_B64_e32_gfx12: 209 case AMDGPU::V_LSHLREV_B64_e64_gfx12: 210 case AMDGPU::V_LSHL_B64_e64: 211 case AMDGPU::V_LSHRREV_B64_e64: 212 case AMDGPU::V_LSHRREV_B64_gfx10: 213 case AMDGPU::V_LSHRREV_B64_e64_gfx11: 214 case AMDGPU::V_LSHRREV_B64_e64_gfx12: 215 case AMDGPU::V_LSHR_B64_e64: 216 case AMDGPU::V_ASHRREV_I64_e64: 217 case AMDGPU::V_ASHRREV_I64_gfx10: 218 case AMDGPU::V_ASHRREV_I64_e64_gfx11: 219 case AMDGPU::V_ASHRREV_I64_e64_gfx12: 220 case AMDGPU::V_ASHR_I64_e64: 221 return 1; 222 } 223 224 return 2; 225 } 226 227 /// This list was mostly derived from experimentation. 228 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 229 switch (Opcode) { 230 case AMDGPU::V_CVT_F16_F32_e32: 231 case AMDGPU::V_CVT_F16_F32_e64: 232 case AMDGPU::V_CVT_F16_U16_e32: 233 case AMDGPU::V_CVT_F16_U16_e64: 234 case AMDGPU::V_CVT_F16_I16_e32: 235 case AMDGPU::V_CVT_F16_I16_e64: 236 case AMDGPU::V_RCP_F16_e64: 237 case AMDGPU::V_RCP_F16_e32: 238 case AMDGPU::V_RSQ_F16_e64: 239 case AMDGPU::V_RSQ_F16_e32: 240 case AMDGPU::V_SQRT_F16_e64: 241 case AMDGPU::V_SQRT_F16_e32: 242 case AMDGPU::V_LOG_F16_e64: 243 case AMDGPU::V_LOG_F16_e32: 244 case AMDGPU::V_EXP_F16_e64: 245 case AMDGPU::V_EXP_F16_e32: 246 case AMDGPU::V_SIN_F16_e64: 247 case AMDGPU::V_SIN_F16_e32: 248 case AMDGPU::V_COS_F16_e64: 249 case AMDGPU::V_COS_F16_e32: 250 case AMDGPU::V_FLOOR_F16_e64: 251 case AMDGPU::V_FLOOR_F16_e32: 252 case AMDGPU::V_CEIL_F16_e64: 253 case AMDGPU::V_CEIL_F16_e32: 254 case AMDGPU::V_TRUNC_F16_e64: 255 case AMDGPU::V_TRUNC_F16_e32: 256 case AMDGPU::V_RNDNE_F16_e64: 257 case AMDGPU::V_RNDNE_F16_e32: 258 case AMDGPU::V_FRACT_F16_e64: 259 case AMDGPU::V_FRACT_F16_e32: 260 case AMDGPU::V_FREXP_MANT_F16_e64: 261 case AMDGPU::V_FREXP_MANT_F16_e32: 262 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 263 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 264 case AMDGPU::V_LDEXP_F16_e64: 265 case AMDGPU::V_LDEXP_F16_e32: 266 case AMDGPU::V_LSHLREV_B16_e64: 267 case AMDGPU::V_LSHLREV_B16_e32: 268 case AMDGPU::V_LSHRREV_B16_e64: 269 case AMDGPU::V_LSHRREV_B16_e32: 270 case AMDGPU::V_ASHRREV_I16_e64: 271 case AMDGPU::V_ASHRREV_I16_e32: 272 case AMDGPU::V_ADD_U16_e64: 273 case AMDGPU::V_ADD_U16_e32: 274 case AMDGPU::V_SUB_U16_e64: 275 case AMDGPU::V_SUB_U16_e32: 276 case AMDGPU::V_SUBREV_U16_e64: 277 case AMDGPU::V_SUBREV_U16_e32: 278 case AMDGPU::V_MUL_LO_U16_e64: 279 case AMDGPU::V_MUL_LO_U16_e32: 280 case AMDGPU::V_ADD_F16_e64: 281 case AMDGPU::V_ADD_F16_e32: 282 case AMDGPU::V_SUB_F16_e64: 283 case AMDGPU::V_SUB_F16_e32: 284 case AMDGPU::V_SUBREV_F16_e64: 285 case AMDGPU::V_SUBREV_F16_e32: 286 case AMDGPU::V_MUL_F16_e64: 287 case AMDGPU::V_MUL_F16_e32: 288 case AMDGPU::V_MAX_F16_e64: 289 case AMDGPU::V_MAX_F16_e32: 290 case AMDGPU::V_MIN_F16_e64: 291 case AMDGPU::V_MIN_F16_e32: 292 case AMDGPU::V_MAX_U16_e64: 293 case AMDGPU::V_MAX_U16_e32: 294 case AMDGPU::V_MIN_U16_e64: 295 case AMDGPU::V_MIN_U16_e32: 296 case AMDGPU::V_MAX_I16_e64: 297 case AMDGPU::V_MAX_I16_e32: 298 case AMDGPU::V_MIN_I16_e64: 299 case AMDGPU::V_MIN_I16_e32: 300 case AMDGPU::V_MAD_F16_e64: 301 case AMDGPU::V_MAD_U16_e64: 302 case AMDGPU::V_MAD_I16_e64: 303 case AMDGPU::V_FMA_F16_e64: 304 case AMDGPU::V_DIV_FIXUP_F16_e64: 305 // On gfx10, all 16-bit instructions preserve the high bits. 306 return getGeneration() <= AMDGPUSubtarget::GFX9; 307 case AMDGPU::V_MADAK_F16: 308 case AMDGPU::V_MADMK_F16: 309 case AMDGPU::V_MAC_F16_e64: 310 case AMDGPU::V_MAC_F16_e32: 311 case AMDGPU::V_FMAMK_F16: 312 case AMDGPU::V_FMAAK_F16: 313 case AMDGPU::V_FMAC_F16_e64: 314 case AMDGPU::V_FMAC_F16_e32: 315 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 316 // instructions maintain the legacy behavior of 0ing. Some instructions 317 // changed to preserving the high bits. 318 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 319 case AMDGPU::V_MAD_MIXLO_F16: 320 case AMDGPU::V_MAD_MIXHI_F16: 321 default: 322 return false; 323 } 324 } 325 326 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 327 unsigned NumRegionInstrs) const { 328 // Track register pressure so the scheduler can try to decrease 329 // pressure once register usage is above the threshold defined by 330 // SIRegisterInfo::getRegPressureSetLimit() 331 Policy.ShouldTrackPressure = true; 332 333 // Enabling both top down and bottom up scheduling seems to give us less 334 // register spills than just using one of these approaches on its own. 335 Policy.OnlyTopDown = false; 336 Policy.OnlyBottomUp = false; 337 338 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 339 if (!enableSIScheduler()) 340 Policy.ShouldTrackLaneMasks = true; 341 } 342 343 void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { 344 if (isWave32()) { 345 // Fix implicit $vcc operands after MIParser has verified that they match 346 // the instruction definitions. 347 for (auto &MBB : MF) { 348 for (auto &MI : MBB) 349 InstrInfo.fixImplicitOperands(MI); 350 } 351 } 352 } 353 354 bool GCNSubtarget::hasMadF16() const { 355 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 356 } 357 358 bool GCNSubtarget::useVGPRIndexMode() const { 359 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode); 360 } 361 362 bool GCNSubtarget::useAA() const { return UseAA; } 363 364 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 365 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), 366 getGeneration()); 367 } 368 369 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { 370 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); 371 } 372 373 unsigned 374 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 375 if (getGeneration() >= AMDGPUSubtarget::GFX10) 376 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 377 378 if (HasFlatScratch || HasArchitectedFlatScratch) { 379 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 380 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 381 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 382 return 4; // FLAT_SCRATCH, VCC (in that order). 383 } 384 385 if (isXNACKEnabled()) 386 return 4; // XNACK, VCC (in that order). 387 return 2; // VCC. 388 } 389 390 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 391 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 392 return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); 393 } 394 395 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 396 // In principle we do not need to reserve SGPR pair used for flat_scratch if 397 // we know flat instructions do not access the stack anywhere in the 398 // program. For now assume it's needed if we have flat instructions. 399 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 400 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 401 } 402 403 std::pair<unsigned, unsigned> 404 GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 405 unsigned NumSGPRs, unsigned NumVGPRs) const { 406 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); 407 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); 408 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); 409 410 // Maximum occupancy may be further limited by high SGPR/VGPR usage. 411 MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); 412 return {std::min(MinOcc, MaxOcc), MaxOcc}; 413 } 414 415 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 416 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 417 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 418 // Compute maximum number of SGPRs function can use using default/requested 419 // minimum number of waves per execution unit. 420 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 421 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 422 423 // Check if maximum number of SGPRs was explicitly requested using 424 // "amdgpu-num-sgpr" attribute. 425 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 426 unsigned Requested = 427 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); 428 429 // Make sure requested value does not violate subtarget's specifications. 430 if (Requested && (Requested <= ReservedNumSGPRs)) 431 Requested = 0; 432 433 // If more SGPRs are required to support the input user/system SGPRs, 434 // increase to accommodate them. 435 // 436 // FIXME: This really ends up using the requested number of SGPRs + number 437 // of reserved special registers in total. Theoretically you could re-use 438 // the last input registers for these special registers, but this would 439 // require a lot of complexity to deal with the weird aliasing. 440 unsigned InputNumSGPRs = PreloadedSGPRs; 441 if (Requested && Requested < InputNumSGPRs) 442 Requested = InputNumSGPRs; 443 444 // Make sure requested value is compatible with values implied by 445 // default/requested minimum/maximum number of waves per execution unit. 446 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 447 Requested = 0; 448 if (WavesPerEU.second && Requested && 449 Requested < getMinNumSGPRs(WavesPerEU.second)) 450 Requested = 0; 451 452 if (Requested) 453 MaxNumSGPRs = Requested; 454 } 455 456 if (hasSGPRInitBug()) 457 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 458 459 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 460 } 461 462 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 463 const Function &F = MF.getFunction(); 464 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 465 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 466 getReservedNumSGPRs(MF)); 467 } 468 469 static unsigned getMaxNumPreloadedSGPRs() { 470 using USI = GCNUserSGPRUsageInfo; 471 // Max number of user SGPRs 472 const unsigned MaxUserSGPRs = 473 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + 474 USI::getNumUserSGPRForField(USI::DispatchPtrID) + 475 USI::getNumUserSGPRForField(USI::QueuePtrID) + 476 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + 477 USI::getNumUserSGPRForField(USI::DispatchIdID) + 478 USI::getNumUserSGPRForField(USI::FlatScratchInitID) + 479 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); 480 481 // Max number of system SGPRs 482 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 483 1 + // WorkGroupIDY 484 1 + // WorkGroupIDZ 485 1 + // WorkGroupInfo 486 1; // private segment wave byte offset 487 488 // Max number of synthetic SGPRs 489 const unsigned SyntheticSGPRs = 1; // LDSKernelId 490 491 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; 492 } 493 494 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 495 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 496 getReservedNumSGPRs(F)); 497 } 498 499 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 500 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 501 // Compute maximum number of VGPRs function can use using default/requested 502 // minimum number of waves per execution unit. 503 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 504 505 // Check if maximum number of VGPRs was explicitly requested using 506 // "amdgpu-num-vgpr" attribute. 507 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 508 unsigned Requested = 509 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); 510 511 if (hasGFX90AInsts()) 512 Requested *= 2; 513 514 // Make sure requested value is compatible with values implied by 515 // default/requested minimum/maximum number of waves per execution unit. 516 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 517 Requested = 0; 518 if (WavesPerEU.second && Requested && 519 Requested < getMinNumVGPRs(WavesPerEU.second)) 520 Requested = 0; 521 522 if (Requested) 523 MaxNumVGPRs = Requested; 524 } 525 526 return MaxNumVGPRs; 527 } 528 529 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 530 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 531 } 532 533 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 534 const Function &F = MF.getFunction(); 535 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 536 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 537 } 538 539 void GCNSubtarget::adjustSchedDependency( 540 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, 541 const TargetSchedModel *SchedModel) const { 542 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() || 543 !Use->isInstr()) 544 return; 545 546 MachineInstr *DefI = Def->getInstr(); 547 MachineInstr *UseI = Use->getInstr(); 548 549 if (DefI->isBundle()) { 550 const SIRegisterInfo *TRI = getRegisterInfo(); 551 auto Reg = Dep.getReg(); 552 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 553 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 554 unsigned Lat = 0; 555 for (++I; I != E && I->isBundledWithPred(); ++I) { 556 if (I->modifiesRegister(Reg, TRI)) 557 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 558 else if (Lat) 559 --Lat; 560 } 561 Dep.setLatency(Lat); 562 } else if (UseI->isBundle()) { 563 const SIRegisterInfo *TRI = getRegisterInfo(); 564 auto Reg = Dep.getReg(); 565 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 566 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 567 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 568 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 569 if (I->readsRegister(Reg, TRI)) 570 break; 571 --Lat; 572 } 573 Dep.setLatency(Lat); 574 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 575 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 576 // implicit operands which come from the MCInstrDesc, which can fool 577 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 578 // pseudo operands. 579 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 580 DefI, DefOpIdx, UseI, UseOpIdx)); 581 } 582 } 583 584 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { 585 if (getGeneration() >= AMDGPUSubtarget::GFX12) 586 return 0; // Not MIMG encoding. 587 588 if (NSAThreshold.getNumOccurrences() > 0) 589 return std::max(NSAThreshold.getValue(), 2u); 590 591 int Value = MF.getFunction().getFnAttributeAsParsedInteger( 592 "amdgpu-nsa-threshold", -1); 593 if (Value > 0) 594 return std::max(Value, 2); 595 596 return NSAThreshold; 597 } 598 599 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, 600 const GCNSubtarget &ST) 601 : ST(ST) { 602 const CallingConv::ID CC = F.getCallingConv(); 603 const bool IsKernel = 604 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; 605 // FIXME: Should have analysis or something rather than attribute to detect 606 // calls. 607 const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); 608 // FIXME: This attribute is a hack, we just need an analysis on the function 609 // to look for allocas. 610 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); 611 612 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) 613 KernargSegmentPtr = true; 614 615 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); 616 if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) 617 PrivateSegmentBuffer = true; 618 else if (ST.isMesaGfxShader(F)) 619 ImplicitBufferPtr = true; 620 621 if (!AMDGPU::isGraphics(CC)) { 622 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) 623 DispatchPtr = true; 624 625 // FIXME: Can this always be disabled with < COv5? 626 if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) 627 QueuePtr = true; 628 629 if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) 630 DispatchID = true; 631 } 632 633 // TODO: This could be refined a lot. The attribute is a poor way of 634 // detecting calls or stack objects that may require it before argument 635 // lowering. 636 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && 637 (IsAmdHsaOrMesa || ST.enableFlatScratch()) && 638 (HasCalls || HasStackObjects || ST.enableFlatScratch()) && 639 !ST.flatScratchIsArchitected()) { 640 FlatScratchInit = true; 641 } 642 643 if (hasImplicitBufferPtr()) 644 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); 645 646 if (hasPrivateSegmentBuffer()) 647 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); 648 649 if (hasDispatchPtr()) 650 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); 651 652 if (hasQueuePtr()) 653 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); 654 655 if (hasKernargSegmentPtr()) 656 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); 657 658 if (hasDispatchID()) 659 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); 660 661 if (hasFlatScratchInit()) 662 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); 663 664 if (hasPrivateSegmentSize()) 665 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); 666 } 667 668 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { 669 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); 670 NumKernargPreloadSGPRs += NumSGPRs; 671 NumUsedUserSGPRs += NumSGPRs; 672 } 673 674 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { 675 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; 676 } 677