1a6bae5cbSJay Foad //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===// 2a6bae5cbSJay Foad // 3a6bae5cbSJay Foad // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4a6bae5cbSJay Foad // See https://llvm.org/LICENSE.txt for license information. 5a6bae5cbSJay Foad // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6a6bae5cbSJay Foad // 7a6bae5cbSJay Foad //===----------------------------------------------------------------------===// 8a6bae5cbSJay Foad // 9a6bae5cbSJay Foad /// \file 10a6bae5cbSJay Foad /// Implements the GCN specific subclass of TargetSubtarget. 11a6bae5cbSJay Foad // 12a6bae5cbSJay Foad //===----------------------------------------------------------------------===// 13a6bae5cbSJay Foad 14a6bae5cbSJay Foad #include "GCNSubtarget.h" 15a6bae5cbSJay Foad #include "AMDGPUCallLowering.h" 16a6bae5cbSJay Foad #include "AMDGPUInstructionSelector.h" 17a6bae5cbSJay Foad #include "AMDGPULegalizerInfo.h" 18a6bae5cbSJay Foad #include "AMDGPURegisterBankInfo.h" 1903847f19SSergei Barannikov #include "AMDGPUSelectionDAGInfo.h" 20a6bae5cbSJay Foad #include "AMDGPUTargetMachine.h" 21a6bae5cbSJay Foad #include "SIMachineFunctionInfo.h" 22a6bae5cbSJay Foad #include "Utils/AMDGPUBaseInfo.h" 23a6bae5cbSJay Foad #include "llvm/ADT/SmallString.h" 24a6bae5cbSJay Foad #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25a6bae5cbSJay Foad #include "llvm/CodeGen/MachineScheduler.h" 26a6bae5cbSJay Foad #include "llvm/CodeGen/TargetFrameLowering.h" 27a6bae5cbSJay Foad #include "llvm/IR/DiagnosticInfo.h" 28a6bae5cbSJay Foad #include "llvm/IR/MDBuilder.h" 29a6bae5cbSJay Foad #include <algorithm> 30a6bae5cbSJay Foad 31a6bae5cbSJay Foad using namespace llvm; 32a6bae5cbSJay Foad 33a6bae5cbSJay Foad #define DEBUG_TYPE "gcn-subtarget" 34a6bae5cbSJay Foad 35a6bae5cbSJay Foad #define GET_SUBTARGETINFO_TARGET_DESC 36a6bae5cbSJay Foad #define GET_SUBTARGETINFO_CTOR 37a6bae5cbSJay Foad #define AMDGPUSubtarget GCNSubtarget 38a6bae5cbSJay Foad #include "AMDGPUGenSubtargetInfo.inc" 39a6bae5cbSJay Foad #undef AMDGPUSubtarget 40a6bae5cbSJay Foad 41a6bae5cbSJay Foad static cl::opt<bool> EnableVGPRIndexMode( 42a6bae5cbSJay Foad "amdgpu-vgpr-index-mode", 43a6bae5cbSJay Foad cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 44a6bae5cbSJay Foad cl::init(false)); 45a6bae5cbSJay Foad 46a6bae5cbSJay Foad static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 47a6bae5cbSJay Foad cl::desc("Enable the use of AA during codegen."), 48a6bae5cbSJay Foad cl::init(true)); 49a6bae5cbSJay Foad 50a6bae5cbSJay Foad static cl::opt<unsigned> 51a6bae5cbSJay Foad NSAThreshold("amdgpu-nsa-threshold", 52a6bae5cbSJay Foad cl::desc("Number of addresses from which to enable MIMG NSA."), 53b3995aa3SJay Foad cl::init(2), cl::Hidden); 54a6bae5cbSJay Foad 55a6bae5cbSJay Foad GCNSubtarget::~GCNSubtarget() = default; 56a6bae5cbSJay Foad 57a6bae5cbSJay Foad GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 58a6bae5cbSJay Foad StringRef GPU, 59a6bae5cbSJay Foad StringRef FS) { 60a6bae5cbSJay Foad // Determine default and user-specified characteristics 61a6bae5cbSJay Foad // 62a6bae5cbSJay Foad // We want to be able to turn these off, but making this a subtarget feature 63a6bae5cbSJay Foad // for SI has the unhelpful behavior that it unsets everything else if you 64a6bae5cbSJay Foad // disable it. 65a6bae5cbSJay Foad // 66a6bae5cbSJay Foad // Similarly we want enable-prt-strict-null to be on by default and not to 67a6bae5cbSJay Foad // unset everything else if it is disabled 68a6bae5cbSJay Foad 69a6bae5cbSJay Foad SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 70a6bae5cbSJay Foad 71a6bae5cbSJay Foad // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by 72a6bae5cbSJay Foad // default 73a6bae5cbSJay Foad if (isAmdHsaOS()) 74a6bae5cbSJay Foad FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 75a6bae5cbSJay Foad 76a6bae5cbSJay Foad FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 77a6bae5cbSJay Foad 78a6bae5cbSJay Foad // Disable mutually exclusive bits. 79a6bae5cbSJay Foad if (FS.contains_insensitive("+wavefrontsize")) { 80a6bae5cbSJay Foad if (!FS.contains_insensitive("wavefrontsize16")) 81a6bae5cbSJay Foad FullFS += "-wavefrontsize16,"; 82a6bae5cbSJay Foad if (!FS.contains_insensitive("wavefrontsize32")) 83a6bae5cbSJay Foad FullFS += "-wavefrontsize32,"; 84a6bae5cbSJay Foad if (!FS.contains_insensitive("wavefrontsize64")) 85a6bae5cbSJay Foad FullFS += "-wavefrontsize64,"; 86a6bae5cbSJay Foad } 87a6bae5cbSJay Foad 88a6bae5cbSJay Foad FullFS += FS; 89a6bae5cbSJay Foad 90a6bae5cbSJay Foad ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 91a6bae5cbSJay Foad 92a6bae5cbSJay Foad // Implement the "generic" processors, which acts as the default when no 93a6bae5cbSJay Foad // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 94a6bae5cbSJay Foad // the first amdgcn target that supports flat addressing. Other OSes defaults 95a6bae5cbSJay Foad // to the first amdgcn target. 96a6bae5cbSJay Foad if (Gen == AMDGPUSubtarget::INVALID) { 97a6bae5cbSJay Foad Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 98a6bae5cbSJay Foad : AMDGPUSubtarget::SOUTHERN_ISLANDS; 99cd20fc07SMatt Arsenault // Assume wave64 for the unknown target, if not explicitly set. 100cd20fc07SMatt Arsenault if (getWavefrontSizeLog2() == 0) 101cd20fc07SMatt Arsenault WavefrontSizeLog2 = 6; 102cd20fc07SMatt Arsenault } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && 103a6bae5cbSJay Foad !hasFeature(AMDGPU::FeatureWavefrontSize64)) { 104a6bae5cbSJay Foad // If there is no default wave size it must be a generation before gfx10, 105a6bae5cbSJay Foad // these have FeatureWavefrontSize64 in their definition already. For gfx10+ 106a6bae5cbSJay Foad // set wave32 as a default. 107a6bae5cbSJay Foad ToggleFeature(AMDGPU::FeatureWavefrontSize32); 108cd20fc07SMatt Arsenault WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6; 109a6bae5cbSJay Foad } 110a6bae5cbSJay Foad 111a6bae5cbSJay Foad // We don't support FP64 for EG/NI atm. 112a6bae5cbSJay Foad assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113a6bae5cbSJay Foad 114a6bae5cbSJay Foad // Targets must either support 64-bit offsets for MUBUF instructions, and/or 115a6bae5cbSJay Foad // support flat operations, otherwise they cannot access a 64-bit global 116a6bae5cbSJay Foad // address space 117a6bae5cbSJay Foad assert(hasAddr64() || hasFlat()); 118a6bae5cbSJay Foad // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 119a6bae5cbSJay Foad // that do not support ADDR64 variants of MUBUF instructions. Such targets 120a6bae5cbSJay Foad // cannot use a 64 bit offset with a MUBUF instruction to access the global 121a6bae5cbSJay Foad // address space 122a6bae5cbSJay Foad if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 123a6bae5cbSJay Foad ToggleFeature(AMDGPU::FeatureFlatForGlobal); 124a6bae5cbSJay Foad FlatForGlobal = true; 125a6bae5cbSJay Foad } 126a6bae5cbSJay Foad // Unless +-flat-for-global is specified, use MUBUF instructions for global 127a6bae5cbSJay Foad // address space access if flat operations are not available. 128a6bae5cbSJay Foad if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 129a6bae5cbSJay Foad ToggleFeature(AMDGPU::FeatureFlatForGlobal); 130a6bae5cbSJay Foad FlatForGlobal = false; 131a6bae5cbSJay Foad } 132a6bae5cbSJay Foad 133a6bae5cbSJay Foad // Set defaults if needed. 134a6bae5cbSJay Foad if (MaxPrivateElementSize == 0) 135a6bae5cbSJay Foad MaxPrivateElementSize = 4; 136a6bae5cbSJay Foad 137a6bae5cbSJay Foad if (LDSBankCount == 0) 138a6bae5cbSJay Foad LDSBankCount = 32; 139a6bae5cbSJay Foad 1406f956e31SJay Foad if (TT.getArch() == Triple::amdgcn && AddressableLocalMemorySize == 0) 1416f956e31SJay Foad AddressableLocalMemorySize = 32768; 142a6bae5cbSJay Foad 1436f956e31SJay Foad LocalMemorySize = AddressableLocalMemorySize; 144a6bae5cbSJay Foad if (AMDGPU::isGFX10Plus(*this) && 145a6bae5cbSJay Foad !getFeatureBits().test(AMDGPU::FeatureCuMode)) 146a6bae5cbSJay Foad LocalMemorySize *= 2; 147a6bae5cbSJay Foad 148a6bae5cbSJay Foad HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 149a6bae5cbSJay Foad HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 150a6bae5cbSJay Foad 151a6bae5cbSJay Foad TargetID.setTargetIDFromFeaturesString(FS); 152a6bae5cbSJay Foad 153a6bae5cbSJay Foad LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 154a6bae5cbSJay Foad << TargetID.getXnackSetting() << '\n'); 155a6bae5cbSJay Foad LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 156a6bae5cbSJay Foad << TargetID.getSramEccSetting() << '\n'); 157a6bae5cbSJay Foad 158a6bae5cbSJay Foad return *this; 159a6bae5cbSJay Foad } 160a6bae5cbSJay Foad 161a6bae5cbSJay Foad void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { 162a6bae5cbSJay Foad LLVMContext &Ctx = F.getContext(); 163cd20fc07SMatt Arsenault if (hasFeature(AMDGPU::FeatureWavefrontSize32) && 164a6bae5cbSJay Foad hasFeature(AMDGPU::FeatureWavefrontSize64)) { 165a6bae5cbSJay Foad Ctx.diagnose(DiagnosticInfoUnsupported( 166a6bae5cbSJay Foad F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); 167a6bae5cbSJay Foad } 168a6bae5cbSJay Foad } 169a6bae5cbSJay Foad 170a6bae5cbSJay Foad GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 171a6bae5cbSJay Foad const GCNTargetMachine &TM) 172a6bae5cbSJay Foad : // clang-format off 173a6bae5cbSJay Foad AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174a6bae5cbSJay Foad AMDGPUSubtarget(TT), 175a6bae5cbSJay Foad TargetTriple(TT), 176a6bae5cbSJay Foad TargetID(*this), 177a6bae5cbSJay Foad InstrItins(getInstrItineraryForCPU(GPU)), 178a6bae5cbSJay Foad InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 179a6bae5cbSJay Foad TLInfo(TM, *this), 180a6bae5cbSJay Foad FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 181a6bae5cbSJay Foad // clang-format on 182a6bae5cbSJay Foad MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 183a6bae5cbSJay Foad EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); 18403847f19SSergei Barannikov 18503847f19SSergei Barannikov TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>(); 18603847f19SSergei Barannikov 187a6bae5cbSJay Foad CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering()); 188a6bae5cbSJay Foad InlineAsmLoweringInfo = 189a6bae5cbSJay Foad std::make_unique<InlineAsmLowering>(getTargetLowering()); 190a6bae5cbSJay Foad Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM); 191a6bae5cbSJay Foad RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this); 192a6bae5cbSJay Foad InstSelector = 193a6bae5cbSJay Foad std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM); 194a6bae5cbSJay Foad } 195a6bae5cbSJay Foad 19603847f19SSergei Barannikov const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const { 19703847f19SSergei Barannikov return TSInfo.get(); 19803847f19SSergei Barannikov } 19903847f19SSergei Barannikov 200a6bae5cbSJay Foad unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 201a6bae5cbSJay Foad if (getGeneration() < GFX10) 202a6bae5cbSJay Foad return 1; 203a6bae5cbSJay Foad 204a6bae5cbSJay Foad switch (Opcode) { 205a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B64_e64: 206a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B64_gfx10: 207a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B64_e64_gfx11: 208a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B64_e32_gfx12: 209a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B64_e64_gfx12: 210a6bae5cbSJay Foad case AMDGPU::V_LSHL_B64_e64: 211a6bae5cbSJay Foad case AMDGPU::V_LSHRREV_B64_e64: 212a6bae5cbSJay Foad case AMDGPU::V_LSHRREV_B64_gfx10: 213a6bae5cbSJay Foad case AMDGPU::V_LSHRREV_B64_e64_gfx11: 214a6bae5cbSJay Foad case AMDGPU::V_LSHRREV_B64_e64_gfx12: 215a6bae5cbSJay Foad case AMDGPU::V_LSHR_B64_e64: 216a6bae5cbSJay Foad case AMDGPU::V_ASHRREV_I64_e64: 217a6bae5cbSJay Foad case AMDGPU::V_ASHRREV_I64_gfx10: 218a6bae5cbSJay Foad case AMDGPU::V_ASHRREV_I64_e64_gfx11: 219a6bae5cbSJay Foad case AMDGPU::V_ASHRREV_I64_e64_gfx12: 220a6bae5cbSJay Foad case AMDGPU::V_ASHR_I64_e64: 221a6bae5cbSJay Foad return 1; 222a6bae5cbSJay Foad } 223a6bae5cbSJay Foad 224a6bae5cbSJay Foad return 2; 225a6bae5cbSJay Foad } 226a6bae5cbSJay Foad 227a6bae5cbSJay Foad /// This list was mostly derived from experimentation. 228a6bae5cbSJay Foad bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 229a6bae5cbSJay Foad switch (Opcode) { 230a6bae5cbSJay Foad case AMDGPU::V_CVT_F16_F32_e32: 231a6bae5cbSJay Foad case AMDGPU::V_CVT_F16_F32_e64: 232a6bae5cbSJay Foad case AMDGPU::V_CVT_F16_U16_e32: 233a6bae5cbSJay Foad case AMDGPU::V_CVT_F16_U16_e64: 234a6bae5cbSJay Foad case AMDGPU::V_CVT_F16_I16_e32: 235a6bae5cbSJay Foad case AMDGPU::V_CVT_F16_I16_e64: 236a6bae5cbSJay Foad case AMDGPU::V_RCP_F16_e64: 237a6bae5cbSJay Foad case AMDGPU::V_RCP_F16_e32: 238a6bae5cbSJay Foad case AMDGPU::V_RSQ_F16_e64: 239a6bae5cbSJay Foad case AMDGPU::V_RSQ_F16_e32: 240a6bae5cbSJay Foad case AMDGPU::V_SQRT_F16_e64: 241a6bae5cbSJay Foad case AMDGPU::V_SQRT_F16_e32: 242a6bae5cbSJay Foad case AMDGPU::V_LOG_F16_e64: 243a6bae5cbSJay Foad case AMDGPU::V_LOG_F16_e32: 244a6bae5cbSJay Foad case AMDGPU::V_EXP_F16_e64: 245a6bae5cbSJay Foad case AMDGPU::V_EXP_F16_e32: 246a6bae5cbSJay Foad case AMDGPU::V_SIN_F16_e64: 247a6bae5cbSJay Foad case AMDGPU::V_SIN_F16_e32: 248a6bae5cbSJay Foad case AMDGPU::V_COS_F16_e64: 249a6bae5cbSJay Foad case AMDGPU::V_COS_F16_e32: 250a6bae5cbSJay Foad case AMDGPU::V_FLOOR_F16_e64: 251a6bae5cbSJay Foad case AMDGPU::V_FLOOR_F16_e32: 252a6bae5cbSJay Foad case AMDGPU::V_CEIL_F16_e64: 253a6bae5cbSJay Foad case AMDGPU::V_CEIL_F16_e32: 254a6bae5cbSJay Foad case AMDGPU::V_TRUNC_F16_e64: 255a6bae5cbSJay Foad case AMDGPU::V_TRUNC_F16_e32: 256a6bae5cbSJay Foad case AMDGPU::V_RNDNE_F16_e64: 257a6bae5cbSJay Foad case AMDGPU::V_RNDNE_F16_e32: 258a6bae5cbSJay Foad case AMDGPU::V_FRACT_F16_e64: 259a6bae5cbSJay Foad case AMDGPU::V_FRACT_F16_e32: 260a6bae5cbSJay Foad case AMDGPU::V_FREXP_MANT_F16_e64: 261a6bae5cbSJay Foad case AMDGPU::V_FREXP_MANT_F16_e32: 262a6bae5cbSJay Foad case AMDGPU::V_FREXP_EXP_I16_F16_e64: 263a6bae5cbSJay Foad case AMDGPU::V_FREXP_EXP_I16_F16_e32: 264a6bae5cbSJay Foad case AMDGPU::V_LDEXP_F16_e64: 265a6bae5cbSJay Foad case AMDGPU::V_LDEXP_F16_e32: 266a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B16_e64: 267a6bae5cbSJay Foad case AMDGPU::V_LSHLREV_B16_e32: 268a6bae5cbSJay Foad case AMDGPU::V_LSHRREV_B16_e64: 269a6bae5cbSJay Foad case AMDGPU::V_LSHRREV_B16_e32: 270a6bae5cbSJay Foad case AMDGPU::V_ASHRREV_I16_e64: 271a6bae5cbSJay Foad case AMDGPU::V_ASHRREV_I16_e32: 272a6bae5cbSJay Foad case AMDGPU::V_ADD_U16_e64: 273a6bae5cbSJay Foad case AMDGPU::V_ADD_U16_e32: 274a6bae5cbSJay Foad case AMDGPU::V_SUB_U16_e64: 275a6bae5cbSJay Foad case AMDGPU::V_SUB_U16_e32: 276a6bae5cbSJay Foad case AMDGPU::V_SUBREV_U16_e64: 277a6bae5cbSJay Foad case AMDGPU::V_SUBREV_U16_e32: 278a6bae5cbSJay Foad case AMDGPU::V_MUL_LO_U16_e64: 279a6bae5cbSJay Foad case AMDGPU::V_MUL_LO_U16_e32: 280a6bae5cbSJay Foad case AMDGPU::V_ADD_F16_e64: 281a6bae5cbSJay Foad case AMDGPU::V_ADD_F16_e32: 282a6bae5cbSJay Foad case AMDGPU::V_SUB_F16_e64: 283a6bae5cbSJay Foad case AMDGPU::V_SUB_F16_e32: 284a6bae5cbSJay Foad case AMDGPU::V_SUBREV_F16_e64: 285a6bae5cbSJay Foad case AMDGPU::V_SUBREV_F16_e32: 286a6bae5cbSJay Foad case AMDGPU::V_MUL_F16_e64: 287a6bae5cbSJay Foad case AMDGPU::V_MUL_F16_e32: 288a6bae5cbSJay Foad case AMDGPU::V_MAX_F16_e64: 289a6bae5cbSJay Foad case AMDGPU::V_MAX_F16_e32: 290a6bae5cbSJay Foad case AMDGPU::V_MIN_F16_e64: 291a6bae5cbSJay Foad case AMDGPU::V_MIN_F16_e32: 292a6bae5cbSJay Foad case AMDGPU::V_MAX_U16_e64: 293a6bae5cbSJay Foad case AMDGPU::V_MAX_U16_e32: 294a6bae5cbSJay Foad case AMDGPU::V_MIN_U16_e64: 295a6bae5cbSJay Foad case AMDGPU::V_MIN_U16_e32: 296a6bae5cbSJay Foad case AMDGPU::V_MAX_I16_e64: 297a6bae5cbSJay Foad case AMDGPU::V_MAX_I16_e32: 298a6bae5cbSJay Foad case AMDGPU::V_MIN_I16_e64: 299a6bae5cbSJay Foad case AMDGPU::V_MIN_I16_e32: 300a6bae5cbSJay Foad case AMDGPU::V_MAD_F16_e64: 301a6bae5cbSJay Foad case AMDGPU::V_MAD_U16_e64: 302a6bae5cbSJay Foad case AMDGPU::V_MAD_I16_e64: 303a6bae5cbSJay Foad case AMDGPU::V_FMA_F16_e64: 304a6bae5cbSJay Foad case AMDGPU::V_DIV_FIXUP_F16_e64: 305a6bae5cbSJay Foad // On gfx10, all 16-bit instructions preserve the high bits. 306a6bae5cbSJay Foad return getGeneration() <= AMDGPUSubtarget::GFX9; 307a6bae5cbSJay Foad case AMDGPU::V_MADAK_F16: 308a6bae5cbSJay Foad case AMDGPU::V_MADMK_F16: 309a6bae5cbSJay Foad case AMDGPU::V_MAC_F16_e64: 310a6bae5cbSJay Foad case AMDGPU::V_MAC_F16_e32: 311a6bae5cbSJay Foad case AMDGPU::V_FMAMK_F16: 312a6bae5cbSJay Foad case AMDGPU::V_FMAAK_F16: 313a6bae5cbSJay Foad case AMDGPU::V_FMAC_F16_e64: 314a6bae5cbSJay Foad case AMDGPU::V_FMAC_F16_e32: 315a6bae5cbSJay Foad // In gfx9, the preferred handling of the unused high 16-bits changed. Most 316a6bae5cbSJay Foad // instructions maintain the legacy behavior of 0ing. Some instructions 317a6bae5cbSJay Foad // changed to preserving the high bits. 318a6bae5cbSJay Foad return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 319a6bae5cbSJay Foad case AMDGPU::V_MAD_MIXLO_F16: 320a6bae5cbSJay Foad case AMDGPU::V_MAD_MIXHI_F16: 321a6bae5cbSJay Foad default: 322a6bae5cbSJay Foad return false; 323a6bae5cbSJay Foad } 324a6bae5cbSJay Foad } 325a6bae5cbSJay Foad 326a6bae5cbSJay Foad void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 327a6bae5cbSJay Foad unsigned NumRegionInstrs) const { 328a6bae5cbSJay Foad // Track register pressure so the scheduler can try to decrease 329a6bae5cbSJay Foad // pressure once register usage is above the threshold defined by 330a6bae5cbSJay Foad // SIRegisterInfo::getRegPressureSetLimit() 331a6bae5cbSJay Foad Policy.ShouldTrackPressure = true; 332a6bae5cbSJay Foad 333a6bae5cbSJay Foad // Enabling both top down and bottom up scheduling seems to give us less 334a6bae5cbSJay Foad // register spills than just using one of these approaches on its own. 335a6bae5cbSJay Foad Policy.OnlyTopDown = false; 336a6bae5cbSJay Foad Policy.OnlyBottomUp = false; 337a6bae5cbSJay Foad 338a6bae5cbSJay Foad // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 339a6bae5cbSJay Foad if (!enableSIScheduler()) 340a6bae5cbSJay Foad Policy.ShouldTrackLaneMasks = true; 341a6bae5cbSJay Foad } 342a6bae5cbSJay Foad 343a6bae5cbSJay Foad void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { 344a6bae5cbSJay Foad if (isWave32()) { 345a6bae5cbSJay Foad // Fix implicit $vcc operands after MIParser has verified that they match 346a6bae5cbSJay Foad // the instruction definitions. 347a6bae5cbSJay Foad for (auto &MBB : MF) { 348a6bae5cbSJay Foad for (auto &MI : MBB) 349a6bae5cbSJay Foad InstrInfo.fixImplicitOperands(MI); 350a6bae5cbSJay Foad } 351a6bae5cbSJay Foad } 352a6bae5cbSJay Foad } 353a6bae5cbSJay Foad 354a6bae5cbSJay Foad bool GCNSubtarget::hasMadF16() const { 355a6bae5cbSJay Foad return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 356a6bae5cbSJay Foad } 357a6bae5cbSJay Foad 358a6bae5cbSJay Foad bool GCNSubtarget::useVGPRIndexMode() const { 359b02b5b7bSJay Foad return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode); 360a6bae5cbSJay Foad } 361a6bae5cbSJay Foad 362a6bae5cbSJay Foad bool GCNSubtarget::useAA() const { return UseAA; } 363a6bae5cbSJay Foad 364a6bae5cbSJay Foad unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 365a6bae5cbSJay Foad return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), 366a6bae5cbSJay Foad getGeneration()); 367a6bae5cbSJay Foad } 368a6bae5cbSJay Foad 369a6bae5cbSJay Foad unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { 370a6bae5cbSJay Foad return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); 371a6bae5cbSJay Foad } 372a6bae5cbSJay Foad 373a6bae5cbSJay Foad unsigned 374a6bae5cbSJay Foad GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 375a6bae5cbSJay Foad if (getGeneration() >= AMDGPUSubtarget::GFX10) 376a6bae5cbSJay Foad return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 377a6bae5cbSJay Foad 378a6bae5cbSJay Foad if (HasFlatScratch || HasArchitectedFlatScratch) { 379a6bae5cbSJay Foad if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 380a6bae5cbSJay Foad return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 381a6bae5cbSJay Foad if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 382a6bae5cbSJay Foad return 4; // FLAT_SCRATCH, VCC (in that order). 383a6bae5cbSJay Foad } 384a6bae5cbSJay Foad 385a6bae5cbSJay Foad if (isXNACKEnabled()) 386a6bae5cbSJay Foad return 4; // XNACK, VCC (in that order). 387a6bae5cbSJay Foad return 2; // VCC. 388a6bae5cbSJay Foad } 389a6bae5cbSJay Foad 390a6bae5cbSJay Foad unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 391a6bae5cbSJay Foad const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 392a6bae5cbSJay Foad return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); 393a6bae5cbSJay Foad } 394a6bae5cbSJay Foad 395a6bae5cbSJay Foad unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 396a6bae5cbSJay Foad // In principle we do not need to reserve SGPR pair used for flat_scratch if 397a6bae5cbSJay Foad // we know flat instructions do not access the stack anywhere in the 398a6bae5cbSJay Foad // program. For now assume it's needed if we have flat instructions. 399a6bae5cbSJay Foad const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 400a6bae5cbSJay Foad return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 401a6bae5cbSJay Foad } 402a6bae5cbSJay Foad 403*6206f544SLucas Ramirez std::pair<unsigned, unsigned> 404*6206f544SLucas Ramirez GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 405*6206f544SLucas Ramirez unsigned NumSGPRs, unsigned NumVGPRs) const { 406*6206f544SLucas Ramirez auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); 407*6206f544SLucas Ramirez unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); 408*6206f544SLucas Ramirez unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); 409*6206f544SLucas Ramirez 410*6206f544SLucas Ramirez // Maximum occupancy may be further limited by high SGPR/VGPR usage. 411*6206f544SLucas Ramirez MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); 412*6206f544SLucas Ramirez return {std::min(MinOcc, MaxOcc), MaxOcc}; 413a6bae5cbSJay Foad } 414a6bae5cbSJay Foad 415a6bae5cbSJay Foad unsigned GCNSubtarget::getBaseMaxNumSGPRs( 416a6bae5cbSJay Foad const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 417a6bae5cbSJay Foad unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 418a6bae5cbSJay Foad // Compute maximum number of SGPRs function can use using default/requested 419a6bae5cbSJay Foad // minimum number of waves per execution unit. 420a6bae5cbSJay Foad unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 421a6bae5cbSJay Foad unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 422a6bae5cbSJay Foad 423a6bae5cbSJay Foad // Check if maximum number of SGPRs was explicitly requested using 424a6bae5cbSJay Foad // "amdgpu-num-sgpr" attribute. 425a6bae5cbSJay Foad if (F.hasFnAttribute("amdgpu-num-sgpr")) { 426a6bae5cbSJay Foad unsigned Requested = 427a6bae5cbSJay Foad F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); 428a6bae5cbSJay Foad 429a6bae5cbSJay Foad // Make sure requested value does not violate subtarget's specifications. 430a6bae5cbSJay Foad if (Requested && (Requested <= ReservedNumSGPRs)) 431a6bae5cbSJay Foad Requested = 0; 432a6bae5cbSJay Foad 433a6bae5cbSJay Foad // If more SGPRs are required to support the input user/system SGPRs, 434a6bae5cbSJay Foad // increase to accommodate them. 435a6bae5cbSJay Foad // 436a6bae5cbSJay Foad // FIXME: This really ends up using the requested number of SGPRs + number 437a6bae5cbSJay Foad // of reserved special registers in total. Theoretically you could re-use 438a6bae5cbSJay Foad // the last input registers for these special registers, but this would 439a6bae5cbSJay Foad // require a lot of complexity to deal with the weird aliasing. 440a6bae5cbSJay Foad unsigned InputNumSGPRs = PreloadedSGPRs; 441a6bae5cbSJay Foad if (Requested && Requested < InputNumSGPRs) 442a6bae5cbSJay Foad Requested = InputNumSGPRs; 443a6bae5cbSJay Foad 444a6bae5cbSJay Foad // Make sure requested value is compatible with values implied by 445a6bae5cbSJay Foad // default/requested minimum/maximum number of waves per execution unit. 446a6bae5cbSJay Foad if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 447a6bae5cbSJay Foad Requested = 0; 448a6bae5cbSJay Foad if (WavesPerEU.second && Requested && 449a6bae5cbSJay Foad Requested < getMinNumSGPRs(WavesPerEU.second)) 450a6bae5cbSJay Foad Requested = 0; 451a6bae5cbSJay Foad 452a6bae5cbSJay Foad if (Requested) 453a6bae5cbSJay Foad MaxNumSGPRs = Requested; 454a6bae5cbSJay Foad } 455a6bae5cbSJay Foad 456a6bae5cbSJay Foad if (hasSGPRInitBug()) 457a6bae5cbSJay Foad MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 458a6bae5cbSJay Foad 459a6bae5cbSJay Foad return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 460a6bae5cbSJay Foad } 461a6bae5cbSJay Foad 462a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 463a6bae5cbSJay Foad const Function &F = MF.getFunction(); 464a6bae5cbSJay Foad const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 465a6bae5cbSJay Foad return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 466a6bae5cbSJay Foad getReservedNumSGPRs(MF)); 467a6bae5cbSJay Foad } 468a6bae5cbSJay Foad 469a6bae5cbSJay Foad static unsigned getMaxNumPreloadedSGPRs() { 470a6bae5cbSJay Foad using USI = GCNUserSGPRUsageInfo; 471a6bae5cbSJay Foad // Max number of user SGPRs 472a6bae5cbSJay Foad const unsigned MaxUserSGPRs = 473a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + 474a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::DispatchPtrID) + 475a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::QueuePtrID) + 476a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + 477a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::DispatchIdID) + 478a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::FlatScratchInitID) + 479a6bae5cbSJay Foad USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); 480a6bae5cbSJay Foad 481a6bae5cbSJay Foad // Max number of system SGPRs 482a6bae5cbSJay Foad const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 483a6bae5cbSJay Foad 1 + // WorkGroupIDY 484a6bae5cbSJay Foad 1 + // WorkGroupIDZ 485a6bae5cbSJay Foad 1 + // WorkGroupInfo 486a6bae5cbSJay Foad 1; // private segment wave byte offset 487a6bae5cbSJay Foad 488a6bae5cbSJay Foad // Max number of synthetic SGPRs 489a6bae5cbSJay Foad const unsigned SyntheticSGPRs = 1; // LDSKernelId 490a6bae5cbSJay Foad 491a6bae5cbSJay Foad return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; 492a6bae5cbSJay Foad } 493a6bae5cbSJay Foad 494a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 495a6bae5cbSJay Foad return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 496a6bae5cbSJay Foad getReservedNumSGPRs(F)); 497a6bae5cbSJay Foad } 498a6bae5cbSJay Foad 499a6bae5cbSJay Foad unsigned GCNSubtarget::getBaseMaxNumVGPRs( 500a6bae5cbSJay Foad const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 501a6bae5cbSJay Foad // Compute maximum number of VGPRs function can use using default/requested 502a6bae5cbSJay Foad // minimum number of waves per execution unit. 503a6bae5cbSJay Foad unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 504a6bae5cbSJay Foad 505a6bae5cbSJay Foad // Check if maximum number of VGPRs was explicitly requested using 506a6bae5cbSJay Foad // "amdgpu-num-vgpr" attribute. 507a6bae5cbSJay Foad if (F.hasFnAttribute("amdgpu-num-vgpr")) { 508a6bae5cbSJay Foad unsigned Requested = 509a6bae5cbSJay Foad F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); 510a6bae5cbSJay Foad 511a6bae5cbSJay Foad if (hasGFX90AInsts()) 512a6bae5cbSJay Foad Requested *= 2; 513a6bae5cbSJay Foad 514a6bae5cbSJay Foad // Make sure requested value is compatible with values implied by 515a6bae5cbSJay Foad // default/requested minimum/maximum number of waves per execution unit. 516a6bae5cbSJay Foad if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 517a6bae5cbSJay Foad Requested = 0; 518a6bae5cbSJay Foad if (WavesPerEU.second && Requested && 519a6bae5cbSJay Foad Requested < getMinNumVGPRs(WavesPerEU.second)) 520a6bae5cbSJay Foad Requested = 0; 521a6bae5cbSJay Foad 522a6bae5cbSJay Foad if (Requested) 523a6bae5cbSJay Foad MaxNumVGPRs = Requested; 524a6bae5cbSJay Foad } 525a6bae5cbSJay Foad 526a6bae5cbSJay Foad return MaxNumVGPRs; 527a6bae5cbSJay Foad } 528a6bae5cbSJay Foad 529a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 530a6bae5cbSJay Foad return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 531a6bae5cbSJay Foad } 532a6bae5cbSJay Foad 533a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 534a6bae5cbSJay Foad const Function &F = MF.getFunction(); 535a6bae5cbSJay Foad const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 536a6bae5cbSJay Foad return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 537a6bae5cbSJay Foad } 538a6bae5cbSJay Foad 539a6bae5cbSJay Foad void GCNSubtarget::adjustSchedDependency( 540a6bae5cbSJay Foad SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, 541a6bae5cbSJay Foad const TargetSchedModel *SchedModel) const { 542a6bae5cbSJay Foad if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() || 543a6bae5cbSJay Foad !Use->isInstr()) 544a6bae5cbSJay Foad return; 545a6bae5cbSJay Foad 546a6bae5cbSJay Foad MachineInstr *DefI = Def->getInstr(); 547a6bae5cbSJay Foad MachineInstr *UseI = Use->getInstr(); 548a6bae5cbSJay Foad 549a6bae5cbSJay Foad if (DefI->isBundle()) { 550a6bae5cbSJay Foad const SIRegisterInfo *TRI = getRegisterInfo(); 551a6bae5cbSJay Foad auto Reg = Dep.getReg(); 552a6bae5cbSJay Foad MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 553a6bae5cbSJay Foad MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 554a6bae5cbSJay Foad unsigned Lat = 0; 555a6bae5cbSJay Foad for (++I; I != E && I->isBundledWithPred(); ++I) { 556a6bae5cbSJay Foad if (I->modifiesRegister(Reg, TRI)) 557a6bae5cbSJay Foad Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 558a6bae5cbSJay Foad else if (Lat) 559a6bae5cbSJay Foad --Lat; 560a6bae5cbSJay Foad } 561a6bae5cbSJay Foad Dep.setLatency(Lat); 562a6bae5cbSJay Foad } else if (UseI->isBundle()) { 563a6bae5cbSJay Foad const SIRegisterInfo *TRI = getRegisterInfo(); 564a6bae5cbSJay Foad auto Reg = Dep.getReg(); 565a6bae5cbSJay Foad MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 566a6bae5cbSJay Foad MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 567a6bae5cbSJay Foad unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 568a6bae5cbSJay Foad for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 569a6bae5cbSJay Foad if (I->readsRegister(Reg, TRI)) 570a6bae5cbSJay Foad break; 571a6bae5cbSJay Foad --Lat; 572a6bae5cbSJay Foad } 573a6bae5cbSJay Foad Dep.setLatency(Lat); 574a6bae5cbSJay Foad } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 575a6bae5cbSJay Foad // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 576a6bae5cbSJay Foad // implicit operands which come from the MCInstrDesc, which can fool 577a6bae5cbSJay Foad // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 578a6bae5cbSJay Foad // pseudo operands. 579a6bae5cbSJay Foad Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 580a6bae5cbSJay Foad DefI, DefOpIdx, UseI, UseOpIdx)); 581a6bae5cbSJay Foad } 582a6bae5cbSJay Foad } 583a6bae5cbSJay Foad 584a6bae5cbSJay Foad unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { 585a6bae5cbSJay Foad if (getGeneration() >= AMDGPUSubtarget::GFX12) 586a6bae5cbSJay Foad return 0; // Not MIMG encoding. 587a6bae5cbSJay Foad 588a6bae5cbSJay Foad if (NSAThreshold.getNumOccurrences() > 0) 589a6bae5cbSJay Foad return std::max(NSAThreshold.getValue(), 2u); 590a6bae5cbSJay Foad 591a6bae5cbSJay Foad int Value = MF.getFunction().getFnAttributeAsParsedInteger( 592a6bae5cbSJay Foad "amdgpu-nsa-threshold", -1); 593a6bae5cbSJay Foad if (Value > 0) 594a6bae5cbSJay Foad return std::max(Value, 2); 595a6bae5cbSJay Foad 5964ce8808dSJay Foad return NSAThreshold; 597a6bae5cbSJay Foad } 598a6bae5cbSJay Foad 599a6bae5cbSJay Foad GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, 600a6bae5cbSJay Foad const GCNSubtarget &ST) 601a6bae5cbSJay Foad : ST(ST) { 602a6bae5cbSJay Foad const CallingConv::ID CC = F.getCallingConv(); 603a6bae5cbSJay Foad const bool IsKernel = 604a6bae5cbSJay Foad CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; 605a6bae5cbSJay Foad // FIXME: Should have analysis or something rather than attribute to detect 606a6bae5cbSJay Foad // calls. 607a6bae5cbSJay Foad const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); 608a6bae5cbSJay Foad // FIXME: This attribute is a hack, we just need an analysis on the function 609a6bae5cbSJay Foad // to look for allocas. 610a6bae5cbSJay Foad const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); 611a6bae5cbSJay Foad 612a6bae5cbSJay Foad if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) 613a6bae5cbSJay Foad KernargSegmentPtr = true; 614a6bae5cbSJay Foad 615a6bae5cbSJay Foad bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); 616a6bae5cbSJay Foad if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) 617a6bae5cbSJay Foad PrivateSegmentBuffer = true; 618a6bae5cbSJay Foad else if (ST.isMesaGfxShader(F)) 619a6bae5cbSJay Foad ImplicitBufferPtr = true; 620a6bae5cbSJay Foad 621a6bae5cbSJay Foad if (!AMDGPU::isGraphics(CC)) { 622a6bae5cbSJay Foad if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) 623a6bae5cbSJay Foad DispatchPtr = true; 624a6bae5cbSJay Foad 625a6bae5cbSJay Foad // FIXME: Can this always be disabled with < COv5? 626a6bae5cbSJay Foad if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) 627a6bae5cbSJay Foad QueuePtr = true; 628a6bae5cbSJay Foad 629a6bae5cbSJay Foad if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) 630a6bae5cbSJay Foad DispatchID = true; 631a6bae5cbSJay Foad } 632a6bae5cbSJay Foad 633a6bae5cbSJay Foad // TODO: This could be refined a lot. The attribute is a poor way of 634a6bae5cbSJay Foad // detecting calls or stack objects that may require it before argument 635a6bae5cbSJay Foad // lowering. 636a6bae5cbSJay Foad if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && 637a6bae5cbSJay Foad (IsAmdHsaOrMesa || ST.enableFlatScratch()) && 638a6bae5cbSJay Foad (HasCalls || HasStackObjects || ST.enableFlatScratch()) && 639a6bae5cbSJay Foad !ST.flatScratchIsArchitected()) { 640a6bae5cbSJay Foad FlatScratchInit = true; 641a6bae5cbSJay Foad } 642a6bae5cbSJay Foad 643a6bae5cbSJay Foad if (hasImplicitBufferPtr()) 644a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); 645a6bae5cbSJay Foad 646a6bae5cbSJay Foad if (hasPrivateSegmentBuffer()) 647a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); 648a6bae5cbSJay Foad 649a6bae5cbSJay Foad if (hasDispatchPtr()) 650a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); 651a6bae5cbSJay Foad 652a6bae5cbSJay Foad if (hasQueuePtr()) 653a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); 654a6bae5cbSJay Foad 655a6bae5cbSJay Foad if (hasKernargSegmentPtr()) 656a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); 657a6bae5cbSJay Foad 658a6bae5cbSJay Foad if (hasDispatchID()) 659a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); 660a6bae5cbSJay Foad 661a6bae5cbSJay Foad if (hasFlatScratchInit()) 662a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); 663a6bae5cbSJay Foad 664a6bae5cbSJay Foad if (hasPrivateSegmentSize()) 665a6bae5cbSJay Foad NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); 666a6bae5cbSJay Foad } 667a6bae5cbSJay Foad 668a6bae5cbSJay Foad void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { 669a6bae5cbSJay Foad assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); 670a6bae5cbSJay Foad NumKernargPreloadSGPRs += NumSGPRs; 671a6bae5cbSJay Foad NumUsedUserSGPRs += NumSGPRs; 672a6bae5cbSJay Foad } 673a6bae5cbSJay Foad 674a6bae5cbSJay Foad unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { 675a6bae5cbSJay Foad return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; 676a6bae5cbSJay Foad } 677