10b57cec5SDimitry Andric //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// Implements the AMDGPU specific subclass of TargetSubtarget. 110b57cec5SDimitry Andric // 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 140b57cec5SDimitry Andric #include "AMDGPUSubtarget.h" 150b57cec5SDimitry Andric #include "AMDGPUCallLowering.h" 160b57cec5SDimitry Andric #include "AMDGPUInstructionSelector.h" 170b57cec5SDimitry Andric #include "AMDGPULegalizerInfo.h" 180b57cec5SDimitry Andric #include "AMDGPURegisterBankInfo.h" 19e8d8bef9SDimitry Andric #include "AMDGPUTargetMachine.h" 205f757f3fSDimitry Andric #include "GCNSubtarget.h" 21349cc55cSDimitry Andric #include "R600Subtarget.h" 220b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 23e8d8bef9SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 240b57cec5SDimitry Andric #include "llvm/ADT/SmallString.h" 25e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 260b57cec5SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h" 270b57cec5SDimitry Andric #include "llvm/CodeGen/TargetFrameLowering.h" 28*0fca6ea1SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 29e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 30e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsR600.h" 31e8d8bef9SDimitry Andric #include "llvm/IR/MDBuilder.h" 32e8d8bef9SDimitry Andric #include "llvm/MC/MCSubtargetInfo.h" 330b57cec5SDimitry Andric #include <algorithm> 340b57cec5SDimitry Andric 350b57cec5SDimitry Andric using namespace llvm; 360b57cec5SDimitry Andric 370b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-subtarget" 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric #define GET_SUBTARGETINFO_TARGET_DESC 400b57cec5SDimitry Andric #define GET_SUBTARGETINFO_CTOR 410b57cec5SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 420b57cec5SDimitry Andric #include "AMDGPUGenSubtargetInfo.inc" 430b57cec5SDimitry Andric #undef AMDGPUSubtarget 440b57cec5SDimitry Andric 45753f127fSDimitry Andric static cl::opt<bool> EnablePowerSched( 46753f127fSDimitry Andric "amdgpu-enable-power-sched", 47753f127fSDimitry Andric cl::desc("Enable scheduling to minimize mAI power bursts"), 480b57cec5SDimitry Andric cl::init(false)); 490b57cec5SDimitry Andric 50480093f4SDimitry Andric static cl::opt<bool> EnableVGPRIndexMode( 51480093f4SDimitry Andric "amdgpu-vgpr-index-mode", 52480093f4SDimitry Andric cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 53480093f4SDimitry Andric cl::init(false)); 54480093f4SDimitry Andric 55e8d8bef9SDimitry Andric static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 56e8d8bef9SDimitry Andric cl::desc("Enable the use of AA during codegen."), 57e8d8bef9SDimitry Andric cl::init(true)); 58e8d8bef9SDimitry Andric 59bdd1243dSDimitry Andric static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold", 60bdd1243dSDimitry Andric cl::desc("Number of addresses from which to enable MIMG NSA."), 61bdd1243dSDimitry Andric cl::init(3), cl::Hidden); 62bdd1243dSDimitry Andric 630b57cec5SDimitry Andric GCNSubtarget::~GCNSubtarget() = default; 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric GCNSubtarget & 660b57cec5SDimitry Andric GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 670b57cec5SDimitry Andric StringRef GPU, StringRef FS) { 680b57cec5SDimitry Andric // Determine default and user-specified characteristics 690b57cec5SDimitry Andric // 700b57cec5SDimitry Andric // We want to be able to turn these off, but making this a subtarget feature 710b57cec5SDimitry Andric // for SI has the unhelpful behavior that it unsets everything else if you 720b57cec5SDimitry Andric // disable it. 730b57cec5SDimitry Andric // 740b57cec5SDimitry Andric // Similarly we want enable-prt-strict-null to be on by default and not to 750b57cec5SDimitry Andric // unset everything else if it is disabled 760b57cec5SDimitry Andric 77e8d8bef9SDimitry Andric SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 780b57cec5SDimitry Andric 79e8d8bef9SDimitry Andric // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 80e8d8bef9SDimitry Andric if (isAmdHsaOS()) 81e8d8bef9SDimitry Andric FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric // Disable mutually exclusive bits. 86349cc55cSDimitry Andric if (FS.contains_insensitive("+wavefrontsize")) { 87349cc55cSDimitry Andric if (!FS.contains_insensitive("wavefrontsize16")) 880b57cec5SDimitry Andric FullFS += "-wavefrontsize16,"; 89349cc55cSDimitry Andric if (!FS.contains_insensitive("wavefrontsize32")) 900b57cec5SDimitry Andric FullFS += "-wavefrontsize32,"; 91349cc55cSDimitry Andric if (!FS.contains_insensitive("wavefrontsize64")) 920b57cec5SDimitry Andric FullFS += "-wavefrontsize64,"; 930b57cec5SDimitry Andric } 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric FullFS += FS; 960b57cec5SDimitry Andric 97e8d8bef9SDimitry Andric ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 98e8d8bef9SDimitry Andric 99e8d8bef9SDimitry Andric // Implement the "generic" processors, which acts as the default when no 100e8d8bef9SDimitry Andric // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 101e8d8bef9SDimitry Andric // the first amdgcn target that supports flat addressing. Other OSes defaults 102e8d8bef9SDimitry Andric // to the first amdgcn target. 103e8d8bef9SDimitry Andric if (Gen == AMDGPUSubtarget::INVALID) { 104e8d8bef9SDimitry Andric Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 105e8d8bef9SDimitry Andric : AMDGPUSubtarget::SOUTHERN_ISLANDS; 106e8d8bef9SDimitry Andric } 1070b57cec5SDimitry Andric 108*0fca6ea1SDimitry Andric if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && 109*0fca6ea1SDimitry Andric !hasFeature(AMDGPU::FeatureWavefrontSize64)) { 110*0fca6ea1SDimitry Andric // If there is no default wave size it must be a generation before gfx10, 111*0fca6ea1SDimitry Andric // these have FeatureWavefrontSize64 in their definition already. For gfx10+ 112*0fca6ea1SDimitry Andric // set wave32 as a default. 113*0fca6ea1SDimitry Andric ToggleFeature(AMDGPU::FeatureWavefrontSize32); 114*0fca6ea1SDimitry Andric } 115*0fca6ea1SDimitry Andric 1160b57cec5SDimitry Andric // We don't support FP64 for EG/NI atm. 1170b57cec5SDimitry Andric assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 1180b57cec5SDimitry Andric 119e8d8bef9SDimitry Andric // Targets must either support 64-bit offsets for MUBUF instructions, and/or 120e8d8bef9SDimitry Andric // support flat operations, otherwise they cannot access a 64-bit global 121e8d8bef9SDimitry Andric // address space 122e8d8bef9SDimitry Andric assert(hasAddr64() || hasFlat()); 123e8d8bef9SDimitry Andric // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 124e8d8bef9SDimitry Andric // that do not support ADDR64 variants of MUBUF instructions. Such targets 125e8d8bef9SDimitry Andric // cannot use a 64 bit offset with a MUBUF instruction to access the global 126e8d8bef9SDimitry Andric // address space 127e8d8bef9SDimitry Andric if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 128e8d8bef9SDimitry Andric ToggleFeature(AMDGPU::FeatureFlatForGlobal); 1290b57cec5SDimitry Andric FlatForGlobal = true; 1300b57cec5SDimitry Andric } 131e8d8bef9SDimitry Andric // Unless +-flat-for-global is specified, use MUBUF instructions for global 132e8d8bef9SDimitry Andric // address space access if flat operations are not available. 133e8d8bef9SDimitry Andric if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 134e8d8bef9SDimitry Andric ToggleFeature(AMDGPU::FeatureFlatForGlobal); 135e8d8bef9SDimitry Andric FlatForGlobal = false; 136e8d8bef9SDimitry Andric } 1370b57cec5SDimitry Andric 1380b57cec5SDimitry Andric // Set defaults if needed. 1390b57cec5SDimitry Andric if (MaxPrivateElementSize == 0) 1400b57cec5SDimitry Andric MaxPrivateElementSize = 4; 1410b57cec5SDimitry Andric 1420b57cec5SDimitry Andric if (LDSBankCount == 0) 1430b57cec5SDimitry Andric LDSBankCount = 32; 1440b57cec5SDimitry Andric 1450b57cec5SDimitry Andric if (TT.getArch() == Triple::amdgcn) { 1460b57cec5SDimitry Andric if (LocalMemorySize == 0) 1470b57cec5SDimitry Andric LocalMemorySize = 32768; 1480b57cec5SDimitry Andric 1490b57cec5SDimitry Andric // Do something sensible for unspecified target. 1500b57cec5SDimitry Andric if (!HasMovrel && !HasVGPRIndexMode) 1510b57cec5SDimitry Andric HasMovrel = true; 1520b57cec5SDimitry Andric } 1530b57cec5SDimitry Andric 154bdd1243dSDimitry Andric AddressableLocalMemorySize = LocalMemorySize; 155bdd1243dSDimitry Andric 156bdd1243dSDimitry Andric if (AMDGPU::isGFX10Plus(*this) && 157bdd1243dSDimitry Andric !getFeatureBits().test(AMDGPU::FeatureCuMode)) 158bdd1243dSDimitry Andric LocalMemorySize *= 2; 159bdd1243dSDimitry Andric 1600b57cec5SDimitry Andric // Don't crash on invalid devices. 1615ffd83dbSDimitry Andric if (WavefrontSizeLog2 == 0) 1625ffd83dbSDimitry Andric WavefrontSizeLog2 = 5; 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 165fe6060f1SDimitry Andric HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 1660b57cec5SDimitry Andric 167e8d8bef9SDimitry Andric TargetID.setTargetIDFromFeaturesString(FS); 1680b57cec5SDimitry Andric 169e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170e8d8bef9SDimitry Andric << TargetID.getXnackSetting() << '\n'); 171e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172e8d8bef9SDimitry Andric << TargetID.getSramEccSetting() << '\n'); 1730b57cec5SDimitry Andric 1740b57cec5SDimitry Andric return *this; 1750b57cec5SDimitry Andric } 1760b57cec5SDimitry Andric 177*0fca6ea1SDimitry Andric void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { 178*0fca6ea1SDimitry Andric LLVMContext &Ctx = F.getContext(); 179*0fca6ea1SDimitry Andric if (hasFeature(AMDGPU::FeatureWavefrontSize32) == 180*0fca6ea1SDimitry Andric hasFeature(AMDGPU::FeatureWavefrontSize64)) { 181*0fca6ea1SDimitry Andric Ctx.diagnose(DiagnosticInfoUnsupported( 182*0fca6ea1SDimitry Andric F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); 183*0fca6ea1SDimitry Andric } 184*0fca6ea1SDimitry Andric } 185*0fca6ea1SDimitry Andric 186*0fca6ea1SDimitry Andric AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} 1870b57cec5SDimitry Andric 1885f757f3fSDimitry Andric bool AMDGPUSubtarget::useRealTrue16Insts() const { 1895f757f3fSDimitry Andric return hasTrue16BitInsts() && EnableRealTrue16Insts; 1905f757f3fSDimitry Andric } 1915f757f3fSDimitry Andric 1920b57cec5SDimitry Andric GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 193fe6060f1SDimitry Andric const GCNTargetMachine &TM) 194fe6060f1SDimitry Andric : // clang-format off 195e8d8bef9SDimitry Andric AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 1960b57cec5SDimitry Andric AMDGPUSubtarget(TT), 1970b57cec5SDimitry Andric TargetTriple(TT), 198e8d8bef9SDimitry Andric TargetID(*this), 1990b57cec5SDimitry Andric InstrItins(getInstrItineraryForCPU(GPU)), 2000b57cec5SDimitry Andric InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 2010b57cec5SDimitry Andric TLInfo(TM, *this), 2020b57cec5SDimitry Andric FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 203fe6060f1SDimitry Andric // clang-format on 2048bcb0991SDimitry Andric MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 205bdd1243dSDimitry Andric EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); 206*0fca6ea1SDimitry Andric CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering()); 207*0fca6ea1SDimitry Andric InlineAsmLoweringInfo = 208*0fca6ea1SDimitry Andric std::make_unique<InlineAsmLowering>(getTargetLowering()); 209*0fca6ea1SDimitry Andric Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM); 210*0fca6ea1SDimitry Andric RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this); 211*0fca6ea1SDimitry Andric InstSelector = 212*0fca6ea1SDimitry Andric std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM); 2130b57cec5SDimitry Andric } 2140b57cec5SDimitry Andric 2150b57cec5SDimitry Andric unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 2160b57cec5SDimitry Andric if (getGeneration() < GFX10) 2170b57cec5SDimitry Andric return 1; 2180b57cec5SDimitry Andric 2190b57cec5SDimitry Andric switch (Opcode) { 220e8d8bef9SDimitry Andric case AMDGPU::V_LSHLREV_B64_e64: 2210b57cec5SDimitry Andric case AMDGPU::V_LSHLREV_B64_gfx10: 22281ad6265SDimitry Andric case AMDGPU::V_LSHLREV_B64_e64_gfx11: 2235f757f3fSDimitry Andric case AMDGPU::V_LSHLREV_B64_e32_gfx12: 2245f757f3fSDimitry Andric case AMDGPU::V_LSHLREV_B64_e64_gfx12: 225e8d8bef9SDimitry Andric case AMDGPU::V_LSHL_B64_e64: 226e8d8bef9SDimitry Andric case AMDGPU::V_LSHRREV_B64_e64: 2270b57cec5SDimitry Andric case AMDGPU::V_LSHRREV_B64_gfx10: 22881ad6265SDimitry Andric case AMDGPU::V_LSHRREV_B64_e64_gfx11: 2295f757f3fSDimitry Andric case AMDGPU::V_LSHRREV_B64_e64_gfx12: 230e8d8bef9SDimitry Andric case AMDGPU::V_LSHR_B64_e64: 231e8d8bef9SDimitry Andric case AMDGPU::V_ASHRREV_I64_e64: 2320b57cec5SDimitry Andric case AMDGPU::V_ASHRREV_I64_gfx10: 23381ad6265SDimitry Andric case AMDGPU::V_ASHRREV_I64_e64_gfx11: 2345f757f3fSDimitry Andric case AMDGPU::V_ASHRREV_I64_e64_gfx12: 235e8d8bef9SDimitry Andric case AMDGPU::V_ASHR_I64_e64: 2360b57cec5SDimitry Andric return 1; 2370b57cec5SDimitry Andric } 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric return 2; 2400b57cec5SDimitry Andric } 2410b57cec5SDimitry Andric 242fe6060f1SDimitry Andric /// This list was mostly derived from experimentation. 243fe6060f1SDimitry Andric bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 244fe6060f1SDimitry Andric switch (Opcode) { 245fe6060f1SDimitry Andric case AMDGPU::V_CVT_F16_F32_e32: 246fe6060f1SDimitry Andric case AMDGPU::V_CVT_F16_F32_e64: 247fe6060f1SDimitry Andric case AMDGPU::V_CVT_F16_U16_e32: 248fe6060f1SDimitry Andric case AMDGPU::V_CVT_F16_U16_e64: 249fe6060f1SDimitry Andric case AMDGPU::V_CVT_F16_I16_e32: 250fe6060f1SDimitry Andric case AMDGPU::V_CVT_F16_I16_e64: 251fe6060f1SDimitry Andric case AMDGPU::V_RCP_F16_e64: 252fe6060f1SDimitry Andric case AMDGPU::V_RCP_F16_e32: 253fe6060f1SDimitry Andric case AMDGPU::V_RSQ_F16_e64: 254fe6060f1SDimitry Andric case AMDGPU::V_RSQ_F16_e32: 255fe6060f1SDimitry Andric case AMDGPU::V_SQRT_F16_e64: 256fe6060f1SDimitry Andric case AMDGPU::V_SQRT_F16_e32: 257fe6060f1SDimitry Andric case AMDGPU::V_LOG_F16_e64: 258fe6060f1SDimitry Andric case AMDGPU::V_LOG_F16_e32: 259fe6060f1SDimitry Andric case AMDGPU::V_EXP_F16_e64: 260fe6060f1SDimitry Andric case AMDGPU::V_EXP_F16_e32: 261fe6060f1SDimitry Andric case AMDGPU::V_SIN_F16_e64: 262fe6060f1SDimitry Andric case AMDGPU::V_SIN_F16_e32: 263fe6060f1SDimitry Andric case AMDGPU::V_COS_F16_e64: 264fe6060f1SDimitry Andric case AMDGPU::V_COS_F16_e32: 265fe6060f1SDimitry Andric case AMDGPU::V_FLOOR_F16_e64: 266fe6060f1SDimitry Andric case AMDGPU::V_FLOOR_F16_e32: 267fe6060f1SDimitry Andric case AMDGPU::V_CEIL_F16_e64: 268fe6060f1SDimitry Andric case AMDGPU::V_CEIL_F16_e32: 269fe6060f1SDimitry Andric case AMDGPU::V_TRUNC_F16_e64: 270fe6060f1SDimitry Andric case AMDGPU::V_TRUNC_F16_e32: 271fe6060f1SDimitry Andric case AMDGPU::V_RNDNE_F16_e64: 272fe6060f1SDimitry Andric case AMDGPU::V_RNDNE_F16_e32: 273fe6060f1SDimitry Andric case AMDGPU::V_FRACT_F16_e64: 274fe6060f1SDimitry Andric case AMDGPU::V_FRACT_F16_e32: 275fe6060f1SDimitry Andric case AMDGPU::V_FREXP_MANT_F16_e64: 276fe6060f1SDimitry Andric case AMDGPU::V_FREXP_MANT_F16_e32: 277fe6060f1SDimitry Andric case AMDGPU::V_FREXP_EXP_I16_F16_e64: 278fe6060f1SDimitry Andric case AMDGPU::V_FREXP_EXP_I16_F16_e32: 279fe6060f1SDimitry Andric case AMDGPU::V_LDEXP_F16_e64: 280fe6060f1SDimitry Andric case AMDGPU::V_LDEXP_F16_e32: 281fe6060f1SDimitry Andric case AMDGPU::V_LSHLREV_B16_e64: 282fe6060f1SDimitry Andric case AMDGPU::V_LSHLREV_B16_e32: 283fe6060f1SDimitry Andric case AMDGPU::V_LSHRREV_B16_e64: 284fe6060f1SDimitry Andric case AMDGPU::V_LSHRREV_B16_e32: 285fe6060f1SDimitry Andric case AMDGPU::V_ASHRREV_I16_e64: 286fe6060f1SDimitry Andric case AMDGPU::V_ASHRREV_I16_e32: 287fe6060f1SDimitry Andric case AMDGPU::V_ADD_U16_e64: 288fe6060f1SDimitry Andric case AMDGPU::V_ADD_U16_e32: 289fe6060f1SDimitry Andric case AMDGPU::V_SUB_U16_e64: 290fe6060f1SDimitry Andric case AMDGPU::V_SUB_U16_e32: 291fe6060f1SDimitry Andric case AMDGPU::V_SUBREV_U16_e64: 292fe6060f1SDimitry Andric case AMDGPU::V_SUBREV_U16_e32: 293fe6060f1SDimitry Andric case AMDGPU::V_MUL_LO_U16_e64: 294fe6060f1SDimitry Andric case AMDGPU::V_MUL_LO_U16_e32: 295fe6060f1SDimitry Andric case AMDGPU::V_ADD_F16_e64: 296fe6060f1SDimitry Andric case AMDGPU::V_ADD_F16_e32: 297fe6060f1SDimitry Andric case AMDGPU::V_SUB_F16_e64: 298fe6060f1SDimitry Andric case AMDGPU::V_SUB_F16_e32: 299fe6060f1SDimitry Andric case AMDGPU::V_SUBREV_F16_e64: 300fe6060f1SDimitry Andric case AMDGPU::V_SUBREV_F16_e32: 301fe6060f1SDimitry Andric case AMDGPU::V_MUL_F16_e64: 302fe6060f1SDimitry Andric case AMDGPU::V_MUL_F16_e32: 303fe6060f1SDimitry Andric case AMDGPU::V_MAX_F16_e64: 304fe6060f1SDimitry Andric case AMDGPU::V_MAX_F16_e32: 305fe6060f1SDimitry Andric case AMDGPU::V_MIN_F16_e64: 306fe6060f1SDimitry Andric case AMDGPU::V_MIN_F16_e32: 307fe6060f1SDimitry Andric case AMDGPU::V_MAX_U16_e64: 308fe6060f1SDimitry Andric case AMDGPU::V_MAX_U16_e32: 309fe6060f1SDimitry Andric case AMDGPU::V_MIN_U16_e64: 310fe6060f1SDimitry Andric case AMDGPU::V_MIN_U16_e32: 311fe6060f1SDimitry Andric case AMDGPU::V_MAX_I16_e64: 312fe6060f1SDimitry Andric case AMDGPU::V_MAX_I16_e32: 313fe6060f1SDimitry Andric case AMDGPU::V_MIN_I16_e64: 314fe6060f1SDimitry Andric case AMDGPU::V_MIN_I16_e32: 3150eae32dcSDimitry Andric case AMDGPU::V_MAD_F16_e64: 3160eae32dcSDimitry Andric case AMDGPU::V_MAD_U16_e64: 3170eae32dcSDimitry Andric case AMDGPU::V_MAD_I16_e64: 3180eae32dcSDimitry Andric case AMDGPU::V_FMA_F16_e64: 3190eae32dcSDimitry Andric case AMDGPU::V_DIV_FIXUP_F16_e64: 320fe6060f1SDimitry Andric // On gfx10, all 16-bit instructions preserve the high bits. 321fe6060f1SDimitry Andric return getGeneration() <= AMDGPUSubtarget::GFX9; 322fe6060f1SDimitry Andric case AMDGPU::V_MADAK_F16: 323fe6060f1SDimitry Andric case AMDGPU::V_MADMK_F16: 324fe6060f1SDimitry Andric case AMDGPU::V_MAC_F16_e64: 325fe6060f1SDimitry Andric case AMDGPU::V_MAC_F16_e32: 326fe6060f1SDimitry Andric case AMDGPU::V_FMAMK_F16: 327fe6060f1SDimitry Andric case AMDGPU::V_FMAAK_F16: 328fe6060f1SDimitry Andric case AMDGPU::V_FMAC_F16_e64: 329fe6060f1SDimitry Andric case AMDGPU::V_FMAC_F16_e32: 330fe6060f1SDimitry Andric // In gfx9, the preferred handling of the unused high 16-bits changed. Most 331fe6060f1SDimitry Andric // instructions maintain the legacy behavior of 0ing. Some instructions 332fe6060f1SDimitry Andric // changed to preserving the high bits. 333fe6060f1SDimitry Andric return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 334fe6060f1SDimitry Andric case AMDGPU::V_MAD_MIXLO_F16: 335fe6060f1SDimitry Andric case AMDGPU::V_MAD_MIXHI_F16: 336fe6060f1SDimitry Andric default: 337fe6060f1SDimitry Andric return false; 338fe6060f1SDimitry Andric } 339fe6060f1SDimitry Andric } 340fe6060f1SDimitry Andric 341bdd1243dSDimitry Andric // Returns the maximum per-workgroup LDS allocation size (in bytes) that still 342bdd1243dSDimitry Andric // allows the given function to achieve an occupancy of NWaves waves per 343bdd1243dSDimitry Andric // SIMD / EU, taking into account only the function's *maximum* workgroup size. 344bdd1243dSDimitry Andric unsigned 345bdd1243dSDimitry Andric AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 3460b57cec5SDimitry Andric const Function &F) const { 347bdd1243dSDimitry Andric const unsigned WaveSize = getWavefrontSize(); 348bdd1243dSDimitry Andric const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 349bdd1243dSDimitry Andric const unsigned WavesPerWorkgroup = 350bdd1243dSDimitry Andric std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); 351bdd1243dSDimitry Andric 352bdd1243dSDimitry Andric const unsigned WorkGroupsPerCU = 353bdd1243dSDimitry Andric std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup); 354bdd1243dSDimitry Andric 355bdd1243dSDimitry Andric return getLocalMemorySize() / WorkGroupsPerCU; 3560b57cec5SDimitry Andric } 3570b57cec5SDimitry Andric 3585ffd83dbSDimitry Andric // FIXME: Should return min,max range. 359bdd1243dSDimitry Andric // 360bdd1243dSDimitry Andric // Returns the maximum occupancy, in number of waves per SIMD / EU, that can 361bdd1243dSDimitry Andric // be achieved when only the given function is running on the machine; and 362bdd1243dSDimitry Andric // taking into account the overall number of wave slots, the (maximum) workgroup 363bdd1243dSDimitry Andric // size, and the per-workgroup LDS allocation size. 3640b57cec5SDimitry Andric unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 3650b57cec5SDimitry Andric const Function &F) const { 3665ffd83dbSDimitry Andric const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 3675ffd83dbSDimitry Andric const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 3685ffd83dbSDimitry Andric if (!MaxWorkGroupsPerCu) 3690b57cec5SDimitry Andric return 0; 3705ffd83dbSDimitry Andric 3715ffd83dbSDimitry Andric const unsigned WaveSize = getWavefrontSize(); 3725ffd83dbSDimitry Andric 3735ffd83dbSDimitry Andric // FIXME: Do we need to account for alignment requirement of LDS rounding the 3745ffd83dbSDimitry Andric // size up? 3755ffd83dbSDimitry Andric // Compute restriction based on LDS usage 3765ffd83dbSDimitry Andric unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 3775ffd83dbSDimitry Andric 3785ffd83dbSDimitry Andric // This can be queried with more LDS than is possible, so just assume the 3795ffd83dbSDimitry Andric // worst. 3805ffd83dbSDimitry Andric if (NumGroups == 0) 3815ffd83dbSDimitry Andric return 1; 3825ffd83dbSDimitry Andric 3835ffd83dbSDimitry Andric NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 3845ffd83dbSDimitry Andric 385bdd1243dSDimitry Andric // Round to the number of waves per CU. 386bdd1243dSDimitry Andric const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize); 3875ffd83dbSDimitry Andric unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 3885ffd83dbSDimitry Andric 389bdd1243dSDimitry Andric // Number of waves per EU (SIMD). 390bdd1243dSDimitry Andric MaxWaves = divideCeil(MaxWaves, getEUsPerCU()); 391bdd1243dSDimitry Andric 3925ffd83dbSDimitry Andric // Clamp to the maximum possible number of waves. 3935ffd83dbSDimitry Andric MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 3945ffd83dbSDimitry Andric 3955ffd83dbSDimitry Andric // FIXME: Needs to be a multiple of the group size? 3965ffd83dbSDimitry Andric //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 3975ffd83dbSDimitry Andric 3985ffd83dbSDimitry Andric assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 3995ffd83dbSDimitry Andric "computed invalid occupancy"); 4005ffd83dbSDimitry Andric return MaxWaves; 4010b57cec5SDimitry Andric } 4020b57cec5SDimitry Andric 4030b57cec5SDimitry Andric unsigned 4040b57cec5SDimitry Andric AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 4050b57cec5SDimitry Andric const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 4060b57cec5SDimitry Andric return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 4070b57cec5SDimitry Andric } 4080b57cec5SDimitry Andric 4090b57cec5SDimitry Andric std::pair<unsigned, unsigned> 4100b57cec5SDimitry Andric AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 4110b57cec5SDimitry Andric switch (CC) { 4120b57cec5SDimitry Andric case CallingConv::AMDGPU_VS: 4130b57cec5SDimitry Andric case CallingConv::AMDGPU_LS: 4140b57cec5SDimitry Andric case CallingConv::AMDGPU_HS: 4150b57cec5SDimitry Andric case CallingConv::AMDGPU_ES: 4160b57cec5SDimitry Andric case CallingConv::AMDGPU_GS: 4170b57cec5SDimitry Andric case CallingConv::AMDGPU_PS: 418bdd1243dSDimitry Andric return std::pair(1, getWavefrontSize()); 4190b57cec5SDimitry Andric default: 420bdd1243dSDimitry Andric return std::pair(1u, getMaxFlatWorkGroupSize()); 4210b57cec5SDimitry Andric } 4220b57cec5SDimitry Andric } 4230b57cec5SDimitry Andric 4240b57cec5SDimitry Andric std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 4250b57cec5SDimitry Andric const Function &F) const { 4260b57cec5SDimitry Andric // Default minimum/maximum flat work group sizes. 4270b57cec5SDimitry Andric std::pair<unsigned, unsigned> Default = 4280b57cec5SDimitry Andric getDefaultFlatWorkGroupSize(F.getCallingConv()); 4290b57cec5SDimitry Andric 4300b57cec5SDimitry Andric // Requested minimum/maximum flat work group sizes. 4310b57cec5SDimitry Andric std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 4320b57cec5SDimitry Andric F, "amdgpu-flat-work-group-size", Default); 4330b57cec5SDimitry Andric 4340b57cec5SDimitry Andric // Make sure requested minimum is less than requested maximum. 4350b57cec5SDimitry Andric if (Requested.first > Requested.second) 4360b57cec5SDimitry Andric return Default; 4370b57cec5SDimitry Andric 4380b57cec5SDimitry Andric // Make sure requested values do not violate subtarget's specifications. 4390b57cec5SDimitry Andric if (Requested.first < getMinFlatWorkGroupSize()) 4400b57cec5SDimitry Andric return Default; 4410b57cec5SDimitry Andric if (Requested.second > getMaxFlatWorkGroupSize()) 4420b57cec5SDimitry Andric return Default; 4430b57cec5SDimitry Andric 4440b57cec5SDimitry Andric return Requested; 4450b57cec5SDimitry Andric } 4460b57cec5SDimitry Andric 44706c3fb27SDimitry Andric std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( 44806c3fb27SDimitry Andric std::pair<unsigned, unsigned> Requested, 44906c3fb27SDimitry Andric std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 4500b57cec5SDimitry Andric // Default minimum/maximum number of waves per execution unit. 4510b57cec5SDimitry Andric std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 4520b57cec5SDimitry Andric 4530b57cec5SDimitry Andric // If minimum/maximum flat work group sizes were explicitly requested using 454*0fca6ea1SDimitry Andric // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum 4550b57cec5SDimitry Andric // number of waves per execution unit to values implied by requested 4560b57cec5SDimitry Andric // minimum/maximum flat work group sizes. 4570b57cec5SDimitry Andric unsigned MinImpliedByFlatWorkGroupSize = 4585ffd83dbSDimitry Andric getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 4590b57cec5SDimitry Andric Default.first = MinImpliedByFlatWorkGroupSize; 4600b57cec5SDimitry Andric 4610b57cec5SDimitry Andric // Make sure requested minimum is less than requested maximum. 4620b57cec5SDimitry Andric if (Requested.second && Requested.first > Requested.second) 4630b57cec5SDimitry Andric return Default; 4640b57cec5SDimitry Andric 4650b57cec5SDimitry Andric // Make sure requested values do not violate subtarget's specifications. 4660b57cec5SDimitry Andric if (Requested.first < getMinWavesPerEU() || 4675ffd83dbSDimitry Andric Requested.second > getMaxWavesPerEU()) 4680b57cec5SDimitry Andric return Default; 4690b57cec5SDimitry Andric 4700b57cec5SDimitry Andric // Make sure requested values are compatible with values implied by requested 4710b57cec5SDimitry Andric // minimum/maximum flat work group sizes. 472349cc55cSDimitry Andric if (Requested.first < MinImpliedByFlatWorkGroupSize) 4730b57cec5SDimitry Andric return Default; 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric return Requested; 4760b57cec5SDimitry Andric } 4770b57cec5SDimitry Andric 47806c3fb27SDimitry Andric std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 47906c3fb27SDimitry Andric const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 48006c3fb27SDimitry Andric // Default minimum/maximum number of waves per execution unit. 48106c3fb27SDimitry Andric std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 48206c3fb27SDimitry Andric 48306c3fb27SDimitry Andric // Requested minimum/maximum number of waves per execution unit. 48406c3fb27SDimitry Andric std::pair<unsigned, unsigned> Requested = 48506c3fb27SDimitry Andric AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); 48606c3fb27SDimitry Andric return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); 48706c3fb27SDimitry Andric } 48806c3fb27SDimitry Andric 489e8d8bef9SDimitry Andric static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 490e8d8bef9SDimitry Andric auto Node = Kernel.getMetadata("reqd_work_group_size"); 491e8d8bef9SDimitry Andric if (Node && Node->getNumOperands() == 3) 492e8d8bef9SDimitry Andric return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 493e8d8bef9SDimitry Andric return std::numeric_limits<unsigned>::max(); 494e8d8bef9SDimitry Andric } 495e8d8bef9SDimitry Andric 496e8d8bef9SDimitry Andric bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 497e8d8bef9SDimitry Andric return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 498e8d8bef9SDimitry Andric } 499e8d8bef9SDimitry Andric 500e8d8bef9SDimitry Andric unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 501e8d8bef9SDimitry Andric unsigned Dimension) const { 502e8d8bef9SDimitry Andric unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 503e8d8bef9SDimitry Andric if (ReqdSize != std::numeric_limits<unsigned>::max()) 504e8d8bef9SDimitry Andric return ReqdSize - 1; 505e8d8bef9SDimitry Andric return getFlatWorkGroupSizes(Kernel).second - 1; 506e8d8bef9SDimitry Andric } 507e8d8bef9SDimitry Andric 50806c3fb27SDimitry Andric bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { 50906c3fb27SDimitry Andric for (int I = 0; I < 3; ++I) { 51006c3fb27SDimitry Andric if (getMaxWorkitemID(Func, I) > 0) 51106c3fb27SDimitry Andric return false; 51206c3fb27SDimitry Andric } 51306c3fb27SDimitry Andric 51406c3fb27SDimitry Andric return true; 51506c3fb27SDimitry Andric } 51606c3fb27SDimitry Andric 5170b57cec5SDimitry Andric bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 5180b57cec5SDimitry Andric Function *Kernel = I->getParent()->getParent(); 5190b57cec5SDimitry Andric unsigned MinSize = 0; 5200b57cec5SDimitry Andric unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 5210b57cec5SDimitry Andric bool IdQuery = false; 5220b57cec5SDimitry Andric 5230b57cec5SDimitry Andric // If reqd_work_group_size is present it narrows value down. 5240b57cec5SDimitry Andric if (auto *CI = dyn_cast<CallInst>(I)) { 5250b57cec5SDimitry Andric const Function *F = CI->getCalledFunction(); 5260b57cec5SDimitry Andric if (F) { 5270b57cec5SDimitry Andric unsigned Dim = UINT_MAX; 5280b57cec5SDimitry Andric switch (F->getIntrinsicID()) { 5290b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 5300b57cec5SDimitry Andric case Intrinsic::r600_read_tidig_x: 5310b57cec5SDimitry Andric IdQuery = true; 532bdd1243dSDimitry Andric [[fallthrough]]; 5330b57cec5SDimitry Andric case Intrinsic::r600_read_local_size_x: 5340b57cec5SDimitry Andric Dim = 0; 5350b57cec5SDimitry Andric break; 5360b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 5370b57cec5SDimitry Andric case Intrinsic::r600_read_tidig_y: 5380b57cec5SDimitry Andric IdQuery = true; 539bdd1243dSDimitry Andric [[fallthrough]]; 5400b57cec5SDimitry Andric case Intrinsic::r600_read_local_size_y: 5410b57cec5SDimitry Andric Dim = 1; 5420b57cec5SDimitry Andric break; 5430b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 5440b57cec5SDimitry Andric case Intrinsic::r600_read_tidig_z: 5450b57cec5SDimitry Andric IdQuery = true; 546bdd1243dSDimitry Andric [[fallthrough]]; 5470b57cec5SDimitry Andric case Intrinsic::r600_read_local_size_z: 5480b57cec5SDimitry Andric Dim = 2; 5490b57cec5SDimitry Andric break; 5500b57cec5SDimitry Andric default: 5510b57cec5SDimitry Andric break; 5520b57cec5SDimitry Andric } 553e8d8bef9SDimitry Andric 5540b57cec5SDimitry Andric if (Dim <= 3) { 555e8d8bef9SDimitry Andric unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 556e8d8bef9SDimitry Andric if (ReqdSize != std::numeric_limits<unsigned>::max()) 557e8d8bef9SDimitry Andric MinSize = MaxSize = ReqdSize; 5580b57cec5SDimitry Andric } 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric } 5610b57cec5SDimitry Andric 5620b57cec5SDimitry Andric if (!MaxSize) 5630b57cec5SDimitry Andric return false; 5640b57cec5SDimitry Andric 5650b57cec5SDimitry Andric // Range metadata is [Lo, Hi). For ID query we need to pass max size 5660b57cec5SDimitry Andric // as Hi. For size query we need to pass Hi + 1. 5670b57cec5SDimitry Andric if (IdQuery) 5680b57cec5SDimitry Andric MinSize = 0; 5690b57cec5SDimitry Andric else 5700b57cec5SDimitry Andric ++MaxSize; 5710b57cec5SDimitry Andric 572*0fca6ea1SDimitry Andric APInt Lower{32, MinSize}; 573*0fca6ea1SDimitry Andric APInt Upper{32, MaxSize}; 574*0fca6ea1SDimitry Andric if (auto *CI = dyn_cast<CallBase>(I)) { 575*0fca6ea1SDimitry Andric ConstantRange Range(Lower, Upper); 576*0fca6ea1SDimitry Andric CI->addRangeRetAttr(Range); 577*0fca6ea1SDimitry Andric } else { 5780b57cec5SDimitry Andric MDBuilder MDB(I->getContext()); 579*0fca6ea1SDimitry Andric MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper); 5800b57cec5SDimitry Andric I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 581*0fca6ea1SDimitry Andric } 5820b57cec5SDimitry Andric return true; 5830b57cec5SDimitry Andric } 5840b57cec5SDimitry Andric 585e8d8bef9SDimitry Andric unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 5860eae32dcSDimitry Andric assert(AMDGPU::isKernel(F.getCallingConv())); 5870eae32dcSDimitry Andric 5880eae32dcSDimitry Andric // We don't allocate the segment if we know the implicit arguments weren't 5890eae32dcSDimitry Andric // used, even if the ABI implies we need them. 5900eae32dcSDimitry Andric if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 5910eae32dcSDimitry Andric return 0; 5920eae32dcSDimitry Andric 593e8d8bef9SDimitry Andric if (isMesaKernel(F)) 594e8d8bef9SDimitry Andric return 16; 5950eae32dcSDimitry Andric 5960eae32dcSDimitry Andric // Assume all implicit inputs are used by default 59706c3fb27SDimitry Andric const Module *M = F.getParent(); 59806c3fb27SDimitry Andric unsigned NBytes = 5997a6dacacSDimitry Andric AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; 600bdd1243dSDimitry Andric return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", 601bdd1243dSDimitry Andric NBytes); 602e8d8bef9SDimitry Andric } 603e8d8bef9SDimitry Andric 6040b57cec5SDimitry Andric uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 6058bcb0991SDimitry Andric Align &MaxAlign) const { 6060b57cec5SDimitry Andric assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 6070b57cec5SDimitry Andric F.getCallingConv() == CallingConv::SPIR_KERNEL); 6080b57cec5SDimitry Andric 609*0fca6ea1SDimitry Andric const DataLayout &DL = F.getDataLayout(); 6100b57cec5SDimitry Andric uint64_t ExplicitArgBytes = 0; 6115ffd83dbSDimitry Andric MaxAlign = Align(1); 6120b57cec5SDimitry Andric 6130b57cec5SDimitry Andric for (const Argument &Arg : F.args()) { 614e8d8bef9SDimitry Andric const bool IsByRef = Arg.hasByRefAttr(); 615e8d8bef9SDimitry Andric Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 61681ad6265SDimitry Andric Align Alignment = DL.getValueOrABITypeAlignment( 617bdd1243dSDimitry Andric IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy); 6180b57cec5SDimitry Andric uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 6198bcb0991SDimitry Andric ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 62081ad6265SDimitry Andric MaxAlign = std::max(MaxAlign, Alignment); 6210b57cec5SDimitry Andric } 6220b57cec5SDimitry Andric 6230b57cec5SDimitry Andric return ExplicitArgBytes; 6240b57cec5SDimitry Andric } 6250b57cec5SDimitry Andric 6260b57cec5SDimitry Andric unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 6278bcb0991SDimitry Andric Align &MaxAlign) const { 62806c3fb27SDimitry Andric if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && 62906c3fb27SDimitry Andric F.getCallingConv() != CallingConv::SPIR_KERNEL) 63006c3fb27SDimitry Andric return 0; 63106c3fb27SDimitry Andric 6320b57cec5SDimitry Andric uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 6330b57cec5SDimitry Andric 63406c3fb27SDimitry Andric unsigned ExplicitOffset = getExplicitKernelArgOffset(); 6350b57cec5SDimitry Andric 6360b57cec5SDimitry Andric uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 6370b57cec5SDimitry Andric unsigned ImplicitBytes = getImplicitArgNumBytes(F); 6380b57cec5SDimitry Andric if (ImplicitBytes != 0) { 6398bcb0991SDimitry Andric const Align Alignment = getAlignmentForImplicitArgPtr(); 6400b57cec5SDimitry Andric TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 641349cc55cSDimitry Andric MaxAlign = std::max(MaxAlign, Alignment); 6420b57cec5SDimitry Andric } 6430b57cec5SDimitry Andric 6440b57cec5SDimitry Andric // Being able to dereference past the end is useful for emitting scalar loads. 6450b57cec5SDimitry Andric return alignTo(TotalSize, 4); 6460b57cec5SDimitry Andric } 6470b57cec5SDimitry Andric 648e8d8bef9SDimitry Andric AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 649e8d8bef9SDimitry Andric return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 650e8d8bef9SDimitry Andric : AMDGPUDwarfFlavour::Wave64; 651e8d8bef9SDimitry Andric } 652e8d8bef9SDimitry Andric 6530b57cec5SDimitry Andric void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 6540b57cec5SDimitry Andric unsigned NumRegionInstrs) const { 6550b57cec5SDimitry Andric // Track register pressure so the scheduler can try to decrease 6560b57cec5SDimitry Andric // pressure once register usage is above the threshold defined by 6570b57cec5SDimitry Andric // SIRegisterInfo::getRegPressureSetLimit() 6580b57cec5SDimitry Andric Policy.ShouldTrackPressure = true; 6590b57cec5SDimitry Andric 6600b57cec5SDimitry Andric // Enabling both top down and bottom up scheduling seems to give us less 6610b57cec5SDimitry Andric // register spills than just using one of these approaches on its own. 6620b57cec5SDimitry Andric Policy.OnlyTopDown = false; 6630b57cec5SDimitry Andric Policy.OnlyBottomUp = false; 6640b57cec5SDimitry Andric 6650b57cec5SDimitry Andric // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 6660b57cec5SDimitry Andric if (!enableSIScheduler()) 6670b57cec5SDimitry Andric Policy.ShouldTrackLaneMasks = true; 6680b57cec5SDimitry Andric } 6690b57cec5SDimitry Andric 670*0fca6ea1SDimitry Andric void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { 671*0fca6ea1SDimitry Andric if (isWave32()) { 672*0fca6ea1SDimitry Andric // Fix implicit $vcc operands after MIParser has verified that they match 673*0fca6ea1SDimitry Andric // the instruction definitions. 674*0fca6ea1SDimitry Andric for (auto &MBB : MF) { 675*0fca6ea1SDimitry Andric for (auto &MI : MBB) 676*0fca6ea1SDimitry Andric InstrInfo.fixImplicitOperands(MI); 677*0fca6ea1SDimitry Andric } 678*0fca6ea1SDimitry Andric } 679*0fca6ea1SDimitry Andric } 680*0fca6ea1SDimitry Andric 6810b57cec5SDimitry Andric bool GCNSubtarget::hasMadF16() const { 682e8d8bef9SDimitry Andric return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 6830b57cec5SDimitry Andric } 6840b57cec5SDimitry Andric 685480093f4SDimitry Andric bool GCNSubtarget::useVGPRIndexMode() const { 686480093f4SDimitry Andric return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 687480093f4SDimitry Andric } 688480093f4SDimitry Andric 689e8d8bef9SDimitry Andric bool GCNSubtarget::useAA() const { return UseAA; } 690e8d8bef9SDimitry Andric 6910b57cec5SDimitry Andric unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 692*0fca6ea1SDimitry Andric return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), 693*0fca6ea1SDimitry Andric getGeneration()); 6940b57cec5SDimitry Andric } 6950b57cec5SDimitry Andric 696bdd1243dSDimitry Andric unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { 697bdd1243dSDimitry Andric return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); 6980b57cec5SDimitry Andric } 6990b57cec5SDimitry Andric 700fe6060f1SDimitry Andric unsigned 70104eeddc0SDimitry Andric GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 7020b57cec5SDimitry Andric if (getGeneration() >= AMDGPUSubtarget::GFX10) 7030b57cec5SDimitry Andric return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 7040b57cec5SDimitry Andric 70504eeddc0SDimitry Andric if (HasFlatScratch || HasArchitectedFlatScratch) { 7060b57cec5SDimitry Andric if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 7070b57cec5SDimitry Andric return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 7080b57cec5SDimitry Andric if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 7090b57cec5SDimitry Andric return 4; // FLAT_SCRATCH, VCC (in that order). 7100b57cec5SDimitry Andric } 7110b57cec5SDimitry Andric 7120b57cec5SDimitry Andric if (isXNACKEnabled()) 7130b57cec5SDimitry Andric return 4; // XNACK, VCC (in that order). 7140b57cec5SDimitry Andric return 2; // VCC. 7150b57cec5SDimitry Andric } 7160b57cec5SDimitry Andric 717fe6060f1SDimitry Andric unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 718fe6060f1SDimitry Andric const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 7195f757f3fSDimitry Andric return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); 720fe6060f1SDimitry Andric } 721fe6060f1SDimitry Andric 722fe6060f1SDimitry Andric unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 72304eeddc0SDimitry Andric // In principle we do not need to reserve SGPR pair used for flat_scratch if 72404eeddc0SDimitry Andric // we know flat instructions do not access the stack anywhere in the 72504eeddc0SDimitry Andric // program. For now assume it's needed if we have flat instructions. 72604eeddc0SDimitry Andric const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 72704eeddc0SDimitry Andric return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 728fe6060f1SDimitry Andric } 729fe6060f1SDimitry Andric 7305ffd83dbSDimitry Andric unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 7318bcb0991SDimitry Andric unsigned NumSGPRs, 7328bcb0991SDimitry Andric unsigned NumVGPRs) const { 7338bcb0991SDimitry Andric unsigned Occupancy = 7348bcb0991SDimitry Andric std::min(getMaxWavesPerEU(), 7355ffd83dbSDimitry Andric getOccupancyWithLocalMemSize(LDSSize, F)); 7368bcb0991SDimitry Andric if (NumSGPRs) 7378bcb0991SDimitry Andric Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 7388bcb0991SDimitry Andric if (NumVGPRs) 7398bcb0991SDimitry Andric Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 7408bcb0991SDimitry Andric return Occupancy; 7418bcb0991SDimitry Andric } 7428bcb0991SDimitry Andric 743fe6060f1SDimitry Andric unsigned GCNSubtarget::getBaseMaxNumSGPRs( 744fe6060f1SDimitry Andric const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 745fe6060f1SDimitry Andric unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 7460b57cec5SDimitry Andric // Compute maximum number of SGPRs function can use using default/requested 7470b57cec5SDimitry Andric // minimum number of waves per execution unit. 7480b57cec5SDimitry Andric unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 7490b57cec5SDimitry Andric unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 7500b57cec5SDimitry Andric 7510b57cec5SDimitry Andric // Check if maximum number of SGPRs was explicitly requested using 7520b57cec5SDimitry Andric // "amdgpu-num-sgpr" attribute. 7530b57cec5SDimitry Andric if (F.hasFnAttribute("amdgpu-num-sgpr")) { 754bdd1243dSDimitry Andric unsigned Requested = 755bdd1243dSDimitry Andric F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); 7560b57cec5SDimitry Andric 7570b57cec5SDimitry Andric // Make sure requested value does not violate subtarget's specifications. 758fe6060f1SDimitry Andric if (Requested && (Requested <= ReservedNumSGPRs)) 7590b57cec5SDimitry Andric Requested = 0; 7600b57cec5SDimitry Andric 7610b57cec5SDimitry Andric // If more SGPRs are required to support the input user/system SGPRs, 7620b57cec5SDimitry Andric // increase to accommodate them. 7630b57cec5SDimitry Andric // 7640b57cec5SDimitry Andric // FIXME: This really ends up using the requested number of SGPRs + number 7650b57cec5SDimitry Andric // of reserved special registers in total. Theoretically you could re-use 7660b57cec5SDimitry Andric // the last input registers for these special registers, but this would 7670b57cec5SDimitry Andric // require a lot of complexity to deal with the weird aliasing. 768fe6060f1SDimitry Andric unsigned InputNumSGPRs = PreloadedSGPRs; 7690b57cec5SDimitry Andric if (Requested && Requested < InputNumSGPRs) 7700b57cec5SDimitry Andric Requested = InputNumSGPRs; 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric // Make sure requested value is compatible with values implied by 7730b57cec5SDimitry Andric // default/requested minimum/maximum number of waves per execution unit. 7740b57cec5SDimitry Andric if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 7750b57cec5SDimitry Andric Requested = 0; 7760b57cec5SDimitry Andric if (WavesPerEU.second && 7770b57cec5SDimitry Andric Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 7780b57cec5SDimitry Andric Requested = 0; 7790b57cec5SDimitry Andric 7800b57cec5SDimitry Andric if (Requested) 7810b57cec5SDimitry Andric MaxNumSGPRs = Requested; 7820b57cec5SDimitry Andric } 7830b57cec5SDimitry Andric 7840b57cec5SDimitry Andric if (hasSGPRInitBug()) 7850b57cec5SDimitry Andric MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 7860b57cec5SDimitry Andric 787fe6060f1SDimitry Andric return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 7880b57cec5SDimitry Andric } 7890b57cec5SDimitry Andric 790fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 7910b57cec5SDimitry Andric const Function &F = MF.getFunction(); 7920b57cec5SDimitry Andric const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 793fe6060f1SDimitry Andric return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 794fe6060f1SDimitry Andric getReservedNumSGPRs(MF)); 795fe6060f1SDimitry Andric } 7960b57cec5SDimitry Andric 797fe6060f1SDimitry Andric static unsigned getMaxNumPreloadedSGPRs() { 7985f757f3fSDimitry Andric using USI = GCNUserSGPRUsageInfo; 799fe6060f1SDimitry Andric // Max number of user SGPRs 8005f757f3fSDimitry Andric const unsigned MaxUserSGPRs = 8015f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + 8025f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::DispatchPtrID) + 8035f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::QueuePtrID) + 8045f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + 8055f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::DispatchIdID) + 8065f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::FlatScratchInitID) + 8075f757f3fSDimitry Andric USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); 808fcaf7f86SDimitry Andric 809fe6060f1SDimitry Andric // Max number of system SGPRs 8105f757f3fSDimitry Andric const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 811fe6060f1SDimitry Andric 1 + // WorkGroupIDY 812fe6060f1SDimitry Andric 1 + // WorkGroupIDZ 813fe6060f1SDimitry Andric 1 + // WorkGroupInfo 814fe6060f1SDimitry Andric 1; // private segment wave byte offset 815fcaf7f86SDimitry Andric 816fcaf7f86SDimitry Andric // Max number of synthetic SGPRs 8175f757f3fSDimitry Andric const unsigned SyntheticSGPRs = 1; // LDSKernelId 818fcaf7f86SDimitry Andric 819fcaf7f86SDimitry Andric return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; 820fe6060f1SDimitry Andric } 821fe6060f1SDimitry Andric 822fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 823fe6060f1SDimitry Andric return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 824fe6060f1SDimitry Andric getReservedNumSGPRs(F)); 825fe6060f1SDimitry Andric } 826fe6060f1SDimitry Andric 827fe6060f1SDimitry Andric unsigned GCNSubtarget::getBaseMaxNumVGPRs( 828fe6060f1SDimitry Andric const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 8290b57cec5SDimitry Andric // Compute maximum number of VGPRs function can use using default/requested 8300b57cec5SDimitry Andric // minimum number of waves per execution unit. 8310b57cec5SDimitry Andric unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 8320b57cec5SDimitry Andric 8330b57cec5SDimitry Andric // Check if maximum number of VGPRs was explicitly requested using 8340b57cec5SDimitry Andric // "amdgpu-num-vgpr" attribute. 8350b57cec5SDimitry Andric if (F.hasFnAttribute("amdgpu-num-vgpr")) { 836bdd1243dSDimitry Andric unsigned Requested = 837bdd1243dSDimitry Andric F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); 8380b57cec5SDimitry Andric 839fe6060f1SDimitry Andric if (hasGFX90AInsts()) 840fe6060f1SDimitry Andric Requested *= 2; 841fe6060f1SDimitry Andric 8420b57cec5SDimitry Andric // Make sure requested value is compatible with values implied by 8430b57cec5SDimitry Andric // default/requested minimum/maximum number of waves per execution unit. 8440b57cec5SDimitry Andric if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 8450b57cec5SDimitry Andric Requested = 0; 8460b57cec5SDimitry Andric if (WavesPerEU.second && 8470b57cec5SDimitry Andric Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 8480b57cec5SDimitry Andric Requested = 0; 8490b57cec5SDimitry Andric 8500b57cec5SDimitry Andric if (Requested) 8510b57cec5SDimitry Andric MaxNumVGPRs = Requested; 8520b57cec5SDimitry Andric } 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric return MaxNumVGPRs; 8550b57cec5SDimitry Andric } 8560b57cec5SDimitry Andric 857fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 858fe6060f1SDimitry Andric return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 859fe6060f1SDimitry Andric } 860fe6060f1SDimitry Andric 861fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 862fe6060f1SDimitry Andric const Function &F = MF.getFunction(); 863fe6060f1SDimitry Andric const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 864fe6060f1SDimitry Andric return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 865fe6060f1SDimitry Andric } 866fe6060f1SDimitry Andric 867*0fca6ea1SDimitry Andric void GCNSubtarget::adjustSchedDependency( 868*0fca6ea1SDimitry Andric SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, 869*0fca6ea1SDimitry Andric const TargetSchedModel *SchedModel) const { 870480093f4SDimitry Andric if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 8715ffd83dbSDimitry Andric !Def->isInstr() || !Use->isInstr()) 872480093f4SDimitry Andric return; 873480093f4SDimitry Andric 8745ffd83dbSDimitry Andric MachineInstr *DefI = Def->getInstr(); 8755ffd83dbSDimitry Andric MachineInstr *UseI = Use->getInstr(); 876480093f4SDimitry Andric 8775ffd83dbSDimitry Andric if (DefI->isBundle()) { 878480093f4SDimitry Andric const SIRegisterInfo *TRI = getRegisterInfo(); 879480093f4SDimitry Andric auto Reg = Dep.getReg(); 8805ffd83dbSDimitry Andric MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 8815ffd83dbSDimitry Andric MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 882480093f4SDimitry Andric unsigned Lat = 0; 883480093f4SDimitry Andric for (++I; I != E && I->isBundledWithPred(); ++I) { 884480093f4SDimitry Andric if (I->modifiesRegister(Reg, TRI)) 885480093f4SDimitry Andric Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 886480093f4SDimitry Andric else if (Lat) 887480093f4SDimitry Andric --Lat; 888480093f4SDimitry Andric } 889480093f4SDimitry Andric Dep.setLatency(Lat); 8905ffd83dbSDimitry Andric } else if (UseI->isBundle()) { 891480093f4SDimitry Andric const SIRegisterInfo *TRI = getRegisterInfo(); 892480093f4SDimitry Andric auto Reg = Dep.getReg(); 8935ffd83dbSDimitry Andric MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 8945ffd83dbSDimitry Andric MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 8955ffd83dbSDimitry Andric unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 896480093f4SDimitry Andric for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 897480093f4SDimitry Andric if (I->readsRegister(Reg, TRI)) 898480093f4SDimitry Andric break; 899480093f4SDimitry Andric --Lat; 900480093f4SDimitry Andric } 901480093f4SDimitry Andric Dep.setLatency(Lat); 902349cc55cSDimitry Andric } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 903349cc55cSDimitry Andric // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 904349cc55cSDimitry Andric // implicit operands which come from the MCInstrDesc, which can fool 905349cc55cSDimitry Andric // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 906349cc55cSDimitry Andric // pseudo operands. 907349cc55cSDimitry Andric Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 908349cc55cSDimitry Andric DefI, DefOpIdx, UseI, UseOpIdx)); 909480093f4SDimitry Andric } 910480093f4SDimitry Andric } 911480093f4SDimitry Andric 9120b57cec5SDimitry Andric namespace { 9130b57cec5SDimitry Andric struct FillMFMAShadowMutation : ScheduleDAGMutation { 9140b57cec5SDimitry Andric const SIInstrInfo *TII; 9150b57cec5SDimitry Andric 9160b57cec5SDimitry Andric ScheduleDAGMI *DAG; 9170b57cec5SDimitry Andric 9180b57cec5SDimitry Andric FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 9190b57cec5SDimitry Andric 9200b57cec5SDimitry Andric bool isSALU(const SUnit *SU) const { 9210b57cec5SDimitry Andric const MachineInstr *MI = SU->getInstr(); 9220b57cec5SDimitry Andric return MI && TII->isSALU(*MI) && !MI->isTerminator(); 9230b57cec5SDimitry Andric } 9240b57cec5SDimitry Andric 925480093f4SDimitry Andric bool isVALU(const SUnit *SU) const { 926480093f4SDimitry Andric const MachineInstr *MI = SU->getInstr(); 927480093f4SDimitry Andric return MI && TII->isVALU(*MI); 928480093f4SDimitry Andric } 929480093f4SDimitry Andric 930349cc55cSDimitry Andric // Link as many SALU instructions in chain as possible. Return the size 9310b57cec5SDimitry Andric // of the chain. Links up to MaxChain instructions. 9320b57cec5SDimitry Andric unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 9330b57cec5SDimitry Andric SmallPtrSetImpl<SUnit *> &Visited) const { 9340b57cec5SDimitry Andric SmallVector<SUnit *, 8> Worklist({To}); 9350b57cec5SDimitry Andric unsigned Linked = 0; 9360b57cec5SDimitry Andric 9370b57cec5SDimitry Andric while (!Worklist.empty() && MaxChain-- > 0) { 9380b57cec5SDimitry Andric SUnit *SU = Worklist.pop_back_val(); 9390b57cec5SDimitry Andric if (!Visited.insert(SU).second) 9400b57cec5SDimitry Andric continue; 9410b57cec5SDimitry Andric 9420b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 9430b57cec5SDimitry Andric dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 9440b57cec5SDimitry Andric 945fcaf7f86SDimitry Andric if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) 946fcaf7f86SDimitry Andric if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) 9470b57cec5SDimitry Andric ++Linked; 9480b57cec5SDimitry Andric 9490b57cec5SDimitry Andric for (SDep &SI : From->Succs) { 9500b57cec5SDimitry Andric SUnit *SUv = SI.getSUnit(); 951fcaf7f86SDimitry Andric if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && 952fcaf7f86SDimitry Andric DAG->canAddEdge(SUv, SU)) 953fcaf7f86SDimitry Andric DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); 9540b57cec5SDimitry Andric } 9550b57cec5SDimitry Andric 9560b57cec5SDimitry Andric for (SDep &SI : SU->Succs) { 9570b57cec5SDimitry Andric SUnit *Succ = SI.getSUnit(); 958fcaf7f86SDimitry Andric if (Succ != SU && isSALU(Succ)) 9590b57cec5SDimitry Andric Worklist.push_back(Succ); 9600b57cec5SDimitry Andric } 9610b57cec5SDimitry Andric } 9620b57cec5SDimitry Andric 9630b57cec5SDimitry Andric return Linked; 9640b57cec5SDimitry Andric } 9650b57cec5SDimitry Andric 9660b57cec5SDimitry Andric void apply(ScheduleDAGInstrs *DAGInstrs) override { 9670b57cec5SDimitry Andric const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 968753f127fSDimitry Andric if (!ST.hasMAIInsts()) 9690b57cec5SDimitry Andric return; 9700b57cec5SDimitry Andric DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 9710b57cec5SDimitry Andric const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 9720b57cec5SDimitry Andric if (!TSchedModel || DAG->SUnits.empty()) 9730b57cec5SDimitry Andric return; 9740b57cec5SDimitry Andric 9750b57cec5SDimitry Andric // Scan for MFMA long latency instructions and try to add a dependency 9760b57cec5SDimitry Andric // of available SALU instructions to give them a chance to fill MFMA 9770b57cec5SDimitry Andric // shadow. That is desirable to fill MFMA shadow with SALU instructions 9780b57cec5SDimitry Andric // rather than VALU to prevent power consumption bursts and throttle. 9790b57cec5SDimitry Andric auto LastSALU = DAG->SUnits.begin(); 9800b57cec5SDimitry Andric auto E = DAG->SUnits.end(); 9810b57cec5SDimitry Andric SmallPtrSet<SUnit*, 32> Visited; 9820b57cec5SDimitry Andric for (SUnit &SU : DAG->SUnits) { 9830b57cec5SDimitry Andric MachineInstr &MAI = *SU.getInstr(); 9840b57cec5SDimitry Andric if (!TII->isMAI(MAI) || 985e8d8bef9SDimitry Andric MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 986e8d8bef9SDimitry Andric MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 9870b57cec5SDimitry Andric continue; 9880b57cec5SDimitry Andric 9890b57cec5SDimitry Andric unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 9900b57cec5SDimitry Andric 9910b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 9920b57cec5SDimitry Andric dbgs() << "Need " << Lat 9930b57cec5SDimitry Andric << " instructions to cover latency.\n"); 9940b57cec5SDimitry Andric 9950b57cec5SDimitry Andric // Find up to Lat independent scalar instructions as early as 9960b57cec5SDimitry Andric // possible such that they can be scheduled after this MFMA. 9970b57cec5SDimitry Andric for ( ; Lat && LastSALU != E; ++LastSALU) { 9980b57cec5SDimitry Andric if (Visited.count(&*LastSALU)) 9990b57cec5SDimitry Andric continue; 10000b57cec5SDimitry Andric 1001fcaf7f86SDimitry Andric if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || 1002fcaf7f86SDimitry Andric !DAG->canAddEdge(&*LastSALU, &SU)) 10030b57cec5SDimitry Andric continue; 10040b57cec5SDimitry Andric 10050b57cec5SDimitry Andric Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 10060b57cec5SDimitry Andric } 10070b57cec5SDimitry Andric } 10080b57cec5SDimitry Andric } 10090b57cec5SDimitry Andric }; 10100b57cec5SDimitry Andric } // namespace 10110b57cec5SDimitry Andric 10120b57cec5SDimitry Andric void GCNSubtarget::getPostRAMutations( 10130b57cec5SDimitry Andric std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 10148bcb0991SDimitry Andric Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 10150b57cec5SDimitry Andric } 10160b57cec5SDimitry Andric 1017349cc55cSDimitry Andric std::unique_ptr<ScheduleDAGMutation> 1018349cc55cSDimitry Andric GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1019753f127fSDimitry Andric return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) 1020753f127fSDimitry Andric : nullptr; 1021349cc55cSDimitry Andric } 1022349cc55cSDimitry Andric 1023bdd1243dSDimitry Andric unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { 10245f757f3fSDimitry Andric if (getGeneration() >= AMDGPUSubtarget::GFX12) 10255f757f3fSDimitry Andric return 0; // Not MIMG encoding. 10265f757f3fSDimitry Andric 1027bdd1243dSDimitry Andric if (NSAThreshold.getNumOccurrences() > 0) 1028bdd1243dSDimitry Andric return std::max(NSAThreshold.getValue(), 2u); 1029bdd1243dSDimitry Andric 1030bdd1243dSDimitry Andric int Value = MF.getFunction().getFnAttributeAsParsedInteger( 1031bdd1243dSDimitry Andric "amdgpu-nsa-threshold", -1); 1032bdd1243dSDimitry Andric if (Value > 0) 1033bdd1243dSDimitry Andric return std::max(Value, 2); 1034bdd1243dSDimitry Andric 1035bdd1243dSDimitry Andric return 3; 1036bdd1243dSDimitry Andric } 1037bdd1243dSDimitry Andric 10380b57cec5SDimitry Andric const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 10390b57cec5SDimitry Andric if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 10400b57cec5SDimitry Andric return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 10410b57cec5SDimitry Andric return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>()); 10420b57cec5SDimitry Andric } 10430b57cec5SDimitry Andric 10440b57cec5SDimitry Andric const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 10450b57cec5SDimitry Andric if (TM.getTargetTriple().getArch() == Triple::amdgcn) 10460b57cec5SDimitry Andric return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1047*0fca6ea1SDimitry Andric return static_cast<const AMDGPUSubtarget &>( 1048*0fca6ea1SDimitry Andric TM.getSubtarget<R600Subtarget>(F)); 10490b57cec5SDimitry Andric } 10505f757f3fSDimitry Andric 10515f757f3fSDimitry Andric GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, 10525f757f3fSDimitry Andric const GCNSubtarget &ST) 10535f757f3fSDimitry Andric : ST(ST) { 10545f757f3fSDimitry Andric const CallingConv::ID CC = F.getCallingConv(); 10555f757f3fSDimitry Andric const bool IsKernel = 10565f757f3fSDimitry Andric CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; 10575f757f3fSDimitry Andric // FIXME: Should have analysis or something rather than attribute to detect 10585f757f3fSDimitry Andric // calls. 10595f757f3fSDimitry Andric const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); 10605f757f3fSDimitry Andric // FIXME: This attribute is a hack, we just need an analysis on the function 10615f757f3fSDimitry Andric // to look for allocas. 10625f757f3fSDimitry Andric const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); 10635f757f3fSDimitry Andric 10645f757f3fSDimitry Andric if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) 10655f757f3fSDimitry Andric KernargSegmentPtr = true; 10665f757f3fSDimitry Andric 10675f757f3fSDimitry Andric bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); 10685f757f3fSDimitry Andric if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) 10695f757f3fSDimitry Andric PrivateSegmentBuffer = true; 10705f757f3fSDimitry Andric else if (ST.isMesaGfxShader(F)) 10715f757f3fSDimitry Andric ImplicitBufferPtr = true; 10725f757f3fSDimitry Andric 10735f757f3fSDimitry Andric if (!AMDGPU::isGraphics(CC)) { 10745f757f3fSDimitry Andric if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) 10755f757f3fSDimitry Andric DispatchPtr = true; 10765f757f3fSDimitry Andric 10775f757f3fSDimitry Andric // FIXME: Can this always be disabled with < COv5? 10785f757f3fSDimitry Andric if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) 10795f757f3fSDimitry Andric QueuePtr = true; 10805f757f3fSDimitry Andric 10815f757f3fSDimitry Andric if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) 10825f757f3fSDimitry Andric DispatchID = true; 10835f757f3fSDimitry Andric } 10845f757f3fSDimitry Andric 10855f757f3fSDimitry Andric // TODO: This could be refined a lot. The attribute is a poor way of 10865f757f3fSDimitry Andric // detecting calls or stack objects that may require it before argument 10875f757f3fSDimitry Andric // lowering. 10885f757f3fSDimitry Andric if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && 10895f757f3fSDimitry Andric (IsAmdHsaOrMesa || ST.enableFlatScratch()) && 10905f757f3fSDimitry Andric (HasCalls || HasStackObjects || ST.enableFlatScratch()) && 10915f757f3fSDimitry Andric !ST.flatScratchIsArchitected()) { 10925f757f3fSDimitry Andric FlatScratchInit = true; 10935f757f3fSDimitry Andric } 10945f757f3fSDimitry Andric 10955f757f3fSDimitry Andric if (hasImplicitBufferPtr()) 10965f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); 10975f757f3fSDimitry Andric 10985f757f3fSDimitry Andric if (hasPrivateSegmentBuffer()) 10995f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); 11005f757f3fSDimitry Andric 11015f757f3fSDimitry Andric if (hasDispatchPtr()) 11025f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); 11035f757f3fSDimitry Andric 11045f757f3fSDimitry Andric if (hasQueuePtr()) 11055f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); 11065f757f3fSDimitry Andric 11075f757f3fSDimitry Andric if (hasKernargSegmentPtr()) 11085f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); 11095f757f3fSDimitry Andric 11105f757f3fSDimitry Andric if (hasDispatchID()) 11115f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); 11125f757f3fSDimitry Andric 11135f757f3fSDimitry Andric if (hasFlatScratchInit()) 11145f757f3fSDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); 1115*0fca6ea1SDimitry Andric 1116*0fca6ea1SDimitry Andric if (hasPrivateSegmentSize()) 1117*0fca6ea1SDimitry Andric NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); 11185f757f3fSDimitry Andric } 11195f757f3fSDimitry Andric 11205f757f3fSDimitry Andric void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { 11215f757f3fSDimitry Andric assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); 11225f757f3fSDimitry Andric NumKernargPreloadSGPRs += NumSGPRs; 11235f757f3fSDimitry Andric NumUsedUserSGPRs += NumSGPRs; 11245f757f3fSDimitry Andric } 11255f757f3fSDimitry Andric 11265f757f3fSDimitry Andric unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { 11275f757f3fSDimitry Andric return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; 11285f757f3fSDimitry Andric } 1129*0fca6ea1SDimitry Andric 1130*0fca6ea1SDimitry Andric SmallVector<unsigned> 1131*0fca6ea1SDimitry Andric AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { 1132*0fca6ea1SDimitry Andric return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3); 1133*0fca6ea1SDimitry Andric } 1134