xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// Implements the AMDGPU specific subclass of TargetSubtarget.
110b57cec5SDimitry Andric //
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
140b57cec5SDimitry Andric #include "AMDGPUSubtarget.h"
150b57cec5SDimitry Andric #include "AMDGPUCallLowering.h"
160b57cec5SDimitry Andric #include "AMDGPUInstructionSelector.h"
170b57cec5SDimitry Andric #include "AMDGPULegalizerInfo.h"
180b57cec5SDimitry Andric #include "AMDGPURegisterBankInfo.h"
19e8d8bef9SDimitry Andric #include "AMDGPUTargetMachine.h"
205f757f3fSDimitry Andric #include "GCNSubtarget.h"
21349cc55cSDimitry Andric #include "R600Subtarget.h"
220b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
23e8d8bef9SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
240b57cec5SDimitry Andric #include "llvm/ADT/SmallString.h"
25e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
260b57cec5SDimitry Andric #include "llvm/CodeGen/MachineScheduler.h"
270b57cec5SDimitry Andric #include "llvm/CodeGen/TargetFrameLowering.h"
28*0fca6ea1SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
29e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
30e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
31e8d8bef9SDimitry Andric #include "llvm/IR/MDBuilder.h"
32e8d8bef9SDimitry Andric #include "llvm/MC/MCSubtargetInfo.h"
330b57cec5SDimitry Andric #include <algorithm>
340b57cec5SDimitry Andric 
350b57cec5SDimitry Andric using namespace llvm;
360b57cec5SDimitry Andric 
370b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-subtarget"
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric #define GET_SUBTARGETINFO_TARGET_DESC
400b57cec5SDimitry Andric #define GET_SUBTARGETINFO_CTOR
410b57cec5SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
420b57cec5SDimitry Andric #include "AMDGPUGenSubtargetInfo.inc"
430b57cec5SDimitry Andric #undef AMDGPUSubtarget
440b57cec5SDimitry Andric 
45753f127fSDimitry Andric static cl::opt<bool> EnablePowerSched(
46753f127fSDimitry Andric   "amdgpu-enable-power-sched",
47753f127fSDimitry Andric   cl::desc("Enable scheduling to minimize mAI power bursts"),
480b57cec5SDimitry Andric   cl::init(false));
490b57cec5SDimitry Andric 
50480093f4SDimitry Andric static cl::opt<bool> EnableVGPRIndexMode(
51480093f4SDimitry Andric   "amdgpu-vgpr-index-mode",
52480093f4SDimitry Andric   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53480093f4SDimitry Andric   cl::init(false));
54480093f4SDimitry Andric 
55e8d8bef9SDimitry Andric static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
56e8d8bef9SDimitry Andric                            cl::desc("Enable the use of AA during codegen."),
57e8d8bef9SDimitry Andric                            cl::init(true));
58e8d8bef9SDimitry Andric 
59bdd1243dSDimitry Andric static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
60bdd1243dSDimitry Andric                                       cl::desc("Number of addresses from which to enable MIMG NSA."),
61bdd1243dSDimitry Andric                                       cl::init(3), cl::Hidden);
62bdd1243dSDimitry Andric 
630b57cec5SDimitry Andric GCNSubtarget::~GCNSubtarget() = default;
640b57cec5SDimitry Andric 
650b57cec5SDimitry Andric GCNSubtarget &
660b57cec5SDimitry Andric GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
670b57cec5SDimitry Andric                                               StringRef GPU, StringRef FS) {
680b57cec5SDimitry Andric   // Determine default and user-specified characteristics
690b57cec5SDimitry Andric   //
700b57cec5SDimitry Andric   // We want to be able to turn these off, but making this a subtarget feature
710b57cec5SDimitry Andric   // for SI has the unhelpful behavior that it unsets everything else if you
720b57cec5SDimitry Andric   // disable it.
730b57cec5SDimitry Andric   //
740b57cec5SDimitry Andric   // Similarly we want enable-prt-strict-null to be on by default and not to
750b57cec5SDimitry Andric   // unset everything else if it is disabled
760b57cec5SDimitry Andric 
77e8d8bef9SDimitry Andric   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
780b57cec5SDimitry Andric 
79e8d8bef9SDimitry Andric   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
80e8d8bef9SDimitry Andric   if (isAmdHsaOS())
81e8d8bef9SDimitry Andric     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
840b57cec5SDimitry Andric 
850b57cec5SDimitry Andric   // Disable mutually exclusive bits.
86349cc55cSDimitry Andric   if (FS.contains_insensitive("+wavefrontsize")) {
87349cc55cSDimitry Andric     if (!FS.contains_insensitive("wavefrontsize16"))
880b57cec5SDimitry Andric       FullFS += "-wavefrontsize16,";
89349cc55cSDimitry Andric     if (!FS.contains_insensitive("wavefrontsize32"))
900b57cec5SDimitry Andric       FullFS += "-wavefrontsize32,";
91349cc55cSDimitry Andric     if (!FS.contains_insensitive("wavefrontsize64"))
920b57cec5SDimitry Andric       FullFS += "-wavefrontsize64,";
930b57cec5SDimitry Andric   }
940b57cec5SDimitry Andric 
950b57cec5SDimitry Andric   FullFS += FS;
960b57cec5SDimitry Andric 
97e8d8bef9SDimitry Andric   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
98e8d8bef9SDimitry Andric 
99e8d8bef9SDimitry Andric   // Implement the "generic" processors, which acts as the default when no
100e8d8bef9SDimitry Andric   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
101e8d8bef9SDimitry Andric   // the first amdgcn target that supports flat addressing. Other OSes defaults
102e8d8bef9SDimitry Andric   // to the first amdgcn target.
103e8d8bef9SDimitry Andric   if (Gen == AMDGPUSubtarget::INVALID) {
104e8d8bef9SDimitry Andric      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
105e8d8bef9SDimitry Andric                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
106e8d8bef9SDimitry Andric   }
1070b57cec5SDimitry Andric 
108*0fca6ea1SDimitry Andric   if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
109*0fca6ea1SDimitry Andric       !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
110*0fca6ea1SDimitry Andric     // If there is no default wave size it must be a generation before gfx10,
111*0fca6ea1SDimitry Andric     // these have FeatureWavefrontSize64 in their definition already. For gfx10+
112*0fca6ea1SDimitry Andric     // set wave32 as a default.
113*0fca6ea1SDimitry Andric     ToggleFeature(AMDGPU::FeatureWavefrontSize32);
114*0fca6ea1SDimitry Andric   }
115*0fca6ea1SDimitry Andric 
1160b57cec5SDimitry Andric   // We don't support FP64 for EG/NI atm.
1170b57cec5SDimitry Andric   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
1180b57cec5SDimitry Andric 
119e8d8bef9SDimitry Andric   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
120e8d8bef9SDimitry Andric   // support flat operations, otherwise they cannot access a 64-bit global
121e8d8bef9SDimitry Andric   // address space
122e8d8bef9SDimitry Andric   assert(hasAddr64() || hasFlat());
123e8d8bef9SDimitry Andric   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
124e8d8bef9SDimitry Andric   // that do not support ADDR64 variants of MUBUF instructions. Such targets
125e8d8bef9SDimitry Andric   // cannot use a 64 bit offset with a MUBUF instruction to access the global
126e8d8bef9SDimitry Andric   // address space
127e8d8bef9SDimitry Andric   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
128e8d8bef9SDimitry Andric     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
1290b57cec5SDimitry Andric     FlatForGlobal = true;
1300b57cec5SDimitry Andric   }
131e8d8bef9SDimitry Andric   // Unless +-flat-for-global is specified, use MUBUF instructions for global
132e8d8bef9SDimitry Andric   // address space access if flat operations are not available.
133e8d8bef9SDimitry Andric   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
134e8d8bef9SDimitry Andric     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
135e8d8bef9SDimitry Andric     FlatForGlobal = false;
136e8d8bef9SDimitry Andric   }
1370b57cec5SDimitry Andric 
1380b57cec5SDimitry Andric   // Set defaults if needed.
1390b57cec5SDimitry Andric   if (MaxPrivateElementSize == 0)
1400b57cec5SDimitry Andric     MaxPrivateElementSize = 4;
1410b57cec5SDimitry Andric 
1420b57cec5SDimitry Andric   if (LDSBankCount == 0)
1430b57cec5SDimitry Andric     LDSBankCount = 32;
1440b57cec5SDimitry Andric 
1450b57cec5SDimitry Andric   if (TT.getArch() == Triple::amdgcn) {
1460b57cec5SDimitry Andric     if (LocalMemorySize == 0)
1470b57cec5SDimitry Andric       LocalMemorySize = 32768;
1480b57cec5SDimitry Andric 
1490b57cec5SDimitry Andric     // Do something sensible for unspecified target.
1500b57cec5SDimitry Andric     if (!HasMovrel && !HasVGPRIndexMode)
1510b57cec5SDimitry Andric       HasMovrel = true;
1520b57cec5SDimitry Andric   }
1530b57cec5SDimitry Andric 
154bdd1243dSDimitry Andric   AddressableLocalMemorySize = LocalMemorySize;
155bdd1243dSDimitry Andric 
156bdd1243dSDimitry Andric   if (AMDGPU::isGFX10Plus(*this) &&
157bdd1243dSDimitry Andric       !getFeatureBits().test(AMDGPU::FeatureCuMode))
158bdd1243dSDimitry Andric     LocalMemorySize *= 2;
159bdd1243dSDimitry Andric 
1600b57cec5SDimitry Andric   // Don't crash on invalid devices.
1615ffd83dbSDimitry Andric   if (WavefrontSizeLog2 == 0)
1625ffd83dbSDimitry Andric     WavefrontSizeLog2 = 5;
1630b57cec5SDimitry Andric 
1640b57cec5SDimitry Andric   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
165fe6060f1SDimitry Andric   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
1660b57cec5SDimitry Andric 
167e8d8bef9SDimitry Andric   TargetID.setTargetIDFromFeaturesString(FS);
1680b57cec5SDimitry Andric 
169e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170e8d8bef9SDimitry Andric                     << TargetID.getXnackSetting() << '\n');
171e8d8bef9SDimitry Andric   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172e8d8bef9SDimitry Andric                     << TargetID.getSramEccSetting() << '\n');
1730b57cec5SDimitry Andric 
1740b57cec5SDimitry Andric   return *this;
1750b57cec5SDimitry Andric }
1760b57cec5SDimitry Andric 
177*0fca6ea1SDimitry Andric void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
178*0fca6ea1SDimitry Andric   LLVMContext &Ctx = F.getContext();
179*0fca6ea1SDimitry Andric   if (hasFeature(AMDGPU::FeatureWavefrontSize32) ==
180*0fca6ea1SDimitry Andric       hasFeature(AMDGPU::FeatureWavefrontSize64)) {
181*0fca6ea1SDimitry Andric     Ctx.diagnose(DiagnosticInfoUnsupported(
182*0fca6ea1SDimitry Andric         F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
183*0fca6ea1SDimitry Andric   }
184*0fca6ea1SDimitry Andric }
185*0fca6ea1SDimitry Andric 
186*0fca6ea1SDimitry Andric AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
1870b57cec5SDimitry Andric 
1885f757f3fSDimitry Andric bool AMDGPUSubtarget::useRealTrue16Insts() const {
1895f757f3fSDimitry Andric   return hasTrue16BitInsts() && EnableRealTrue16Insts;
1905f757f3fSDimitry Andric }
1915f757f3fSDimitry Andric 
1920b57cec5SDimitry Andric GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
193fe6060f1SDimitry Andric                            const GCNTargetMachine &TM)
194fe6060f1SDimitry Andric     : // clang-format off
195e8d8bef9SDimitry Andric     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
1960b57cec5SDimitry Andric     AMDGPUSubtarget(TT),
1970b57cec5SDimitry Andric     TargetTriple(TT),
198e8d8bef9SDimitry Andric     TargetID(*this),
1990b57cec5SDimitry Andric     InstrItins(getInstrItineraryForCPU(GPU)),
2000b57cec5SDimitry Andric     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
2010b57cec5SDimitry Andric     TLInfo(TM, *this),
2020b57cec5SDimitry Andric     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
203fe6060f1SDimitry Andric   // clang-format on
2048bcb0991SDimitry Andric   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
205bdd1243dSDimitry Andric   EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
206*0fca6ea1SDimitry Andric   CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
207*0fca6ea1SDimitry Andric   InlineAsmLoweringInfo =
208*0fca6ea1SDimitry Andric       std::make_unique<InlineAsmLowering>(getTargetLowering());
209*0fca6ea1SDimitry Andric   Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
210*0fca6ea1SDimitry Andric   RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
211*0fca6ea1SDimitry Andric   InstSelector =
212*0fca6ea1SDimitry Andric       std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
2130b57cec5SDimitry Andric }
2140b57cec5SDimitry Andric 
2150b57cec5SDimitry Andric unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
2160b57cec5SDimitry Andric   if (getGeneration() < GFX10)
2170b57cec5SDimitry Andric     return 1;
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric   switch (Opcode) {
220e8d8bef9SDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64:
2210b57cec5SDimitry Andric   case AMDGPU::V_LSHLREV_B64_gfx10:
22281ad6265SDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64_gfx11:
2235f757f3fSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e32_gfx12:
2245f757f3fSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64_gfx12:
225e8d8bef9SDimitry Andric   case AMDGPU::V_LSHL_B64_e64:
226e8d8bef9SDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64:
2270b57cec5SDimitry Andric   case AMDGPU::V_LSHRREV_B64_gfx10:
22881ad6265SDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64_gfx11:
2295f757f3fSDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64_gfx12:
230e8d8bef9SDimitry Andric   case AMDGPU::V_LSHR_B64_e64:
231e8d8bef9SDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64:
2320b57cec5SDimitry Andric   case AMDGPU::V_ASHRREV_I64_gfx10:
23381ad6265SDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64_gfx11:
2345f757f3fSDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64_gfx12:
235e8d8bef9SDimitry Andric   case AMDGPU::V_ASHR_I64_e64:
2360b57cec5SDimitry Andric     return 1;
2370b57cec5SDimitry Andric   }
2380b57cec5SDimitry Andric 
2390b57cec5SDimitry Andric   return 2;
2400b57cec5SDimitry Andric }
2410b57cec5SDimitry Andric 
242fe6060f1SDimitry Andric /// This list was mostly derived from experimentation.
243fe6060f1SDimitry Andric bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
244fe6060f1SDimitry Andric   switch (Opcode) {
245fe6060f1SDimitry Andric   case AMDGPU::V_CVT_F16_F32_e32:
246fe6060f1SDimitry Andric   case AMDGPU::V_CVT_F16_F32_e64:
247fe6060f1SDimitry Andric   case AMDGPU::V_CVT_F16_U16_e32:
248fe6060f1SDimitry Andric   case AMDGPU::V_CVT_F16_U16_e64:
249fe6060f1SDimitry Andric   case AMDGPU::V_CVT_F16_I16_e32:
250fe6060f1SDimitry Andric   case AMDGPU::V_CVT_F16_I16_e64:
251fe6060f1SDimitry Andric   case AMDGPU::V_RCP_F16_e64:
252fe6060f1SDimitry Andric   case AMDGPU::V_RCP_F16_e32:
253fe6060f1SDimitry Andric   case AMDGPU::V_RSQ_F16_e64:
254fe6060f1SDimitry Andric   case AMDGPU::V_RSQ_F16_e32:
255fe6060f1SDimitry Andric   case AMDGPU::V_SQRT_F16_e64:
256fe6060f1SDimitry Andric   case AMDGPU::V_SQRT_F16_e32:
257fe6060f1SDimitry Andric   case AMDGPU::V_LOG_F16_e64:
258fe6060f1SDimitry Andric   case AMDGPU::V_LOG_F16_e32:
259fe6060f1SDimitry Andric   case AMDGPU::V_EXP_F16_e64:
260fe6060f1SDimitry Andric   case AMDGPU::V_EXP_F16_e32:
261fe6060f1SDimitry Andric   case AMDGPU::V_SIN_F16_e64:
262fe6060f1SDimitry Andric   case AMDGPU::V_SIN_F16_e32:
263fe6060f1SDimitry Andric   case AMDGPU::V_COS_F16_e64:
264fe6060f1SDimitry Andric   case AMDGPU::V_COS_F16_e32:
265fe6060f1SDimitry Andric   case AMDGPU::V_FLOOR_F16_e64:
266fe6060f1SDimitry Andric   case AMDGPU::V_FLOOR_F16_e32:
267fe6060f1SDimitry Andric   case AMDGPU::V_CEIL_F16_e64:
268fe6060f1SDimitry Andric   case AMDGPU::V_CEIL_F16_e32:
269fe6060f1SDimitry Andric   case AMDGPU::V_TRUNC_F16_e64:
270fe6060f1SDimitry Andric   case AMDGPU::V_TRUNC_F16_e32:
271fe6060f1SDimitry Andric   case AMDGPU::V_RNDNE_F16_e64:
272fe6060f1SDimitry Andric   case AMDGPU::V_RNDNE_F16_e32:
273fe6060f1SDimitry Andric   case AMDGPU::V_FRACT_F16_e64:
274fe6060f1SDimitry Andric   case AMDGPU::V_FRACT_F16_e32:
275fe6060f1SDimitry Andric   case AMDGPU::V_FREXP_MANT_F16_e64:
276fe6060f1SDimitry Andric   case AMDGPU::V_FREXP_MANT_F16_e32:
277fe6060f1SDimitry Andric   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
278fe6060f1SDimitry Andric   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
279fe6060f1SDimitry Andric   case AMDGPU::V_LDEXP_F16_e64:
280fe6060f1SDimitry Andric   case AMDGPU::V_LDEXP_F16_e32:
281fe6060f1SDimitry Andric   case AMDGPU::V_LSHLREV_B16_e64:
282fe6060f1SDimitry Andric   case AMDGPU::V_LSHLREV_B16_e32:
283fe6060f1SDimitry Andric   case AMDGPU::V_LSHRREV_B16_e64:
284fe6060f1SDimitry Andric   case AMDGPU::V_LSHRREV_B16_e32:
285fe6060f1SDimitry Andric   case AMDGPU::V_ASHRREV_I16_e64:
286fe6060f1SDimitry Andric   case AMDGPU::V_ASHRREV_I16_e32:
287fe6060f1SDimitry Andric   case AMDGPU::V_ADD_U16_e64:
288fe6060f1SDimitry Andric   case AMDGPU::V_ADD_U16_e32:
289fe6060f1SDimitry Andric   case AMDGPU::V_SUB_U16_e64:
290fe6060f1SDimitry Andric   case AMDGPU::V_SUB_U16_e32:
291fe6060f1SDimitry Andric   case AMDGPU::V_SUBREV_U16_e64:
292fe6060f1SDimitry Andric   case AMDGPU::V_SUBREV_U16_e32:
293fe6060f1SDimitry Andric   case AMDGPU::V_MUL_LO_U16_e64:
294fe6060f1SDimitry Andric   case AMDGPU::V_MUL_LO_U16_e32:
295fe6060f1SDimitry Andric   case AMDGPU::V_ADD_F16_e64:
296fe6060f1SDimitry Andric   case AMDGPU::V_ADD_F16_e32:
297fe6060f1SDimitry Andric   case AMDGPU::V_SUB_F16_e64:
298fe6060f1SDimitry Andric   case AMDGPU::V_SUB_F16_e32:
299fe6060f1SDimitry Andric   case AMDGPU::V_SUBREV_F16_e64:
300fe6060f1SDimitry Andric   case AMDGPU::V_SUBREV_F16_e32:
301fe6060f1SDimitry Andric   case AMDGPU::V_MUL_F16_e64:
302fe6060f1SDimitry Andric   case AMDGPU::V_MUL_F16_e32:
303fe6060f1SDimitry Andric   case AMDGPU::V_MAX_F16_e64:
304fe6060f1SDimitry Andric   case AMDGPU::V_MAX_F16_e32:
305fe6060f1SDimitry Andric   case AMDGPU::V_MIN_F16_e64:
306fe6060f1SDimitry Andric   case AMDGPU::V_MIN_F16_e32:
307fe6060f1SDimitry Andric   case AMDGPU::V_MAX_U16_e64:
308fe6060f1SDimitry Andric   case AMDGPU::V_MAX_U16_e32:
309fe6060f1SDimitry Andric   case AMDGPU::V_MIN_U16_e64:
310fe6060f1SDimitry Andric   case AMDGPU::V_MIN_U16_e32:
311fe6060f1SDimitry Andric   case AMDGPU::V_MAX_I16_e64:
312fe6060f1SDimitry Andric   case AMDGPU::V_MAX_I16_e32:
313fe6060f1SDimitry Andric   case AMDGPU::V_MIN_I16_e64:
314fe6060f1SDimitry Andric   case AMDGPU::V_MIN_I16_e32:
3150eae32dcSDimitry Andric   case AMDGPU::V_MAD_F16_e64:
3160eae32dcSDimitry Andric   case AMDGPU::V_MAD_U16_e64:
3170eae32dcSDimitry Andric   case AMDGPU::V_MAD_I16_e64:
3180eae32dcSDimitry Andric   case AMDGPU::V_FMA_F16_e64:
3190eae32dcSDimitry Andric   case AMDGPU::V_DIV_FIXUP_F16_e64:
320fe6060f1SDimitry Andric     // On gfx10, all 16-bit instructions preserve the high bits.
321fe6060f1SDimitry Andric     return getGeneration() <= AMDGPUSubtarget::GFX9;
322fe6060f1SDimitry Andric   case AMDGPU::V_MADAK_F16:
323fe6060f1SDimitry Andric   case AMDGPU::V_MADMK_F16:
324fe6060f1SDimitry Andric   case AMDGPU::V_MAC_F16_e64:
325fe6060f1SDimitry Andric   case AMDGPU::V_MAC_F16_e32:
326fe6060f1SDimitry Andric   case AMDGPU::V_FMAMK_F16:
327fe6060f1SDimitry Andric   case AMDGPU::V_FMAAK_F16:
328fe6060f1SDimitry Andric   case AMDGPU::V_FMAC_F16_e64:
329fe6060f1SDimitry Andric   case AMDGPU::V_FMAC_F16_e32:
330fe6060f1SDimitry Andric     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
331fe6060f1SDimitry Andric     // instructions maintain the legacy behavior of 0ing. Some instructions
332fe6060f1SDimitry Andric     // changed to preserving the high bits.
333fe6060f1SDimitry Andric     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
334fe6060f1SDimitry Andric   case AMDGPU::V_MAD_MIXLO_F16:
335fe6060f1SDimitry Andric   case AMDGPU::V_MAD_MIXHI_F16:
336fe6060f1SDimitry Andric   default:
337fe6060f1SDimitry Andric     return false;
338fe6060f1SDimitry Andric   }
339fe6060f1SDimitry Andric }
340fe6060f1SDimitry Andric 
341bdd1243dSDimitry Andric // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
342bdd1243dSDimitry Andric // allows the given function to achieve an occupancy of NWaves waves per
343bdd1243dSDimitry Andric // SIMD / EU, taking into account only the function's *maximum* workgroup size.
344bdd1243dSDimitry Andric unsigned
345bdd1243dSDimitry Andric AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
3460b57cec5SDimitry Andric                                                  const Function &F) const {
347bdd1243dSDimitry Andric   const unsigned WaveSize = getWavefrontSize();
348bdd1243dSDimitry Andric   const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
349bdd1243dSDimitry Andric   const unsigned WavesPerWorkgroup =
350bdd1243dSDimitry Andric       std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
351bdd1243dSDimitry Andric 
352bdd1243dSDimitry Andric   const unsigned WorkGroupsPerCU =
353bdd1243dSDimitry Andric       std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
354bdd1243dSDimitry Andric 
355bdd1243dSDimitry Andric   return getLocalMemorySize() / WorkGroupsPerCU;
3560b57cec5SDimitry Andric }
3570b57cec5SDimitry Andric 
3585ffd83dbSDimitry Andric // FIXME: Should return min,max range.
359bdd1243dSDimitry Andric //
360bdd1243dSDimitry Andric // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
361bdd1243dSDimitry Andric // be achieved when only the given function is running on the machine; and
362bdd1243dSDimitry Andric // taking into account the overall number of wave slots, the (maximum) workgroup
363bdd1243dSDimitry Andric // size, and the per-workgroup LDS allocation size.
3640b57cec5SDimitry Andric unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
3650b57cec5SDimitry Andric   const Function &F) const {
3665ffd83dbSDimitry Andric   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
3675ffd83dbSDimitry Andric   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
3685ffd83dbSDimitry Andric   if (!MaxWorkGroupsPerCu)
3690b57cec5SDimitry Andric     return 0;
3705ffd83dbSDimitry Andric 
3715ffd83dbSDimitry Andric   const unsigned WaveSize = getWavefrontSize();
3725ffd83dbSDimitry Andric 
3735ffd83dbSDimitry Andric   // FIXME: Do we need to account for alignment requirement of LDS rounding the
3745ffd83dbSDimitry Andric   // size up?
3755ffd83dbSDimitry Andric   // Compute restriction based on LDS usage
3765ffd83dbSDimitry Andric   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
3775ffd83dbSDimitry Andric 
3785ffd83dbSDimitry Andric   // This can be queried with more LDS than is possible, so just assume the
3795ffd83dbSDimitry Andric   // worst.
3805ffd83dbSDimitry Andric   if (NumGroups == 0)
3815ffd83dbSDimitry Andric     return 1;
3825ffd83dbSDimitry Andric 
3835ffd83dbSDimitry Andric   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
3845ffd83dbSDimitry Andric 
385bdd1243dSDimitry Andric   // Round to the number of waves per CU.
386bdd1243dSDimitry Andric   const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
3875ffd83dbSDimitry Andric   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
3885ffd83dbSDimitry Andric 
389bdd1243dSDimitry Andric   // Number of waves per EU (SIMD).
390bdd1243dSDimitry Andric   MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
391bdd1243dSDimitry Andric 
3925ffd83dbSDimitry Andric   // Clamp to the maximum possible number of waves.
3935ffd83dbSDimitry Andric   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
3945ffd83dbSDimitry Andric 
3955ffd83dbSDimitry Andric   // FIXME: Needs to be a multiple of the group size?
3965ffd83dbSDimitry Andric   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
3975ffd83dbSDimitry Andric 
3985ffd83dbSDimitry Andric   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
3995ffd83dbSDimitry Andric          "computed invalid occupancy");
4005ffd83dbSDimitry Andric   return MaxWaves;
4010b57cec5SDimitry Andric }
4020b57cec5SDimitry Andric 
4030b57cec5SDimitry Andric unsigned
4040b57cec5SDimitry Andric AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
4050b57cec5SDimitry Andric   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
4060b57cec5SDimitry Andric   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
4070b57cec5SDimitry Andric }
4080b57cec5SDimitry Andric 
4090b57cec5SDimitry Andric std::pair<unsigned, unsigned>
4100b57cec5SDimitry Andric AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
4110b57cec5SDimitry Andric   switch (CC) {
4120b57cec5SDimitry Andric   case CallingConv::AMDGPU_VS:
4130b57cec5SDimitry Andric   case CallingConv::AMDGPU_LS:
4140b57cec5SDimitry Andric   case CallingConv::AMDGPU_HS:
4150b57cec5SDimitry Andric   case CallingConv::AMDGPU_ES:
4160b57cec5SDimitry Andric   case CallingConv::AMDGPU_GS:
4170b57cec5SDimitry Andric   case CallingConv::AMDGPU_PS:
418bdd1243dSDimitry Andric     return std::pair(1, getWavefrontSize());
4190b57cec5SDimitry Andric   default:
420bdd1243dSDimitry Andric     return std::pair(1u, getMaxFlatWorkGroupSize());
4210b57cec5SDimitry Andric   }
4220b57cec5SDimitry Andric }
4230b57cec5SDimitry Andric 
4240b57cec5SDimitry Andric std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
4250b57cec5SDimitry Andric   const Function &F) const {
4260b57cec5SDimitry Andric   // Default minimum/maximum flat work group sizes.
4270b57cec5SDimitry Andric   std::pair<unsigned, unsigned> Default =
4280b57cec5SDimitry Andric     getDefaultFlatWorkGroupSize(F.getCallingConv());
4290b57cec5SDimitry Andric 
4300b57cec5SDimitry Andric   // Requested minimum/maximum flat work group sizes.
4310b57cec5SDimitry Andric   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
4320b57cec5SDimitry Andric     F, "amdgpu-flat-work-group-size", Default);
4330b57cec5SDimitry Andric 
4340b57cec5SDimitry Andric   // Make sure requested minimum is less than requested maximum.
4350b57cec5SDimitry Andric   if (Requested.first > Requested.second)
4360b57cec5SDimitry Andric     return Default;
4370b57cec5SDimitry Andric 
4380b57cec5SDimitry Andric   // Make sure requested values do not violate subtarget's specifications.
4390b57cec5SDimitry Andric   if (Requested.first < getMinFlatWorkGroupSize())
4400b57cec5SDimitry Andric     return Default;
4410b57cec5SDimitry Andric   if (Requested.second > getMaxFlatWorkGroupSize())
4420b57cec5SDimitry Andric     return Default;
4430b57cec5SDimitry Andric 
4440b57cec5SDimitry Andric   return Requested;
4450b57cec5SDimitry Andric }
4460b57cec5SDimitry Andric 
44706c3fb27SDimitry Andric std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
44806c3fb27SDimitry Andric     std::pair<unsigned, unsigned> Requested,
44906c3fb27SDimitry Andric     std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
4500b57cec5SDimitry Andric   // Default minimum/maximum number of waves per execution unit.
4510b57cec5SDimitry Andric   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
4520b57cec5SDimitry Andric 
4530b57cec5SDimitry Andric   // If minimum/maximum flat work group sizes were explicitly requested using
454*0fca6ea1SDimitry Andric   // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
4550b57cec5SDimitry Andric   // number of waves per execution unit to values implied by requested
4560b57cec5SDimitry Andric   // minimum/maximum flat work group sizes.
4570b57cec5SDimitry Andric   unsigned MinImpliedByFlatWorkGroupSize =
4585ffd83dbSDimitry Andric     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
4590b57cec5SDimitry Andric   Default.first = MinImpliedByFlatWorkGroupSize;
4600b57cec5SDimitry Andric 
4610b57cec5SDimitry Andric   // Make sure requested minimum is less than requested maximum.
4620b57cec5SDimitry Andric   if (Requested.second && Requested.first > Requested.second)
4630b57cec5SDimitry Andric     return Default;
4640b57cec5SDimitry Andric 
4650b57cec5SDimitry Andric   // Make sure requested values do not violate subtarget's specifications.
4660b57cec5SDimitry Andric   if (Requested.first < getMinWavesPerEU() ||
4675ffd83dbSDimitry Andric       Requested.second > getMaxWavesPerEU())
4680b57cec5SDimitry Andric     return Default;
4690b57cec5SDimitry Andric 
4700b57cec5SDimitry Andric   // Make sure requested values are compatible with values implied by requested
4710b57cec5SDimitry Andric   // minimum/maximum flat work group sizes.
472349cc55cSDimitry Andric   if (Requested.first < MinImpliedByFlatWorkGroupSize)
4730b57cec5SDimitry Andric     return Default;
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric   return Requested;
4760b57cec5SDimitry Andric }
4770b57cec5SDimitry Andric 
47806c3fb27SDimitry Andric std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
47906c3fb27SDimitry Andric     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
48006c3fb27SDimitry Andric   // Default minimum/maximum number of waves per execution unit.
48106c3fb27SDimitry Andric   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
48206c3fb27SDimitry Andric 
48306c3fb27SDimitry Andric   // Requested minimum/maximum number of waves per execution unit.
48406c3fb27SDimitry Andric   std::pair<unsigned, unsigned> Requested =
48506c3fb27SDimitry Andric       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
48606c3fb27SDimitry Andric   return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
48706c3fb27SDimitry Andric }
48806c3fb27SDimitry Andric 
489e8d8bef9SDimitry Andric static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
490e8d8bef9SDimitry Andric   auto Node = Kernel.getMetadata("reqd_work_group_size");
491e8d8bef9SDimitry Andric   if (Node && Node->getNumOperands() == 3)
492e8d8bef9SDimitry Andric     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
493e8d8bef9SDimitry Andric   return std::numeric_limits<unsigned>::max();
494e8d8bef9SDimitry Andric }
495e8d8bef9SDimitry Andric 
496e8d8bef9SDimitry Andric bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
497e8d8bef9SDimitry Andric   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
498e8d8bef9SDimitry Andric }
499e8d8bef9SDimitry Andric 
500e8d8bef9SDimitry Andric unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
501e8d8bef9SDimitry Andric                                            unsigned Dimension) const {
502e8d8bef9SDimitry Andric   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
503e8d8bef9SDimitry Andric   if (ReqdSize != std::numeric_limits<unsigned>::max())
504e8d8bef9SDimitry Andric     return ReqdSize - 1;
505e8d8bef9SDimitry Andric   return getFlatWorkGroupSizes(Kernel).second - 1;
506e8d8bef9SDimitry Andric }
507e8d8bef9SDimitry Andric 
50806c3fb27SDimitry Andric bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
50906c3fb27SDimitry Andric   for (int I = 0; I < 3; ++I) {
51006c3fb27SDimitry Andric     if (getMaxWorkitemID(Func, I) > 0)
51106c3fb27SDimitry Andric       return false;
51206c3fb27SDimitry Andric   }
51306c3fb27SDimitry Andric 
51406c3fb27SDimitry Andric   return true;
51506c3fb27SDimitry Andric }
51606c3fb27SDimitry Andric 
5170b57cec5SDimitry Andric bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
5180b57cec5SDimitry Andric   Function *Kernel = I->getParent()->getParent();
5190b57cec5SDimitry Andric   unsigned MinSize = 0;
5200b57cec5SDimitry Andric   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
5210b57cec5SDimitry Andric   bool IdQuery = false;
5220b57cec5SDimitry Andric 
5230b57cec5SDimitry Andric   // If reqd_work_group_size is present it narrows value down.
5240b57cec5SDimitry Andric   if (auto *CI = dyn_cast<CallInst>(I)) {
5250b57cec5SDimitry Andric     const Function *F = CI->getCalledFunction();
5260b57cec5SDimitry Andric     if (F) {
5270b57cec5SDimitry Andric       unsigned Dim = UINT_MAX;
5280b57cec5SDimitry Andric       switch (F->getIntrinsicID()) {
5290b57cec5SDimitry Andric       case Intrinsic::amdgcn_workitem_id_x:
5300b57cec5SDimitry Andric       case Intrinsic::r600_read_tidig_x:
5310b57cec5SDimitry Andric         IdQuery = true;
532bdd1243dSDimitry Andric         [[fallthrough]];
5330b57cec5SDimitry Andric       case Intrinsic::r600_read_local_size_x:
5340b57cec5SDimitry Andric         Dim = 0;
5350b57cec5SDimitry Andric         break;
5360b57cec5SDimitry Andric       case Intrinsic::amdgcn_workitem_id_y:
5370b57cec5SDimitry Andric       case Intrinsic::r600_read_tidig_y:
5380b57cec5SDimitry Andric         IdQuery = true;
539bdd1243dSDimitry Andric         [[fallthrough]];
5400b57cec5SDimitry Andric       case Intrinsic::r600_read_local_size_y:
5410b57cec5SDimitry Andric         Dim = 1;
5420b57cec5SDimitry Andric         break;
5430b57cec5SDimitry Andric       case Intrinsic::amdgcn_workitem_id_z:
5440b57cec5SDimitry Andric       case Intrinsic::r600_read_tidig_z:
5450b57cec5SDimitry Andric         IdQuery = true;
546bdd1243dSDimitry Andric         [[fallthrough]];
5470b57cec5SDimitry Andric       case Intrinsic::r600_read_local_size_z:
5480b57cec5SDimitry Andric         Dim = 2;
5490b57cec5SDimitry Andric         break;
5500b57cec5SDimitry Andric       default:
5510b57cec5SDimitry Andric         break;
5520b57cec5SDimitry Andric       }
553e8d8bef9SDimitry Andric 
5540b57cec5SDimitry Andric       if (Dim <= 3) {
555e8d8bef9SDimitry Andric         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
556e8d8bef9SDimitry Andric         if (ReqdSize != std::numeric_limits<unsigned>::max())
557e8d8bef9SDimitry Andric           MinSize = MaxSize = ReqdSize;
5580b57cec5SDimitry Andric       }
5590b57cec5SDimitry Andric     }
5600b57cec5SDimitry Andric   }
5610b57cec5SDimitry Andric 
5620b57cec5SDimitry Andric   if (!MaxSize)
5630b57cec5SDimitry Andric     return false;
5640b57cec5SDimitry Andric 
5650b57cec5SDimitry Andric   // Range metadata is [Lo, Hi). For ID query we need to pass max size
5660b57cec5SDimitry Andric   // as Hi. For size query we need to pass Hi + 1.
5670b57cec5SDimitry Andric   if (IdQuery)
5680b57cec5SDimitry Andric     MinSize = 0;
5690b57cec5SDimitry Andric   else
5700b57cec5SDimitry Andric     ++MaxSize;
5710b57cec5SDimitry Andric 
572*0fca6ea1SDimitry Andric   APInt Lower{32, MinSize};
573*0fca6ea1SDimitry Andric   APInt Upper{32, MaxSize};
574*0fca6ea1SDimitry Andric   if (auto *CI = dyn_cast<CallBase>(I)) {
575*0fca6ea1SDimitry Andric     ConstantRange Range(Lower, Upper);
576*0fca6ea1SDimitry Andric     CI->addRangeRetAttr(Range);
577*0fca6ea1SDimitry Andric   } else {
5780b57cec5SDimitry Andric     MDBuilder MDB(I->getContext());
579*0fca6ea1SDimitry Andric     MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
5800b57cec5SDimitry Andric     I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
581*0fca6ea1SDimitry Andric   }
5820b57cec5SDimitry Andric   return true;
5830b57cec5SDimitry Andric }
5840b57cec5SDimitry Andric 
585e8d8bef9SDimitry Andric unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
5860eae32dcSDimitry Andric   assert(AMDGPU::isKernel(F.getCallingConv()));
5870eae32dcSDimitry Andric 
5880eae32dcSDimitry Andric   // We don't allocate the segment if we know the implicit arguments weren't
5890eae32dcSDimitry Andric   // used, even if the ABI implies we need them.
5900eae32dcSDimitry Andric   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
5910eae32dcSDimitry Andric     return 0;
5920eae32dcSDimitry Andric 
593e8d8bef9SDimitry Andric   if (isMesaKernel(F))
594e8d8bef9SDimitry Andric     return 16;
5950eae32dcSDimitry Andric 
5960eae32dcSDimitry Andric   // Assume all implicit inputs are used by default
59706c3fb27SDimitry Andric   const Module *M = F.getParent();
59806c3fb27SDimitry Andric   unsigned NBytes =
5997a6dacacSDimitry Andric       AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
600bdd1243dSDimitry Andric   return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
601bdd1243dSDimitry Andric                                          NBytes);
602e8d8bef9SDimitry Andric }
603e8d8bef9SDimitry Andric 
6040b57cec5SDimitry Andric uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
6058bcb0991SDimitry Andric                                                  Align &MaxAlign) const {
6060b57cec5SDimitry Andric   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
6070b57cec5SDimitry Andric          F.getCallingConv() == CallingConv::SPIR_KERNEL);
6080b57cec5SDimitry Andric 
609*0fca6ea1SDimitry Andric   const DataLayout &DL = F.getDataLayout();
6100b57cec5SDimitry Andric   uint64_t ExplicitArgBytes = 0;
6115ffd83dbSDimitry Andric   MaxAlign = Align(1);
6120b57cec5SDimitry Andric 
6130b57cec5SDimitry Andric   for (const Argument &Arg : F.args()) {
614e8d8bef9SDimitry Andric     const bool IsByRef = Arg.hasByRefAttr();
615e8d8bef9SDimitry Andric     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
61681ad6265SDimitry Andric     Align Alignment = DL.getValueOrABITypeAlignment(
617bdd1243dSDimitry Andric         IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
6180b57cec5SDimitry Andric     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
6198bcb0991SDimitry Andric     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
62081ad6265SDimitry Andric     MaxAlign = std::max(MaxAlign, Alignment);
6210b57cec5SDimitry Andric   }
6220b57cec5SDimitry Andric 
6230b57cec5SDimitry Andric   return ExplicitArgBytes;
6240b57cec5SDimitry Andric }
6250b57cec5SDimitry Andric 
6260b57cec5SDimitry Andric unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
6278bcb0991SDimitry Andric                                                 Align &MaxAlign) const {
62806c3fb27SDimitry Andric   if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
62906c3fb27SDimitry Andric       F.getCallingConv() != CallingConv::SPIR_KERNEL)
63006c3fb27SDimitry Andric     return 0;
63106c3fb27SDimitry Andric 
6320b57cec5SDimitry Andric   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
6330b57cec5SDimitry Andric 
63406c3fb27SDimitry Andric   unsigned ExplicitOffset = getExplicitKernelArgOffset();
6350b57cec5SDimitry Andric 
6360b57cec5SDimitry Andric   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
6370b57cec5SDimitry Andric   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
6380b57cec5SDimitry Andric   if (ImplicitBytes != 0) {
6398bcb0991SDimitry Andric     const Align Alignment = getAlignmentForImplicitArgPtr();
6400b57cec5SDimitry Andric     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
641349cc55cSDimitry Andric     MaxAlign = std::max(MaxAlign, Alignment);
6420b57cec5SDimitry Andric   }
6430b57cec5SDimitry Andric 
6440b57cec5SDimitry Andric   // Being able to dereference past the end is useful for emitting scalar loads.
6450b57cec5SDimitry Andric   return alignTo(TotalSize, 4);
6460b57cec5SDimitry Andric }
6470b57cec5SDimitry Andric 
648e8d8bef9SDimitry Andric AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
649e8d8bef9SDimitry Andric   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
650e8d8bef9SDimitry Andric                                   : AMDGPUDwarfFlavour::Wave64;
651e8d8bef9SDimitry Andric }
652e8d8bef9SDimitry Andric 
6530b57cec5SDimitry Andric void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
6540b57cec5SDimitry Andric                                       unsigned NumRegionInstrs) const {
6550b57cec5SDimitry Andric   // Track register pressure so the scheduler can try to decrease
6560b57cec5SDimitry Andric   // pressure once register usage is above the threshold defined by
6570b57cec5SDimitry Andric   // SIRegisterInfo::getRegPressureSetLimit()
6580b57cec5SDimitry Andric   Policy.ShouldTrackPressure = true;
6590b57cec5SDimitry Andric 
6600b57cec5SDimitry Andric   // Enabling both top down and bottom up scheduling seems to give us less
6610b57cec5SDimitry Andric   // register spills than just using one of these approaches on its own.
6620b57cec5SDimitry Andric   Policy.OnlyTopDown = false;
6630b57cec5SDimitry Andric   Policy.OnlyBottomUp = false;
6640b57cec5SDimitry Andric 
6650b57cec5SDimitry Andric   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
6660b57cec5SDimitry Andric   if (!enableSIScheduler())
6670b57cec5SDimitry Andric     Policy.ShouldTrackLaneMasks = true;
6680b57cec5SDimitry Andric }
6690b57cec5SDimitry Andric 
670*0fca6ea1SDimitry Andric void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
671*0fca6ea1SDimitry Andric   if (isWave32()) {
672*0fca6ea1SDimitry Andric     // Fix implicit $vcc operands after MIParser has verified that they match
673*0fca6ea1SDimitry Andric     // the instruction definitions.
674*0fca6ea1SDimitry Andric     for (auto &MBB : MF) {
675*0fca6ea1SDimitry Andric       for (auto &MI : MBB)
676*0fca6ea1SDimitry Andric         InstrInfo.fixImplicitOperands(MI);
677*0fca6ea1SDimitry Andric     }
678*0fca6ea1SDimitry Andric   }
679*0fca6ea1SDimitry Andric }
680*0fca6ea1SDimitry Andric 
6810b57cec5SDimitry Andric bool GCNSubtarget::hasMadF16() const {
682e8d8bef9SDimitry Andric   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
6830b57cec5SDimitry Andric }
6840b57cec5SDimitry Andric 
685480093f4SDimitry Andric bool GCNSubtarget::useVGPRIndexMode() const {
686480093f4SDimitry Andric   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
687480093f4SDimitry Andric }
688480093f4SDimitry Andric 
689e8d8bef9SDimitry Andric bool GCNSubtarget::useAA() const { return UseAA; }
690e8d8bef9SDimitry Andric 
6910b57cec5SDimitry Andric unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
692*0fca6ea1SDimitry Andric   return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
693*0fca6ea1SDimitry Andric                                                    getGeneration());
6940b57cec5SDimitry Andric }
6950b57cec5SDimitry Andric 
696bdd1243dSDimitry Andric unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
697bdd1243dSDimitry Andric   return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
6980b57cec5SDimitry Andric }
6990b57cec5SDimitry Andric 
700fe6060f1SDimitry Andric unsigned
70104eeddc0SDimitry Andric GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
7020b57cec5SDimitry Andric   if (getGeneration() >= AMDGPUSubtarget::GFX10)
7030b57cec5SDimitry Andric     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
7040b57cec5SDimitry Andric 
70504eeddc0SDimitry Andric   if (HasFlatScratch || HasArchitectedFlatScratch) {
7060b57cec5SDimitry Andric     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
7070b57cec5SDimitry Andric       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
7080b57cec5SDimitry Andric     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
7090b57cec5SDimitry Andric       return 4; // FLAT_SCRATCH, VCC (in that order).
7100b57cec5SDimitry Andric   }
7110b57cec5SDimitry Andric 
7120b57cec5SDimitry Andric   if (isXNACKEnabled())
7130b57cec5SDimitry Andric     return 4; // XNACK, VCC (in that order).
7140b57cec5SDimitry Andric   return 2; // VCC.
7150b57cec5SDimitry Andric }
7160b57cec5SDimitry Andric 
717fe6060f1SDimitry Andric unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
718fe6060f1SDimitry Andric   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
7195f757f3fSDimitry Andric   return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
720fe6060f1SDimitry Andric }
721fe6060f1SDimitry Andric 
722fe6060f1SDimitry Andric unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
72304eeddc0SDimitry Andric   // In principle we do not need to reserve SGPR pair used for flat_scratch if
72404eeddc0SDimitry Andric   // we know flat instructions do not access the stack anywhere in the
72504eeddc0SDimitry Andric   // program. For now assume it's needed if we have flat instructions.
72604eeddc0SDimitry Andric   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
72704eeddc0SDimitry Andric   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
728fe6060f1SDimitry Andric }
729fe6060f1SDimitry Andric 
7305ffd83dbSDimitry Andric unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
7318bcb0991SDimitry Andric                                         unsigned NumSGPRs,
7328bcb0991SDimitry Andric                                         unsigned NumVGPRs) const {
7338bcb0991SDimitry Andric   unsigned Occupancy =
7348bcb0991SDimitry Andric     std::min(getMaxWavesPerEU(),
7355ffd83dbSDimitry Andric              getOccupancyWithLocalMemSize(LDSSize, F));
7368bcb0991SDimitry Andric   if (NumSGPRs)
7378bcb0991SDimitry Andric     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
7388bcb0991SDimitry Andric   if (NumVGPRs)
7398bcb0991SDimitry Andric     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
7408bcb0991SDimitry Andric   return Occupancy;
7418bcb0991SDimitry Andric }
7428bcb0991SDimitry Andric 
743fe6060f1SDimitry Andric unsigned GCNSubtarget::getBaseMaxNumSGPRs(
744fe6060f1SDimitry Andric     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
745fe6060f1SDimitry Andric     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
7460b57cec5SDimitry Andric   // Compute maximum number of SGPRs function can use using default/requested
7470b57cec5SDimitry Andric   // minimum number of waves per execution unit.
7480b57cec5SDimitry Andric   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
7490b57cec5SDimitry Andric   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric   // Check if maximum number of SGPRs was explicitly requested using
7520b57cec5SDimitry Andric   // "amdgpu-num-sgpr" attribute.
7530b57cec5SDimitry Andric   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
754bdd1243dSDimitry Andric     unsigned Requested =
755bdd1243dSDimitry Andric         F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
7560b57cec5SDimitry Andric 
7570b57cec5SDimitry Andric     // Make sure requested value does not violate subtarget's specifications.
758fe6060f1SDimitry Andric     if (Requested && (Requested <= ReservedNumSGPRs))
7590b57cec5SDimitry Andric       Requested = 0;
7600b57cec5SDimitry Andric 
7610b57cec5SDimitry Andric     // If more SGPRs are required to support the input user/system SGPRs,
7620b57cec5SDimitry Andric     // increase to accommodate them.
7630b57cec5SDimitry Andric     //
7640b57cec5SDimitry Andric     // FIXME: This really ends up using the requested number of SGPRs + number
7650b57cec5SDimitry Andric     // of reserved special registers in total. Theoretically you could re-use
7660b57cec5SDimitry Andric     // the last input registers for these special registers, but this would
7670b57cec5SDimitry Andric     // require a lot of complexity to deal with the weird aliasing.
768fe6060f1SDimitry Andric     unsigned InputNumSGPRs = PreloadedSGPRs;
7690b57cec5SDimitry Andric     if (Requested && Requested < InputNumSGPRs)
7700b57cec5SDimitry Andric       Requested = InputNumSGPRs;
7710b57cec5SDimitry Andric 
7720b57cec5SDimitry Andric     // Make sure requested value is compatible with values implied by
7730b57cec5SDimitry Andric     // default/requested minimum/maximum number of waves per execution unit.
7740b57cec5SDimitry Andric     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
7750b57cec5SDimitry Andric       Requested = 0;
7760b57cec5SDimitry Andric     if (WavesPerEU.second &&
7770b57cec5SDimitry Andric         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
7780b57cec5SDimitry Andric       Requested = 0;
7790b57cec5SDimitry Andric 
7800b57cec5SDimitry Andric     if (Requested)
7810b57cec5SDimitry Andric       MaxNumSGPRs = Requested;
7820b57cec5SDimitry Andric   }
7830b57cec5SDimitry Andric 
7840b57cec5SDimitry Andric   if (hasSGPRInitBug())
7850b57cec5SDimitry Andric     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
7860b57cec5SDimitry Andric 
787fe6060f1SDimitry Andric   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
7880b57cec5SDimitry Andric }
7890b57cec5SDimitry Andric 
790fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
7910b57cec5SDimitry Andric   const Function &F = MF.getFunction();
7920b57cec5SDimitry Andric   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
793fe6060f1SDimitry Andric   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
794fe6060f1SDimitry Andric                             getReservedNumSGPRs(MF));
795fe6060f1SDimitry Andric }
7960b57cec5SDimitry Andric 
797fe6060f1SDimitry Andric static unsigned getMaxNumPreloadedSGPRs() {
7985f757f3fSDimitry Andric   using USI = GCNUserSGPRUsageInfo;
799fe6060f1SDimitry Andric   // Max number of user SGPRs
8005f757f3fSDimitry Andric   const unsigned MaxUserSGPRs =
8015f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
8025f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::DispatchPtrID) +
8035f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::QueuePtrID) +
8045f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
8055f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::DispatchIdID) +
8065f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
8075f757f3fSDimitry Andric       USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
808fcaf7f86SDimitry Andric 
809fe6060f1SDimitry Andric   // Max number of system SGPRs
8105f757f3fSDimitry Andric   const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
811fe6060f1SDimitry Andric                                   1 + // WorkGroupIDY
812fe6060f1SDimitry Andric                                   1 + // WorkGroupIDZ
813fe6060f1SDimitry Andric                                   1 + // WorkGroupInfo
814fe6060f1SDimitry Andric                                   1;  // private segment wave byte offset
815fcaf7f86SDimitry Andric 
816fcaf7f86SDimitry Andric   // Max number of synthetic SGPRs
8175f757f3fSDimitry Andric   const unsigned SyntheticSGPRs = 1; // LDSKernelId
818fcaf7f86SDimitry Andric 
819fcaf7f86SDimitry Andric   return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
820fe6060f1SDimitry Andric }
821fe6060f1SDimitry Andric 
822fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
823fe6060f1SDimitry Andric   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
824fe6060f1SDimitry Andric                             getReservedNumSGPRs(F));
825fe6060f1SDimitry Andric }
826fe6060f1SDimitry Andric 
827fe6060f1SDimitry Andric unsigned GCNSubtarget::getBaseMaxNumVGPRs(
828fe6060f1SDimitry Andric     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
8290b57cec5SDimitry Andric   // Compute maximum number of VGPRs function can use using default/requested
8300b57cec5SDimitry Andric   // minimum number of waves per execution unit.
8310b57cec5SDimitry Andric   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
8320b57cec5SDimitry Andric 
8330b57cec5SDimitry Andric   // Check if maximum number of VGPRs was explicitly requested using
8340b57cec5SDimitry Andric   // "amdgpu-num-vgpr" attribute.
8350b57cec5SDimitry Andric   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
836bdd1243dSDimitry Andric     unsigned Requested =
837bdd1243dSDimitry Andric         F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
8380b57cec5SDimitry Andric 
839fe6060f1SDimitry Andric     if (hasGFX90AInsts())
840fe6060f1SDimitry Andric       Requested *= 2;
841fe6060f1SDimitry Andric 
8420b57cec5SDimitry Andric     // Make sure requested value is compatible with values implied by
8430b57cec5SDimitry Andric     // default/requested minimum/maximum number of waves per execution unit.
8440b57cec5SDimitry Andric     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
8450b57cec5SDimitry Andric       Requested = 0;
8460b57cec5SDimitry Andric     if (WavesPerEU.second &&
8470b57cec5SDimitry Andric         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
8480b57cec5SDimitry Andric       Requested = 0;
8490b57cec5SDimitry Andric 
8500b57cec5SDimitry Andric     if (Requested)
8510b57cec5SDimitry Andric       MaxNumVGPRs = Requested;
8520b57cec5SDimitry Andric   }
8530b57cec5SDimitry Andric 
8540b57cec5SDimitry Andric   return MaxNumVGPRs;
8550b57cec5SDimitry Andric }
8560b57cec5SDimitry Andric 
857fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
858fe6060f1SDimitry Andric   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
859fe6060f1SDimitry Andric }
860fe6060f1SDimitry Andric 
861fe6060f1SDimitry Andric unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
862fe6060f1SDimitry Andric   const Function &F = MF.getFunction();
863fe6060f1SDimitry Andric   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
864fe6060f1SDimitry Andric   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
865fe6060f1SDimitry Andric }
866fe6060f1SDimitry Andric 
867*0fca6ea1SDimitry Andric void GCNSubtarget::adjustSchedDependency(
868*0fca6ea1SDimitry Andric     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
869*0fca6ea1SDimitry Andric     const TargetSchedModel *SchedModel) const {
870480093f4SDimitry Andric   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
8715ffd83dbSDimitry Andric       !Def->isInstr() || !Use->isInstr())
872480093f4SDimitry Andric     return;
873480093f4SDimitry Andric 
8745ffd83dbSDimitry Andric   MachineInstr *DefI = Def->getInstr();
8755ffd83dbSDimitry Andric   MachineInstr *UseI = Use->getInstr();
876480093f4SDimitry Andric 
8775ffd83dbSDimitry Andric   if (DefI->isBundle()) {
878480093f4SDimitry Andric     const SIRegisterInfo *TRI = getRegisterInfo();
879480093f4SDimitry Andric     auto Reg = Dep.getReg();
8805ffd83dbSDimitry Andric     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
8815ffd83dbSDimitry Andric     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
882480093f4SDimitry Andric     unsigned Lat = 0;
883480093f4SDimitry Andric     for (++I; I != E && I->isBundledWithPred(); ++I) {
884480093f4SDimitry Andric       if (I->modifiesRegister(Reg, TRI))
885480093f4SDimitry Andric         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
886480093f4SDimitry Andric       else if (Lat)
887480093f4SDimitry Andric         --Lat;
888480093f4SDimitry Andric     }
889480093f4SDimitry Andric     Dep.setLatency(Lat);
8905ffd83dbSDimitry Andric   } else if (UseI->isBundle()) {
891480093f4SDimitry Andric     const SIRegisterInfo *TRI = getRegisterInfo();
892480093f4SDimitry Andric     auto Reg = Dep.getReg();
8935ffd83dbSDimitry Andric     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
8945ffd83dbSDimitry Andric     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
8955ffd83dbSDimitry Andric     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
896480093f4SDimitry Andric     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
897480093f4SDimitry Andric       if (I->readsRegister(Reg, TRI))
898480093f4SDimitry Andric         break;
899480093f4SDimitry Andric       --Lat;
900480093f4SDimitry Andric     }
901480093f4SDimitry Andric     Dep.setLatency(Lat);
902349cc55cSDimitry Andric   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
903349cc55cSDimitry Andric     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
904349cc55cSDimitry Andric     // implicit operands which come from the MCInstrDesc, which can fool
905349cc55cSDimitry Andric     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
906349cc55cSDimitry Andric     // pseudo operands.
907349cc55cSDimitry Andric     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
908349cc55cSDimitry Andric         DefI, DefOpIdx, UseI, UseOpIdx));
909480093f4SDimitry Andric   }
910480093f4SDimitry Andric }
911480093f4SDimitry Andric 
9120b57cec5SDimitry Andric namespace {
9130b57cec5SDimitry Andric struct FillMFMAShadowMutation : ScheduleDAGMutation {
9140b57cec5SDimitry Andric   const SIInstrInfo *TII;
9150b57cec5SDimitry Andric 
9160b57cec5SDimitry Andric   ScheduleDAGMI *DAG;
9170b57cec5SDimitry Andric 
9180b57cec5SDimitry Andric   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
9190b57cec5SDimitry Andric 
9200b57cec5SDimitry Andric   bool isSALU(const SUnit *SU) const {
9210b57cec5SDimitry Andric     const MachineInstr *MI = SU->getInstr();
9220b57cec5SDimitry Andric     return MI && TII->isSALU(*MI) && !MI->isTerminator();
9230b57cec5SDimitry Andric   }
9240b57cec5SDimitry Andric 
925480093f4SDimitry Andric   bool isVALU(const SUnit *SU) const {
926480093f4SDimitry Andric     const MachineInstr *MI = SU->getInstr();
927480093f4SDimitry Andric     return MI && TII->isVALU(*MI);
928480093f4SDimitry Andric   }
929480093f4SDimitry Andric 
930349cc55cSDimitry Andric   // Link as many SALU instructions in chain as possible. Return the size
9310b57cec5SDimitry Andric   // of the chain. Links up to MaxChain instructions.
9320b57cec5SDimitry Andric   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
9330b57cec5SDimitry Andric                          SmallPtrSetImpl<SUnit *> &Visited) const {
9340b57cec5SDimitry Andric     SmallVector<SUnit *, 8> Worklist({To});
9350b57cec5SDimitry Andric     unsigned Linked = 0;
9360b57cec5SDimitry Andric 
9370b57cec5SDimitry Andric     while (!Worklist.empty() && MaxChain-- > 0) {
9380b57cec5SDimitry Andric       SUnit *SU = Worklist.pop_back_val();
9390b57cec5SDimitry Andric       if (!Visited.insert(SU).second)
9400b57cec5SDimitry Andric         continue;
9410b57cec5SDimitry Andric 
9420b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
9430b57cec5SDimitry Andric                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
9440b57cec5SDimitry Andric 
945fcaf7f86SDimitry Andric       if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
946fcaf7f86SDimitry Andric         if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
9470b57cec5SDimitry Andric           ++Linked;
9480b57cec5SDimitry Andric 
9490b57cec5SDimitry Andric       for (SDep &SI : From->Succs) {
9500b57cec5SDimitry Andric         SUnit *SUv = SI.getSUnit();
951fcaf7f86SDimitry Andric         if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
952fcaf7f86SDimitry Andric             DAG->canAddEdge(SUv, SU))
953fcaf7f86SDimitry Andric           DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
9540b57cec5SDimitry Andric       }
9550b57cec5SDimitry Andric 
9560b57cec5SDimitry Andric       for (SDep &SI : SU->Succs) {
9570b57cec5SDimitry Andric         SUnit *Succ = SI.getSUnit();
958fcaf7f86SDimitry Andric         if (Succ != SU && isSALU(Succ))
9590b57cec5SDimitry Andric           Worklist.push_back(Succ);
9600b57cec5SDimitry Andric       }
9610b57cec5SDimitry Andric     }
9620b57cec5SDimitry Andric 
9630b57cec5SDimitry Andric     return Linked;
9640b57cec5SDimitry Andric   }
9650b57cec5SDimitry Andric 
9660b57cec5SDimitry Andric   void apply(ScheduleDAGInstrs *DAGInstrs) override {
9670b57cec5SDimitry Andric     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
968753f127fSDimitry Andric     if (!ST.hasMAIInsts())
9690b57cec5SDimitry Andric       return;
9700b57cec5SDimitry Andric     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
9710b57cec5SDimitry Andric     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
9720b57cec5SDimitry Andric     if (!TSchedModel || DAG->SUnits.empty())
9730b57cec5SDimitry Andric       return;
9740b57cec5SDimitry Andric 
9750b57cec5SDimitry Andric     // Scan for MFMA long latency instructions and try to add a dependency
9760b57cec5SDimitry Andric     // of available SALU instructions to give them a chance to fill MFMA
9770b57cec5SDimitry Andric     // shadow. That is desirable to fill MFMA shadow with SALU instructions
9780b57cec5SDimitry Andric     // rather than VALU to prevent power consumption bursts and throttle.
9790b57cec5SDimitry Andric     auto LastSALU = DAG->SUnits.begin();
9800b57cec5SDimitry Andric     auto E = DAG->SUnits.end();
9810b57cec5SDimitry Andric     SmallPtrSet<SUnit*, 32> Visited;
9820b57cec5SDimitry Andric     for (SUnit &SU : DAG->SUnits) {
9830b57cec5SDimitry Andric       MachineInstr &MAI = *SU.getInstr();
9840b57cec5SDimitry Andric       if (!TII->isMAI(MAI) ||
985e8d8bef9SDimitry Andric            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
986e8d8bef9SDimitry Andric            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
9870b57cec5SDimitry Andric         continue;
9880b57cec5SDimitry Andric 
9890b57cec5SDimitry Andric       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
9900b57cec5SDimitry Andric 
9910b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
9920b57cec5SDimitry Andric                  dbgs() << "Need " << Lat
9930b57cec5SDimitry Andric                         << " instructions to cover latency.\n");
9940b57cec5SDimitry Andric 
9950b57cec5SDimitry Andric       // Find up to Lat independent scalar instructions as early as
9960b57cec5SDimitry Andric       // possible such that they can be scheduled after this MFMA.
9970b57cec5SDimitry Andric       for ( ; Lat && LastSALU != E; ++LastSALU) {
9980b57cec5SDimitry Andric         if (Visited.count(&*LastSALU))
9990b57cec5SDimitry Andric           continue;
10000b57cec5SDimitry Andric 
1001fcaf7f86SDimitry Andric         if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
1002fcaf7f86SDimitry Andric             !DAG->canAddEdge(&*LastSALU, &SU))
10030b57cec5SDimitry Andric           continue;
10040b57cec5SDimitry Andric 
10050b57cec5SDimitry Andric         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
10060b57cec5SDimitry Andric       }
10070b57cec5SDimitry Andric     }
10080b57cec5SDimitry Andric   }
10090b57cec5SDimitry Andric };
10100b57cec5SDimitry Andric } // namespace
10110b57cec5SDimitry Andric 
10120b57cec5SDimitry Andric void GCNSubtarget::getPostRAMutations(
10130b57cec5SDimitry Andric     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
10148bcb0991SDimitry Andric   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
10150b57cec5SDimitry Andric }
10160b57cec5SDimitry Andric 
1017349cc55cSDimitry Andric std::unique_ptr<ScheduleDAGMutation>
1018349cc55cSDimitry Andric GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1019753f127fSDimitry Andric   return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1020753f127fSDimitry Andric                           : nullptr;
1021349cc55cSDimitry Andric }
1022349cc55cSDimitry Andric 
1023bdd1243dSDimitry Andric unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
10245f757f3fSDimitry Andric   if (getGeneration() >= AMDGPUSubtarget::GFX12)
10255f757f3fSDimitry Andric     return 0; // Not MIMG encoding.
10265f757f3fSDimitry Andric 
1027bdd1243dSDimitry Andric   if (NSAThreshold.getNumOccurrences() > 0)
1028bdd1243dSDimitry Andric     return std::max(NSAThreshold.getValue(), 2u);
1029bdd1243dSDimitry Andric 
1030bdd1243dSDimitry Andric   int Value = MF.getFunction().getFnAttributeAsParsedInteger(
1031bdd1243dSDimitry Andric       "amdgpu-nsa-threshold", -1);
1032bdd1243dSDimitry Andric   if (Value > 0)
1033bdd1243dSDimitry Andric     return std::max(Value, 2);
1034bdd1243dSDimitry Andric 
1035bdd1243dSDimitry Andric   return 3;
1036bdd1243dSDimitry Andric }
1037bdd1243dSDimitry Andric 
10380b57cec5SDimitry Andric const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
10390b57cec5SDimitry Andric   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
10400b57cec5SDimitry Andric     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
10410b57cec5SDimitry Andric   return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
10420b57cec5SDimitry Andric }
10430b57cec5SDimitry Andric 
10440b57cec5SDimitry Andric const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
10450b57cec5SDimitry Andric   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
10460b57cec5SDimitry Andric     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1047*0fca6ea1SDimitry Andric   return static_cast<const AMDGPUSubtarget &>(
1048*0fca6ea1SDimitry Andric       TM.getSubtarget<R600Subtarget>(F));
10490b57cec5SDimitry Andric }
10505f757f3fSDimitry Andric 
10515f757f3fSDimitry Andric GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
10525f757f3fSDimitry Andric                                            const GCNSubtarget &ST)
10535f757f3fSDimitry Andric     : ST(ST) {
10545f757f3fSDimitry Andric   const CallingConv::ID CC = F.getCallingConv();
10555f757f3fSDimitry Andric   const bool IsKernel =
10565f757f3fSDimitry Andric       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
10575f757f3fSDimitry Andric   // FIXME: Should have analysis or something rather than attribute to detect
10585f757f3fSDimitry Andric   // calls.
10595f757f3fSDimitry Andric   const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
10605f757f3fSDimitry Andric   // FIXME: This attribute is a hack, we just need an analysis on the function
10615f757f3fSDimitry Andric   // to look for allocas.
10625f757f3fSDimitry Andric   const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
10635f757f3fSDimitry Andric 
10645f757f3fSDimitry Andric   if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
10655f757f3fSDimitry Andric     KernargSegmentPtr = true;
10665f757f3fSDimitry Andric 
10675f757f3fSDimitry Andric   bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
10685f757f3fSDimitry Andric   if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
10695f757f3fSDimitry Andric     PrivateSegmentBuffer = true;
10705f757f3fSDimitry Andric   else if (ST.isMesaGfxShader(F))
10715f757f3fSDimitry Andric     ImplicitBufferPtr = true;
10725f757f3fSDimitry Andric 
10735f757f3fSDimitry Andric   if (!AMDGPU::isGraphics(CC)) {
10745f757f3fSDimitry Andric     if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
10755f757f3fSDimitry Andric       DispatchPtr = true;
10765f757f3fSDimitry Andric 
10775f757f3fSDimitry Andric     // FIXME: Can this always be disabled with < COv5?
10785f757f3fSDimitry Andric     if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
10795f757f3fSDimitry Andric       QueuePtr = true;
10805f757f3fSDimitry Andric 
10815f757f3fSDimitry Andric     if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
10825f757f3fSDimitry Andric       DispatchID = true;
10835f757f3fSDimitry Andric   }
10845f757f3fSDimitry Andric 
10855f757f3fSDimitry Andric   // TODO: This could be refined a lot. The attribute is a poor way of
10865f757f3fSDimitry Andric   // detecting calls or stack objects that may require it before argument
10875f757f3fSDimitry Andric   // lowering.
10885f757f3fSDimitry Andric   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
10895f757f3fSDimitry Andric       (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
10905f757f3fSDimitry Andric       (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
10915f757f3fSDimitry Andric       !ST.flatScratchIsArchitected()) {
10925f757f3fSDimitry Andric     FlatScratchInit = true;
10935f757f3fSDimitry Andric   }
10945f757f3fSDimitry Andric 
10955f757f3fSDimitry Andric   if (hasImplicitBufferPtr())
10965f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
10975f757f3fSDimitry Andric 
10985f757f3fSDimitry Andric   if (hasPrivateSegmentBuffer())
10995f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
11005f757f3fSDimitry Andric 
11015f757f3fSDimitry Andric   if (hasDispatchPtr())
11025f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
11035f757f3fSDimitry Andric 
11045f757f3fSDimitry Andric   if (hasQueuePtr())
11055f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
11065f757f3fSDimitry Andric 
11075f757f3fSDimitry Andric   if (hasKernargSegmentPtr())
11085f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
11095f757f3fSDimitry Andric 
11105f757f3fSDimitry Andric   if (hasDispatchID())
11115f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
11125f757f3fSDimitry Andric 
11135f757f3fSDimitry Andric   if (hasFlatScratchInit())
11145f757f3fSDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1115*0fca6ea1SDimitry Andric 
1116*0fca6ea1SDimitry Andric   if (hasPrivateSegmentSize())
1117*0fca6ea1SDimitry Andric     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
11185f757f3fSDimitry Andric }
11195f757f3fSDimitry Andric 
11205f757f3fSDimitry Andric void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
11215f757f3fSDimitry Andric   assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
11225f757f3fSDimitry Andric   NumKernargPreloadSGPRs += NumSGPRs;
11235f757f3fSDimitry Andric   NumUsedUserSGPRs += NumSGPRs;
11245f757f3fSDimitry Andric }
11255f757f3fSDimitry Andric 
11265f757f3fSDimitry Andric unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
11275f757f3fSDimitry Andric   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
11285f757f3fSDimitry Andric }
1129*0fca6ea1SDimitry Andric 
1130*0fca6ea1SDimitry Andric SmallVector<unsigned>
1131*0fca6ea1SDimitry Andric AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
1132*0fca6ea1SDimitry Andric   return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
1133*0fca6ea1SDimitry Andric }
1134