xref: /llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1a6bae5cbSJay Foad //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2a6bae5cbSJay Foad //
3a6bae5cbSJay Foad // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4a6bae5cbSJay Foad // See https://llvm.org/LICENSE.txt for license information.
5a6bae5cbSJay Foad // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a6bae5cbSJay Foad //
7a6bae5cbSJay Foad //===----------------------------------------------------------------------===//
8a6bae5cbSJay Foad //
9a6bae5cbSJay Foad /// \file
10a6bae5cbSJay Foad /// Implements the GCN specific subclass of TargetSubtarget.
11a6bae5cbSJay Foad //
12a6bae5cbSJay Foad //===----------------------------------------------------------------------===//
13a6bae5cbSJay Foad 
14a6bae5cbSJay Foad #include "GCNSubtarget.h"
15a6bae5cbSJay Foad #include "AMDGPUCallLowering.h"
16a6bae5cbSJay Foad #include "AMDGPUInstructionSelector.h"
17a6bae5cbSJay Foad #include "AMDGPULegalizerInfo.h"
18a6bae5cbSJay Foad #include "AMDGPURegisterBankInfo.h"
1903847f19SSergei Barannikov #include "AMDGPUSelectionDAGInfo.h"
20a6bae5cbSJay Foad #include "AMDGPUTargetMachine.h"
21a6bae5cbSJay Foad #include "SIMachineFunctionInfo.h"
22a6bae5cbSJay Foad #include "Utils/AMDGPUBaseInfo.h"
23a6bae5cbSJay Foad #include "llvm/ADT/SmallString.h"
24a6bae5cbSJay Foad #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25a6bae5cbSJay Foad #include "llvm/CodeGen/MachineScheduler.h"
26a6bae5cbSJay Foad #include "llvm/CodeGen/TargetFrameLowering.h"
27a6bae5cbSJay Foad #include "llvm/IR/DiagnosticInfo.h"
28a6bae5cbSJay Foad #include "llvm/IR/MDBuilder.h"
29a6bae5cbSJay Foad #include <algorithm>
30a6bae5cbSJay Foad 
31a6bae5cbSJay Foad using namespace llvm;
32a6bae5cbSJay Foad 
33a6bae5cbSJay Foad #define DEBUG_TYPE "gcn-subtarget"
34a6bae5cbSJay Foad 
35a6bae5cbSJay Foad #define GET_SUBTARGETINFO_TARGET_DESC
36a6bae5cbSJay Foad #define GET_SUBTARGETINFO_CTOR
37a6bae5cbSJay Foad #define AMDGPUSubtarget GCNSubtarget
38a6bae5cbSJay Foad #include "AMDGPUGenSubtargetInfo.inc"
39a6bae5cbSJay Foad #undef AMDGPUSubtarget
40a6bae5cbSJay Foad 
41a6bae5cbSJay Foad static cl::opt<bool> EnableVGPRIndexMode(
42a6bae5cbSJay Foad     "amdgpu-vgpr-index-mode",
43a6bae5cbSJay Foad     cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44a6bae5cbSJay Foad     cl::init(false));
45a6bae5cbSJay Foad 
46a6bae5cbSJay Foad static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47a6bae5cbSJay Foad                            cl::desc("Enable the use of AA during codegen."),
48a6bae5cbSJay Foad                            cl::init(true));
49a6bae5cbSJay Foad 
50a6bae5cbSJay Foad static cl::opt<unsigned>
51a6bae5cbSJay Foad     NSAThreshold("amdgpu-nsa-threshold",
52a6bae5cbSJay Foad                  cl::desc("Number of addresses from which to enable MIMG NSA."),
53b3995aa3SJay Foad                  cl::init(2), cl::Hidden);
54a6bae5cbSJay Foad 
55a6bae5cbSJay Foad GCNSubtarget::~GCNSubtarget() = default;
56a6bae5cbSJay Foad 
57a6bae5cbSJay Foad GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58a6bae5cbSJay Foad                                                             StringRef GPU,
59a6bae5cbSJay Foad                                                             StringRef FS) {
60a6bae5cbSJay Foad   // Determine default and user-specified characteristics
61a6bae5cbSJay Foad   //
62a6bae5cbSJay Foad   // We want to be able to turn these off, but making this a subtarget feature
63a6bae5cbSJay Foad   // for SI has the unhelpful behavior that it unsets everything else if you
64a6bae5cbSJay Foad   // disable it.
65a6bae5cbSJay Foad   //
66a6bae5cbSJay Foad   // Similarly we want enable-prt-strict-null to be on by default and not to
67a6bae5cbSJay Foad   // unset everything else if it is disabled
68a6bae5cbSJay Foad 
69a6bae5cbSJay Foad   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70a6bae5cbSJay Foad 
71a6bae5cbSJay Foad   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72a6bae5cbSJay Foad   // default
73a6bae5cbSJay Foad   if (isAmdHsaOS())
74a6bae5cbSJay Foad     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75a6bae5cbSJay Foad 
76a6bae5cbSJay Foad   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77a6bae5cbSJay Foad 
78a6bae5cbSJay Foad   // Disable mutually exclusive bits.
79a6bae5cbSJay Foad   if (FS.contains_insensitive("+wavefrontsize")) {
80a6bae5cbSJay Foad     if (!FS.contains_insensitive("wavefrontsize16"))
81a6bae5cbSJay Foad       FullFS += "-wavefrontsize16,";
82a6bae5cbSJay Foad     if (!FS.contains_insensitive("wavefrontsize32"))
83a6bae5cbSJay Foad       FullFS += "-wavefrontsize32,";
84a6bae5cbSJay Foad     if (!FS.contains_insensitive("wavefrontsize64"))
85a6bae5cbSJay Foad       FullFS += "-wavefrontsize64,";
86a6bae5cbSJay Foad   }
87a6bae5cbSJay Foad 
88a6bae5cbSJay Foad   FullFS += FS;
89a6bae5cbSJay Foad 
90a6bae5cbSJay Foad   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91a6bae5cbSJay Foad 
92a6bae5cbSJay Foad   // Implement the "generic" processors, which acts as the default when no
93a6bae5cbSJay Foad   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94a6bae5cbSJay Foad   // the first amdgcn target that supports flat addressing. Other OSes defaults
95a6bae5cbSJay Foad   // to the first amdgcn target.
96a6bae5cbSJay Foad   if (Gen == AMDGPUSubtarget::INVALID) {
97a6bae5cbSJay Foad     Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98a6bae5cbSJay Foad                                        : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99cd20fc07SMatt Arsenault     // Assume wave64 for the unknown target, if not explicitly set.
100cd20fc07SMatt Arsenault     if (getWavefrontSizeLog2() == 0)
101cd20fc07SMatt Arsenault       WavefrontSizeLog2 = 6;
102cd20fc07SMatt Arsenault   } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103a6bae5cbSJay Foad              !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104a6bae5cbSJay Foad     // If there is no default wave size it must be a generation before gfx10,
105a6bae5cbSJay Foad     // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106a6bae5cbSJay Foad     // set wave32 as a default.
107a6bae5cbSJay Foad     ToggleFeature(AMDGPU::FeatureWavefrontSize32);
108cd20fc07SMatt Arsenault     WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109a6bae5cbSJay Foad   }
110a6bae5cbSJay Foad 
111a6bae5cbSJay Foad   // We don't support FP64 for EG/NI atm.
112a6bae5cbSJay Foad   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113a6bae5cbSJay Foad 
114a6bae5cbSJay Foad   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115a6bae5cbSJay Foad   // support flat operations, otherwise they cannot access a 64-bit global
116a6bae5cbSJay Foad   // address space
117a6bae5cbSJay Foad   assert(hasAddr64() || hasFlat());
118a6bae5cbSJay Foad   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119a6bae5cbSJay Foad   // that do not support ADDR64 variants of MUBUF instructions. Such targets
120a6bae5cbSJay Foad   // cannot use a 64 bit offset with a MUBUF instruction to access the global
121a6bae5cbSJay Foad   // address space
122a6bae5cbSJay Foad   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
123a6bae5cbSJay Foad     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
124a6bae5cbSJay Foad     FlatForGlobal = true;
125a6bae5cbSJay Foad   }
126a6bae5cbSJay Foad   // Unless +-flat-for-global is specified, use MUBUF instructions for global
127a6bae5cbSJay Foad   // address space access if flat operations are not available.
128a6bae5cbSJay Foad   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
129a6bae5cbSJay Foad     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
130a6bae5cbSJay Foad     FlatForGlobal = false;
131a6bae5cbSJay Foad   }
132a6bae5cbSJay Foad 
133a6bae5cbSJay Foad   // Set defaults if needed.
134a6bae5cbSJay Foad   if (MaxPrivateElementSize == 0)
135a6bae5cbSJay Foad     MaxPrivateElementSize = 4;
136a6bae5cbSJay Foad 
137a6bae5cbSJay Foad   if (LDSBankCount == 0)
138a6bae5cbSJay Foad     LDSBankCount = 32;
139a6bae5cbSJay Foad 
1406f956e31SJay Foad   if (TT.getArch() == Triple::amdgcn && AddressableLocalMemorySize == 0)
1416f956e31SJay Foad     AddressableLocalMemorySize = 32768;
142a6bae5cbSJay Foad 
1436f956e31SJay Foad   LocalMemorySize = AddressableLocalMemorySize;
144a6bae5cbSJay Foad   if (AMDGPU::isGFX10Plus(*this) &&
145a6bae5cbSJay Foad       !getFeatureBits().test(AMDGPU::FeatureCuMode))
146a6bae5cbSJay Foad     LocalMemorySize *= 2;
147a6bae5cbSJay Foad 
148a6bae5cbSJay Foad   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149a6bae5cbSJay Foad   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150a6bae5cbSJay Foad 
151a6bae5cbSJay Foad   TargetID.setTargetIDFromFeaturesString(FS);
152a6bae5cbSJay Foad 
153a6bae5cbSJay Foad   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154a6bae5cbSJay Foad                     << TargetID.getXnackSetting() << '\n');
155a6bae5cbSJay Foad   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156a6bae5cbSJay Foad                     << TargetID.getSramEccSetting() << '\n');
157a6bae5cbSJay Foad 
158a6bae5cbSJay Foad   return *this;
159a6bae5cbSJay Foad }
160a6bae5cbSJay Foad 
161a6bae5cbSJay Foad void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
162a6bae5cbSJay Foad   LLVMContext &Ctx = F.getContext();
163cd20fc07SMatt Arsenault   if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
164a6bae5cbSJay Foad       hasFeature(AMDGPU::FeatureWavefrontSize64)) {
165a6bae5cbSJay Foad     Ctx.diagnose(DiagnosticInfoUnsupported(
166a6bae5cbSJay Foad         F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167a6bae5cbSJay Foad   }
168a6bae5cbSJay Foad }
169a6bae5cbSJay Foad 
170a6bae5cbSJay Foad GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171a6bae5cbSJay Foad                            const GCNTargetMachine &TM)
172a6bae5cbSJay Foad     : // clang-format off
173a6bae5cbSJay Foad     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174a6bae5cbSJay Foad     AMDGPUSubtarget(TT),
175a6bae5cbSJay Foad     TargetTriple(TT),
176a6bae5cbSJay Foad     TargetID(*this),
177a6bae5cbSJay Foad     InstrItins(getInstrItineraryForCPU(GPU)),
178a6bae5cbSJay Foad     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
179a6bae5cbSJay Foad     TLInfo(TM, *this),
180a6bae5cbSJay Foad     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
181a6bae5cbSJay Foad   // clang-format on
182a6bae5cbSJay Foad   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
183a6bae5cbSJay Foad   EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
18403847f19SSergei Barannikov 
18503847f19SSergei Barannikov   TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
18603847f19SSergei Barannikov 
187a6bae5cbSJay Foad   CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
188a6bae5cbSJay Foad   InlineAsmLoweringInfo =
189a6bae5cbSJay Foad       std::make_unique<InlineAsmLowering>(getTargetLowering());
190a6bae5cbSJay Foad   Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
191a6bae5cbSJay Foad   RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
192a6bae5cbSJay Foad   InstSelector =
193a6bae5cbSJay Foad       std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
194a6bae5cbSJay Foad }
195a6bae5cbSJay Foad 
19603847f19SSergei Barannikov const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
19703847f19SSergei Barannikov   return TSInfo.get();
19803847f19SSergei Barannikov }
19903847f19SSergei Barannikov 
200a6bae5cbSJay Foad unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
201a6bae5cbSJay Foad   if (getGeneration() < GFX10)
202a6bae5cbSJay Foad     return 1;
203a6bae5cbSJay Foad 
204a6bae5cbSJay Foad   switch (Opcode) {
205a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B64_e64:
206a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B64_gfx10:
207a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210a6bae5cbSJay Foad   case AMDGPU::V_LSHL_B64_e64:
211a6bae5cbSJay Foad   case AMDGPU::V_LSHRREV_B64_e64:
212a6bae5cbSJay Foad   case AMDGPU::V_LSHRREV_B64_gfx10:
213a6bae5cbSJay Foad   case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214a6bae5cbSJay Foad   case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215a6bae5cbSJay Foad   case AMDGPU::V_LSHR_B64_e64:
216a6bae5cbSJay Foad   case AMDGPU::V_ASHRREV_I64_e64:
217a6bae5cbSJay Foad   case AMDGPU::V_ASHRREV_I64_gfx10:
218a6bae5cbSJay Foad   case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219a6bae5cbSJay Foad   case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220a6bae5cbSJay Foad   case AMDGPU::V_ASHR_I64_e64:
221a6bae5cbSJay Foad     return 1;
222a6bae5cbSJay Foad   }
223a6bae5cbSJay Foad 
224a6bae5cbSJay Foad   return 2;
225a6bae5cbSJay Foad }
226a6bae5cbSJay Foad 
227a6bae5cbSJay Foad /// This list was mostly derived from experimentation.
228a6bae5cbSJay Foad bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
229a6bae5cbSJay Foad   switch (Opcode) {
230a6bae5cbSJay Foad   case AMDGPU::V_CVT_F16_F32_e32:
231a6bae5cbSJay Foad   case AMDGPU::V_CVT_F16_F32_e64:
232a6bae5cbSJay Foad   case AMDGPU::V_CVT_F16_U16_e32:
233a6bae5cbSJay Foad   case AMDGPU::V_CVT_F16_U16_e64:
234a6bae5cbSJay Foad   case AMDGPU::V_CVT_F16_I16_e32:
235a6bae5cbSJay Foad   case AMDGPU::V_CVT_F16_I16_e64:
236a6bae5cbSJay Foad   case AMDGPU::V_RCP_F16_e64:
237a6bae5cbSJay Foad   case AMDGPU::V_RCP_F16_e32:
238a6bae5cbSJay Foad   case AMDGPU::V_RSQ_F16_e64:
239a6bae5cbSJay Foad   case AMDGPU::V_RSQ_F16_e32:
240a6bae5cbSJay Foad   case AMDGPU::V_SQRT_F16_e64:
241a6bae5cbSJay Foad   case AMDGPU::V_SQRT_F16_e32:
242a6bae5cbSJay Foad   case AMDGPU::V_LOG_F16_e64:
243a6bae5cbSJay Foad   case AMDGPU::V_LOG_F16_e32:
244a6bae5cbSJay Foad   case AMDGPU::V_EXP_F16_e64:
245a6bae5cbSJay Foad   case AMDGPU::V_EXP_F16_e32:
246a6bae5cbSJay Foad   case AMDGPU::V_SIN_F16_e64:
247a6bae5cbSJay Foad   case AMDGPU::V_SIN_F16_e32:
248a6bae5cbSJay Foad   case AMDGPU::V_COS_F16_e64:
249a6bae5cbSJay Foad   case AMDGPU::V_COS_F16_e32:
250a6bae5cbSJay Foad   case AMDGPU::V_FLOOR_F16_e64:
251a6bae5cbSJay Foad   case AMDGPU::V_FLOOR_F16_e32:
252a6bae5cbSJay Foad   case AMDGPU::V_CEIL_F16_e64:
253a6bae5cbSJay Foad   case AMDGPU::V_CEIL_F16_e32:
254a6bae5cbSJay Foad   case AMDGPU::V_TRUNC_F16_e64:
255a6bae5cbSJay Foad   case AMDGPU::V_TRUNC_F16_e32:
256a6bae5cbSJay Foad   case AMDGPU::V_RNDNE_F16_e64:
257a6bae5cbSJay Foad   case AMDGPU::V_RNDNE_F16_e32:
258a6bae5cbSJay Foad   case AMDGPU::V_FRACT_F16_e64:
259a6bae5cbSJay Foad   case AMDGPU::V_FRACT_F16_e32:
260a6bae5cbSJay Foad   case AMDGPU::V_FREXP_MANT_F16_e64:
261a6bae5cbSJay Foad   case AMDGPU::V_FREXP_MANT_F16_e32:
262a6bae5cbSJay Foad   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263a6bae5cbSJay Foad   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264a6bae5cbSJay Foad   case AMDGPU::V_LDEXP_F16_e64:
265a6bae5cbSJay Foad   case AMDGPU::V_LDEXP_F16_e32:
266a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B16_e64:
267a6bae5cbSJay Foad   case AMDGPU::V_LSHLREV_B16_e32:
268a6bae5cbSJay Foad   case AMDGPU::V_LSHRREV_B16_e64:
269a6bae5cbSJay Foad   case AMDGPU::V_LSHRREV_B16_e32:
270a6bae5cbSJay Foad   case AMDGPU::V_ASHRREV_I16_e64:
271a6bae5cbSJay Foad   case AMDGPU::V_ASHRREV_I16_e32:
272a6bae5cbSJay Foad   case AMDGPU::V_ADD_U16_e64:
273a6bae5cbSJay Foad   case AMDGPU::V_ADD_U16_e32:
274a6bae5cbSJay Foad   case AMDGPU::V_SUB_U16_e64:
275a6bae5cbSJay Foad   case AMDGPU::V_SUB_U16_e32:
276a6bae5cbSJay Foad   case AMDGPU::V_SUBREV_U16_e64:
277a6bae5cbSJay Foad   case AMDGPU::V_SUBREV_U16_e32:
278a6bae5cbSJay Foad   case AMDGPU::V_MUL_LO_U16_e64:
279a6bae5cbSJay Foad   case AMDGPU::V_MUL_LO_U16_e32:
280a6bae5cbSJay Foad   case AMDGPU::V_ADD_F16_e64:
281a6bae5cbSJay Foad   case AMDGPU::V_ADD_F16_e32:
282a6bae5cbSJay Foad   case AMDGPU::V_SUB_F16_e64:
283a6bae5cbSJay Foad   case AMDGPU::V_SUB_F16_e32:
284a6bae5cbSJay Foad   case AMDGPU::V_SUBREV_F16_e64:
285a6bae5cbSJay Foad   case AMDGPU::V_SUBREV_F16_e32:
286a6bae5cbSJay Foad   case AMDGPU::V_MUL_F16_e64:
287a6bae5cbSJay Foad   case AMDGPU::V_MUL_F16_e32:
288a6bae5cbSJay Foad   case AMDGPU::V_MAX_F16_e64:
289a6bae5cbSJay Foad   case AMDGPU::V_MAX_F16_e32:
290a6bae5cbSJay Foad   case AMDGPU::V_MIN_F16_e64:
291a6bae5cbSJay Foad   case AMDGPU::V_MIN_F16_e32:
292a6bae5cbSJay Foad   case AMDGPU::V_MAX_U16_e64:
293a6bae5cbSJay Foad   case AMDGPU::V_MAX_U16_e32:
294a6bae5cbSJay Foad   case AMDGPU::V_MIN_U16_e64:
295a6bae5cbSJay Foad   case AMDGPU::V_MIN_U16_e32:
296a6bae5cbSJay Foad   case AMDGPU::V_MAX_I16_e64:
297a6bae5cbSJay Foad   case AMDGPU::V_MAX_I16_e32:
298a6bae5cbSJay Foad   case AMDGPU::V_MIN_I16_e64:
299a6bae5cbSJay Foad   case AMDGPU::V_MIN_I16_e32:
300a6bae5cbSJay Foad   case AMDGPU::V_MAD_F16_e64:
301a6bae5cbSJay Foad   case AMDGPU::V_MAD_U16_e64:
302a6bae5cbSJay Foad   case AMDGPU::V_MAD_I16_e64:
303a6bae5cbSJay Foad   case AMDGPU::V_FMA_F16_e64:
304a6bae5cbSJay Foad   case AMDGPU::V_DIV_FIXUP_F16_e64:
305a6bae5cbSJay Foad     // On gfx10, all 16-bit instructions preserve the high bits.
306a6bae5cbSJay Foad     return getGeneration() <= AMDGPUSubtarget::GFX9;
307a6bae5cbSJay Foad   case AMDGPU::V_MADAK_F16:
308a6bae5cbSJay Foad   case AMDGPU::V_MADMK_F16:
309a6bae5cbSJay Foad   case AMDGPU::V_MAC_F16_e64:
310a6bae5cbSJay Foad   case AMDGPU::V_MAC_F16_e32:
311a6bae5cbSJay Foad   case AMDGPU::V_FMAMK_F16:
312a6bae5cbSJay Foad   case AMDGPU::V_FMAAK_F16:
313a6bae5cbSJay Foad   case AMDGPU::V_FMAC_F16_e64:
314a6bae5cbSJay Foad   case AMDGPU::V_FMAC_F16_e32:
315a6bae5cbSJay Foad     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
316a6bae5cbSJay Foad     // instructions maintain the legacy behavior of 0ing. Some instructions
317a6bae5cbSJay Foad     // changed to preserving the high bits.
318a6bae5cbSJay Foad     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
319a6bae5cbSJay Foad   case AMDGPU::V_MAD_MIXLO_F16:
320a6bae5cbSJay Foad   case AMDGPU::V_MAD_MIXHI_F16:
321a6bae5cbSJay Foad   default:
322a6bae5cbSJay Foad     return false;
323a6bae5cbSJay Foad   }
324a6bae5cbSJay Foad }
325a6bae5cbSJay Foad 
326a6bae5cbSJay Foad void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327a6bae5cbSJay Foad                                        unsigned NumRegionInstrs) const {
328a6bae5cbSJay Foad   // Track register pressure so the scheduler can try to decrease
329a6bae5cbSJay Foad   // pressure once register usage is above the threshold defined by
330a6bae5cbSJay Foad   // SIRegisterInfo::getRegPressureSetLimit()
331a6bae5cbSJay Foad   Policy.ShouldTrackPressure = true;
332a6bae5cbSJay Foad 
333a6bae5cbSJay Foad   // Enabling both top down and bottom up scheduling seems to give us less
334a6bae5cbSJay Foad   // register spills than just using one of these approaches on its own.
335a6bae5cbSJay Foad   Policy.OnlyTopDown = false;
336a6bae5cbSJay Foad   Policy.OnlyBottomUp = false;
337a6bae5cbSJay Foad 
338a6bae5cbSJay Foad   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339a6bae5cbSJay Foad   if (!enableSIScheduler())
340a6bae5cbSJay Foad     Policy.ShouldTrackLaneMasks = true;
341a6bae5cbSJay Foad }
342a6bae5cbSJay Foad 
343a6bae5cbSJay Foad void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
344a6bae5cbSJay Foad   if (isWave32()) {
345a6bae5cbSJay Foad     // Fix implicit $vcc operands after MIParser has verified that they match
346a6bae5cbSJay Foad     // the instruction definitions.
347a6bae5cbSJay Foad     for (auto &MBB : MF) {
348a6bae5cbSJay Foad       for (auto &MI : MBB)
349a6bae5cbSJay Foad         InstrInfo.fixImplicitOperands(MI);
350a6bae5cbSJay Foad     }
351a6bae5cbSJay Foad   }
352a6bae5cbSJay Foad }
353a6bae5cbSJay Foad 
354a6bae5cbSJay Foad bool GCNSubtarget::hasMadF16() const {
355a6bae5cbSJay Foad   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
356a6bae5cbSJay Foad }
357a6bae5cbSJay Foad 
358a6bae5cbSJay Foad bool GCNSubtarget::useVGPRIndexMode() const {
359b02b5b7bSJay Foad   return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
360a6bae5cbSJay Foad }
361a6bae5cbSJay Foad 
362a6bae5cbSJay Foad bool GCNSubtarget::useAA() const { return UseAA; }
363a6bae5cbSJay Foad 
364a6bae5cbSJay Foad unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
365a6bae5cbSJay Foad   return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
366a6bae5cbSJay Foad                                                    getGeneration());
367a6bae5cbSJay Foad }
368a6bae5cbSJay Foad 
369a6bae5cbSJay Foad unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
370a6bae5cbSJay Foad   return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
371a6bae5cbSJay Foad }
372a6bae5cbSJay Foad 
373a6bae5cbSJay Foad unsigned
374a6bae5cbSJay Foad GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
375a6bae5cbSJay Foad   if (getGeneration() >= AMDGPUSubtarget::GFX10)
376a6bae5cbSJay Foad     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
377a6bae5cbSJay Foad 
378a6bae5cbSJay Foad   if (HasFlatScratch || HasArchitectedFlatScratch) {
379a6bae5cbSJay Foad     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
380a6bae5cbSJay Foad       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
381a6bae5cbSJay Foad     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
382a6bae5cbSJay Foad       return 4; // FLAT_SCRATCH, VCC (in that order).
383a6bae5cbSJay Foad   }
384a6bae5cbSJay Foad 
385a6bae5cbSJay Foad   if (isXNACKEnabled())
386a6bae5cbSJay Foad     return 4; // XNACK, VCC (in that order).
387a6bae5cbSJay Foad   return 2;   // VCC.
388a6bae5cbSJay Foad }
389a6bae5cbSJay Foad 
390a6bae5cbSJay Foad unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
391a6bae5cbSJay Foad   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
392a6bae5cbSJay Foad   return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
393a6bae5cbSJay Foad }
394a6bae5cbSJay Foad 
395a6bae5cbSJay Foad unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
396a6bae5cbSJay Foad   // In principle we do not need to reserve SGPR pair used for flat_scratch if
397a6bae5cbSJay Foad   // we know flat instructions do not access the stack anywhere in the
398a6bae5cbSJay Foad   // program. For now assume it's needed if we have flat instructions.
399a6bae5cbSJay Foad   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
400a6bae5cbSJay Foad   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
401a6bae5cbSJay Foad }
402a6bae5cbSJay Foad 
403*6206f544SLucas Ramirez std::pair<unsigned, unsigned>
404*6206f544SLucas Ramirez GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
405*6206f544SLucas Ramirez                                unsigned NumSGPRs, unsigned NumVGPRs) const {
406*6206f544SLucas Ramirez   auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
407*6206f544SLucas Ramirez   unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
408*6206f544SLucas Ramirez   unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
409*6206f544SLucas Ramirez 
410*6206f544SLucas Ramirez   // Maximum occupancy may be further limited by high SGPR/VGPR usage.
411*6206f544SLucas Ramirez   MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
412*6206f544SLucas Ramirez   return {std::min(MinOcc, MaxOcc), MaxOcc};
413a6bae5cbSJay Foad }
414a6bae5cbSJay Foad 
415a6bae5cbSJay Foad unsigned GCNSubtarget::getBaseMaxNumSGPRs(
416a6bae5cbSJay Foad     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
417a6bae5cbSJay Foad     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
418a6bae5cbSJay Foad   // Compute maximum number of SGPRs function can use using default/requested
419a6bae5cbSJay Foad   // minimum number of waves per execution unit.
420a6bae5cbSJay Foad   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
421a6bae5cbSJay Foad   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
422a6bae5cbSJay Foad 
423a6bae5cbSJay Foad   // Check if maximum number of SGPRs was explicitly requested using
424a6bae5cbSJay Foad   // "amdgpu-num-sgpr" attribute.
425a6bae5cbSJay Foad   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
426a6bae5cbSJay Foad     unsigned Requested =
427a6bae5cbSJay Foad         F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
428a6bae5cbSJay Foad 
429a6bae5cbSJay Foad     // Make sure requested value does not violate subtarget's specifications.
430a6bae5cbSJay Foad     if (Requested && (Requested <= ReservedNumSGPRs))
431a6bae5cbSJay Foad       Requested = 0;
432a6bae5cbSJay Foad 
433a6bae5cbSJay Foad     // If more SGPRs are required to support the input user/system SGPRs,
434a6bae5cbSJay Foad     // increase to accommodate them.
435a6bae5cbSJay Foad     //
436a6bae5cbSJay Foad     // FIXME: This really ends up using the requested number of SGPRs + number
437a6bae5cbSJay Foad     // of reserved special registers in total. Theoretically you could re-use
438a6bae5cbSJay Foad     // the last input registers for these special registers, but this would
439a6bae5cbSJay Foad     // require a lot of complexity to deal with the weird aliasing.
440a6bae5cbSJay Foad     unsigned InputNumSGPRs = PreloadedSGPRs;
441a6bae5cbSJay Foad     if (Requested && Requested < InputNumSGPRs)
442a6bae5cbSJay Foad       Requested = InputNumSGPRs;
443a6bae5cbSJay Foad 
444a6bae5cbSJay Foad     // Make sure requested value is compatible with values implied by
445a6bae5cbSJay Foad     // default/requested minimum/maximum number of waves per execution unit.
446a6bae5cbSJay Foad     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
447a6bae5cbSJay Foad       Requested = 0;
448a6bae5cbSJay Foad     if (WavesPerEU.second && Requested &&
449a6bae5cbSJay Foad         Requested < getMinNumSGPRs(WavesPerEU.second))
450a6bae5cbSJay Foad       Requested = 0;
451a6bae5cbSJay Foad 
452a6bae5cbSJay Foad     if (Requested)
453a6bae5cbSJay Foad       MaxNumSGPRs = Requested;
454a6bae5cbSJay Foad   }
455a6bae5cbSJay Foad 
456a6bae5cbSJay Foad   if (hasSGPRInitBug())
457a6bae5cbSJay Foad     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
458a6bae5cbSJay Foad 
459a6bae5cbSJay Foad   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
460a6bae5cbSJay Foad }
461a6bae5cbSJay Foad 
462a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
463a6bae5cbSJay Foad   const Function &F = MF.getFunction();
464a6bae5cbSJay Foad   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
465a6bae5cbSJay Foad   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
466a6bae5cbSJay Foad                             getReservedNumSGPRs(MF));
467a6bae5cbSJay Foad }
468a6bae5cbSJay Foad 
469a6bae5cbSJay Foad static unsigned getMaxNumPreloadedSGPRs() {
470a6bae5cbSJay Foad   using USI = GCNUserSGPRUsageInfo;
471a6bae5cbSJay Foad   // Max number of user SGPRs
472a6bae5cbSJay Foad   const unsigned MaxUserSGPRs =
473a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
474a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::DispatchPtrID) +
475a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::QueuePtrID) +
476a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
477a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::DispatchIdID) +
478a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
479a6bae5cbSJay Foad       USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
480a6bae5cbSJay Foad 
481a6bae5cbSJay Foad   // Max number of system SGPRs
482a6bae5cbSJay Foad   const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
483a6bae5cbSJay Foad                                   1 + // WorkGroupIDY
484a6bae5cbSJay Foad                                   1 + // WorkGroupIDZ
485a6bae5cbSJay Foad                                   1 + // WorkGroupInfo
486a6bae5cbSJay Foad                                   1;  // private segment wave byte offset
487a6bae5cbSJay Foad 
488a6bae5cbSJay Foad   // Max number of synthetic SGPRs
489a6bae5cbSJay Foad   const unsigned SyntheticSGPRs = 1; // LDSKernelId
490a6bae5cbSJay Foad 
491a6bae5cbSJay Foad   return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
492a6bae5cbSJay Foad }
493a6bae5cbSJay Foad 
494a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
495a6bae5cbSJay Foad   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
496a6bae5cbSJay Foad                             getReservedNumSGPRs(F));
497a6bae5cbSJay Foad }
498a6bae5cbSJay Foad 
499a6bae5cbSJay Foad unsigned GCNSubtarget::getBaseMaxNumVGPRs(
500a6bae5cbSJay Foad     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
501a6bae5cbSJay Foad   // Compute maximum number of VGPRs function can use using default/requested
502a6bae5cbSJay Foad   // minimum number of waves per execution unit.
503a6bae5cbSJay Foad   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
504a6bae5cbSJay Foad 
505a6bae5cbSJay Foad   // Check if maximum number of VGPRs was explicitly requested using
506a6bae5cbSJay Foad   // "amdgpu-num-vgpr" attribute.
507a6bae5cbSJay Foad   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
508a6bae5cbSJay Foad     unsigned Requested =
509a6bae5cbSJay Foad         F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
510a6bae5cbSJay Foad 
511a6bae5cbSJay Foad     if (hasGFX90AInsts())
512a6bae5cbSJay Foad       Requested *= 2;
513a6bae5cbSJay Foad 
514a6bae5cbSJay Foad     // Make sure requested value is compatible with values implied by
515a6bae5cbSJay Foad     // default/requested minimum/maximum number of waves per execution unit.
516a6bae5cbSJay Foad     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
517a6bae5cbSJay Foad       Requested = 0;
518a6bae5cbSJay Foad     if (WavesPerEU.second && Requested &&
519a6bae5cbSJay Foad         Requested < getMinNumVGPRs(WavesPerEU.second))
520a6bae5cbSJay Foad       Requested = 0;
521a6bae5cbSJay Foad 
522a6bae5cbSJay Foad     if (Requested)
523a6bae5cbSJay Foad       MaxNumVGPRs = Requested;
524a6bae5cbSJay Foad   }
525a6bae5cbSJay Foad 
526a6bae5cbSJay Foad   return MaxNumVGPRs;
527a6bae5cbSJay Foad }
528a6bae5cbSJay Foad 
529a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
530a6bae5cbSJay Foad   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
531a6bae5cbSJay Foad }
532a6bae5cbSJay Foad 
533a6bae5cbSJay Foad unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
534a6bae5cbSJay Foad   const Function &F = MF.getFunction();
535a6bae5cbSJay Foad   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
536a6bae5cbSJay Foad   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
537a6bae5cbSJay Foad }
538a6bae5cbSJay Foad 
539a6bae5cbSJay Foad void GCNSubtarget::adjustSchedDependency(
540a6bae5cbSJay Foad     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
541a6bae5cbSJay Foad     const TargetSchedModel *SchedModel) const {
542a6bae5cbSJay Foad   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
543a6bae5cbSJay Foad       !Use->isInstr())
544a6bae5cbSJay Foad     return;
545a6bae5cbSJay Foad 
546a6bae5cbSJay Foad   MachineInstr *DefI = Def->getInstr();
547a6bae5cbSJay Foad   MachineInstr *UseI = Use->getInstr();
548a6bae5cbSJay Foad 
549a6bae5cbSJay Foad   if (DefI->isBundle()) {
550a6bae5cbSJay Foad     const SIRegisterInfo *TRI = getRegisterInfo();
551a6bae5cbSJay Foad     auto Reg = Dep.getReg();
552a6bae5cbSJay Foad     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
553a6bae5cbSJay Foad     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
554a6bae5cbSJay Foad     unsigned Lat = 0;
555a6bae5cbSJay Foad     for (++I; I != E && I->isBundledWithPred(); ++I) {
556a6bae5cbSJay Foad       if (I->modifiesRegister(Reg, TRI))
557a6bae5cbSJay Foad         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
558a6bae5cbSJay Foad       else if (Lat)
559a6bae5cbSJay Foad         --Lat;
560a6bae5cbSJay Foad     }
561a6bae5cbSJay Foad     Dep.setLatency(Lat);
562a6bae5cbSJay Foad   } else if (UseI->isBundle()) {
563a6bae5cbSJay Foad     const SIRegisterInfo *TRI = getRegisterInfo();
564a6bae5cbSJay Foad     auto Reg = Dep.getReg();
565a6bae5cbSJay Foad     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
566a6bae5cbSJay Foad     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
567a6bae5cbSJay Foad     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
568a6bae5cbSJay Foad     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
569a6bae5cbSJay Foad       if (I->readsRegister(Reg, TRI))
570a6bae5cbSJay Foad         break;
571a6bae5cbSJay Foad       --Lat;
572a6bae5cbSJay Foad     }
573a6bae5cbSJay Foad     Dep.setLatency(Lat);
574a6bae5cbSJay Foad   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
575a6bae5cbSJay Foad     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
576a6bae5cbSJay Foad     // implicit operands which come from the MCInstrDesc, which can fool
577a6bae5cbSJay Foad     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
578a6bae5cbSJay Foad     // pseudo operands.
579a6bae5cbSJay Foad     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
580a6bae5cbSJay Foad         DefI, DefOpIdx, UseI, UseOpIdx));
581a6bae5cbSJay Foad   }
582a6bae5cbSJay Foad }
583a6bae5cbSJay Foad 
584a6bae5cbSJay Foad unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
585a6bae5cbSJay Foad   if (getGeneration() >= AMDGPUSubtarget::GFX12)
586a6bae5cbSJay Foad     return 0; // Not MIMG encoding.
587a6bae5cbSJay Foad 
588a6bae5cbSJay Foad   if (NSAThreshold.getNumOccurrences() > 0)
589a6bae5cbSJay Foad     return std::max(NSAThreshold.getValue(), 2u);
590a6bae5cbSJay Foad 
591a6bae5cbSJay Foad   int Value = MF.getFunction().getFnAttributeAsParsedInteger(
592a6bae5cbSJay Foad       "amdgpu-nsa-threshold", -1);
593a6bae5cbSJay Foad   if (Value > 0)
594a6bae5cbSJay Foad     return std::max(Value, 2);
595a6bae5cbSJay Foad 
5964ce8808dSJay Foad   return NSAThreshold;
597a6bae5cbSJay Foad }
598a6bae5cbSJay Foad 
599a6bae5cbSJay Foad GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
600a6bae5cbSJay Foad                                            const GCNSubtarget &ST)
601a6bae5cbSJay Foad     : ST(ST) {
602a6bae5cbSJay Foad   const CallingConv::ID CC = F.getCallingConv();
603a6bae5cbSJay Foad   const bool IsKernel =
604a6bae5cbSJay Foad       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
605a6bae5cbSJay Foad   // FIXME: Should have analysis or something rather than attribute to detect
606a6bae5cbSJay Foad   // calls.
607a6bae5cbSJay Foad   const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
608a6bae5cbSJay Foad   // FIXME: This attribute is a hack, we just need an analysis on the function
609a6bae5cbSJay Foad   // to look for allocas.
610a6bae5cbSJay Foad   const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
611a6bae5cbSJay Foad 
612a6bae5cbSJay Foad   if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
613a6bae5cbSJay Foad     KernargSegmentPtr = true;
614a6bae5cbSJay Foad 
615a6bae5cbSJay Foad   bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
616a6bae5cbSJay Foad   if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
617a6bae5cbSJay Foad     PrivateSegmentBuffer = true;
618a6bae5cbSJay Foad   else if (ST.isMesaGfxShader(F))
619a6bae5cbSJay Foad     ImplicitBufferPtr = true;
620a6bae5cbSJay Foad 
621a6bae5cbSJay Foad   if (!AMDGPU::isGraphics(CC)) {
622a6bae5cbSJay Foad     if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
623a6bae5cbSJay Foad       DispatchPtr = true;
624a6bae5cbSJay Foad 
625a6bae5cbSJay Foad     // FIXME: Can this always be disabled with < COv5?
626a6bae5cbSJay Foad     if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
627a6bae5cbSJay Foad       QueuePtr = true;
628a6bae5cbSJay Foad 
629a6bae5cbSJay Foad     if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
630a6bae5cbSJay Foad       DispatchID = true;
631a6bae5cbSJay Foad   }
632a6bae5cbSJay Foad 
633a6bae5cbSJay Foad   // TODO: This could be refined a lot. The attribute is a poor way of
634a6bae5cbSJay Foad   // detecting calls or stack objects that may require it before argument
635a6bae5cbSJay Foad   // lowering.
636a6bae5cbSJay Foad   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
637a6bae5cbSJay Foad       (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
638a6bae5cbSJay Foad       (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
639a6bae5cbSJay Foad       !ST.flatScratchIsArchitected()) {
640a6bae5cbSJay Foad     FlatScratchInit = true;
641a6bae5cbSJay Foad   }
642a6bae5cbSJay Foad 
643a6bae5cbSJay Foad   if (hasImplicitBufferPtr())
644a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
645a6bae5cbSJay Foad 
646a6bae5cbSJay Foad   if (hasPrivateSegmentBuffer())
647a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
648a6bae5cbSJay Foad 
649a6bae5cbSJay Foad   if (hasDispatchPtr())
650a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
651a6bae5cbSJay Foad 
652a6bae5cbSJay Foad   if (hasQueuePtr())
653a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
654a6bae5cbSJay Foad 
655a6bae5cbSJay Foad   if (hasKernargSegmentPtr())
656a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
657a6bae5cbSJay Foad 
658a6bae5cbSJay Foad   if (hasDispatchID())
659a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
660a6bae5cbSJay Foad 
661a6bae5cbSJay Foad   if (hasFlatScratchInit())
662a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
663a6bae5cbSJay Foad 
664a6bae5cbSJay Foad   if (hasPrivateSegmentSize())
665a6bae5cbSJay Foad     NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
666a6bae5cbSJay Foad }
667a6bae5cbSJay Foad 
668a6bae5cbSJay Foad void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
669a6bae5cbSJay Foad   assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
670a6bae5cbSJay Foad   NumKernargPreloadSGPRs += NumSGPRs;
671a6bae5cbSJay Foad   NumUsedUserSGPRs += NumSGPRs;
672a6bae5cbSJay Foad }
673a6bae5cbSJay Foad 
674a6bae5cbSJay Foad unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
675a6bae5cbSJay Foad   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
676a6bae5cbSJay Foad }
677