xref: /llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (revision 18f8106f310ee702046a11f360af47947c030d2e)
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains both AMDGPU target machine and the CodeGen pass builder.
11 /// The AMDGPU target machine contains all of the hardware specific information
12 /// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13 /// CodeGen pass builder handles the pass pipeline for new pass manager.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPU.h"
19 #include "AMDGPUAliasAnalysis.h"
20 #include "AMDGPUCtorDtorLowering.h"
21 #include "AMDGPUExportClustering.h"
22 #include "AMDGPUIGroupLP.h"
23 #include "AMDGPUISelDAGToDAG.h"
24 #include "AMDGPUMacroFusion.h"
25 #include "AMDGPUOpenCLEnqueuedBlockLowering.h"
26 #include "AMDGPUPerfHintAnalysis.h"
27 #include "AMDGPURemoveIncompatibleFunctions.h"
28 #include "AMDGPUSplitModule.h"
29 #include "AMDGPUTargetObjectFile.h"
30 #include "AMDGPUTargetTransformInfo.h"
31 #include "AMDGPUUnifyDivergentExitNodes.h"
32 #include "GCNDPPCombine.h"
33 #include "GCNIterativeScheduler.h"
34 #include "GCNSchedStrategy.h"
35 #include "GCNVOPDUtils.h"
36 #include "R600.h"
37 #include "R600TargetMachine.h"
38 #include "SIFixSGPRCopies.h"
39 #include "SIFixVGPRCopies.h"
40 #include "SIFoldOperands.h"
41 #include "SILoadStoreOptimizer.h"
42 #include "SILowerControlFlow.h"
43 #include "SILowerSGPRSpills.h"
44 #include "SILowerWWMCopies.h"
45 #include "SIMachineFunctionInfo.h"
46 #include "SIMachineScheduler.h"
47 #include "SIOptimizeExecMasking.h"
48 #include "SIOptimizeVGPRLiveRange.h"
49 #include "SIPeepholeSDWA.h"
50 #include "SIPreAllocateWWMRegs.h"
51 #include "SIShrinkInstructions.h"
52 #include "TargetInfo/AMDGPUTargetInfo.h"
53 #include "Utils/AMDGPUBaseInfo.h"
54 #include "llvm/Analysis/CGSCCPassManager.h"
55 #include "llvm/Analysis/CallGraphSCCPass.h"
56 #include "llvm/Analysis/KernelInfo.h"
57 #include "llvm/Analysis/UniformityAnalysis.h"
58 #include "llvm/CodeGen/AtomicExpand.h"
59 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
60 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
61 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
62 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
63 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
64 #include "llvm/CodeGen/GlobalISel/Localizer.h"
65 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
66 #include "llvm/CodeGen/MIRParser/MIParser.h"
67 #include "llvm/CodeGen/MachineCSE.h"
68 #include "llvm/CodeGen/MachineLICM.h"
69 #include "llvm/CodeGen/Passes.h"
70 #include "llvm/CodeGen/RegAllocRegistry.h"
71 #include "llvm/CodeGen/TargetPassConfig.h"
72 #include "llvm/IR/IntrinsicsAMDGPU.h"
73 #include "llvm/IR/PassManager.h"
74 #include "llvm/IR/PatternMatch.h"
75 #include "llvm/InitializePasses.h"
76 #include "llvm/MC/TargetRegistry.h"
77 #include "llvm/Passes/PassBuilder.h"
78 #include "llvm/Support/FormatVariadic.h"
79 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
80 #include "llvm/Transforms/IPO.h"
81 #include "llvm/Transforms/IPO/AlwaysInliner.h"
82 #include "llvm/Transforms/IPO/ExpandVariadics.h"
83 #include "llvm/Transforms/IPO/GlobalDCE.h"
84 #include "llvm/Transforms/IPO/Internalize.h"
85 #include "llvm/Transforms/Scalar.h"
86 #include "llvm/Transforms/Scalar/EarlyCSE.h"
87 #include "llvm/Transforms/Scalar/FlattenCFG.h"
88 #include "llvm/Transforms/Scalar/GVN.h"
89 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
90 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
91 #include "llvm/Transforms/Scalar/NaryReassociate.h"
92 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
93 #include "llvm/Transforms/Scalar/Sink.h"
94 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
95 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
96 #include "llvm/Transforms/Utils.h"
97 #include "llvm/Transforms/Utils/FixIrreducible.h"
98 #include "llvm/Transforms/Utils/LCSSA.h"
99 #include "llvm/Transforms/Utils/LowerSwitch.h"
100 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
101 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
102 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
103 #include <optional>
104 
105 using namespace llvm;
106 using namespace llvm::PatternMatch;
107 
108 namespace {
109 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
110 public:
111   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
112     : RegisterRegAllocBase(N, D, C) {}
113 };
114 
115 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
116 public:
117   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
118     : RegisterRegAllocBase(N, D, C) {}
119 };
120 
121 class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
122 public:
123   WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
124       : RegisterRegAllocBase(N, D, C) {}
125 };
126 
127 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
128                               const MachineRegisterInfo &MRI,
129                               const Register Reg) {
130   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
131   return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
132 }
133 
134 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
135                               const MachineRegisterInfo &MRI,
136                               const Register Reg) {
137   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
138   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
139 }
140 
141 static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
142                                 const MachineRegisterInfo &MRI,
143                                 const Register Reg) {
144   const SIMachineFunctionInfo *MFI =
145       MRI.getMF().getInfo<SIMachineFunctionInfo>();
146   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
147   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
148          MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
149 }
150 
151 /// -{sgpr|wwm|vgpr}-regalloc=... command line option.
152 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
153 
154 /// A dummy default pass factory indicates whether the register allocator is
155 /// overridden on the command line.
156 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
157 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
158 static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
159 
160 static SGPRRegisterRegAlloc
161 defaultSGPRRegAlloc("default",
162                     "pick SGPR register allocator based on -O option",
163                     useDefaultRegisterAllocator);
164 
165 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
166                RegisterPassParser<SGPRRegisterRegAlloc>>
167 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
168              cl::desc("Register allocator to use for SGPRs"));
169 
170 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
171                RegisterPassParser<VGPRRegisterRegAlloc>>
172 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
173              cl::desc("Register allocator to use for VGPRs"));
174 
175 static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
176                RegisterPassParser<WWMRegisterRegAlloc>>
177     WWMRegAlloc("wwm-regalloc", cl::Hidden,
178                 cl::init(&useDefaultRegisterAllocator),
179                 cl::desc("Register allocator to use for WWM registers"));
180 
181 static void initializeDefaultSGPRRegisterAllocatorOnce() {
182   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
183 
184   if (!Ctor) {
185     Ctor = SGPRRegAlloc;
186     SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
187   }
188 }
189 
190 static void initializeDefaultVGPRRegisterAllocatorOnce() {
191   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
192 
193   if (!Ctor) {
194     Ctor = VGPRRegAlloc;
195     VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
196   }
197 }
198 
199 static void initializeDefaultWWMRegisterAllocatorOnce() {
200   RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
201 
202   if (!Ctor) {
203     Ctor = WWMRegAlloc;
204     WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
205   }
206 }
207 
208 static FunctionPass *createBasicSGPRRegisterAllocator() {
209   return createBasicRegisterAllocator(onlyAllocateSGPRs);
210 }
211 
212 static FunctionPass *createGreedySGPRRegisterAllocator() {
213   return createGreedyRegisterAllocator(onlyAllocateSGPRs);
214 }
215 
216 static FunctionPass *createFastSGPRRegisterAllocator() {
217   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
218 }
219 
220 static FunctionPass *createBasicVGPRRegisterAllocator() {
221   return createBasicRegisterAllocator(onlyAllocateVGPRs);
222 }
223 
224 static FunctionPass *createGreedyVGPRRegisterAllocator() {
225   return createGreedyRegisterAllocator(onlyAllocateVGPRs);
226 }
227 
228 static FunctionPass *createFastVGPRRegisterAllocator() {
229   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
230 }
231 
232 static FunctionPass *createBasicWWMRegisterAllocator() {
233   return createBasicRegisterAllocator(onlyAllocateWWMRegs);
234 }
235 
236 static FunctionPass *createGreedyWWMRegisterAllocator() {
237   return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
238 }
239 
240 static FunctionPass *createFastWWMRegisterAllocator() {
241   return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
242 }
243 
244 static SGPRRegisterRegAlloc basicRegAllocSGPR(
245   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
246 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
247   "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
248 
249 static SGPRRegisterRegAlloc fastRegAllocSGPR(
250   "fast", "fast register allocator", createFastSGPRRegisterAllocator);
251 
252 
253 static VGPRRegisterRegAlloc basicRegAllocVGPR(
254   "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
255 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
256   "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
257 
258 static VGPRRegisterRegAlloc fastRegAllocVGPR(
259   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
260 static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
261                                                "basic register allocator",
262                                                createBasicWWMRegisterAllocator);
263 static WWMRegisterRegAlloc
264     greedyRegAllocWWMReg("greedy", "greedy register allocator",
265                          createGreedyWWMRegisterAllocator);
266 static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
267                                               createFastWWMRegisterAllocator);
268 
269 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
270   return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
271          Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
272 }
273 } // anonymous namespace
274 
275 static cl::opt<bool>
276 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
277                         cl::desc("Run early if-conversion"),
278                         cl::init(false));
279 
280 static cl::opt<bool>
281 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
282             cl::desc("Run pre-RA exec mask optimizations"),
283             cl::init(true));
284 
285 static cl::opt<bool>
286     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
287                   cl::desc("Lower GPU ctor / dtors to globals on the device."),
288                   cl::init(true), cl::Hidden);
289 
290 // Option to disable vectorizer for tests.
291 static cl::opt<bool> EnableLoadStoreVectorizer(
292   "amdgpu-load-store-vectorizer",
293   cl::desc("Enable load store vectorizer"),
294   cl::init(true),
295   cl::Hidden);
296 
297 // Option to control global loads scalarization
298 static cl::opt<bool> ScalarizeGlobal(
299   "amdgpu-scalarize-global-loads",
300   cl::desc("Enable global load scalarization"),
301   cl::init(true),
302   cl::Hidden);
303 
304 // Option to run internalize pass.
305 static cl::opt<bool> InternalizeSymbols(
306   "amdgpu-internalize-symbols",
307   cl::desc("Enable elimination of non-kernel functions and unused globals"),
308   cl::init(false),
309   cl::Hidden);
310 
311 // Option to inline all early.
312 static cl::opt<bool> EarlyInlineAll(
313   "amdgpu-early-inline-all",
314   cl::desc("Inline all functions early"),
315   cl::init(false),
316   cl::Hidden);
317 
318 static cl::opt<bool> RemoveIncompatibleFunctions(
319     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
320     cl::desc("Enable removal of functions when they"
321              "use features not supported by the target GPU"),
322     cl::init(true));
323 
324 static cl::opt<bool> EnableSDWAPeephole(
325   "amdgpu-sdwa-peephole",
326   cl::desc("Enable SDWA peepholer"),
327   cl::init(true));
328 
329 static cl::opt<bool> EnableDPPCombine(
330   "amdgpu-dpp-combine",
331   cl::desc("Enable DPP combiner"),
332   cl::init(true));
333 
334 // Enable address space based alias analysis
335 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
336   cl::desc("Enable AMDGPU Alias Analysis"),
337   cl::init(true));
338 
339 // Enable lib calls simplifications
340 static cl::opt<bool> EnableLibCallSimplify(
341   "amdgpu-simplify-libcall",
342   cl::desc("Enable amdgpu library simplifications"),
343   cl::init(true),
344   cl::Hidden);
345 
346 static cl::opt<bool> EnableLowerKernelArguments(
347   "amdgpu-ir-lower-kernel-arguments",
348   cl::desc("Lower kernel argument loads in IR pass"),
349   cl::init(true),
350   cl::Hidden);
351 
352 static cl::opt<bool> EnableRegReassign(
353   "amdgpu-reassign-regs",
354   cl::desc("Enable register reassign optimizations on gfx10+"),
355   cl::init(true),
356   cl::Hidden);
357 
358 static cl::opt<bool> OptVGPRLiveRange(
359     "amdgpu-opt-vgpr-liverange",
360     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
361     cl::init(true), cl::Hidden);
362 
363 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
364     "amdgpu-atomic-optimizer-strategy",
365     cl::desc("Select DPP or Iterative strategy for scan"),
366     cl::init(ScanOptions::Iterative),
367     cl::values(
368         clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
369         clEnumValN(ScanOptions::Iterative, "Iterative",
370                    "Use Iterative approach for scan"),
371         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
372 
373 // Enable Mode register optimization
374 static cl::opt<bool> EnableSIModeRegisterPass(
375   "amdgpu-mode-register",
376   cl::desc("Enable mode register pass"),
377   cl::init(true),
378   cl::Hidden);
379 
380 // Enable GFX11+ s_delay_alu insertion
381 static cl::opt<bool>
382     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
383                          cl::desc("Enable s_delay_alu insertion"),
384                          cl::init(true), cl::Hidden);
385 
386 // Enable GFX11+ VOPD
387 static cl::opt<bool>
388     EnableVOPD("amdgpu-enable-vopd",
389                cl::desc("Enable VOPD, dual issue of VALU in wave32"),
390                cl::init(true), cl::Hidden);
391 
392 // Option is used in lit tests to prevent deadcoding of patterns inspected.
393 static cl::opt<bool>
394 EnableDCEInRA("amdgpu-dce-in-ra",
395     cl::init(true), cl::Hidden,
396     cl::desc("Enable machine DCE inside regalloc"));
397 
398 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
399                                            cl::desc("Adjust wave priority"),
400                                            cl::init(false), cl::Hidden);
401 
402 static cl::opt<bool> EnableScalarIRPasses(
403   "amdgpu-scalar-ir-passes",
404   cl::desc("Enable scalar IR passes"),
405   cl::init(true),
406   cl::Hidden);
407 
408 static cl::opt<bool>
409     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
410                      cl::desc("Enable lowering of lds to global memory pass "
411                               "and asan instrument resulting IR."),
412                      cl::init(true), cl::Hidden);
413 
414 static cl::opt<bool, true> EnableLowerModuleLDS(
415     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
416     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
417     cl::Hidden);
418 
419 static cl::opt<bool> EnablePreRAOptimizations(
420     "amdgpu-enable-pre-ra-optimizations",
421     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
422     cl::Hidden);
423 
424 static cl::opt<bool> EnablePromoteKernelArguments(
425     "amdgpu-enable-promote-kernel-arguments",
426     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
427     cl::Hidden, cl::init(true));
428 
429 static cl::opt<bool> EnableImageIntrinsicOptimizer(
430     "amdgpu-enable-image-intrinsic-optimizer",
431     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
432     cl::Hidden);
433 
434 static cl::opt<bool>
435     EnableLoopPrefetch("amdgpu-loop-prefetch",
436                        cl::desc("Enable loop data prefetch on AMDGPU"),
437                        cl::Hidden, cl::init(false));
438 
439 static cl::opt<std::string>
440     AMDGPUSchedStrategy("amdgpu-sched-strategy",
441                         cl::desc("Select custom AMDGPU scheduling strategy."),
442                         cl::Hidden, cl::init(""));
443 
444 static cl::opt<bool> EnableRewritePartialRegUses(
445     "amdgpu-enable-rewrite-partial-reg-uses",
446     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
447     cl::Hidden);
448 
449 static cl::opt<bool> EnableHipStdPar(
450   "amdgpu-enable-hipstdpar",
451   cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
452   cl::Hidden);
453 
454 static cl::opt<bool>
455     EnableAMDGPUAttributor("amdgpu-attributor-enable",
456                            cl::desc("Enable AMDGPUAttributorPass"),
457                            cl::init(true), cl::Hidden);
458 
459 static cl::opt<bool> NewRegBankSelect(
460     "new-reg-bank-select",
461     cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
462              "regbankselect"),
463     cl::init(false), cl::Hidden);
464 
465 static cl::opt<bool> HasClosedWorldAssumption(
466     "amdgpu-link-time-closed-world",
467     cl::desc("Whether has closed-world assumption at link time"),
468     cl::init(false), cl::Hidden);
469 
470 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
471   // Register the target
472   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
473   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
474 
475   PassRegistry *PR = PassRegistry::getPassRegistry();
476   initializeR600ClauseMergePassPass(*PR);
477   initializeR600ControlFlowFinalizerPass(*PR);
478   initializeR600PacketizerPass(*PR);
479   initializeR600ExpandSpecialInstrsPassPass(*PR);
480   initializeR600VectorRegMergerPass(*PR);
481   initializeGlobalISel(*PR);
482   initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
483   initializeGCNDPPCombineLegacyPass(*PR);
484   initializeSILowerI1CopiesLegacyPass(*PR);
485   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
486   initializeAMDGPURegBankSelectPass(*PR);
487   initializeAMDGPURegBankLegalizePass(*PR);
488   initializeSILowerWWMCopiesLegacyPass(*PR);
489   initializeAMDGPUMarkLastScratchLoadPass(*PR);
490   initializeSILowerSGPRSpillsLegacyPass(*PR);
491   initializeSIFixSGPRCopiesLegacyPass(*PR);
492   initializeSIFixVGPRCopiesLegacyPass(*PR);
493   initializeSIFoldOperandsLegacyPass(*PR);
494   initializeSIPeepholeSDWALegacyPass(*PR);
495   initializeSIShrinkInstructionsLegacyPass(*PR);
496   initializeSIOptimizeExecMaskingPreRAPass(*PR);
497   initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
498   initializeSILoadStoreOptimizerLegacyPass(*PR);
499   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
500   initializeAMDGPUAlwaysInlinePass(*PR);
501   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
502   initializeAMDGPUAttributorLegacyPass(*PR);
503   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
504   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
505   initializeAMDGPUArgumentUsageInfoPass(*PR);
506   initializeAMDGPUAtomicOptimizerPass(*PR);
507   initializeAMDGPULowerKernelArgumentsPass(*PR);
508   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
509   initializeAMDGPULowerKernelAttributesPass(*PR);
510   initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR);
511   initializeAMDGPUPostLegalizerCombinerPass(*PR);
512   initializeAMDGPUPreLegalizerCombinerPass(*PR);
513   initializeAMDGPURegBankCombinerPass(*PR);
514   initializeAMDGPUPromoteAllocaPass(*PR);
515   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
516   initializeAMDGPUCodeGenPreparePass(*PR);
517   initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
518   initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
519   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
520   initializeAMDGPULowerBufferFatPointersPass(*PR);
521   initializeAMDGPUReserveWWMRegsPass(*PR);
522   initializeAMDGPURewriteOutArgumentsPass(*PR);
523   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
524   initializeAMDGPUUnifyMetadataPass(*PR);
525   initializeSIAnnotateControlFlowLegacyPass(*PR);
526   initializeAMDGPUInsertDelayAluPass(*PR);
527   initializeSIInsertHardClausesPass(*PR);
528   initializeSIInsertWaitcntsPass(*PR);
529   initializeSIModeRegisterPass(*PR);
530   initializeSIWholeQuadModePass(*PR);
531   initializeSILowerControlFlowLegacyPass(*PR);
532   initializeSIPreEmitPeepholePass(*PR);
533   initializeSILateBranchLoweringPass(*PR);
534   initializeSIMemoryLegalizerPass(*PR);
535   initializeSIOptimizeExecMaskingLegacyPass(*PR);
536   initializeSIPreAllocateWWMRegsLegacyPass(*PR);
537   initializeSIFormMemoryClausesPass(*PR);
538   initializeSIPostRABundlerPass(*PR);
539   initializeGCNCreateVOPDPass(*PR);
540   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
541   initializeAMDGPUAAWrapperPassPass(*PR);
542   initializeAMDGPUExternalAAWrapperPass(*PR);
543   initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
544   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
545   initializeAMDGPUResourceUsageAnalysisPass(*PR);
546   initializeGCNNSAReassignPass(*PR);
547   initializeGCNPreRAOptimizationsPass(*PR);
548   initializeGCNPreRALongBranchRegPass(*PR);
549   initializeGCNRewritePartialRegUsesPass(*PR);
550   initializeGCNRegPressurePrinterPass(*PR);
551   initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
552 }
553 
554 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
555   return std::make_unique<AMDGPUTargetObjectFile>();
556 }
557 
558 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
559   return new SIScheduleDAGMI(C);
560 }
561 
562 static ScheduleDAGInstrs *
563 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
564   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
565   ScheduleDAGMILive *DAG =
566     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
567   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
568   if (ST.shouldClusterStores())
569     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
570   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
571   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
572   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
573   return DAG;
574 }
575 
576 static ScheduleDAGInstrs *
577 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
578   ScheduleDAGMILive *DAG =
579       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
580   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
581   return DAG;
582 }
583 
584 static ScheduleDAGInstrs *
585 createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
586   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
587   ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
588       C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
589   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
590   if (ST.shouldClusterStores())
591     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
592   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
593   return DAG;
594 }
595 
596 static ScheduleDAGInstrs *
597 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
598   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
599   auto *DAG = new GCNIterativeScheduler(
600       C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
601   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
602   if (ST.shouldClusterStores())
603     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
604   return DAG;
605 }
606 
607 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
608   return new GCNIterativeScheduler(C,
609     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
610 }
611 
612 static ScheduleDAGInstrs *
613 createIterativeILPMachineScheduler(MachineSchedContext *C) {
614   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
615   auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
616   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
617   if (ST.shouldClusterStores())
618     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
619   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
620   return DAG;
621 }
622 
623 static MachineSchedRegistry
624 SISchedRegistry("si", "Run SI's custom scheduler",
625                 createSIMachineScheduler);
626 
627 static MachineSchedRegistry
628 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
629                              "Run GCN scheduler to maximize occupancy",
630                              createGCNMaxOccupancyMachineScheduler);
631 
632 static MachineSchedRegistry
633     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
634                            createGCNMaxILPMachineScheduler);
635 
636 static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
637     "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
638     createGCNMaxMemoryClauseMachineScheduler);
639 
640 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
641     "gcn-iterative-max-occupancy-experimental",
642     "Run GCN scheduler to maximize occupancy (experimental)",
643     createIterativeGCNMaxOccupancyMachineScheduler);
644 
645 static MachineSchedRegistry GCNMinRegSchedRegistry(
646     "gcn-iterative-minreg",
647     "Run GCN iterative scheduler for minimal register usage (experimental)",
648     createMinRegScheduler);
649 
650 static MachineSchedRegistry GCNILPSchedRegistry(
651     "gcn-iterative-ilp",
652     "Run GCN iterative scheduler for ILP scheduling (experimental)",
653     createIterativeILPMachineScheduler);
654 
655 static StringRef computeDataLayout(const Triple &TT) {
656   if (TT.getArch() == Triple::r600) {
657     // 32-bit pointers.
658     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
659            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
660   }
661 
662   // 32-bit private, local, and region pointers. 64-bit global, constant and
663   // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
664   // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
665   // (address space 7), and 128-bit non-integral buffer resourcees (address
666   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
667   // like getelementptr.
668   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
669          "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
670          "v32:32-v48:64-v96:"
671          "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
672          "G1-ni:7:8:9";
673 }
674 
675 LLVM_READNONE
676 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
677   if (!GPU.empty())
678     return GPU;
679 
680   // Need to default to a target with flat support for HSA.
681   if (TT.getArch() == Triple::amdgcn)
682     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
683 
684   return "r600";
685 }
686 
687 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
688   // The AMDGPU toolchain only supports generating shared objects, so we
689   // must always use PIC.
690   return Reloc::PIC_;
691 }
692 
693 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
694                                          StringRef CPU, StringRef FS,
695                                          const TargetOptions &Options,
696                                          std::optional<Reloc::Model> RM,
697                                          std::optional<CodeModel::Model> CM,
698                                          CodeGenOptLevel OptLevel)
699     : CodeGenTargetMachineImpl(
700           T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options,
701           getEffectiveRelocModel(RM),
702           getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
703       TLOF(createTLOF(getTargetTriple())) {
704   initAsmInfo();
705   if (TT.getArch() == Triple::amdgcn) {
706     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
707       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
708     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
709       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
710   }
711 }
712 
713 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
714 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
715 
716 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
717 
718 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
719   Attribute GPUAttr = F.getFnAttribute("target-cpu");
720   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
721 }
722 
723 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
724   Attribute FSAttr = F.getFnAttribute("target-features");
725 
726   return FSAttr.isValid() ? FSAttr.getValueAsString()
727                           : getTargetFeatureString();
728 }
729 
730 /// Predicate for Internalize pass.
731 static bool mustPreserveGV(const GlobalValue &GV) {
732   if (const Function *F = dyn_cast<Function>(&GV))
733     return F->isDeclaration() || F->getName().starts_with("__asan_") ||
734            F->getName().starts_with("__sanitizer_") ||
735            AMDGPU::isEntryFunctionCC(F->getCallingConv());
736 
737   GV.removeDeadConstantUsers();
738   return !GV.use_empty();
739 }
740 
741 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
742   AAM.registerFunctionAnalysis<AMDGPUAA>();
743 }
744 
745 static Expected<ScanOptions>
746 parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
747   if (Params.empty())
748     return ScanOptions::Iterative;
749   Params.consume_front("strategy=");
750   auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
751                     .Case("dpp", ScanOptions::DPP)
752                     .Cases("iterative", "", ScanOptions::Iterative)
753                     .Case("none", ScanOptions::None)
754                     .Default(std::nullopt);
755   if (Result)
756     return *Result;
757   return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
758 }
759 
760 Expected<AMDGPUAttributorOptions>
761 parseAMDGPUAttributorPassOptions(StringRef Params) {
762   AMDGPUAttributorOptions Result;
763   while (!Params.empty()) {
764     StringRef ParamName;
765     std::tie(ParamName, Params) = Params.split(';');
766     if (ParamName == "closed-world") {
767       Result.IsClosedWorld = true;
768     } else {
769       return make_error<StringError>(
770           formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
771               .str(),
772           inconvertibleErrorCode());
773     }
774   }
775   return Result;
776 }
777 
778 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
779 
780 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
781 #include "llvm/Passes/TargetPassRegistry.inc"
782 
783   PB.registerPipelineStartEPCallback(
784       [](ModulePassManager &PM, OptimizationLevel Level) {
785         if (EnableHipStdPar)
786           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
787       });
788 
789   PB.registerPipelineEarlySimplificationEPCallback(
790       [](ModulePassManager &PM, OptimizationLevel Level,
791          ThinOrFullLTOPhase Phase) {
792         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
793 
794         if (Level == OptimizationLevel::O0)
795           return;
796 
797         PM.addPass(AMDGPUUnifyMetadataPass());
798 
799         // We don't want to run internalization at per-module stage.
800         if (InternalizeSymbols && !isLTOPreLink(Phase)) {
801           PM.addPass(InternalizePass(mustPreserveGV));
802           PM.addPass(GlobalDCEPass());
803         }
804 
805         if (EarlyInlineAll && !EnableFunctionCalls)
806           PM.addPass(AMDGPUAlwaysInlinePass());
807       });
808 
809   PB.registerPeepholeEPCallback(
810       [](FunctionPassManager &FPM, OptimizationLevel Level) {
811         if (Level == OptimizationLevel::O0)
812           return;
813 
814         FPM.addPass(AMDGPUUseNativeCallsPass());
815         if (EnableLibCallSimplify)
816           FPM.addPass(AMDGPUSimplifyLibCallsPass());
817       });
818 
819   PB.registerCGSCCOptimizerLateEPCallback(
820       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
821         if (Level == OptimizationLevel::O0)
822           return;
823 
824         FunctionPassManager FPM;
825 
826         // Add promote kernel arguments pass to the opt pipeline right before
827         // infer address spaces which is needed to do actual address space
828         // rewriting.
829         if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
830             EnablePromoteKernelArguments)
831           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
832 
833         // Add infer address spaces pass to the opt pipeline after inlining
834         // but before SROA to increase SROA opportunities.
835         FPM.addPass(InferAddressSpacesPass());
836 
837         // This should run after inlining to have any chance of doing
838         // anything, and before other cleanup optimizations.
839         FPM.addPass(AMDGPULowerKernelAttributesPass());
840 
841         if (Level != OptimizationLevel::O0) {
842           // Promote alloca to vector before SROA and loop unroll. If we
843           // manage to eliminate allocas before unroll we may choose to unroll
844           // less.
845           FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
846         }
847 
848         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
849       });
850 
851   // FIXME: Why is AMDGPUAttributor not in CGSCC?
852   PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
853                                             OptimizationLevel Level,
854                                             ThinOrFullLTOPhase Phase) {
855     if (Level != OptimizationLevel::O0) {
856       if (!isLTOPreLink(Phase))
857         MPM.addPass(AMDGPUAttributorPass(*this));
858     }
859   });
860 
861   PB.registerFullLinkTimeOptimizationLastEPCallback(
862       [this](ModulePassManager &PM, OptimizationLevel Level) {
863         // We want to support the -lto-partitions=N option as "best effort".
864         // For that, we need to lower LDS earlier in the pipeline before the
865         // module is partitioned for codegen.
866         if (EnableSwLowerLDS)
867           PM.addPass(AMDGPUSwLowerLDSPass(*this));
868         if (EnableLowerModuleLDS)
869           PM.addPass(AMDGPULowerModuleLDSPass(*this));
870         if (Level != OptimizationLevel::O0) {
871           // Do we really need internalization in LTO?
872           if (InternalizeSymbols) {
873             PM.addPass(InternalizePass(mustPreserveGV));
874             PM.addPass(GlobalDCEPass());
875           }
876           if (EnableAMDGPUAttributor) {
877             AMDGPUAttributorOptions Opt;
878             if (HasClosedWorldAssumption)
879               Opt.IsClosedWorld = true;
880             PM.addPass(AMDGPUAttributorPass(*this, Opt));
881           }
882         }
883         if (!NoKernelInfoEndLTO) {
884           FunctionPassManager FPM;
885           FPM.addPass(KernelInfoPrinter(this));
886           PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
887         }
888       });
889 
890   PB.registerRegClassFilterParsingCallback(
891       [](StringRef FilterName) -> RegAllocFilterFunc {
892         if (FilterName == "sgpr")
893           return onlyAllocateSGPRs;
894         if (FilterName == "vgpr")
895           return onlyAllocateVGPRs;
896         if (FilterName == "wwm")
897           return onlyAllocateWWMRegs;
898         return nullptr;
899       });
900 }
901 
902 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
903   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
904           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
905           AddrSpace == AMDGPUAS::REGION_ADDRESS)
906              ? -1
907              : 0;
908 }
909 
910 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
911                                               unsigned DestAS) const {
912   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
913          AMDGPU::isFlatGlobalAddrSpace(DestAS);
914 }
915 
916 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
917   const auto *LD = dyn_cast<LoadInst>(V);
918   if (!LD) // TODO: Handle invariant load like constant.
919     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
920 
921   // It must be a generic pointer loaded.
922   assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
923 
924   const auto *Ptr = LD->getPointerOperand();
925   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
926     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
927   // For a generic pointer loaded from the constant memory, it could be assumed
928   // as a global pointer since the constant memory is only populated on the
929   // host side. As implied by the offload programming model, only global
930   // pointers could be referenced on the host side.
931   return AMDGPUAS::GLOBAL_ADDRESS;
932 }
933 
934 std::pair<const Value *, unsigned>
935 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
936   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
937     switch (II->getIntrinsicID()) {
938     case Intrinsic::amdgcn_is_shared:
939       return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
940     case Intrinsic::amdgcn_is_private:
941       return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
942     default:
943       break;
944     }
945     return std::pair(nullptr, -1);
946   }
947   // Check the global pointer predication based on
948   // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
949   // the order of 'is_shared' and 'is_private' is not significant.
950   Value *Ptr;
951   if (match(
952           const_cast<Value *>(V),
953           m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
954                   m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
955                       m_Deferred(Ptr))))))
956     return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
957 
958   return std::pair(nullptr, -1);
959 }
960 
961 unsigned
962 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
963   switch (Kind) {
964   case PseudoSourceValue::Stack:
965   case PseudoSourceValue::FixedStack:
966     return AMDGPUAS::PRIVATE_ADDRESS;
967   case PseudoSourceValue::ConstantPool:
968   case PseudoSourceValue::GOT:
969   case PseudoSourceValue::JumpTable:
970   case PseudoSourceValue::GlobalValueCallEntry:
971   case PseudoSourceValue::ExternalSymbolCallEntry:
972     return AMDGPUAS::CONSTANT_ADDRESS;
973   }
974   return AMDGPUAS::FLAT_ADDRESS;
975 }
976 
977 bool AMDGPUTargetMachine::splitModule(
978     Module &M, unsigned NumParts,
979     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
980   // FIXME(?): Would be better to use an already existing Analysis/PassManager,
981   // but all current users of this API don't have one ready and would need to
982   // create one anyway. Let's hide the boilerplate for now to keep it simple.
983 
984   LoopAnalysisManager LAM;
985   FunctionAnalysisManager FAM;
986   CGSCCAnalysisManager CGAM;
987   ModuleAnalysisManager MAM;
988 
989   PassBuilder PB(this);
990   PB.registerModuleAnalyses(MAM);
991   PB.registerFunctionAnalyses(FAM);
992   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
993 
994   ModulePassManager MPM;
995   MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
996   MPM.run(M, MAM);
997   return true;
998 }
999 
1000 //===----------------------------------------------------------------------===//
1001 // GCN Target Machine (SI+)
1002 //===----------------------------------------------------------------------===//
1003 
1004 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
1005                                    StringRef CPU, StringRef FS,
1006                                    const TargetOptions &Options,
1007                                    std::optional<Reloc::Model> RM,
1008                                    std::optional<CodeModel::Model> CM,
1009                                    CodeGenOptLevel OL, bool JIT)
1010     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1011 
1012 const TargetSubtargetInfo *
1013 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
1014   StringRef GPU = getGPUName(F);
1015   StringRef FS = getFeatureString(F);
1016 
1017   SmallString<128> SubtargetKey(GPU);
1018   SubtargetKey.append(FS);
1019 
1020   auto &I = SubtargetMap[SubtargetKey];
1021   if (!I) {
1022     // This needs to be done before we create a new subtarget since any
1023     // creation will depend on the TM and the code generation flags on the
1024     // function that reside in TargetOptions.
1025     resetTargetOptions(F);
1026     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
1027   }
1028 
1029   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1030 
1031   return I.get();
1032 }
1033 
1034 TargetTransformInfo
1035 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
1036   return TargetTransformInfo(GCNTTIImpl(this, F));
1037 }
1038 
1039 Error GCNTargetMachine::buildCodeGenPipeline(
1040     ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
1041     CodeGenFileType FileType, const CGPassBuilderOption &Opts,
1042     PassInstrumentationCallbacks *PIC) {
1043   AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1044   return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
1045 }
1046 
1047 //===----------------------------------------------------------------------===//
1048 // AMDGPU Legacy Pass Setup
1049 //===----------------------------------------------------------------------===//
1050 
1051 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1052   return getStandardCSEConfigForOpt(TM->getOptLevel());
1053 }
1054 
1055 namespace {
1056 
1057 class GCNPassConfig final : public AMDGPUPassConfig {
1058 public:
1059   GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1060       : AMDGPUPassConfig(TM, PM) {
1061     // It is necessary to know the register usage of the entire call graph.  We
1062     // allow calls without EnableAMDGPUFunctionCalls if they are marked
1063     // noinline, so this is always required.
1064     setRequiresCodeGenSCCOrder(true);
1065     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
1066   }
1067 
1068   GCNTargetMachine &getGCNTargetMachine() const {
1069     return getTM<GCNTargetMachine>();
1070   }
1071 
1072   ScheduleDAGInstrs *
1073   createMachineScheduler(MachineSchedContext *C) const override;
1074 
1075   ScheduleDAGInstrs *
1076   createPostMachineScheduler(MachineSchedContext *C) const override {
1077     ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
1078         C, std::make_unique<PostGenericScheduler>(C),
1079         /*RemoveKillFlags=*/true);
1080     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1081     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1082     if (ST.shouldClusterStores())
1083       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1084     DAG->addMutation(
1085         createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1086     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1087       DAG->addMutation(createVOPDPairingMutation());
1088     return DAG;
1089   }
1090 
1091   bool addPreISel() override;
1092   void addMachineSSAOptimization() override;
1093   bool addILPOpts() override;
1094   bool addInstSelector() override;
1095   bool addIRTranslator() override;
1096   void addPreLegalizeMachineIR() override;
1097   bool addLegalizeMachineIR() override;
1098   void addPreRegBankSelect() override;
1099   bool addRegBankSelect() override;
1100   void addPreGlobalInstructionSelect() override;
1101   bool addGlobalInstructionSelect() override;
1102   void addFastRegAlloc() override;
1103   void addOptimizedRegAlloc() override;
1104 
1105   FunctionPass *createSGPRAllocPass(bool Optimized);
1106   FunctionPass *createVGPRAllocPass(bool Optimized);
1107   FunctionPass *createWWMRegAllocPass(bool Optimized);
1108   FunctionPass *createRegAllocPass(bool Optimized) override;
1109 
1110   bool addRegAssignAndRewriteFast() override;
1111   bool addRegAssignAndRewriteOptimized() override;
1112 
1113   bool addPreRewrite() override;
1114   void addPostRegAlloc() override;
1115   void addPreSched2() override;
1116   void addPreEmitPass() override;
1117 };
1118 
1119 } // end anonymous namespace
1120 
1121 AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
1122     : TargetPassConfig(TM, PM) {
1123   // Exceptions and StackMaps are not supported, so these passes will never do
1124   // anything.
1125   disablePass(&StackMapLivenessID);
1126   disablePass(&FuncletLayoutID);
1127   // Garbage collection is not supported.
1128   disablePass(&GCLoweringID);
1129   disablePass(&ShadowStackGCLoweringID);
1130 }
1131 
1132 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
1133   if (getOptLevel() == CodeGenOptLevel::Aggressive)
1134     addPass(createGVNPass());
1135   else
1136     addPass(createEarlyCSEPass());
1137 }
1138 
1139 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1140   if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
1141     addPass(createLoopDataPrefetchPass());
1142   addPass(createSeparateConstOffsetFromGEPPass());
1143   // ReassociateGEPs exposes more opportunities for SLSR. See
1144   // the example in reassociate-geps-and-slsr.ll.
1145   addPass(createStraightLineStrengthReducePass());
1146   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1147   // EarlyCSE can reuse.
1148   addEarlyCSEOrGVNPass();
1149   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1150   addPass(createNaryReassociatePass());
1151   // NaryReassociate on GEPs creates redundant common expressions, so run
1152   // EarlyCSE after it.
1153   addPass(createEarlyCSEPass());
1154 }
1155 
1156 void AMDGPUPassConfig::addIRPasses() {
1157   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1158 
1159   Triple::ArchType Arch = TM.getTargetTriple().getArch();
1160   if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
1161     addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1162 
1163   // There is no reason to run these.
1164   disablePass(&StackMapLivenessID);
1165   disablePass(&FuncletLayoutID);
1166   disablePass(&PatchableFunctionID);
1167 
1168   addPass(createAMDGPUPrintfRuntimeBinding());
1169   if (LowerCtorDtor)
1170     addPass(createAMDGPUCtorDtorLoweringLegacyPass());
1171 
1172   if (isPassEnabled(EnableImageIntrinsicOptimizer))
1173     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
1174 
1175   // This can be disabled by passing ::Disable here or on the command line
1176   // with --expand-variadics-override=disable.
1177   addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1178 
1179   // Function calls are not supported, so make sure we inline everything.
1180   addPass(createAMDGPUAlwaysInlinePass());
1181   addPass(createAlwaysInlinerLegacyPass());
1182 
1183   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1184   if (Arch == Triple::r600)
1185     addPass(createR600OpenCLImageTypeLoweringPass());
1186 
1187   // Replace OpenCL enqueued block function pointers with global variables.
1188   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass());
1189 
1190   // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1191   if (EnableSwLowerLDS)
1192     addPass(createAMDGPUSwLowerLDSLegacyPass(&TM));
1193 
1194   // Runs before PromoteAlloca so the latter can account for function uses
1195   if (EnableLowerModuleLDS) {
1196     addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
1197   }
1198 
1199   if (TM.getOptLevel() > CodeGenOptLevel::None)
1200     addPass(createInferAddressSpacesPass());
1201 
1202   // Run atomic optimizer before Atomic Expand
1203   if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
1204       (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1205       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1206     addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1207   }
1208 
1209   addPass(createAtomicExpandLegacyPass());
1210 
1211   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1212     addPass(createAMDGPUPromoteAlloca());
1213 
1214     if (isPassEnabled(EnableScalarIRPasses))
1215       addStraightLineScalarOptimizationPasses();
1216 
1217     if (EnableAMDGPUAliasAnalysis) {
1218       addPass(createAMDGPUAAWrapperPass());
1219       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1220                                              AAResults &AAR) {
1221         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1222           AAR.addAAResult(WrapperPass->getResult());
1223         }));
1224     }
1225 
1226     if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1227       // TODO: May want to move later or split into an early and late one.
1228       addPass(createAMDGPUCodeGenPreparePass());
1229     }
1230 
1231     // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1232     // have expanded.
1233     if (TM.getOptLevel() > CodeGenOptLevel::Less)
1234       addPass(createLICMPass());
1235   }
1236 
1237   TargetPassConfig::addIRPasses();
1238 
1239   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1240   // example, GVN can combine
1241   //
1242   //   %0 = add %a, %b
1243   //   %1 = add %b, %a
1244   //
1245   // and
1246   //
1247   //   %0 = shl nsw %a, 2
1248   //   %1 = shl %a, 2
1249   //
1250   // but EarlyCSE can do neither of them.
1251   if (isPassEnabled(EnableScalarIRPasses))
1252     addEarlyCSEOrGVNPass();
1253 }
1254 
1255 void AMDGPUPassConfig::addCodeGenPrepare() {
1256   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1257     // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1258     // analysis, and should be removed.
1259     addPass(createAMDGPUAnnotateKernelFeaturesPass());
1260   }
1261 
1262   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1263       EnableLowerKernelArguments)
1264     addPass(createAMDGPULowerKernelArgumentsPass());
1265 
1266   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1267     // This lowering has been placed after codegenprepare to take advantage of
1268     // address mode matching (which is why it isn't put with the LDS lowerings).
1269     // It could be placed anywhere before uniformity annotations (an analysis
1270     // that it changes by splitting up fat pointers into their components)
1271     // but has been put before switch lowering and CFG flattening so that those
1272     // passes can run on the more optimized control flow this pass creates in
1273     // many cases.
1274     //
1275     // FIXME: This should ideally be put after the LoadStoreVectorizer.
1276     // However, due to some annoying facts about ResourceUsageAnalysis,
1277     // (especially as exercised in the resource-usage-dead-function test),
1278     // we need all the function passes codegenprepare all the way through
1279     // said resource usage analysis to run on the call graph produced
1280     // before codegenprepare runs (because codegenprepare will knock some
1281     // nodes out of the graph, which leads to function-level passes not
1282     // being run on them, which causes crashes in the resource usage analysis).
1283     addPass(createAMDGPULowerBufferFatPointersPass());
1284     // In accordance with the above FIXME, manually force all the
1285     // function-level passes into a CGSCCPassManager.
1286     addPass(new DummyCGSCCPass());
1287   }
1288 
1289   TargetPassConfig::addCodeGenPrepare();
1290 
1291   if (isPassEnabled(EnableLoadStoreVectorizer))
1292     addPass(createLoadStoreVectorizerPass());
1293 
1294   // LowerSwitch pass may introduce unreachable blocks that can
1295   // cause unexpected behavior for subsequent passes. Placing it
1296   // here seems better that these blocks would get cleaned up by
1297   // UnreachableBlockElim inserted next in the pass flow.
1298   addPass(createLowerSwitchPass());
1299 }
1300 
1301 bool AMDGPUPassConfig::addPreISel() {
1302   if (TM->getOptLevel() > CodeGenOptLevel::None)
1303     addPass(createFlattenCFGPass());
1304   return false;
1305 }
1306 
1307 bool AMDGPUPassConfig::addInstSelector() {
1308   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1309   return false;
1310 }
1311 
1312 bool AMDGPUPassConfig::addGCPasses() {
1313   // Do nothing. GC is not supported.
1314   return false;
1315 }
1316 
1317 llvm::ScheduleDAGInstrs *
1318 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1319   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1320   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1321   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1322   if (ST.shouldClusterStores())
1323     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1324   return DAG;
1325 }
1326 
1327 //===----------------------------------------------------------------------===//
1328 // GCN Legacy Pass Setup
1329 //===----------------------------------------------------------------------===//
1330 
1331 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1332   MachineSchedContext *C) const {
1333   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1334   if (ST.enableSIScheduler())
1335     return createSIMachineScheduler(C);
1336 
1337   Attribute SchedStrategyAttr =
1338       C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
1339   StringRef SchedStrategy = SchedStrategyAttr.isValid()
1340                                 ? SchedStrategyAttr.getValueAsString()
1341                                 : AMDGPUSchedStrategy;
1342 
1343   if (SchedStrategy == "max-ilp")
1344     return createGCNMaxILPMachineScheduler(C);
1345 
1346   if (SchedStrategy == "max-memory-clause")
1347     return createGCNMaxMemoryClauseMachineScheduler(C);
1348 
1349   return createGCNMaxOccupancyMachineScheduler(C);
1350 }
1351 
1352 bool GCNPassConfig::addPreISel() {
1353   AMDGPUPassConfig::addPreISel();
1354 
1355   if (TM->getOptLevel() > CodeGenOptLevel::None)
1356     addPass(createSinkingPass());
1357 
1358   if (TM->getOptLevel() > CodeGenOptLevel::None)
1359     addPass(createAMDGPULateCodeGenPrepareLegacyPass());
1360 
1361   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1362   // regions formed by them.
1363   addPass(&AMDGPUUnifyDivergentExitNodesID);
1364   addPass(createFixIrreduciblePass());
1365   addPass(createUnifyLoopExitsPass());
1366   addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1367 
1368   addPass(createAMDGPUAnnotateUniformValuesLegacy());
1369   addPass(createSIAnnotateControlFlowLegacyPass());
1370   // TODO: Move this right after structurizeCFG to avoid extra divergence
1371   // analysis. This depends on stopping SIAnnotateControlFlow from making
1372   // control flow modifications.
1373   addPass(createAMDGPURewriteUndefForPHILegacyPass());
1374 
1375   addPass(createLCSSAPass());
1376 
1377   if (TM->getOptLevel() > CodeGenOptLevel::Less)
1378     addPass(&AMDGPUPerfHintAnalysisLegacyID);
1379 
1380   return false;
1381 }
1382 
1383 void GCNPassConfig::addMachineSSAOptimization() {
1384   TargetPassConfig::addMachineSSAOptimization();
1385 
1386   // We want to fold operands after PeepholeOptimizer has run (or as part of
1387   // it), because it will eliminate extra copies making it easier to fold the
1388   // real source operand. We want to eliminate dead instructions after, so that
1389   // we see fewer uses of the copies. We then need to clean up the dead
1390   // instructions leftover after the operands are folded as well.
1391   //
1392   // XXX - Can we get away without running DeadMachineInstructionElim again?
1393   addPass(&SIFoldOperandsLegacyID);
1394   if (EnableDPPCombine)
1395     addPass(&GCNDPPCombineLegacyID);
1396   addPass(&SILoadStoreOptimizerLegacyID);
1397   if (isPassEnabled(EnableSDWAPeephole)) {
1398     addPass(&SIPeepholeSDWALegacyID);
1399     addPass(&EarlyMachineLICMID);
1400     addPass(&MachineCSELegacyID);
1401     addPass(&SIFoldOperandsLegacyID);
1402   }
1403   addPass(&DeadMachineInstructionElimID);
1404   addPass(createSIShrinkInstructionsLegacyPass());
1405 }
1406 
1407 bool GCNPassConfig::addILPOpts() {
1408   if (EnableEarlyIfConversion)
1409     addPass(&EarlyIfConverterLegacyID);
1410 
1411   TargetPassConfig::addILPOpts();
1412   return false;
1413 }
1414 
1415 bool GCNPassConfig::addInstSelector() {
1416   AMDGPUPassConfig::addInstSelector();
1417   addPass(&SIFixSGPRCopiesLegacyID);
1418   addPass(createSILowerI1CopiesLegacyPass());
1419   return false;
1420 }
1421 
1422 bool GCNPassConfig::addIRTranslator() {
1423   addPass(new IRTranslator(getOptLevel()));
1424   return false;
1425 }
1426 
1427 void GCNPassConfig::addPreLegalizeMachineIR() {
1428   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1429   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1430   addPass(new Localizer());
1431 }
1432 
1433 bool GCNPassConfig::addLegalizeMachineIR() {
1434   addPass(new Legalizer());
1435   return false;
1436 }
1437 
1438 void GCNPassConfig::addPreRegBankSelect() {
1439   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1440   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1441   addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
1442 }
1443 
1444 bool GCNPassConfig::addRegBankSelect() {
1445   if (NewRegBankSelect) {
1446     addPass(createAMDGPURegBankSelectPass());
1447     addPass(createAMDGPURegBankLegalizePass());
1448   } else {
1449     addPass(new RegBankSelect());
1450   }
1451   return false;
1452 }
1453 
1454 void GCNPassConfig::addPreGlobalInstructionSelect() {
1455   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1456   addPass(createAMDGPURegBankCombiner(IsOptNone));
1457 }
1458 
1459 bool GCNPassConfig::addGlobalInstructionSelect() {
1460   addPass(new InstructionSelect(getOptLevel()));
1461   return false;
1462 }
1463 
1464 void GCNPassConfig::addFastRegAlloc() {
1465   // FIXME: We have to disable the verifier here because of PHIElimination +
1466   // TwoAddressInstructions disabling it.
1467 
1468   // This must be run immediately after phi elimination and before
1469   // TwoAddressInstructions, otherwise the processing of the tied operand of
1470   // SI_ELSE will introduce a copy of the tied operand source after the else.
1471   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
1472 
1473   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1474 
1475   TargetPassConfig::addFastRegAlloc();
1476 }
1477 
1478 void GCNPassConfig::addOptimizedRegAlloc() {
1479   if (EnableDCEInRA)
1480     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1481 
1482   // FIXME: when an instruction has a Killed operand, and the instruction is
1483   // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1484   // the register in LiveVariables, this would trigger a failure in verifier,
1485   // we should fix it and enable the verifier.
1486   if (OptVGPRLiveRange)
1487     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID);
1488 
1489   // This must be run immediately after phi elimination and before
1490   // TwoAddressInstructions, otherwise the processing of the tied operand of
1491   // SI_ELSE will introduce a copy of the tied operand source after the else.
1492   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
1493 
1494   if (EnableRewritePartialRegUses)
1495     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1496 
1497   if (isPassEnabled(EnablePreRAOptimizations))
1498     insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID);
1499 
1500   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1501   // instructions that cause scheduling barriers.
1502   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1503 
1504   if (OptExecMaskPreRA)
1505     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1506 
1507   // This is not an essential optimization and it has a noticeable impact on
1508   // compilation time, so we only enable it from O2.
1509   if (TM->getOptLevel() > CodeGenOptLevel::Less)
1510     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1511 
1512   TargetPassConfig::addOptimizedRegAlloc();
1513 }
1514 
1515 bool GCNPassConfig::addPreRewrite() {
1516   if (EnableRegReassign)
1517     addPass(&GCNNSAReassignID);
1518   return true;
1519 }
1520 
1521 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1522   // Initialize the global default.
1523   llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1524                   initializeDefaultSGPRRegisterAllocatorOnce);
1525 
1526   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1527   if (Ctor != useDefaultRegisterAllocator)
1528     return Ctor();
1529 
1530   if (Optimized)
1531     return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1532 
1533   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1534 }
1535 
1536 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1537   // Initialize the global default.
1538   llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1539                   initializeDefaultVGPRRegisterAllocatorOnce);
1540 
1541   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1542   if (Ctor != useDefaultRegisterAllocator)
1543     return Ctor();
1544 
1545   if (Optimized)
1546     return createGreedyVGPRRegisterAllocator();
1547 
1548   return createFastVGPRRegisterAllocator();
1549 }
1550 
1551 FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1552   // Initialize the global default.
1553   llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
1554                   initializeDefaultWWMRegisterAllocatorOnce);
1555 
1556   RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1557   if (Ctor != useDefaultRegisterAllocator)
1558     return Ctor();
1559 
1560   if (Optimized)
1561     return createGreedyWWMRegisterAllocator();
1562 
1563   return createFastWWMRegisterAllocator();
1564 }
1565 
1566 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1567   llvm_unreachable("should not be used");
1568 }
1569 
1570 static const char RegAllocOptNotSupportedMessage[] =
1571     "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1572     "and -vgpr-regalloc";
1573 
1574 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1575   if (!usingDefaultRegAlloc())
1576     report_fatal_error(RegAllocOptNotSupportedMessage);
1577 
1578   addPass(&GCNPreRALongBranchRegID);
1579 
1580   addPass(createSGPRAllocPass(false));
1581 
1582   // Equivalent of PEI for SGPRs.
1583   addPass(&SILowerSGPRSpillsLegacyID);
1584 
1585   // To Allocate wwm registers used in whole quad mode operations (for shaders).
1586   addPass(&SIPreAllocateWWMRegsLegacyID);
1587 
1588   // For allocating other wwm register operands.
1589   addPass(createWWMRegAllocPass(false));
1590 
1591   addPass(&SILowerWWMCopiesLegacyID);
1592   addPass(&AMDGPUReserveWWMRegsID);
1593 
1594   // For allocating per-thread VGPRs.
1595   addPass(createVGPRAllocPass(false));
1596 
1597   return true;
1598 }
1599 
1600 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1601   if (!usingDefaultRegAlloc())
1602     report_fatal_error(RegAllocOptNotSupportedMessage);
1603 
1604   addPass(&GCNPreRALongBranchRegID);
1605 
1606   addPass(createSGPRAllocPass(true));
1607 
1608   // Commit allocated register changes. This is mostly necessary because too
1609   // many things rely on the use lists of the physical registers, such as the
1610   // verifier. This is only necessary with allocators which use LiveIntervals,
1611   // since FastRegAlloc does the replacements itself.
1612   addPass(createVirtRegRewriter(false));
1613 
1614   // At this point, the sgpr-regalloc has been done and it is good to have the
1615   // stack slot coloring to try to optimize the SGPR spill stack indices before
1616   // attempting the custom SGPR spill lowering.
1617   addPass(&StackSlotColoringID);
1618 
1619   // Equivalent of PEI for SGPRs.
1620   addPass(&SILowerSGPRSpillsLegacyID);
1621 
1622   // To Allocate wwm registers used in whole quad mode operations (for shaders).
1623   addPass(&SIPreAllocateWWMRegsLegacyID);
1624 
1625   // For allocating other whole wave mode registers.
1626   addPass(createWWMRegAllocPass(true));
1627   addPass(&SILowerWWMCopiesLegacyID);
1628   addPass(createVirtRegRewriter(false));
1629   addPass(&AMDGPUReserveWWMRegsID);
1630 
1631   // For allocating per-thread VGPRs.
1632   addPass(createVGPRAllocPass(true));
1633 
1634   addPreRewrite();
1635   addPass(&VirtRegRewriterID);
1636 
1637   addPass(&AMDGPUMarkLastScratchLoadID);
1638 
1639   return true;
1640 }
1641 
1642 void GCNPassConfig::addPostRegAlloc() {
1643   addPass(&SIFixVGPRCopiesID);
1644   if (getOptLevel() > CodeGenOptLevel::None)
1645     addPass(&SIOptimizeExecMaskingLegacyID);
1646   TargetPassConfig::addPostRegAlloc();
1647 }
1648 
1649 void GCNPassConfig::addPreSched2() {
1650   if (TM->getOptLevel() > CodeGenOptLevel::None)
1651     addPass(createSIShrinkInstructionsLegacyPass());
1652   addPass(&SIPostRABundlerID);
1653 }
1654 
1655 void GCNPassConfig::addPreEmitPass() {
1656   if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1657     addPass(&GCNCreateVOPDID);
1658   addPass(createSIMemoryLegalizerPass());
1659   addPass(createSIInsertWaitcntsPass());
1660 
1661   addPass(createSIModeRegisterPass());
1662 
1663   if (getOptLevel() > CodeGenOptLevel::None)
1664     addPass(&SIInsertHardClausesID);
1665 
1666   addPass(&SILateBranchLoweringPassID);
1667   if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1668     addPass(createAMDGPUSetWavePriorityPass());
1669   if (getOptLevel() > CodeGenOptLevel::None)
1670     addPass(&SIPreEmitPeepholeID);
1671   // The hazard recognizer that runs as part of the post-ra scheduler does not
1672   // guarantee to be able handle all hazards correctly. This is because if there
1673   // are multiple scheduling regions in a basic block, the regions are scheduled
1674   // bottom up, so when we begin to schedule a region we don't know what
1675   // instructions were emitted directly before it.
1676   //
1677   // Here we add a stand-alone hazard recognizer pass which can handle all
1678   // cases.
1679   addPass(&PostRAHazardRecognizerID);
1680 
1681   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1682     addPass(&AMDGPUInsertDelayAluID);
1683 
1684   addPass(&BranchRelaxationPassID);
1685   addPass(createAMDGPUPreloadKernArgPrologLegacyPass());
1686 }
1687 
1688 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1689   return new GCNPassConfig(*this, PM);
1690 }
1691 
1692 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1693     MachineFunction &MF) const {
1694   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1695   MF.getRegInfo().addDelegate(MFI);
1696 }
1697 
1698 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1699     BumpPtrAllocator &Allocator, const Function &F,
1700     const TargetSubtargetInfo *STI) const {
1701   return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1702       Allocator, F, static_cast<const GCNSubtarget *>(STI));
1703 }
1704 
1705 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1706   return new yaml::SIMachineFunctionInfo();
1707 }
1708 
1709 yaml::MachineFunctionInfo *
1710 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1711   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1712   return new yaml::SIMachineFunctionInfo(
1713       *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1714 }
1715 
1716 bool GCNTargetMachine::parseMachineFunctionInfo(
1717     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1718     SMDiagnostic &Error, SMRange &SourceRange) const {
1719   const yaml::SIMachineFunctionInfo &YamlMFI =
1720       static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1721   MachineFunction &MF = PFS.MF;
1722   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1723   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1724 
1725   if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1726     return true;
1727 
1728   if (MFI->Occupancy == 0) {
1729     // Fixup the subtarget dependent default value.
1730     MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1731   }
1732 
1733   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1734     Register TempReg;
1735     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1736       SourceRange = RegName.SourceRange;
1737       return true;
1738     }
1739     RegVal = TempReg;
1740 
1741     return false;
1742   };
1743 
1744   auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1745                                    Register &RegVal) {
1746     return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1747   };
1748 
1749   if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1750     return true;
1751 
1752   if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1753     return true;
1754 
1755   if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1756                             MFI->LongBranchReservedReg))
1757     return true;
1758 
1759   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1760     // Create a diagnostic for a the register string literal.
1761     const MemoryBuffer &Buffer =
1762         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1763     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1764                          RegName.Value.size(), SourceMgr::DK_Error,
1765                          "incorrect register class for field", RegName.Value,
1766                          {}, {});
1767     SourceRange = RegName.SourceRange;
1768     return true;
1769   };
1770 
1771   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1772       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1773       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1774     return true;
1775 
1776   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1777       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1778     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1779   }
1780 
1781   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1782       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1783     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1784   }
1785 
1786   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1787       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1788     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1789   }
1790 
1791   for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1792     Register ParsedReg;
1793     if (parseRegister(YamlReg, ParsedReg))
1794       return true;
1795 
1796     MFI->reserveWWMRegister(ParsedReg);
1797   }
1798 
1799   for (const auto &[_, Info] : PFS.VRegInfosNamed) {
1800     MFI->setFlag(Info->VReg, Info->Flags);
1801   }
1802   for (const auto &[_, Info] : PFS.VRegInfos) {
1803     MFI->setFlag(Info->VReg, Info->Flags);
1804   }
1805 
1806   for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
1807     Register ParsedReg;
1808     if (parseRegister(YamlRegStr, ParsedReg))
1809       return true;
1810     MFI->SpillPhysVGPRs.push_back(ParsedReg);
1811   }
1812 
1813   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1814                                    const TargetRegisterClass &RC,
1815                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1816                                    unsigned SystemSGPRs) {
1817     // Skip parsing if it's not present.
1818     if (!A)
1819       return false;
1820 
1821     if (A->IsRegister) {
1822       Register Reg;
1823       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1824         SourceRange = A->RegisterName.SourceRange;
1825         return true;
1826       }
1827       if (!RC.contains(Reg))
1828         return diagnoseRegisterClass(A->RegisterName);
1829       Arg = ArgDescriptor::createRegister(Reg);
1830     } else
1831       Arg = ArgDescriptor::createStack(A->StackOffset);
1832     // Check and apply the optional mask.
1833     if (A->Mask)
1834       Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1835 
1836     MFI->NumUserSGPRs += UserSGPRs;
1837     MFI->NumSystemSGPRs += SystemSGPRs;
1838     return false;
1839   };
1840 
1841   if (YamlMFI.ArgInfo &&
1842       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1843                              AMDGPU::SGPR_128RegClass,
1844                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1845        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1846                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1847                              2, 0) ||
1848        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1849                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1850        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1851                              AMDGPU::SReg_64RegClass,
1852                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1853        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1854                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1855                              2, 0) ||
1856        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1857                              AMDGPU::SReg_64RegClass,
1858                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1859        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1860                              AMDGPU::SGPR_32RegClass,
1861                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1862        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1863                              AMDGPU::SGPR_32RegClass,
1864                              MFI->ArgInfo.LDSKernelId, 0, 1) ||
1865        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1866                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1867                              0, 1) ||
1868        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1869                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1870                              0, 1) ||
1871        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1872                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1873                              0, 1) ||
1874        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1875                              AMDGPU::SGPR_32RegClass,
1876                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1877        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1878                              AMDGPU::SGPR_32RegClass,
1879                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1880        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1881                              AMDGPU::SReg_64RegClass,
1882                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1883        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1884                              AMDGPU::SReg_64RegClass,
1885                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1886        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1887                              AMDGPU::VGPR_32RegClass,
1888                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1889        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1890                              AMDGPU::VGPR_32RegClass,
1891                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1892        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1893                              AMDGPU::VGPR_32RegClass,
1894                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1895     return true;
1896 
1897   if (ST.hasIEEEMode())
1898     MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1899   if (ST.hasDX10ClampMode())
1900     MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1901 
1902   // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1903   MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1904                                       ? DenormalMode::IEEE
1905                                       : DenormalMode::PreserveSign;
1906   MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1907                                        ? DenormalMode::IEEE
1908                                        : DenormalMode::PreserveSign;
1909 
1910   MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1911                                           ? DenormalMode::IEEE
1912                                           : DenormalMode::PreserveSign;
1913   MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1914                                            ? DenormalMode::IEEE
1915                                            : DenormalMode::PreserveSign;
1916 
1917   if (YamlMFI.HasInitWholeWave)
1918     MFI->setInitWholeWave();
1919 
1920   return false;
1921 }
1922 
1923 //===----------------------------------------------------------------------===//
1924 // AMDGPU CodeGen Pass Builder interface.
1925 //===----------------------------------------------------------------------===//
1926 
1927 AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
1928     GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
1929     PassInstrumentationCallbacks *PIC)
1930     : CodeGenPassBuilder(TM, Opts, PIC) {
1931   Opt.RequiresCodeGenSCCOrder = true;
1932   // Exceptions and StackMaps are not supported, so these passes will never do
1933   // anything.
1934   // Garbage collection is not supported.
1935   disablePass<StackMapLivenessPass, FuncletLayoutPass,
1936               ShadowStackGCLoweringPass>();
1937 }
1938 
1939 void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
1940   if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1941     addPass(AMDGPURemoveIncompatibleFunctionsPass(TM));
1942 
1943   addPass(AMDGPUPrintfRuntimeBindingPass());
1944   if (LowerCtorDtor)
1945     addPass(AMDGPUCtorDtorLoweringPass());
1946 
1947   if (isPassEnabled(EnableImageIntrinsicOptimizer))
1948     addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
1949 
1950   // This can be disabled by passing ::Disable here or on the command line
1951   // with --expand-variadics-override=disable.
1952   addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
1953 
1954   addPass(AMDGPUAlwaysInlinePass());
1955   addPass(AlwaysInlinerPass());
1956 
1957   addPass(AMDGPUOpenCLEnqueuedBlockLoweringPass());
1958 
1959   if (EnableSwLowerLDS)
1960     addPass(AMDGPUSwLowerLDSPass(TM));
1961 
1962   // Runs before PromoteAlloca so the latter can account for function uses
1963   if (EnableLowerModuleLDS)
1964     addPass(AMDGPULowerModuleLDSPass(TM));
1965 
1966   if (TM.getOptLevel() > CodeGenOptLevel::None)
1967     addPass(InferAddressSpacesPass());
1968 
1969   // Run atomic optimizer before Atomic Expand
1970   if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
1971       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
1972     addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
1973 
1974   addPass(AtomicExpandPass(&TM));
1975 
1976   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1977     addPass(AMDGPUPromoteAllocaPass(TM));
1978     if (isPassEnabled(EnableScalarIRPasses))
1979       addStraightLineScalarOptimizationPasses(addPass);
1980 
1981     // TODO: Handle EnableAMDGPUAliasAnalysis
1982 
1983     // TODO: May want to move later or split into an early and late one.
1984     addPass(AMDGPUCodeGenPreparePass(TM));
1985 
1986     // TODO: LICM
1987   }
1988 
1989   Base::addIRPasses(addPass);
1990 
1991   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1992   // example, GVN can combine
1993   //
1994   //   %0 = add %a, %b
1995   //   %1 = add %b, %a
1996   //
1997   // and
1998   //
1999   //   %0 = shl nsw %a, 2
2000   //   %1 = shl %a, 2
2001   //
2002   // but EarlyCSE can do neither of them.
2003   if (isPassEnabled(EnableScalarIRPasses))
2004     addEarlyCSEOrGVNPass(addPass);
2005 }
2006 
2007 void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
2008   // AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be
2009   // deleted soon.
2010 
2011   if (EnableLowerKernelArguments)
2012     addPass(AMDGPULowerKernelArgumentsPass(TM));
2013 
2014   // This lowering has been placed after codegenprepare to take advantage of
2015   // address mode matching (which is why it isn't put with the LDS lowerings).
2016   // It could be placed anywhere before uniformity annotations (an analysis
2017   // that it changes by splitting up fat pointers into their components)
2018   // but has been put before switch lowering and CFG flattening so that those
2019   // passes can run on the more optimized control flow this pass creates in
2020   // many cases.
2021   //
2022   // FIXME: This should ideally be put after the LoadStoreVectorizer.
2023   // However, due to some annoying facts about ResourceUsageAnalysis,
2024   // (especially as exercised in the resource-usage-dead-function test),
2025   // we need all the function passes codegenprepare all the way through
2026   // said resource usage analysis to run on the call graph produced
2027   // before codegenprepare runs (because codegenprepare will knock some
2028   // nodes out of the graph, which leads to function-level passes not
2029   // being run on them, which causes crashes in the resource usage analysis).
2030   addPass(AMDGPULowerBufferFatPointersPass(TM));
2031 
2032   Base::addCodeGenPrepare(addPass);
2033 
2034   if (isPassEnabled(EnableLoadStoreVectorizer))
2035     addPass(LoadStoreVectorizerPass());
2036 
2037   // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2038   // behavior for subsequent passes. Placing it here seems better that these
2039   // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2040   // pass flow.
2041   addPass(LowerSwitchPass());
2042 }
2043 
2044 void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
2045 
2046   if (TM.getOptLevel() > CodeGenOptLevel::None)
2047     addPass(FlattenCFGPass());
2048 
2049   if (TM.getOptLevel() > CodeGenOptLevel::None)
2050     addPass(SinkingPass());
2051 
2052   addPass(AMDGPULateCodeGenPreparePass(TM));
2053 
2054   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2055   // regions formed by them.
2056 
2057   addPass(AMDGPUUnifyDivergentExitNodesPass());
2058   addPass(FixIrreduciblePass());
2059   addPass(UnifyLoopExitsPass());
2060   addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false));
2061 
2062   addPass(AMDGPUAnnotateUniformValuesPass());
2063 
2064   addPass(SIAnnotateControlFlowPass(TM));
2065 
2066   // TODO: Move this right after structurizeCFG to avoid extra divergence
2067   // analysis. This depends on stopping SIAnnotateControlFlow from making
2068   // control flow modifications.
2069   addPass(AMDGPURewriteUndefForPHIPass());
2070 
2071   addPass(LCSSAPass());
2072 
2073   if (TM.getOptLevel() > CodeGenOptLevel::Less)
2074     addPass(AMDGPUPerfHintAnalysisPass(TM));
2075 
2076   // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2077   // isn't this in addInstSelector?
2078   addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>());
2079 }
2080 
2081 void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
2082   if (EnableEarlyIfConversion)
2083     addPass(EarlyIfConverterPass());
2084 
2085   Base::addILPOpts(addPass);
2086 }
2087 
2088 void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
2089                                              CreateMCStreamer) const {
2090   // TODO: Add AsmPrinter.
2091 }
2092 
2093 Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
2094   addPass(AMDGPUISelDAGToDAGPass(TM));
2095   addPass(SIFixSGPRCopiesPass());
2096   addPass(SILowerI1CopiesPass());
2097   return Error::success();
2098 }
2099 
2100 void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2101     AddMachinePass &addPass) const {
2102   Base::addMachineSSAOptimization(addPass);
2103 
2104   addPass(SIFoldOperandsPass());
2105   if (EnableDPPCombine) {
2106     addPass(GCNDPPCombinePass());
2107   }
2108   addPass(SILoadStoreOptimizerPass());
2109   if (isPassEnabled(EnableSDWAPeephole)) {
2110     addPass(SIPeepholeSDWAPass());
2111     addPass(EarlyMachineLICMPass());
2112     addPass(MachineCSEPass());
2113     addPass(SIFoldOperandsPass());
2114   }
2115   addPass(DeadMachineInstructionElimPass());
2116   addPass(SIShrinkInstructionsPass());
2117 }
2118 
2119 void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
2120   addPass(SIFixVGPRCopiesPass());
2121   if (TM.getOptLevel() > CodeGenOptLevel::None)
2122     addPass(SIOptimizeExecMaskingPass());
2123   Base::addPostRegAlloc(addPass);
2124 }
2125 
2126 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2127                                              CodeGenOptLevel Level) const {
2128   if (Opt.getNumOccurrences())
2129     return Opt;
2130   if (TM.getOptLevel() < Level)
2131     return false;
2132   return Opt;
2133 }
2134 
2135 void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const {
2136   if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2137     addPass(GVNPass());
2138   else
2139     addPass(EarlyCSEPass());
2140 }
2141 
2142 void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2143     AddIRPass &addPass) const {
2144   if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
2145     addPass(LoopDataPrefetchPass());
2146 
2147   addPass(SeparateConstOffsetFromGEPPass());
2148 
2149   // ReassociateGEPs exposes more opportunities for SLSR. See
2150   // the example in reassociate-geps-and-slsr.ll.
2151   addPass(StraightLineStrengthReducePass());
2152 
2153   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2154   // EarlyCSE can reuse.
2155   addEarlyCSEOrGVNPass(addPass);
2156 
2157   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2158   addPass(NaryReassociatePass());
2159 
2160   // NaryReassociate on GEPs creates redundant common expressions, so run
2161   // EarlyCSE after it.
2162   addPass(EarlyCSEPass());
2163 }
2164