1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file contains both AMDGPU target machine and the CodeGen pass builder. 11 /// The AMDGPU target machine contains all of the hardware specific information 12 /// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The 13 /// CodeGen pass builder handles the pass pipeline for new pass manager. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUAliasAnalysis.h" 20 #include "AMDGPUCtorDtorLowering.h" 21 #include "AMDGPUExportClustering.h" 22 #include "AMDGPUIGroupLP.h" 23 #include "AMDGPUISelDAGToDAG.h" 24 #include "AMDGPUMacroFusion.h" 25 #include "AMDGPUOpenCLEnqueuedBlockLowering.h" 26 #include "AMDGPUPerfHintAnalysis.h" 27 #include "AMDGPURemoveIncompatibleFunctions.h" 28 #include "AMDGPUSplitModule.h" 29 #include "AMDGPUTargetObjectFile.h" 30 #include "AMDGPUTargetTransformInfo.h" 31 #include "AMDGPUUnifyDivergentExitNodes.h" 32 #include "GCNDPPCombine.h" 33 #include "GCNIterativeScheduler.h" 34 #include "GCNSchedStrategy.h" 35 #include "GCNVOPDUtils.h" 36 #include "R600.h" 37 #include "R600TargetMachine.h" 38 #include "SIFixSGPRCopies.h" 39 #include "SIFixVGPRCopies.h" 40 #include "SIFoldOperands.h" 41 #include "SILoadStoreOptimizer.h" 42 #include "SILowerControlFlow.h" 43 #include "SILowerSGPRSpills.h" 44 #include "SILowerWWMCopies.h" 45 #include "SIMachineFunctionInfo.h" 46 #include "SIMachineScheduler.h" 47 #include "SIOptimizeExecMasking.h" 48 #include "SIOptimizeVGPRLiveRange.h" 49 #include "SIPeepholeSDWA.h" 50 #include "SIPreAllocateWWMRegs.h" 51 #include "SIShrinkInstructions.h" 52 #include "TargetInfo/AMDGPUTargetInfo.h" 53 #include "Utils/AMDGPUBaseInfo.h" 54 #include "llvm/Analysis/CGSCCPassManager.h" 55 #include "llvm/Analysis/CallGraphSCCPass.h" 56 #include "llvm/Analysis/KernelInfo.h" 57 #include "llvm/Analysis/UniformityAnalysis.h" 58 #include "llvm/CodeGen/AtomicExpand.h" 59 #include "llvm/CodeGen/DeadMachineInstructionElim.h" 60 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 61 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 62 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 63 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 64 #include "llvm/CodeGen/GlobalISel/Localizer.h" 65 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 66 #include "llvm/CodeGen/MIRParser/MIParser.h" 67 #include "llvm/CodeGen/MachineCSE.h" 68 #include "llvm/CodeGen/MachineLICM.h" 69 #include "llvm/CodeGen/Passes.h" 70 #include "llvm/CodeGen/RegAllocRegistry.h" 71 #include "llvm/CodeGen/TargetPassConfig.h" 72 #include "llvm/IR/IntrinsicsAMDGPU.h" 73 #include "llvm/IR/PassManager.h" 74 #include "llvm/IR/PatternMatch.h" 75 #include "llvm/InitializePasses.h" 76 #include "llvm/MC/TargetRegistry.h" 77 #include "llvm/Passes/PassBuilder.h" 78 #include "llvm/Support/FormatVariadic.h" 79 #include "llvm/Transforms/HipStdPar/HipStdPar.h" 80 #include "llvm/Transforms/IPO.h" 81 #include "llvm/Transforms/IPO/AlwaysInliner.h" 82 #include "llvm/Transforms/IPO/ExpandVariadics.h" 83 #include "llvm/Transforms/IPO/GlobalDCE.h" 84 #include "llvm/Transforms/IPO/Internalize.h" 85 #include "llvm/Transforms/Scalar.h" 86 #include "llvm/Transforms/Scalar/EarlyCSE.h" 87 #include "llvm/Transforms/Scalar/FlattenCFG.h" 88 #include "llvm/Transforms/Scalar/GVN.h" 89 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 90 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" 91 #include "llvm/Transforms/Scalar/NaryReassociate.h" 92 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" 93 #include "llvm/Transforms/Scalar/Sink.h" 94 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" 95 #include "llvm/Transforms/Scalar/StructurizeCFG.h" 96 #include "llvm/Transforms/Utils.h" 97 #include "llvm/Transforms/Utils/FixIrreducible.h" 98 #include "llvm/Transforms/Utils/LCSSA.h" 99 #include "llvm/Transforms/Utils/LowerSwitch.h" 100 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 101 #include "llvm/Transforms/Utils/UnifyLoopExits.h" 102 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 103 #include <optional> 104 105 using namespace llvm; 106 using namespace llvm::PatternMatch; 107 108 namespace { 109 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 110 public: 111 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 112 : RegisterRegAllocBase(N, D, C) {} 113 }; 114 115 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 116 public: 117 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 118 : RegisterRegAllocBase(N, D, C) {} 119 }; 120 121 class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> { 122 public: 123 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 124 : RegisterRegAllocBase(N, D, C) {} 125 }; 126 127 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 128 const MachineRegisterInfo &MRI, 129 const Register Reg) { 130 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 131 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC); 132 } 133 134 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 135 const MachineRegisterInfo &MRI, 136 const Register Reg) { 137 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 138 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC); 139 } 140 141 static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI, 142 const MachineRegisterInfo &MRI, 143 const Register Reg) { 144 const SIMachineFunctionInfo *MFI = 145 MRI.getMF().getInfo<SIMachineFunctionInfo>(); 146 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 147 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) && 148 MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); 149 } 150 151 /// -{sgpr|wwm|vgpr}-regalloc=... command line option. 152 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 153 154 /// A dummy default pass factory indicates whether the register allocator is 155 /// overridden on the command line. 156 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 157 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 158 static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag; 159 160 static SGPRRegisterRegAlloc 161 defaultSGPRRegAlloc("default", 162 "pick SGPR register allocator based on -O option", 163 useDefaultRegisterAllocator); 164 165 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 166 RegisterPassParser<SGPRRegisterRegAlloc>> 167 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 168 cl::desc("Register allocator to use for SGPRs")); 169 170 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 171 RegisterPassParser<VGPRRegisterRegAlloc>> 172 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 173 cl::desc("Register allocator to use for VGPRs")); 174 175 static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false, 176 RegisterPassParser<WWMRegisterRegAlloc>> 177 WWMRegAlloc("wwm-regalloc", cl::Hidden, 178 cl::init(&useDefaultRegisterAllocator), 179 cl::desc("Register allocator to use for WWM registers")); 180 181 static void initializeDefaultSGPRRegisterAllocatorOnce() { 182 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 183 184 if (!Ctor) { 185 Ctor = SGPRRegAlloc; 186 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 187 } 188 } 189 190 static void initializeDefaultVGPRRegisterAllocatorOnce() { 191 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 192 193 if (!Ctor) { 194 Ctor = VGPRRegAlloc; 195 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 196 } 197 } 198 199 static void initializeDefaultWWMRegisterAllocatorOnce() { 200 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); 201 202 if (!Ctor) { 203 Ctor = WWMRegAlloc; 204 WWMRegisterRegAlloc::setDefault(WWMRegAlloc); 205 } 206 } 207 208 static FunctionPass *createBasicSGPRRegisterAllocator() { 209 return createBasicRegisterAllocator(onlyAllocateSGPRs); 210 } 211 212 static FunctionPass *createGreedySGPRRegisterAllocator() { 213 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 214 } 215 216 static FunctionPass *createFastSGPRRegisterAllocator() { 217 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 218 } 219 220 static FunctionPass *createBasicVGPRRegisterAllocator() { 221 return createBasicRegisterAllocator(onlyAllocateVGPRs); 222 } 223 224 static FunctionPass *createGreedyVGPRRegisterAllocator() { 225 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 226 } 227 228 static FunctionPass *createFastVGPRRegisterAllocator() { 229 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 230 } 231 232 static FunctionPass *createBasicWWMRegisterAllocator() { 233 return createBasicRegisterAllocator(onlyAllocateWWMRegs); 234 } 235 236 static FunctionPass *createGreedyWWMRegisterAllocator() { 237 return createGreedyRegisterAllocator(onlyAllocateWWMRegs); 238 } 239 240 static FunctionPass *createFastWWMRegisterAllocator() { 241 return createFastRegisterAllocator(onlyAllocateWWMRegs, false); 242 } 243 244 static SGPRRegisterRegAlloc basicRegAllocSGPR( 245 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 246 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 247 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 248 249 static SGPRRegisterRegAlloc fastRegAllocSGPR( 250 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 251 252 253 static VGPRRegisterRegAlloc basicRegAllocVGPR( 254 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 255 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 256 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 257 258 static VGPRRegisterRegAlloc fastRegAllocVGPR( 259 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 260 static WWMRegisterRegAlloc basicRegAllocWWMReg("basic", 261 "basic register allocator", 262 createBasicWWMRegisterAllocator); 263 static WWMRegisterRegAlloc 264 greedyRegAllocWWMReg("greedy", "greedy register allocator", 265 createGreedyWWMRegisterAllocator); 266 static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator", 267 createFastWWMRegisterAllocator); 268 269 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 270 return Phase == ThinOrFullLTOPhase::FullLTOPreLink || 271 Phase == ThinOrFullLTOPhase::ThinLTOPreLink; 272 } 273 } // anonymous namespace 274 275 static cl::opt<bool> 276 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 277 cl::desc("Run early if-conversion"), 278 cl::init(false)); 279 280 static cl::opt<bool> 281 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 282 cl::desc("Run pre-RA exec mask optimizations"), 283 cl::init(true)); 284 285 static cl::opt<bool> 286 LowerCtorDtor("amdgpu-lower-global-ctor-dtor", 287 cl::desc("Lower GPU ctor / dtors to globals on the device."), 288 cl::init(true), cl::Hidden); 289 290 // Option to disable vectorizer for tests. 291 static cl::opt<bool> EnableLoadStoreVectorizer( 292 "amdgpu-load-store-vectorizer", 293 cl::desc("Enable load store vectorizer"), 294 cl::init(true), 295 cl::Hidden); 296 297 // Option to control global loads scalarization 298 static cl::opt<bool> ScalarizeGlobal( 299 "amdgpu-scalarize-global-loads", 300 cl::desc("Enable global load scalarization"), 301 cl::init(true), 302 cl::Hidden); 303 304 // Option to run internalize pass. 305 static cl::opt<bool> InternalizeSymbols( 306 "amdgpu-internalize-symbols", 307 cl::desc("Enable elimination of non-kernel functions and unused globals"), 308 cl::init(false), 309 cl::Hidden); 310 311 // Option to inline all early. 312 static cl::opt<bool> EarlyInlineAll( 313 "amdgpu-early-inline-all", 314 cl::desc("Inline all functions early"), 315 cl::init(false), 316 cl::Hidden); 317 318 static cl::opt<bool> RemoveIncompatibleFunctions( 319 "amdgpu-enable-remove-incompatible-functions", cl::Hidden, 320 cl::desc("Enable removal of functions when they" 321 "use features not supported by the target GPU"), 322 cl::init(true)); 323 324 static cl::opt<bool> EnableSDWAPeephole( 325 "amdgpu-sdwa-peephole", 326 cl::desc("Enable SDWA peepholer"), 327 cl::init(true)); 328 329 static cl::opt<bool> EnableDPPCombine( 330 "amdgpu-dpp-combine", 331 cl::desc("Enable DPP combiner"), 332 cl::init(true)); 333 334 // Enable address space based alias analysis 335 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 336 cl::desc("Enable AMDGPU Alias Analysis"), 337 cl::init(true)); 338 339 // Enable lib calls simplifications 340 static cl::opt<bool> EnableLibCallSimplify( 341 "amdgpu-simplify-libcall", 342 cl::desc("Enable amdgpu library simplifications"), 343 cl::init(true), 344 cl::Hidden); 345 346 static cl::opt<bool> EnableLowerKernelArguments( 347 "amdgpu-ir-lower-kernel-arguments", 348 cl::desc("Lower kernel argument loads in IR pass"), 349 cl::init(true), 350 cl::Hidden); 351 352 static cl::opt<bool> EnableRegReassign( 353 "amdgpu-reassign-regs", 354 cl::desc("Enable register reassign optimizations on gfx10+"), 355 cl::init(true), 356 cl::Hidden); 357 358 static cl::opt<bool> OptVGPRLiveRange( 359 "amdgpu-opt-vgpr-liverange", 360 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 361 cl::init(true), cl::Hidden); 362 363 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy( 364 "amdgpu-atomic-optimizer-strategy", 365 cl::desc("Select DPP or Iterative strategy for scan"), 366 cl::init(ScanOptions::Iterative), 367 cl::values( 368 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), 369 clEnumValN(ScanOptions::Iterative, "Iterative", 370 "Use Iterative approach for scan"), 371 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer"))); 372 373 // Enable Mode register optimization 374 static cl::opt<bool> EnableSIModeRegisterPass( 375 "amdgpu-mode-register", 376 cl::desc("Enable mode register pass"), 377 cl::init(true), 378 cl::Hidden); 379 380 // Enable GFX11+ s_delay_alu insertion 381 static cl::opt<bool> 382 EnableInsertDelayAlu("amdgpu-enable-delay-alu", 383 cl::desc("Enable s_delay_alu insertion"), 384 cl::init(true), cl::Hidden); 385 386 // Enable GFX11+ VOPD 387 static cl::opt<bool> 388 EnableVOPD("amdgpu-enable-vopd", 389 cl::desc("Enable VOPD, dual issue of VALU in wave32"), 390 cl::init(true), cl::Hidden); 391 392 // Option is used in lit tests to prevent deadcoding of patterns inspected. 393 static cl::opt<bool> 394 EnableDCEInRA("amdgpu-dce-in-ra", 395 cl::init(true), cl::Hidden, 396 cl::desc("Enable machine DCE inside regalloc")); 397 398 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 399 cl::desc("Adjust wave priority"), 400 cl::init(false), cl::Hidden); 401 402 static cl::opt<bool> EnableScalarIRPasses( 403 "amdgpu-scalar-ir-passes", 404 cl::desc("Enable scalar IR passes"), 405 cl::init(true), 406 cl::Hidden); 407 408 static cl::opt<bool> 409 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", 410 cl::desc("Enable lowering of lds to global memory pass " 411 "and asan instrument resulting IR."), 412 cl::init(true), cl::Hidden); 413 414 static cl::opt<bool, true> EnableLowerModuleLDS( 415 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 416 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 417 cl::Hidden); 418 419 static cl::opt<bool> EnablePreRAOptimizations( 420 "amdgpu-enable-pre-ra-optimizations", 421 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 422 cl::Hidden); 423 424 static cl::opt<bool> EnablePromoteKernelArguments( 425 "amdgpu-enable-promote-kernel-arguments", 426 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 427 cl::Hidden, cl::init(true)); 428 429 static cl::opt<bool> EnableImageIntrinsicOptimizer( 430 "amdgpu-enable-image-intrinsic-optimizer", 431 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), 432 cl::Hidden); 433 434 static cl::opt<bool> 435 EnableLoopPrefetch("amdgpu-loop-prefetch", 436 cl::desc("Enable loop data prefetch on AMDGPU"), 437 cl::Hidden, cl::init(false)); 438 439 static cl::opt<std::string> 440 AMDGPUSchedStrategy("amdgpu-sched-strategy", 441 cl::desc("Select custom AMDGPU scheduling strategy."), 442 cl::Hidden, cl::init("")); 443 444 static cl::opt<bool> EnableRewritePartialRegUses( 445 "amdgpu-enable-rewrite-partial-reg-uses", 446 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), 447 cl::Hidden); 448 449 static cl::opt<bool> EnableHipStdPar( 450 "amdgpu-enable-hipstdpar", 451 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), 452 cl::Hidden); 453 454 static cl::opt<bool> 455 EnableAMDGPUAttributor("amdgpu-attributor-enable", 456 cl::desc("Enable AMDGPUAttributorPass"), 457 cl::init(true), cl::Hidden); 458 459 static cl::opt<bool> NewRegBankSelect( 460 "new-reg-bank-select", 461 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of " 462 "regbankselect"), 463 cl::init(false), cl::Hidden); 464 465 static cl::opt<bool> HasClosedWorldAssumption( 466 "amdgpu-link-time-closed-world", 467 cl::desc("Whether has closed-world assumption at link time"), 468 cl::init(false), cl::Hidden); 469 470 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 471 // Register the target 472 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); 473 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 474 475 PassRegistry *PR = PassRegistry::getPassRegistry(); 476 initializeR600ClauseMergePassPass(*PR); 477 initializeR600ControlFlowFinalizerPass(*PR); 478 initializeR600PacketizerPass(*PR); 479 initializeR600ExpandSpecialInstrsPassPass(*PR); 480 initializeR600VectorRegMergerPass(*PR); 481 initializeGlobalISel(*PR); 482 initializeAMDGPUDAGToDAGISelLegacyPass(*PR); 483 initializeGCNDPPCombineLegacyPass(*PR); 484 initializeSILowerI1CopiesLegacyPass(*PR); 485 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); 486 initializeAMDGPURegBankSelectPass(*PR); 487 initializeAMDGPURegBankLegalizePass(*PR); 488 initializeSILowerWWMCopiesLegacyPass(*PR); 489 initializeAMDGPUMarkLastScratchLoadPass(*PR); 490 initializeSILowerSGPRSpillsLegacyPass(*PR); 491 initializeSIFixSGPRCopiesLegacyPass(*PR); 492 initializeSIFixVGPRCopiesLegacyPass(*PR); 493 initializeSIFoldOperandsLegacyPass(*PR); 494 initializeSIPeepholeSDWALegacyPass(*PR); 495 initializeSIShrinkInstructionsLegacyPass(*PR); 496 initializeSIOptimizeExecMaskingPreRAPass(*PR); 497 initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR); 498 initializeSILoadStoreOptimizerLegacyPass(*PR); 499 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); 500 initializeAMDGPUAlwaysInlinePass(*PR); 501 initializeAMDGPUSwLowerLDSLegacyPass(*PR); 502 initializeAMDGPUAttributorLegacyPass(*PR); 503 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 504 initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); 505 initializeAMDGPUArgumentUsageInfoPass(*PR); 506 initializeAMDGPUAtomicOptimizerPass(*PR); 507 initializeAMDGPULowerKernelArgumentsPass(*PR); 508 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 509 initializeAMDGPULowerKernelAttributesPass(*PR); 510 initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR); 511 initializeAMDGPUPostLegalizerCombinerPass(*PR); 512 initializeAMDGPUPreLegalizerCombinerPass(*PR); 513 initializeAMDGPURegBankCombinerPass(*PR); 514 initializeAMDGPUPromoteAllocaPass(*PR); 515 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 516 initializeAMDGPUCodeGenPreparePass(*PR); 517 initializeAMDGPULateCodeGenPrepareLegacyPass(*PR); 518 initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); 519 initializeAMDGPULowerModuleLDSLegacyPass(*PR); 520 initializeAMDGPULowerBufferFatPointersPass(*PR); 521 initializeAMDGPUReserveWWMRegsPass(*PR); 522 initializeAMDGPURewriteOutArgumentsPass(*PR); 523 initializeAMDGPURewriteUndefForPHILegacyPass(*PR); 524 initializeAMDGPUUnifyMetadataPass(*PR); 525 initializeSIAnnotateControlFlowLegacyPass(*PR); 526 initializeAMDGPUInsertDelayAluPass(*PR); 527 initializeSIInsertHardClausesPass(*PR); 528 initializeSIInsertWaitcntsPass(*PR); 529 initializeSIModeRegisterPass(*PR); 530 initializeSIWholeQuadModePass(*PR); 531 initializeSILowerControlFlowLegacyPass(*PR); 532 initializeSIPreEmitPeepholePass(*PR); 533 initializeSILateBranchLoweringPass(*PR); 534 initializeSIMemoryLegalizerPass(*PR); 535 initializeSIOptimizeExecMaskingLegacyPass(*PR); 536 initializeSIPreAllocateWWMRegsLegacyPass(*PR); 537 initializeSIFormMemoryClausesPass(*PR); 538 initializeSIPostRABundlerPass(*PR); 539 initializeGCNCreateVOPDPass(*PR); 540 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 541 initializeAMDGPUAAWrapperPassPass(*PR); 542 initializeAMDGPUExternalAAWrapperPass(*PR); 543 initializeAMDGPUImageIntrinsicOptimizerPass(*PR); 544 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 545 initializeAMDGPUResourceUsageAnalysisPass(*PR); 546 initializeGCNNSAReassignPass(*PR); 547 initializeGCNPreRAOptimizationsPass(*PR); 548 initializeGCNPreRALongBranchRegPass(*PR); 549 initializeGCNRewritePartialRegUsesPass(*PR); 550 initializeGCNRegPressurePrinterPass(*PR); 551 initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); 552 } 553 554 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 555 return std::make_unique<AMDGPUTargetObjectFile>(); 556 } 557 558 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 559 return new SIScheduleDAGMI(C); 560 } 561 562 static ScheduleDAGInstrs * 563 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 564 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 565 ScheduleDAGMILive *DAG = 566 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 567 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 568 if (ST.shouldClusterStores()) 569 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 570 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 571 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 572 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 573 return DAG; 574 } 575 576 static ScheduleDAGInstrs * 577 createGCNMaxILPMachineScheduler(MachineSchedContext *C) { 578 ScheduleDAGMILive *DAG = 579 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C)); 580 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); 581 return DAG; 582 } 583 584 static ScheduleDAGInstrs * 585 createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { 586 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 587 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive( 588 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C)); 589 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 590 if (ST.shouldClusterStores()) 591 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 592 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 593 return DAG; 594 } 595 596 static ScheduleDAGInstrs * 597 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 598 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 599 auto *DAG = new GCNIterativeScheduler( 600 C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 601 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 602 if (ST.shouldClusterStores()) 603 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 604 return DAG; 605 } 606 607 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 608 return new GCNIterativeScheduler(C, 609 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 610 } 611 612 static ScheduleDAGInstrs * 613 createIterativeILPMachineScheduler(MachineSchedContext *C) { 614 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 615 auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); 616 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 617 if (ST.shouldClusterStores()) 618 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 619 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 620 return DAG; 621 } 622 623 static MachineSchedRegistry 624 SISchedRegistry("si", "Run SI's custom scheduler", 625 createSIMachineScheduler); 626 627 static MachineSchedRegistry 628 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 629 "Run GCN scheduler to maximize occupancy", 630 createGCNMaxOccupancyMachineScheduler); 631 632 static MachineSchedRegistry 633 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", 634 createGCNMaxILPMachineScheduler); 635 636 static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry( 637 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause", 638 createGCNMaxMemoryClauseMachineScheduler); 639 640 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( 641 "gcn-iterative-max-occupancy-experimental", 642 "Run GCN scheduler to maximize occupancy (experimental)", 643 createIterativeGCNMaxOccupancyMachineScheduler); 644 645 static MachineSchedRegistry GCNMinRegSchedRegistry( 646 "gcn-iterative-minreg", 647 "Run GCN iterative scheduler for minimal register usage (experimental)", 648 createMinRegScheduler); 649 650 static MachineSchedRegistry GCNILPSchedRegistry( 651 "gcn-iterative-ilp", 652 "Run GCN iterative scheduler for ILP scheduling (experimental)", 653 createIterativeILPMachineScheduler); 654 655 static StringRef computeDataLayout(const Triple &TT) { 656 if (TT.getArch() == Triple::r600) { 657 // 32-bit pointers. 658 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 659 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 660 } 661 662 // 32-bit private, local, and region pointers. 64-bit global, constant and 663 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit 664 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values 665 // (address space 7), and 128-bit non-integral buffer resourcees (address 666 // space 8) which cannot be non-trivilally accessed by LLVM memory operations 667 // like getelementptr. 668 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 669 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-" 670 "v32:32-v48:64-v96:" 671 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" 672 "G1-ni:7:8:9"; 673 } 674 675 LLVM_READNONE 676 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 677 if (!GPU.empty()) 678 return GPU; 679 680 // Need to default to a target with flat support for HSA. 681 if (TT.getArch() == Triple::amdgcn) 682 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 683 684 return "r600"; 685 } 686 687 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { 688 // The AMDGPU toolchain only supports generating shared objects, so we 689 // must always use PIC. 690 return Reloc::PIC_; 691 } 692 693 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 694 StringRef CPU, StringRef FS, 695 const TargetOptions &Options, 696 std::optional<Reloc::Model> RM, 697 std::optional<CodeModel::Model> CM, 698 CodeGenOptLevel OptLevel) 699 : CodeGenTargetMachineImpl( 700 T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, 701 getEffectiveRelocModel(RM), 702 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 703 TLOF(createTLOF(getTargetTriple())) { 704 initAsmInfo(); 705 if (TT.getArch() == Triple::amdgcn) { 706 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 707 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 708 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 709 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 710 } 711 } 712 713 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 714 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 715 716 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 717 718 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 719 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 720 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 721 } 722 723 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 724 Attribute FSAttr = F.getFnAttribute("target-features"); 725 726 return FSAttr.isValid() ? FSAttr.getValueAsString() 727 : getTargetFeatureString(); 728 } 729 730 /// Predicate for Internalize pass. 731 static bool mustPreserveGV(const GlobalValue &GV) { 732 if (const Function *F = dyn_cast<Function>(&GV)) 733 return F->isDeclaration() || F->getName().starts_with("__asan_") || 734 F->getName().starts_with("__sanitizer_") || 735 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 736 737 GV.removeDeadConstantUsers(); 738 return !GV.use_empty(); 739 } 740 741 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 742 AAM.registerFunctionAnalysis<AMDGPUAA>(); 743 } 744 745 static Expected<ScanOptions> 746 parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { 747 if (Params.empty()) 748 return ScanOptions::Iterative; 749 Params.consume_front("strategy="); 750 auto Result = StringSwitch<std::optional<ScanOptions>>(Params) 751 .Case("dpp", ScanOptions::DPP) 752 .Cases("iterative", "", ScanOptions::Iterative) 753 .Case("none", ScanOptions::None) 754 .Default(std::nullopt); 755 if (Result) 756 return *Result; 757 return make_error<StringError>("invalid parameter", inconvertibleErrorCode()); 758 } 759 760 Expected<AMDGPUAttributorOptions> 761 parseAMDGPUAttributorPassOptions(StringRef Params) { 762 AMDGPUAttributorOptions Result; 763 while (!Params.empty()) { 764 StringRef ParamName; 765 std::tie(ParamName, Params) = Params.split(';'); 766 if (ParamName == "closed-world") { 767 Result.IsClosedWorld = true; 768 } else { 769 return make_error<StringError>( 770 formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName) 771 .str(), 772 inconvertibleErrorCode()); 773 } 774 } 775 return Result; 776 } 777 778 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 779 780 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" 781 #include "llvm/Passes/TargetPassRegistry.inc" 782 783 PB.registerPipelineStartEPCallback( 784 [](ModulePassManager &PM, OptimizationLevel Level) { 785 if (EnableHipStdPar) 786 PM.addPass(HipStdParAcceleratorCodeSelectionPass()); 787 }); 788 789 PB.registerPipelineEarlySimplificationEPCallback( 790 [](ModulePassManager &PM, OptimizationLevel Level, 791 ThinOrFullLTOPhase Phase) { 792 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 793 794 if (Level == OptimizationLevel::O0) 795 return; 796 797 PM.addPass(AMDGPUUnifyMetadataPass()); 798 799 // We don't want to run internalization at per-module stage. 800 if (InternalizeSymbols && !isLTOPreLink(Phase)) { 801 PM.addPass(InternalizePass(mustPreserveGV)); 802 PM.addPass(GlobalDCEPass()); 803 } 804 805 if (EarlyInlineAll && !EnableFunctionCalls) 806 PM.addPass(AMDGPUAlwaysInlinePass()); 807 }); 808 809 PB.registerPeepholeEPCallback( 810 [](FunctionPassManager &FPM, OptimizationLevel Level) { 811 if (Level == OptimizationLevel::O0) 812 return; 813 814 FPM.addPass(AMDGPUUseNativeCallsPass()); 815 if (EnableLibCallSimplify) 816 FPM.addPass(AMDGPUSimplifyLibCallsPass()); 817 }); 818 819 PB.registerCGSCCOptimizerLateEPCallback( 820 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 821 if (Level == OptimizationLevel::O0) 822 return; 823 824 FunctionPassManager FPM; 825 826 // Add promote kernel arguments pass to the opt pipeline right before 827 // infer address spaces which is needed to do actual address space 828 // rewriting. 829 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 830 EnablePromoteKernelArguments) 831 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 832 833 // Add infer address spaces pass to the opt pipeline after inlining 834 // but before SROA to increase SROA opportunities. 835 FPM.addPass(InferAddressSpacesPass()); 836 837 // This should run after inlining to have any chance of doing 838 // anything, and before other cleanup optimizations. 839 FPM.addPass(AMDGPULowerKernelAttributesPass()); 840 841 if (Level != OptimizationLevel::O0) { 842 // Promote alloca to vector before SROA and loop unroll. If we 843 // manage to eliminate allocas before unroll we may choose to unroll 844 // less. 845 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 846 } 847 848 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 849 }); 850 851 // FIXME: Why is AMDGPUAttributor not in CGSCC? 852 PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, 853 OptimizationLevel Level, 854 ThinOrFullLTOPhase Phase) { 855 if (Level != OptimizationLevel::O0) { 856 if (!isLTOPreLink(Phase)) 857 MPM.addPass(AMDGPUAttributorPass(*this)); 858 } 859 }); 860 861 PB.registerFullLinkTimeOptimizationLastEPCallback( 862 [this](ModulePassManager &PM, OptimizationLevel Level) { 863 // We want to support the -lto-partitions=N option as "best effort". 864 // For that, we need to lower LDS earlier in the pipeline before the 865 // module is partitioned for codegen. 866 if (EnableSwLowerLDS) 867 PM.addPass(AMDGPUSwLowerLDSPass(*this)); 868 if (EnableLowerModuleLDS) 869 PM.addPass(AMDGPULowerModuleLDSPass(*this)); 870 if (Level != OptimizationLevel::O0) { 871 // Do we really need internalization in LTO? 872 if (InternalizeSymbols) { 873 PM.addPass(InternalizePass(mustPreserveGV)); 874 PM.addPass(GlobalDCEPass()); 875 } 876 if (EnableAMDGPUAttributor) { 877 AMDGPUAttributorOptions Opt; 878 if (HasClosedWorldAssumption) 879 Opt.IsClosedWorld = true; 880 PM.addPass(AMDGPUAttributorPass(*this, Opt)); 881 } 882 } 883 if (!NoKernelInfoEndLTO) { 884 FunctionPassManager FPM; 885 FPM.addPass(KernelInfoPrinter(this)); 886 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 887 } 888 }); 889 890 PB.registerRegClassFilterParsingCallback( 891 [](StringRef FilterName) -> RegAllocFilterFunc { 892 if (FilterName == "sgpr") 893 return onlyAllocateSGPRs; 894 if (FilterName == "vgpr") 895 return onlyAllocateVGPRs; 896 if (FilterName == "wwm") 897 return onlyAllocateWWMRegs; 898 return nullptr; 899 }); 900 } 901 902 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 903 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 904 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 905 AddrSpace == AMDGPUAS::REGION_ADDRESS) 906 ? -1 907 : 0; 908 } 909 910 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 911 unsigned DestAS) const { 912 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 913 AMDGPU::isFlatGlobalAddrSpace(DestAS); 914 } 915 916 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 917 const auto *LD = dyn_cast<LoadInst>(V); 918 if (!LD) // TODO: Handle invariant load like constant. 919 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 920 921 // It must be a generic pointer loaded. 922 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 923 924 const auto *Ptr = LD->getPointerOperand(); 925 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 926 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 927 // For a generic pointer loaded from the constant memory, it could be assumed 928 // as a global pointer since the constant memory is only populated on the 929 // host side. As implied by the offload programming model, only global 930 // pointers could be referenced on the host side. 931 return AMDGPUAS::GLOBAL_ADDRESS; 932 } 933 934 std::pair<const Value *, unsigned> 935 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 936 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 937 switch (II->getIntrinsicID()) { 938 case Intrinsic::amdgcn_is_shared: 939 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 940 case Intrinsic::amdgcn_is_private: 941 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 942 default: 943 break; 944 } 945 return std::pair(nullptr, -1); 946 } 947 // Check the global pointer predication based on 948 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 949 // the order of 'is_shared' and 'is_private' is not significant. 950 Value *Ptr; 951 if (match( 952 const_cast<Value *>(V), 953 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 954 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 955 m_Deferred(Ptr)))))) 956 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 957 958 return std::pair(nullptr, -1); 959 } 960 961 unsigned 962 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { 963 switch (Kind) { 964 case PseudoSourceValue::Stack: 965 case PseudoSourceValue::FixedStack: 966 return AMDGPUAS::PRIVATE_ADDRESS; 967 case PseudoSourceValue::ConstantPool: 968 case PseudoSourceValue::GOT: 969 case PseudoSourceValue::JumpTable: 970 case PseudoSourceValue::GlobalValueCallEntry: 971 case PseudoSourceValue::ExternalSymbolCallEntry: 972 return AMDGPUAS::CONSTANT_ADDRESS; 973 } 974 return AMDGPUAS::FLAT_ADDRESS; 975 } 976 977 bool AMDGPUTargetMachine::splitModule( 978 Module &M, unsigned NumParts, 979 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { 980 // FIXME(?): Would be better to use an already existing Analysis/PassManager, 981 // but all current users of this API don't have one ready and would need to 982 // create one anyway. Let's hide the boilerplate for now to keep it simple. 983 984 LoopAnalysisManager LAM; 985 FunctionAnalysisManager FAM; 986 CGSCCAnalysisManager CGAM; 987 ModuleAnalysisManager MAM; 988 989 PassBuilder PB(this); 990 PB.registerModuleAnalyses(MAM); 991 PB.registerFunctionAnalyses(FAM); 992 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); 993 994 ModulePassManager MPM; 995 MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback)); 996 MPM.run(M, MAM); 997 return true; 998 } 999 1000 //===----------------------------------------------------------------------===// 1001 // GCN Target Machine (SI+) 1002 //===----------------------------------------------------------------------===// 1003 1004 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 1005 StringRef CPU, StringRef FS, 1006 const TargetOptions &Options, 1007 std::optional<Reloc::Model> RM, 1008 std::optional<CodeModel::Model> CM, 1009 CodeGenOptLevel OL, bool JIT) 1010 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 1011 1012 const TargetSubtargetInfo * 1013 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 1014 StringRef GPU = getGPUName(F); 1015 StringRef FS = getFeatureString(F); 1016 1017 SmallString<128> SubtargetKey(GPU); 1018 SubtargetKey.append(FS); 1019 1020 auto &I = SubtargetMap[SubtargetKey]; 1021 if (!I) { 1022 // This needs to be done before we create a new subtarget since any 1023 // creation will depend on the TM and the code generation flags on the 1024 // function that reside in TargetOptions. 1025 resetTargetOptions(F); 1026 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 1027 } 1028 1029 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 1030 1031 return I.get(); 1032 } 1033 1034 TargetTransformInfo 1035 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 1036 return TargetTransformInfo(GCNTTIImpl(this, F)); 1037 } 1038 1039 Error GCNTargetMachine::buildCodeGenPipeline( 1040 ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, 1041 CodeGenFileType FileType, const CGPassBuilderOption &Opts, 1042 PassInstrumentationCallbacks *PIC) { 1043 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC); 1044 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); 1045 } 1046 1047 //===----------------------------------------------------------------------===// 1048 // AMDGPU Legacy Pass Setup 1049 //===----------------------------------------------------------------------===// 1050 1051 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 1052 return getStandardCSEConfigForOpt(TM->getOptLevel()); 1053 } 1054 1055 namespace { 1056 1057 class GCNPassConfig final : public AMDGPUPassConfig { 1058 public: 1059 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM) 1060 : AMDGPUPassConfig(TM, PM) { 1061 // It is necessary to know the register usage of the entire call graph. We 1062 // allow calls without EnableAMDGPUFunctionCalls if they are marked 1063 // noinline, so this is always required. 1064 setRequiresCodeGenSCCOrder(true); 1065 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 1066 } 1067 1068 GCNTargetMachine &getGCNTargetMachine() const { 1069 return getTM<GCNTargetMachine>(); 1070 } 1071 1072 ScheduleDAGInstrs * 1073 createMachineScheduler(MachineSchedContext *C) const override; 1074 1075 ScheduleDAGInstrs * 1076 createPostMachineScheduler(MachineSchedContext *C) const override { 1077 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive( 1078 C, std::make_unique<PostGenericScheduler>(C), 1079 /*RemoveKillFlags=*/true); 1080 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1081 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1082 if (ST.shouldClusterStores()) 1083 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 1084 DAG->addMutation( 1085 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); 1086 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) 1087 DAG->addMutation(createVOPDPairingMutation()); 1088 return DAG; 1089 } 1090 1091 bool addPreISel() override; 1092 void addMachineSSAOptimization() override; 1093 bool addILPOpts() override; 1094 bool addInstSelector() override; 1095 bool addIRTranslator() override; 1096 void addPreLegalizeMachineIR() override; 1097 bool addLegalizeMachineIR() override; 1098 void addPreRegBankSelect() override; 1099 bool addRegBankSelect() override; 1100 void addPreGlobalInstructionSelect() override; 1101 bool addGlobalInstructionSelect() override; 1102 void addFastRegAlloc() override; 1103 void addOptimizedRegAlloc() override; 1104 1105 FunctionPass *createSGPRAllocPass(bool Optimized); 1106 FunctionPass *createVGPRAllocPass(bool Optimized); 1107 FunctionPass *createWWMRegAllocPass(bool Optimized); 1108 FunctionPass *createRegAllocPass(bool Optimized) override; 1109 1110 bool addRegAssignAndRewriteFast() override; 1111 bool addRegAssignAndRewriteOptimized() override; 1112 1113 bool addPreRewrite() override; 1114 void addPostRegAlloc() override; 1115 void addPreSched2() override; 1116 void addPreEmitPass() override; 1117 }; 1118 1119 } // end anonymous namespace 1120 1121 AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM) 1122 : TargetPassConfig(TM, PM) { 1123 // Exceptions and StackMaps are not supported, so these passes will never do 1124 // anything. 1125 disablePass(&StackMapLivenessID); 1126 disablePass(&FuncletLayoutID); 1127 // Garbage collection is not supported. 1128 disablePass(&GCLoweringID); 1129 disablePass(&ShadowStackGCLoweringID); 1130 } 1131 1132 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 1133 if (getOptLevel() == CodeGenOptLevel::Aggressive) 1134 addPass(createGVNPass()); 1135 else 1136 addPass(createEarlyCSEPass()); 1137 } 1138 1139 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 1140 if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) 1141 addPass(createLoopDataPrefetchPass()); 1142 addPass(createSeparateConstOffsetFromGEPPass()); 1143 // ReassociateGEPs exposes more opportunities for SLSR. See 1144 // the example in reassociate-geps-and-slsr.ll. 1145 addPass(createStraightLineStrengthReducePass()); 1146 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 1147 // EarlyCSE can reuse. 1148 addEarlyCSEOrGVNPass(); 1149 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 1150 addPass(createNaryReassociatePass()); 1151 // NaryReassociate on GEPs creates redundant common expressions, so run 1152 // EarlyCSE after it. 1153 addPass(createEarlyCSEPass()); 1154 } 1155 1156 void AMDGPUPassConfig::addIRPasses() { 1157 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 1158 1159 Triple::ArchType Arch = TM.getTargetTriple().getArch(); 1160 if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn) 1161 addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM)); 1162 1163 // There is no reason to run these. 1164 disablePass(&StackMapLivenessID); 1165 disablePass(&FuncletLayoutID); 1166 disablePass(&PatchableFunctionID); 1167 1168 addPass(createAMDGPUPrintfRuntimeBinding()); 1169 if (LowerCtorDtor) 1170 addPass(createAMDGPUCtorDtorLoweringLegacyPass()); 1171 1172 if (isPassEnabled(EnableImageIntrinsicOptimizer)) 1173 addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); 1174 1175 // This can be disabled by passing ::Disable here or on the command line 1176 // with --expand-variadics-override=disable. 1177 addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); 1178 1179 // Function calls are not supported, so make sure we inline everything. 1180 addPass(createAMDGPUAlwaysInlinePass()); 1181 addPass(createAlwaysInlinerLegacyPass()); 1182 1183 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 1184 if (Arch == Triple::r600) 1185 addPass(createR600OpenCLImageTypeLoweringPass()); 1186 1187 // Replace OpenCL enqueued block function pointers with global variables. 1188 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass()); 1189 1190 // Lower LDS accesses to global memory pass if address sanitizer is enabled. 1191 if (EnableSwLowerLDS) 1192 addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); 1193 1194 // Runs before PromoteAlloca so the latter can account for function uses 1195 if (EnableLowerModuleLDS) { 1196 addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); 1197 } 1198 1199 if (TM.getOptLevel() > CodeGenOptLevel::None) 1200 addPass(createInferAddressSpacesPass()); 1201 1202 // Run atomic optimizer before Atomic Expand 1203 if ((TM.getTargetTriple().getArch() == Triple::amdgcn) && 1204 (TM.getOptLevel() >= CodeGenOptLevel::Less) && 1205 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { 1206 addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); 1207 } 1208 1209 addPass(createAtomicExpandLegacyPass()); 1210 1211 if (TM.getOptLevel() > CodeGenOptLevel::None) { 1212 addPass(createAMDGPUPromoteAlloca()); 1213 1214 if (isPassEnabled(EnableScalarIRPasses)) 1215 addStraightLineScalarOptimizationPasses(); 1216 1217 if (EnableAMDGPUAliasAnalysis) { 1218 addPass(createAMDGPUAAWrapperPass()); 1219 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1220 AAResults &AAR) { 1221 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1222 AAR.addAAResult(WrapperPass->getResult()); 1223 })); 1224 } 1225 1226 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1227 // TODO: May want to move later or split into an early and late one. 1228 addPass(createAMDGPUCodeGenPreparePass()); 1229 } 1230 1231 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may 1232 // have expanded. 1233 if (TM.getOptLevel() > CodeGenOptLevel::Less) 1234 addPass(createLICMPass()); 1235 } 1236 1237 TargetPassConfig::addIRPasses(); 1238 1239 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1240 // example, GVN can combine 1241 // 1242 // %0 = add %a, %b 1243 // %1 = add %b, %a 1244 // 1245 // and 1246 // 1247 // %0 = shl nsw %a, 2 1248 // %1 = shl %a, 2 1249 // 1250 // but EarlyCSE can do neither of them. 1251 if (isPassEnabled(EnableScalarIRPasses)) 1252 addEarlyCSEOrGVNPass(); 1253 } 1254 1255 void AMDGPUPassConfig::addCodeGenPrepare() { 1256 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1257 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1258 // analysis, and should be removed. 1259 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1260 } 1261 1262 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1263 EnableLowerKernelArguments) 1264 addPass(createAMDGPULowerKernelArgumentsPass()); 1265 1266 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1267 // This lowering has been placed after codegenprepare to take advantage of 1268 // address mode matching (which is why it isn't put with the LDS lowerings). 1269 // It could be placed anywhere before uniformity annotations (an analysis 1270 // that it changes by splitting up fat pointers into their components) 1271 // but has been put before switch lowering and CFG flattening so that those 1272 // passes can run on the more optimized control flow this pass creates in 1273 // many cases. 1274 // 1275 // FIXME: This should ideally be put after the LoadStoreVectorizer. 1276 // However, due to some annoying facts about ResourceUsageAnalysis, 1277 // (especially as exercised in the resource-usage-dead-function test), 1278 // we need all the function passes codegenprepare all the way through 1279 // said resource usage analysis to run on the call graph produced 1280 // before codegenprepare runs (because codegenprepare will knock some 1281 // nodes out of the graph, which leads to function-level passes not 1282 // being run on them, which causes crashes in the resource usage analysis). 1283 addPass(createAMDGPULowerBufferFatPointersPass()); 1284 // In accordance with the above FIXME, manually force all the 1285 // function-level passes into a CGSCCPassManager. 1286 addPass(new DummyCGSCCPass()); 1287 } 1288 1289 TargetPassConfig::addCodeGenPrepare(); 1290 1291 if (isPassEnabled(EnableLoadStoreVectorizer)) 1292 addPass(createLoadStoreVectorizerPass()); 1293 1294 // LowerSwitch pass may introduce unreachable blocks that can 1295 // cause unexpected behavior for subsequent passes. Placing it 1296 // here seems better that these blocks would get cleaned up by 1297 // UnreachableBlockElim inserted next in the pass flow. 1298 addPass(createLowerSwitchPass()); 1299 } 1300 1301 bool AMDGPUPassConfig::addPreISel() { 1302 if (TM->getOptLevel() > CodeGenOptLevel::None) 1303 addPass(createFlattenCFGPass()); 1304 return false; 1305 } 1306 1307 bool AMDGPUPassConfig::addInstSelector() { 1308 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); 1309 return false; 1310 } 1311 1312 bool AMDGPUPassConfig::addGCPasses() { 1313 // Do nothing. GC is not supported. 1314 return false; 1315 } 1316 1317 llvm::ScheduleDAGInstrs * 1318 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1319 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1320 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1321 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1322 if (ST.shouldClusterStores()) 1323 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 1324 return DAG; 1325 } 1326 1327 //===----------------------------------------------------------------------===// 1328 // GCN Legacy Pass Setup 1329 //===----------------------------------------------------------------------===// 1330 1331 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1332 MachineSchedContext *C) const { 1333 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1334 if (ST.enableSIScheduler()) 1335 return createSIMachineScheduler(C); 1336 1337 Attribute SchedStrategyAttr = 1338 C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy"); 1339 StringRef SchedStrategy = SchedStrategyAttr.isValid() 1340 ? SchedStrategyAttr.getValueAsString() 1341 : AMDGPUSchedStrategy; 1342 1343 if (SchedStrategy == "max-ilp") 1344 return createGCNMaxILPMachineScheduler(C); 1345 1346 if (SchedStrategy == "max-memory-clause") 1347 return createGCNMaxMemoryClauseMachineScheduler(C); 1348 1349 return createGCNMaxOccupancyMachineScheduler(C); 1350 } 1351 1352 bool GCNPassConfig::addPreISel() { 1353 AMDGPUPassConfig::addPreISel(); 1354 1355 if (TM->getOptLevel() > CodeGenOptLevel::None) 1356 addPass(createSinkingPass()); 1357 1358 if (TM->getOptLevel() > CodeGenOptLevel::None) 1359 addPass(createAMDGPULateCodeGenPrepareLegacyPass()); 1360 1361 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1362 // regions formed by them. 1363 addPass(&AMDGPUUnifyDivergentExitNodesID); 1364 addPass(createFixIrreduciblePass()); 1365 addPass(createUnifyLoopExitsPass()); 1366 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1367 1368 addPass(createAMDGPUAnnotateUniformValuesLegacy()); 1369 addPass(createSIAnnotateControlFlowLegacyPass()); 1370 // TODO: Move this right after structurizeCFG to avoid extra divergence 1371 // analysis. This depends on stopping SIAnnotateControlFlow from making 1372 // control flow modifications. 1373 addPass(createAMDGPURewriteUndefForPHILegacyPass()); 1374 1375 addPass(createLCSSAPass()); 1376 1377 if (TM->getOptLevel() > CodeGenOptLevel::Less) 1378 addPass(&AMDGPUPerfHintAnalysisLegacyID); 1379 1380 return false; 1381 } 1382 1383 void GCNPassConfig::addMachineSSAOptimization() { 1384 TargetPassConfig::addMachineSSAOptimization(); 1385 1386 // We want to fold operands after PeepholeOptimizer has run (or as part of 1387 // it), because it will eliminate extra copies making it easier to fold the 1388 // real source operand. We want to eliminate dead instructions after, so that 1389 // we see fewer uses of the copies. We then need to clean up the dead 1390 // instructions leftover after the operands are folded as well. 1391 // 1392 // XXX - Can we get away without running DeadMachineInstructionElim again? 1393 addPass(&SIFoldOperandsLegacyID); 1394 if (EnableDPPCombine) 1395 addPass(&GCNDPPCombineLegacyID); 1396 addPass(&SILoadStoreOptimizerLegacyID); 1397 if (isPassEnabled(EnableSDWAPeephole)) { 1398 addPass(&SIPeepholeSDWALegacyID); 1399 addPass(&EarlyMachineLICMID); 1400 addPass(&MachineCSELegacyID); 1401 addPass(&SIFoldOperandsLegacyID); 1402 } 1403 addPass(&DeadMachineInstructionElimID); 1404 addPass(createSIShrinkInstructionsLegacyPass()); 1405 } 1406 1407 bool GCNPassConfig::addILPOpts() { 1408 if (EnableEarlyIfConversion) 1409 addPass(&EarlyIfConverterLegacyID); 1410 1411 TargetPassConfig::addILPOpts(); 1412 return false; 1413 } 1414 1415 bool GCNPassConfig::addInstSelector() { 1416 AMDGPUPassConfig::addInstSelector(); 1417 addPass(&SIFixSGPRCopiesLegacyID); 1418 addPass(createSILowerI1CopiesLegacyPass()); 1419 return false; 1420 } 1421 1422 bool GCNPassConfig::addIRTranslator() { 1423 addPass(new IRTranslator(getOptLevel())); 1424 return false; 1425 } 1426 1427 void GCNPassConfig::addPreLegalizeMachineIR() { 1428 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; 1429 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1430 addPass(new Localizer()); 1431 } 1432 1433 bool GCNPassConfig::addLegalizeMachineIR() { 1434 addPass(new Legalizer()); 1435 return false; 1436 } 1437 1438 void GCNPassConfig::addPreRegBankSelect() { 1439 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; 1440 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1441 addPass(createAMDGPUGlobalISelDivergenceLoweringPass()); 1442 } 1443 1444 bool GCNPassConfig::addRegBankSelect() { 1445 if (NewRegBankSelect) { 1446 addPass(createAMDGPURegBankSelectPass()); 1447 addPass(createAMDGPURegBankLegalizePass()); 1448 } else { 1449 addPass(new RegBankSelect()); 1450 } 1451 return false; 1452 } 1453 1454 void GCNPassConfig::addPreGlobalInstructionSelect() { 1455 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; 1456 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1457 } 1458 1459 bool GCNPassConfig::addGlobalInstructionSelect() { 1460 addPass(new InstructionSelect(getOptLevel())); 1461 return false; 1462 } 1463 1464 void GCNPassConfig::addFastRegAlloc() { 1465 // FIXME: We have to disable the verifier here because of PHIElimination + 1466 // TwoAddressInstructions disabling it. 1467 1468 // This must be run immediately after phi elimination and before 1469 // TwoAddressInstructions, otherwise the processing of the tied operand of 1470 // SI_ELSE will introduce a copy of the tied operand source after the else. 1471 insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID); 1472 1473 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1474 1475 TargetPassConfig::addFastRegAlloc(); 1476 } 1477 1478 void GCNPassConfig::addOptimizedRegAlloc() { 1479 if (EnableDCEInRA) 1480 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1481 1482 // FIXME: when an instruction has a Killed operand, and the instruction is 1483 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1484 // the register in LiveVariables, this would trigger a failure in verifier, 1485 // we should fix it and enable the verifier. 1486 if (OptVGPRLiveRange) 1487 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID); 1488 1489 // This must be run immediately after phi elimination and before 1490 // TwoAddressInstructions, otherwise the processing of the tied operand of 1491 // SI_ELSE will introduce a copy of the tied operand source after the else. 1492 insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID); 1493 1494 if (EnableRewritePartialRegUses) 1495 insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID); 1496 1497 if (isPassEnabled(EnablePreRAOptimizations)) 1498 insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID); 1499 1500 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1501 // instructions that cause scheduling barriers. 1502 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1503 1504 if (OptExecMaskPreRA) 1505 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1506 1507 // This is not an essential optimization and it has a noticeable impact on 1508 // compilation time, so we only enable it from O2. 1509 if (TM->getOptLevel() > CodeGenOptLevel::Less) 1510 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1511 1512 TargetPassConfig::addOptimizedRegAlloc(); 1513 } 1514 1515 bool GCNPassConfig::addPreRewrite() { 1516 if (EnableRegReassign) 1517 addPass(&GCNNSAReassignID); 1518 return true; 1519 } 1520 1521 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1522 // Initialize the global default. 1523 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1524 initializeDefaultSGPRRegisterAllocatorOnce); 1525 1526 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1527 if (Ctor != useDefaultRegisterAllocator) 1528 return Ctor(); 1529 1530 if (Optimized) 1531 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1532 1533 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1534 } 1535 1536 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1537 // Initialize the global default. 1538 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1539 initializeDefaultVGPRRegisterAllocatorOnce); 1540 1541 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1542 if (Ctor != useDefaultRegisterAllocator) 1543 return Ctor(); 1544 1545 if (Optimized) 1546 return createGreedyVGPRRegisterAllocator(); 1547 1548 return createFastVGPRRegisterAllocator(); 1549 } 1550 1551 FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) { 1552 // Initialize the global default. 1553 llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag, 1554 initializeDefaultWWMRegisterAllocatorOnce); 1555 1556 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault(); 1557 if (Ctor != useDefaultRegisterAllocator) 1558 return Ctor(); 1559 1560 if (Optimized) 1561 return createGreedyWWMRegisterAllocator(); 1562 1563 return createFastWWMRegisterAllocator(); 1564 } 1565 1566 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1567 llvm_unreachable("should not be used"); 1568 } 1569 1570 static const char RegAllocOptNotSupportedMessage[] = 1571 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, " 1572 "and -vgpr-regalloc"; 1573 1574 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1575 if (!usingDefaultRegAlloc()) 1576 report_fatal_error(RegAllocOptNotSupportedMessage); 1577 1578 addPass(&GCNPreRALongBranchRegID); 1579 1580 addPass(createSGPRAllocPass(false)); 1581 1582 // Equivalent of PEI for SGPRs. 1583 addPass(&SILowerSGPRSpillsLegacyID); 1584 1585 // To Allocate wwm registers used in whole quad mode operations (for shaders). 1586 addPass(&SIPreAllocateWWMRegsLegacyID); 1587 1588 // For allocating other wwm register operands. 1589 addPass(createWWMRegAllocPass(false)); 1590 1591 addPass(&SILowerWWMCopiesLegacyID); 1592 addPass(&AMDGPUReserveWWMRegsID); 1593 1594 // For allocating per-thread VGPRs. 1595 addPass(createVGPRAllocPass(false)); 1596 1597 return true; 1598 } 1599 1600 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1601 if (!usingDefaultRegAlloc()) 1602 report_fatal_error(RegAllocOptNotSupportedMessage); 1603 1604 addPass(&GCNPreRALongBranchRegID); 1605 1606 addPass(createSGPRAllocPass(true)); 1607 1608 // Commit allocated register changes. This is mostly necessary because too 1609 // many things rely on the use lists of the physical registers, such as the 1610 // verifier. This is only necessary with allocators which use LiveIntervals, 1611 // since FastRegAlloc does the replacements itself. 1612 addPass(createVirtRegRewriter(false)); 1613 1614 // At this point, the sgpr-regalloc has been done and it is good to have the 1615 // stack slot coloring to try to optimize the SGPR spill stack indices before 1616 // attempting the custom SGPR spill lowering. 1617 addPass(&StackSlotColoringID); 1618 1619 // Equivalent of PEI for SGPRs. 1620 addPass(&SILowerSGPRSpillsLegacyID); 1621 1622 // To Allocate wwm registers used in whole quad mode operations (for shaders). 1623 addPass(&SIPreAllocateWWMRegsLegacyID); 1624 1625 // For allocating other whole wave mode registers. 1626 addPass(createWWMRegAllocPass(true)); 1627 addPass(&SILowerWWMCopiesLegacyID); 1628 addPass(createVirtRegRewriter(false)); 1629 addPass(&AMDGPUReserveWWMRegsID); 1630 1631 // For allocating per-thread VGPRs. 1632 addPass(createVGPRAllocPass(true)); 1633 1634 addPreRewrite(); 1635 addPass(&VirtRegRewriterID); 1636 1637 addPass(&AMDGPUMarkLastScratchLoadID); 1638 1639 return true; 1640 } 1641 1642 void GCNPassConfig::addPostRegAlloc() { 1643 addPass(&SIFixVGPRCopiesID); 1644 if (getOptLevel() > CodeGenOptLevel::None) 1645 addPass(&SIOptimizeExecMaskingLegacyID); 1646 TargetPassConfig::addPostRegAlloc(); 1647 } 1648 1649 void GCNPassConfig::addPreSched2() { 1650 if (TM->getOptLevel() > CodeGenOptLevel::None) 1651 addPass(createSIShrinkInstructionsLegacyPass()); 1652 addPass(&SIPostRABundlerID); 1653 } 1654 1655 void GCNPassConfig::addPreEmitPass() { 1656 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) 1657 addPass(&GCNCreateVOPDID); 1658 addPass(createSIMemoryLegalizerPass()); 1659 addPass(createSIInsertWaitcntsPass()); 1660 1661 addPass(createSIModeRegisterPass()); 1662 1663 if (getOptLevel() > CodeGenOptLevel::None) 1664 addPass(&SIInsertHardClausesID); 1665 1666 addPass(&SILateBranchLoweringPassID); 1667 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) 1668 addPass(createAMDGPUSetWavePriorityPass()); 1669 if (getOptLevel() > CodeGenOptLevel::None) 1670 addPass(&SIPreEmitPeepholeID); 1671 // The hazard recognizer that runs as part of the post-ra scheduler does not 1672 // guarantee to be able handle all hazards correctly. This is because if there 1673 // are multiple scheduling regions in a basic block, the regions are scheduled 1674 // bottom up, so when we begin to schedule a region we don't know what 1675 // instructions were emitted directly before it. 1676 // 1677 // Here we add a stand-alone hazard recognizer pass which can handle all 1678 // cases. 1679 addPass(&PostRAHazardRecognizerID); 1680 1681 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) 1682 addPass(&AMDGPUInsertDelayAluID); 1683 1684 addPass(&BranchRelaxationPassID); 1685 addPass(createAMDGPUPreloadKernArgPrologLegacyPass()); 1686 } 1687 1688 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1689 return new GCNPassConfig(*this, PM); 1690 } 1691 1692 void GCNTargetMachine::registerMachineRegisterInfoCallback( 1693 MachineFunction &MF) const { 1694 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1695 MF.getRegInfo().addDelegate(MFI); 1696 } 1697 1698 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( 1699 BumpPtrAllocator &Allocator, const Function &F, 1700 const TargetSubtargetInfo *STI) const { 1701 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>( 1702 Allocator, F, static_cast<const GCNSubtarget *>(STI)); 1703 } 1704 1705 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1706 return new yaml::SIMachineFunctionInfo(); 1707 } 1708 1709 yaml::MachineFunctionInfo * 1710 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1711 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1712 return new yaml::SIMachineFunctionInfo( 1713 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF); 1714 } 1715 1716 bool GCNTargetMachine::parseMachineFunctionInfo( 1717 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1718 SMDiagnostic &Error, SMRange &SourceRange) const { 1719 const yaml::SIMachineFunctionInfo &YamlMFI = 1720 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1721 MachineFunction &MF = PFS.MF; 1722 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1723 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1724 1725 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1726 return true; 1727 1728 if (MFI->Occupancy == 0) { 1729 // Fixup the subtarget dependent default value. 1730 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second; 1731 } 1732 1733 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1734 Register TempReg; 1735 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1736 SourceRange = RegName.SourceRange; 1737 return true; 1738 } 1739 RegVal = TempReg; 1740 1741 return false; 1742 }; 1743 1744 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1745 Register &RegVal) { 1746 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1747 }; 1748 1749 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1750 return true; 1751 1752 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) 1753 return true; 1754 1755 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, 1756 MFI->LongBranchReservedReg)) 1757 return true; 1758 1759 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1760 // Create a diagnostic for a the register string literal. 1761 const MemoryBuffer &Buffer = 1762 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1763 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1764 RegName.Value.size(), SourceMgr::DK_Error, 1765 "incorrect register class for field", RegName.Value, 1766 {}, {}); 1767 SourceRange = RegName.SourceRange; 1768 return true; 1769 }; 1770 1771 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1772 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1773 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1774 return true; 1775 1776 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1777 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1778 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1779 } 1780 1781 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1782 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1783 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1784 } 1785 1786 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1787 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1788 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1789 } 1790 1791 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1792 Register ParsedReg; 1793 if (parseRegister(YamlReg, ParsedReg)) 1794 return true; 1795 1796 MFI->reserveWWMRegister(ParsedReg); 1797 } 1798 1799 for (const auto &[_, Info] : PFS.VRegInfosNamed) { 1800 MFI->setFlag(Info->VReg, Info->Flags); 1801 } 1802 for (const auto &[_, Info] : PFS.VRegInfos) { 1803 MFI->setFlag(Info->VReg, Info->Flags); 1804 } 1805 1806 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) { 1807 Register ParsedReg; 1808 if (parseRegister(YamlRegStr, ParsedReg)) 1809 return true; 1810 MFI->SpillPhysVGPRs.push_back(ParsedReg); 1811 } 1812 1813 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A, 1814 const TargetRegisterClass &RC, 1815 ArgDescriptor &Arg, unsigned UserSGPRs, 1816 unsigned SystemSGPRs) { 1817 // Skip parsing if it's not present. 1818 if (!A) 1819 return false; 1820 1821 if (A->IsRegister) { 1822 Register Reg; 1823 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1824 SourceRange = A->RegisterName.SourceRange; 1825 return true; 1826 } 1827 if (!RC.contains(Reg)) 1828 return diagnoseRegisterClass(A->RegisterName); 1829 Arg = ArgDescriptor::createRegister(Reg); 1830 } else 1831 Arg = ArgDescriptor::createStack(A->StackOffset); 1832 // Check and apply the optional mask. 1833 if (A->Mask) 1834 Arg = ArgDescriptor::createArg(Arg, *A->Mask); 1835 1836 MFI->NumUserSGPRs += UserSGPRs; 1837 MFI->NumSystemSGPRs += SystemSGPRs; 1838 return false; 1839 }; 1840 1841 if (YamlMFI.ArgInfo && 1842 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1843 AMDGPU::SGPR_128RegClass, 1844 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1845 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1846 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1847 2, 0) || 1848 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1849 MFI->ArgInfo.QueuePtr, 2, 0) || 1850 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1851 AMDGPU::SReg_64RegClass, 1852 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1853 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1854 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1855 2, 0) || 1856 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1857 AMDGPU::SReg_64RegClass, 1858 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1859 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1860 AMDGPU::SGPR_32RegClass, 1861 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1862 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, 1863 AMDGPU::SGPR_32RegClass, 1864 MFI->ArgInfo.LDSKernelId, 0, 1) || 1865 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1866 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1867 0, 1) || 1868 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1869 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1870 0, 1) || 1871 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1872 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1873 0, 1) || 1874 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1875 AMDGPU::SGPR_32RegClass, 1876 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1877 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1878 AMDGPU::SGPR_32RegClass, 1879 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1880 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1881 AMDGPU::SReg_64RegClass, 1882 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1883 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1884 AMDGPU::SReg_64RegClass, 1885 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1886 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1887 AMDGPU::VGPR_32RegClass, 1888 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1889 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1890 AMDGPU::VGPR_32RegClass, 1891 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1892 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1893 AMDGPU::VGPR_32RegClass, 1894 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1895 return true; 1896 1897 if (ST.hasIEEEMode()) 1898 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1899 if (ST.hasDX10ClampMode()) 1900 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1901 1902 // FIXME: Move proper support for denormal-fp-math into base MachineFunction 1903 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals 1904 ? DenormalMode::IEEE 1905 : DenormalMode::PreserveSign; 1906 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals 1907 ? DenormalMode::IEEE 1908 : DenormalMode::PreserveSign; 1909 1910 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals 1911 ? DenormalMode::IEEE 1912 : DenormalMode::PreserveSign; 1913 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals 1914 ? DenormalMode::IEEE 1915 : DenormalMode::PreserveSign; 1916 1917 if (YamlMFI.HasInitWholeWave) 1918 MFI->setInitWholeWave(); 1919 1920 return false; 1921 } 1922 1923 //===----------------------------------------------------------------------===// 1924 // AMDGPU CodeGen Pass Builder interface. 1925 //===----------------------------------------------------------------------===// 1926 1927 AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( 1928 GCNTargetMachine &TM, const CGPassBuilderOption &Opts, 1929 PassInstrumentationCallbacks *PIC) 1930 : CodeGenPassBuilder(TM, Opts, PIC) { 1931 Opt.RequiresCodeGenSCCOrder = true; 1932 // Exceptions and StackMaps are not supported, so these passes will never do 1933 // anything. 1934 // Garbage collection is not supported. 1935 disablePass<StackMapLivenessPass, FuncletLayoutPass, 1936 ShadowStackGCLoweringPass>(); 1937 } 1938 1939 void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { 1940 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) 1941 addPass(AMDGPURemoveIncompatibleFunctionsPass(TM)); 1942 1943 addPass(AMDGPUPrintfRuntimeBindingPass()); 1944 if (LowerCtorDtor) 1945 addPass(AMDGPUCtorDtorLoweringPass()); 1946 1947 if (isPassEnabled(EnableImageIntrinsicOptimizer)) 1948 addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); 1949 1950 // This can be disabled by passing ::Disable here or on the command line 1951 // with --expand-variadics-override=disable. 1952 addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); 1953 1954 addPass(AMDGPUAlwaysInlinePass()); 1955 addPass(AlwaysInlinerPass()); 1956 1957 addPass(AMDGPUOpenCLEnqueuedBlockLoweringPass()); 1958 1959 if (EnableSwLowerLDS) 1960 addPass(AMDGPUSwLowerLDSPass(TM)); 1961 1962 // Runs before PromoteAlloca so the latter can account for function uses 1963 if (EnableLowerModuleLDS) 1964 addPass(AMDGPULowerModuleLDSPass(TM)); 1965 1966 if (TM.getOptLevel() > CodeGenOptLevel::None) 1967 addPass(InferAddressSpacesPass()); 1968 1969 // Run atomic optimizer before Atomic Expand 1970 if (TM.getOptLevel() >= CodeGenOptLevel::Less && 1971 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) 1972 addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); 1973 1974 addPass(AtomicExpandPass(&TM)); 1975 1976 if (TM.getOptLevel() > CodeGenOptLevel::None) { 1977 addPass(AMDGPUPromoteAllocaPass(TM)); 1978 if (isPassEnabled(EnableScalarIRPasses)) 1979 addStraightLineScalarOptimizationPasses(addPass); 1980 1981 // TODO: Handle EnableAMDGPUAliasAnalysis 1982 1983 // TODO: May want to move later or split into an early and late one. 1984 addPass(AMDGPUCodeGenPreparePass(TM)); 1985 1986 // TODO: LICM 1987 } 1988 1989 Base::addIRPasses(addPass); 1990 1991 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1992 // example, GVN can combine 1993 // 1994 // %0 = add %a, %b 1995 // %1 = add %b, %a 1996 // 1997 // and 1998 // 1999 // %0 = shl nsw %a, 2 2000 // %1 = shl %a, 2 2001 // 2002 // but EarlyCSE can do neither of them. 2003 if (isPassEnabled(EnableScalarIRPasses)) 2004 addEarlyCSEOrGVNPass(addPass); 2005 } 2006 2007 void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { 2008 // AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be 2009 // deleted soon. 2010 2011 if (EnableLowerKernelArguments) 2012 addPass(AMDGPULowerKernelArgumentsPass(TM)); 2013 2014 // This lowering has been placed after codegenprepare to take advantage of 2015 // address mode matching (which is why it isn't put with the LDS lowerings). 2016 // It could be placed anywhere before uniformity annotations (an analysis 2017 // that it changes by splitting up fat pointers into their components) 2018 // but has been put before switch lowering and CFG flattening so that those 2019 // passes can run on the more optimized control flow this pass creates in 2020 // many cases. 2021 // 2022 // FIXME: This should ideally be put after the LoadStoreVectorizer. 2023 // However, due to some annoying facts about ResourceUsageAnalysis, 2024 // (especially as exercised in the resource-usage-dead-function test), 2025 // we need all the function passes codegenprepare all the way through 2026 // said resource usage analysis to run on the call graph produced 2027 // before codegenprepare runs (because codegenprepare will knock some 2028 // nodes out of the graph, which leads to function-level passes not 2029 // being run on them, which causes crashes in the resource usage analysis). 2030 addPass(AMDGPULowerBufferFatPointersPass(TM)); 2031 2032 Base::addCodeGenPrepare(addPass); 2033 2034 if (isPassEnabled(EnableLoadStoreVectorizer)) 2035 addPass(LoadStoreVectorizerPass()); 2036 2037 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected 2038 // behavior for subsequent passes. Placing it here seems better that these 2039 // blocks would get cleaned up by UnreachableBlockElim inserted next in the 2040 // pass flow. 2041 addPass(LowerSwitchPass()); 2042 } 2043 2044 void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { 2045 2046 if (TM.getOptLevel() > CodeGenOptLevel::None) 2047 addPass(FlattenCFGPass()); 2048 2049 if (TM.getOptLevel() > CodeGenOptLevel::None) 2050 addPass(SinkingPass()); 2051 2052 addPass(AMDGPULateCodeGenPreparePass(TM)); 2053 2054 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 2055 // regions formed by them. 2056 2057 addPass(AMDGPUUnifyDivergentExitNodesPass()); 2058 addPass(FixIrreduciblePass()); 2059 addPass(UnifyLoopExitsPass()); 2060 addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); 2061 2062 addPass(AMDGPUAnnotateUniformValuesPass()); 2063 2064 addPass(SIAnnotateControlFlowPass(TM)); 2065 2066 // TODO: Move this right after structurizeCFG to avoid extra divergence 2067 // analysis. This depends on stopping SIAnnotateControlFlow from making 2068 // control flow modifications. 2069 addPass(AMDGPURewriteUndefForPHIPass()); 2070 2071 addPass(LCSSAPass()); 2072 2073 if (TM.getOptLevel() > CodeGenOptLevel::Less) 2074 addPass(AMDGPUPerfHintAnalysisPass(TM)); 2075 2076 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why 2077 // isn't this in addInstSelector? 2078 addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>()); 2079 } 2080 2081 void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const { 2082 if (EnableEarlyIfConversion) 2083 addPass(EarlyIfConverterPass()); 2084 2085 Base::addILPOpts(addPass); 2086 } 2087 2088 void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass, 2089 CreateMCStreamer) const { 2090 // TODO: Add AsmPrinter. 2091 } 2092 2093 Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { 2094 addPass(AMDGPUISelDAGToDAGPass(TM)); 2095 addPass(SIFixSGPRCopiesPass()); 2096 addPass(SILowerI1CopiesPass()); 2097 return Error::success(); 2098 } 2099 2100 void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( 2101 AddMachinePass &addPass) const { 2102 Base::addMachineSSAOptimization(addPass); 2103 2104 addPass(SIFoldOperandsPass()); 2105 if (EnableDPPCombine) { 2106 addPass(GCNDPPCombinePass()); 2107 } 2108 addPass(SILoadStoreOptimizerPass()); 2109 if (isPassEnabled(EnableSDWAPeephole)) { 2110 addPass(SIPeepholeSDWAPass()); 2111 addPass(EarlyMachineLICMPass()); 2112 addPass(MachineCSEPass()); 2113 addPass(SIFoldOperandsPass()); 2114 } 2115 addPass(DeadMachineInstructionElimPass()); 2116 addPass(SIShrinkInstructionsPass()); 2117 } 2118 2119 void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { 2120 addPass(SIFixVGPRCopiesPass()); 2121 if (TM.getOptLevel() > CodeGenOptLevel::None) 2122 addPass(SIOptimizeExecMaskingPass()); 2123 Base::addPostRegAlloc(addPass); 2124 } 2125 2126 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt, 2127 CodeGenOptLevel Level) const { 2128 if (Opt.getNumOccurrences()) 2129 return Opt; 2130 if (TM.getOptLevel() < Level) 2131 return false; 2132 return Opt; 2133 } 2134 2135 void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const { 2136 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) 2137 addPass(GVNPass()); 2138 else 2139 addPass(EarlyCSEPass()); 2140 } 2141 2142 void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses( 2143 AddIRPass &addPass) const { 2144 if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) 2145 addPass(LoopDataPrefetchPass()); 2146 2147 addPass(SeparateConstOffsetFromGEPPass()); 2148 2149 // ReassociateGEPs exposes more opportunities for SLSR. See 2150 // the example in reassociate-geps-and-slsr.ll. 2151 addPass(StraightLineStrengthReducePass()); 2152 2153 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 2154 // EarlyCSE can reuse. 2155 addEarlyCSEOrGVNPass(addPass); 2156 2157 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 2158 addPass(NaryReassociatePass()); 2159 2160 // NaryReassociate on GEPs creates redundant common expressions, so run 2161 // EarlyCSE after it. 2162 addPass(EarlyCSEPass()); 2163 } 2164