1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAliasAnalysis.h" 16 #include "NVPTXAllocaHoisting.h" 17 #include "NVPTXAtomicLower.h" 18 #include "NVPTXCtorDtorLowering.h" 19 #include "NVPTXLowerAggrCopies.h" 20 #include "NVPTXMachineFunctionInfo.h" 21 #include "NVPTXTargetObjectFile.h" 22 #include "NVPTXTargetTransformInfo.h" 23 #include "TargetInfo/NVPTXTargetInfo.h" 24 #include "llvm/Analysis/KernelInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/CodeGen/Passes.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/IR/IntrinsicsNVPTX.h" 29 #include "llvm/MC/TargetRegistry.h" 30 #include "llvm/Pass.h" 31 #include "llvm/Passes/PassBuilder.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Target/TargetOptions.h" 35 #include "llvm/TargetParser/Triple.h" 36 #include "llvm/Transforms/IPO/ExpandVariadics.h" 37 #include "llvm/Transforms/Scalar.h" 38 #include "llvm/Transforms/Scalar/GVN.h" 39 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 40 #include <cassert> 41 #include <optional> 42 #include <string> 43 44 using namespace llvm; 45 46 // LSV is still relatively new; this switch lets us turn it off in case we 47 // encounter (or suspect) a bug. 48 static cl::opt<bool> 49 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 50 cl::desc("Disable load/store vectorizer"), 51 cl::init(false), cl::Hidden); 52 53 // TODO: Remove this flag when we are confident with no regressions. 54 static cl::opt<bool> DisableRequireStructuredCFG( 55 "disable-nvptx-require-structured-cfg", 56 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 57 "structured CFG. The requirement should be disabled only when " 58 "unexpected regressions happen."), 59 cl::init(false), cl::Hidden); 60 61 static cl::opt<bool> UseShortPointersOpt( 62 "nvptx-short-ptr", 63 cl::desc( 64 "Use 32-bit pointers for accessing const/local/shared address spaces."), 65 cl::init(false), cl::Hidden); 66 67 // byval arguments in NVPTX are special. We're only allowed to read from them 68 // using a special instruction, and if we ever need to write to them or take an 69 // address, we must make a local copy and use it, instead. 70 // 71 // The problem is that local copies are very expensive, and we create them very 72 // late in the compilation pipeline, so LLVM does not have much of a chance to 73 // eliminate them, if they turn out to be unnecessary. 74 // 75 // One way around that is to create such copies early on, and let them percolate 76 // through the optimizations. The copying itself will never trigger creation of 77 // another copy later on, as the reads are allowed. If LLVM can eliminate it, 78 // it's a win. It the full optimization pipeline can't remove the copy, that's 79 // as good as it gets in terms of the effort we could've done, and it's 80 // certainly a much better effort than what we do now. 81 // 82 // This early injection of the copies has potential to create undesireable 83 // side-effects, so it's disabled by default, for now, until it sees more 84 // testing. 85 static cl::opt<bool> EarlyByValArgsCopy( 86 "nvptx-early-byval-copy", 87 cl::desc("Create a copy of byval function arguments early."), 88 cl::init(false), cl::Hidden); 89 90 namespace llvm { 91 92 void initializeGenericToNVVMLegacyPassPass(PassRegistry &); 93 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 94 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); 95 void initializeNVPTXAtomicLowerPass(PassRegistry &); 96 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 97 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 98 void initializeNVPTXLowerAllocaPass(PassRegistry &); 99 void initializeNVPTXLowerUnreachablePass(PassRegistry &); 100 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); 101 void initializeNVPTXLowerArgsPass(PassRegistry &); 102 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 103 void initializeNVVMIntrRangePass(PassRegistry &); 104 void initializeNVVMReflectPass(PassRegistry &); 105 void initializeNVPTXAAWrapperPassPass(PassRegistry &); 106 void initializeNVPTXExternalAAWrapperPass(PassRegistry &); 107 108 } // end namespace llvm 109 110 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 111 // Register the target. 112 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 113 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 114 115 PassRegistry &PR = *PassRegistry::getPassRegistry(); 116 // FIXME: This pass is really intended to be invoked during IR optimization, 117 // but it's very NVPTX-specific. 118 initializeNVVMReflectPass(PR); 119 initializeNVVMIntrRangePass(PR); 120 initializeGenericToNVVMLegacyPassPass(PR); 121 initializeNVPTXAllocaHoistingPass(PR); 122 initializeNVPTXAssignValidGlobalNamesPass(PR); 123 initializeNVPTXAtomicLowerPass(PR); 124 initializeNVPTXLowerArgsPass(PR); 125 initializeNVPTXLowerAllocaPass(PR); 126 initializeNVPTXLowerUnreachablePass(PR); 127 initializeNVPTXCtorDtorLoweringLegacyPass(PR); 128 initializeNVPTXLowerAggrCopiesPass(PR); 129 initializeNVPTXProxyRegErasurePass(PR); 130 initializeNVPTXDAGToDAGISelLegacyPass(PR); 131 initializeNVPTXAAWrapperPassPass(PR); 132 initializeNVPTXExternalAAWrapperPass(PR); 133 } 134 135 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 136 std::string Ret = "e"; 137 138 if (!is64Bit) 139 Ret += "-p:32:32"; 140 else if (UseShortPointers) 141 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 142 143 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 144 145 return Ret; 146 } 147 148 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 149 StringRef CPU, StringRef FS, 150 const TargetOptions &Options, 151 std::optional<Reloc::Model> RM, 152 std::optional<CodeModel::Model> CM, 153 CodeGenOptLevel OL, bool is64bit) 154 // The pic relocation model is used regardless of what the client has 155 // specified, as it is the only relocation model currently supported. 156 : CodeGenTargetMachineImpl(T, 157 computeDataLayout(is64bit, UseShortPointersOpt), 158 TT, CPU, FS, Options, Reloc::PIC_, 159 getEffectiveCodeModel(CM, CodeModel::Small), OL), 160 is64bit(is64bit), TLOF(std::make_unique<NVPTXTargetObjectFile>()), 161 Subtarget(TT, std::string(CPU), std::string(FS), *this), 162 StrPool(StrAlloc) { 163 if (TT.getOS() == Triple::NVCL) 164 drvInterface = NVPTX::NVCL; 165 else 166 drvInterface = NVPTX::CUDA; 167 if (!DisableRequireStructuredCFG) 168 setRequiresStructuredCFG(true); 169 initAsmInfo(); 170 } 171 172 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 173 174 void NVPTXTargetMachine32::anchor() {} 175 176 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 177 StringRef CPU, StringRef FS, 178 const TargetOptions &Options, 179 std::optional<Reloc::Model> RM, 180 std::optional<CodeModel::Model> CM, 181 CodeGenOptLevel OL, bool JIT) 182 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 183 184 void NVPTXTargetMachine64::anchor() {} 185 186 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 187 StringRef CPU, StringRef FS, 188 const TargetOptions &Options, 189 std::optional<Reloc::Model> RM, 190 std::optional<CodeModel::Model> CM, 191 CodeGenOptLevel OL, bool JIT) 192 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 193 194 namespace { 195 196 class NVPTXPassConfig : public TargetPassConfig { 197 public: 198 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 199 : TargetPassConfig(TM, PM) {} 200 201 NVPTXTargetMachine &getNVPTXTargetMachine() const { 202 return getTM<NVPTXTargetMachine>(); 203 } 204 205 void addIRPasses() override; 206 bool addInstSelector() override; 207 void addPreRegAlloc() override; 208 void addPostRegAlloc() override; 209 void addMachineSSAOptimization() override; 210 211 FunctionPass *createTargetRegisterAllocator(bool) override; 212 void addFastRegAlloc() override; 213 void addOptimizedRegAlloc() override; 214 215 bool addRegAssignAndRewriteFast() override { 216 llvm_unreachable("should not be used"); 217 } 218 219 bool addRegAssignAndRewriteOptimized() override { 220 llvm_unreachable("should not be used"); 221 } 222 223 private: 224 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 225 // function is only called in opt mode. 226 void addEarlyCSEOrGVNPass(); 227 228 // Add passes that propagate special memory spaces. 229 void addAddressSpaceInferencePasses(); 230 231 // Add passes that perform straight-line scalar optimizations. 232 void addStraightLineScalarOptimizationPasses(); 233 }; 234 235 } // end anonymous namespace 236 237 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 238 return new NVPTXPassConfig(*this, PM); 239 } 240 241 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo( 242 BumpPtrAllocator &Allocator, const Function &F, 243 const TargetSubtargetInfo *STI) const { 244 return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator, 245 F, STI); 246 } 247 248 void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 249 AAM.registerFunctionAnalysis<NVPTXAA>(); 250 } 251 252 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 253 #define GET_PASS_REGISTRY "NVPTXPassRegistry.def" 254 #include "llvm/Passes/TargetPassRegistry.inc" 255 256 PB.registerPipelineStartEPCallback( 257 [this](ModulePassManager &PM, OptimizationLevel Level) { 258 FunctionPassManager FPM; 259 // We do not want to fold out calls to nvvm.reflect early if the user 260 // has not provided a target architecture just yet. 261 if (Subtarget.hasTargetName()) 262 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 263 // Note: NVVMIntrRangePass was causing numerical discrepancies at one 264 // point, if issues crop up, consider disabling. 265 FPM.addPass(NVVMIntrRangePass()); 266 if (EarlyByValArgsCopy) 267 FPM.addPass(NVPTXCopyByValArgsPass()); 268 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 269 }); 270 271 if (!NoKernelInfoEndLTO) { 272 PB.registerFullLinkTimeOptimizationLastEPCallback( 273 [this](ModulePassManager &PM, OptimizationLevel Level) { 274 FunctionPassManager FPM; 275 FPM.addPass(KernelInfoPrinter(this)); 276 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 277 }); 278 } 279 } 280 281 TargetTransformInfo 282 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const { 283 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 284 } 285 286 std::pair<const Value *, unsigned> 287 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { 288 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 289 switch (II->getIntrinsicID()) { 290 case Intrinsic::nvvm_isspacep_const: 291 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST); 292 case Intrinsic::nvvm_isspacep_global: 293 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL); 294 case Intrinsic::nvvm_isspacep_local: 295 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); 296 case Intrinsic::nvvm_isspacep_shared: 297 case Intrinsic::nvvm_isspacep_shared_cluster: 298 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); 299 default: 300 break; 301 } 302 } 303 return std::make_pair(nullptr, -1); 304 } 305 306 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 307 if (getOptLevel() == CodeGenOptLevel::Aggressive) 308 addPass(createGVNPass()); 309 else 310 addPass(createEarlyCSEPass()); 311 } 312 313 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 314 // NVPTXLowerArgs emits alloca for byval parameters which can often 315 // be eliminated by SROA. 316 addPass(createSROAPass()); 317 addPass(createNVPTXLowerAllocaPass()); 318 // TODO: Consider running InferAddressSpaces during opt, earlier in the 319 // compilation flow. 320 addPass(createInferAddressSpacesPass()); 321 addPass(createNVPTXAtomicLowerPass()); 322 } 323 324 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 325 addPass(createSeparateConstOffsetFromGEPPass()); 326 addPass(createSpeculativeExecutionPass()); 327 // ReassociateGEPs exposes more opportunites for SLSR. See 328 // the example in reassociate-geps-and-slsr.ll. 329 addPass(createStraightLineStrengthReducePass()); 330 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 331 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 332 // for some of our benchmarks. 333 addEarlyCSEOrGVNPass(); 334 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 335 addPass(createNaryReassociatePass()); 336 // NaryReassociate on GEPs creates redundant common expressions, so run 337 // EarlyCSE after it. 338 addPass(createEarlyCSEPass()); 339 } 340 341 void NVPTXPassConfig::addIRPasses() { 342 // The following passes are known to not play well with virtual regs hanging 343 // around after register allocation (which in our case, is *all* registers). 344 // We explicitly disable them here. We do, however, need some functionality 345 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 346 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 347 disablePass(&PrologEpilogCodeInserterID); 348 disablePass(&MachineLateInstrsCleanupID); 349 disablePass(&MachineCopyPropagationID); 350 disablePass(&TailDuplicateLegacyID); 351 disablePass(&StackMapLivenessID); 352 disablePass(&PostRAMachineSinkingID); 353 disablePass(&PostRASchedulerID); 354 disablePass(&FuncletLayoutID); 355 disablePass(&PatchableFunctionID); 356 disablePass(&ShrinkWrapID); 357 disablePass(&RemoveLoadsIntoFakeUsesID); 358 359 addPass(createNVPTXAAWrapperPass()); 360 addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { 361 if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>()) 362 AAR.addAAResult(WrapperPass->getResult()); 363 })); 364 365 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 366 // it here does nothing. But since we need it for correctness when lowering 367 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 368 // call addEarlyAsPossiblePasses. 369 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 370 addPass(createNVVMReflectPass(ST.getSmVersion())); 371 372 if (getOptLevel() != CodeGenOptLevel::None) 373 addPass(createNVPTXImageOptimizerPass()); 374 addPass(createNVPTXAssignValidGlobalNamesPass()); 375 addPass(createGenericToNVVMLegacyPass()); 376 377 // NVPTXLowerArgs is required for correctness and should be run right 378 // before the address space inference passes. 379 addPass(createNVPTXLowerArgsPass()); 380 if (getOptLevel() != CodeGenOptLevel::None) { 381 addAddressSpaceInferencePasses(); 382 addStraightLineScalarOptimizationPasses(); 383 } 384 385 addPass(createAtomicExpandLegacyPass()); 386 addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); 387 addPass(createNVPTXCtorDtorLoweringLegacyPass()); 388 389 // === LSR and other generic IR passes === 390 TargetPassConfig::addIRPasses(); 391 // EarlyCSE is not always strong enough to clean up what LSR produces. For 392 // example, GVN can combine 393 // 394 // %0 = add %a, %b 395 // %1 = add %b, %a 396 // 397 // and 398 // 399 // %0 = shl nsw %a, 2 400 // %1 = shl %a, 2 401 // 402 // but EarlyCSE can do neither of them. 403 if (getOptLevel() != CodeGenOptLevel::None) { 404 addEarlyCSEOrGVNPass(); 405 if (!DisableLoadStoreVectorizer) 406 addPass(createLoadStoreVectorizerPass()); 407 addPass(createSROAPass()); 408 } 409 410 if (ST.hasPTXASUnreachableBug()) { 411 // Run LowerUnreachable to WAR a ptxas bug. See the commit description of 412 // 1ee4d880e8760256c606fe55b7af85a4f70d006d for more details. 413 const auto &Options = getNVPTXTargetMachine().Options; 414 addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable, 415 Options.NoTrapAfterNoreturn)); 416 } 417 } 418 419 bool NVPTXPassConfig::addInstSelector() { 420 addPass(createLowerAggrCopies()); 421 addPass(createAllocaHoisting()); 422 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 423 addPass(createNVPTXReplaceImageHandlesPass()); 424 425 return false; 426 } 427 428 void NVPTXPassConfig::addPreRegAlloc() { 429 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 430 addPass(createNVPTXProxyRegErasurePass()); 431 } 432 433 void NVPTXPassConfig::addPostRegAlloc() { 434 addPass(createNVPTXPrologEpilogPass()); 435 if (getOptLevel() != CodeGenOptLevel::None) { 436 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 437 // index with VRFrame register. NVPTXPeephole need to be run after that and 438 // will replace VRFrame with VRFrameLocal when possible. 439 addPass(createNVPTXPeephole()); 440 } 441 } 442 443 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 444 return nullptr; // No reg alloc 445 } 446 447 void NVPTXPassConfig::addFastRegAlloc() { 448 addPass(&PHIEliminationID); 449 addPass(&TwoAddressInstructionPassID); 450 } 451 452 void NVPTXPassConfig::addOptimizedRegAlloc() { 453 addPass(&ProcessImplicitDefsID); 454 addPass(&LiveVariablesID); 455 addPass(&MachineLoopInfoID); 456 addPass(&PHIEliminationID); 457 458 addPass(&TwoAddressInstructionPassID); 459 addPass(&RegisterCoalescerID); 460 461 // PreRA instruction scheduling. 462 if (addPass(&MachineSchedulerID)) 463 printAndVerify("After Machine Scheduling"); 464 465 addPass(&StackSlotColoringID); 466 467 // FIXME: Needs physical registers 468 // addPass(&MachineLICMID); 469 470 printAndVerify("After StackSlotColoring"); 471 } 472 473 void NVPTXPassConfig::addMachineSSAOptimization() { 474 // Pre-ra tail duplication. 475 if (addPass(&EarlyTailDuplicateLegacyID)) 476 printAndVerify("After Pre-RegAlloc TailDuplicate"); 477 478 // Optimize PHIs before DCE: removing dead PHI cycles may make more 479 // instructions dead. 480 addPass(&OptimizePHIsLegacyID); 481 482 // This pass merges large allocas. StackSlotColoring is a different pass 483 // which merges spill slots. 484 addPass(&StackColoringLegacyID); 485 486 // If the target requests it, assign local variables to stack slots relative 487 // to one another and simplify frame index references where possible. 488 addPass(&LocalStackSlotAllocationID); 489 490 // With optimization, dead code should already be eliminated. However 491 // there is one known exception: lowered code for arguments that are only 492 // used by tail calls, where the tail calls reuse the incoming stack 493 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 494 addPass(&DeadMachineInstructionElimID); 495 printAndVerify("After codegen DCE pass"); 496 497 // Allow targets to insert passes that improve instruction level parallelism, 498 // like if-conversion. Such passes will typically need dominator trees and 499 // loop info, just like LICM and CSE below. 500 if (addILPOpts()) 501 printAndVerify("After ILP optimizations"); 502 503 addPass(&EarlyMachineLICMID); 504 addPass(&MachineCSELegacyID); 505 506 addPass(&MachineSinkingID); 507 printAndVerify("After Machine LICM, CSE and Sinking passes"); 508 509 addPass(&PeepholeOptimizerLegacyID); 510 printAndVerify("After codegen peephole optimization pass"); 511 } 512