1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Top-level implementation for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "NVPTXTargetMachine.h"
14 #include "NVPTX.h"
15 #include "NVPTXAllocaHoisting.h"
16 #include "NVPTXAtomicLower.h"
17 #include "NVPTXLowerAggrCopies.h"
18 #include "NVPTXMachineFunctionInfo.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXTargetTransformInfo.h"
21 #include "TargetInfo/NVPTXTargetInfo.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/Triple.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/CodeGen/Passes.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/IR/IntrinsicsNVPTX.h"
28 #include "llvm/IR/LegacyPassManager.h"
29 #include "llvm/MC/TargetRegistry.h"
30 #include "llvm/Pass.h"
31 #include "llvm/Passes/PassBuilder.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Target/TargetMachine.h"
34 #include "llvm/Target/TargetOptions.h"
35 #include "llvm/Transforms/Scalar.h"
36 #include "llvm/Transforms/Scalar/GVN.h"
37 #include "llvm/Transforms/Vectorize.h"
38 #include <cassert>
39 #include <optional>
40 #include <string>
41
42 using namespace llvm;
43
44 // LSV is still relatively new; this switch lets us turn it off in case we
45 // encounter (or suspect) a bug.
46 static cl::opt<bool>
47 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
48 cl::desc("Disable load/store vectorizer"),
49 cl::init(false), cl::Hidden);
50
51 // TODO: Remove this flag when we are confident with no regressions.
52 static cl::opt<bool> DisableRequireStructuredCFG(
53 "disable-nvptx-require-structured-cfg",
54 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
55 "structured CFG. The requirement should be disabled only when "
56 "unexpected regressions happen."),
57 cl::init(false), cl::Hidden);
58
59 static cl::opt<bool> UseShortPointersOpt(
60 "nvptx-short-ptr",
61 cl::desc(
62 "Use 32-bit pointers for accessing const/local/shared address spaces."),
63 cl::init(false), cl::Hidden);
64
65 namespace llvm {
66
67 void initializeGenericToNVVMPass(PassRegistry&);
68 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
69 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
70 void initializeNVPTXAtomicLowerPass(PassRegistry &);
71 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
72 void initializeNVPTXLowerAllocaPass(PassRegistry &);
73 void initializeNVPTXLowerArgsPass(PassRegistry &);
74 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
75 void initializeNVVMIntrRangePass(PassRegistry &);
76 void initializeNVVMReflectPass(PassRegistry &);
77
78 } // end namespace llvm
79
LLVMInitializeNVPTXTarget()80 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
81 // Register the target.
82 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
83 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
84
85 PassRegistry &PR = *PassRegistry::getPassRegistry();
86 // FIXME: This pass is really intended to be invoked during IR optimization,
87 // but it's very NVPTX-specific.
88 initializeNVVMReflectPass(PR);
89 initializeNVVMIntrRangePass(PR);
90 initializeGenericToNVVMPass(PR);
91 initializeNVPTXAllocaHoistingPass(PR);
92 initializeNVPTXAssignValidGlobalNamesPass(PR);
93 initializeNVPTXAtomicLowerPass(PR);
94 initializeNVPTXLowerArgsPass(PR);
95 initializeNVPTXLowerAllocaPass(PR);
96 initializeNVPTXLowerAggrCopiesPass(PR);
97 initializeNVPTXProxyRegErasurePass(PR);
98 initializeNVPTXDAGToDAGISelPass(PR);
99 }
100
computeDataLayout(bool is64Bit,bool UseShortPointers)101 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
102 std::string Ret = "e";
103
104 if (!is64Bit)
105 Ret += "-p:32:32";
106 else if (UseShortPointers)
107 Ret += "-p3:32:32-p4:32:32-p5:32:32";
108
109 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
110
111 return Ret;
112 }
113
NVPTXTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool is64bit)114 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
115 StringRef CPU, StringRef FS,
116 const TargetOptions &Options,
117 std::optional<Reloc::Model> RM,
118 std::optional<CodeModel::Model> CM,
119 CodeGenOpt::Level OL, bool is64bit)
120 // The pic relocation model is used regardless of what the client has
121 // specified, as it is the only relocation model currently supported.
122 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
123 CPU, FS, Options, Reloc::PIC_,
124 getEffectiveCodeModel(CM, CodeModel::Small), OL),
125 is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
126 TLOF(std::make_unique<NVPTXTargetObjectFile>()),
127 Subtarget(TT, std::string(CPU), std::string(FS), *this),
128 StrPool(StrAlloc) {
129 if (TT.getOS() == Triple::NVCL)
130 drvInterface = NVPTX::NVCL;
131 else
132 drvInterface = NVPTX::CUDA;
133 if (!DisableRequireStructuredCFG)
134 setRequiresStructuredCFG(true);
135 initAsmInfo();
136 }
137
138 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
139
anchor()140 void NVPTXTargetMachine32::anchor() {}
141
NVPTXTargetMachine32(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)142 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
143 StringRef CPU, StringRef FS,
144 const TargetOptions &Options,
145 std::optional<Reloc::Model> RM,
146 std::optional<CodeModel::Model> CM,
147 CodeGenOpt::Level OL, bool JIT)
148 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
149
anchor()150 void NVPTXTargetMachine64::anchor() {}
151
NVPTXTargetMachine64(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)152 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
153 StringRef CPU, StringRef FS,
154 const TargetOptions &Options,
155 std::optional<Reloc::Model> RM,
156 std::optional<CodeModel::Model> CM,
157 CodeGenOpt::Level OL, bool JIT)
158 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
159
160 namespace {
161
162 class NVPTXPassConfig : public TargetPassConfig {
163 public:
NVPTXPassConfig(NVPTXTargetMachine & TM,PassManagerBase & PM)164 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
165 : TargetPassConfig(TM, PM) {}
166
getNVPTXTargetMachine() const167 NVPTXTargetMachine &getNVPTXTargetMachine() const {
168 return getTM<NVPTXTargetMachine>();
169 }
170
171 void addIRPasses() override;
172 bool addInstSelector() override;
173 void addPreRegAlloc() override;
174 void addPostRegAlloc() override;
175 void addMachineSSAOptimization() override;
176
177 FunctionPass *createTargetRegisterAllocator(bool) override;
178 void addFastRegAlloc() override;
179 void addOptimizedRegAlloc() override;
180
addRegAssignAndRewriteFast()181 bool addRegAssignAndRewriteFast() override {
182 llvm_unreachable("should not be used");
183 }
184
addRegAssignAndRewriteOptimized()185 bool addRegAssignAndRewriteOptimized() override {
186 llvm_unreachable("should not be used");
187 }
188
189 private:
190 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
191 // function is only called in opt mode.
192 void addEarlyCSEOrGVNPass();
193
194 // Add passes that propagate special memory spaces.
195 void addAddressSpaceInferencePasses();
196
197 // Add passes that perform straight-line scalar optimizations.
198 void addStraightLineScalarOptimizationPasses();
199 };
200
201 } // end anonymous namespace
202
createPassConfig(PassManagerBase & PM)203 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
204 return new NVPTXPassConfig(*this, PM);
205 }
206
createMachineFunctionInfo(BumpPtrAllocator & Allocator,const Function & F,const TargetSubtargetInfo * STI) const207 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo(
208 BumpPtrAllocator &Allocator, const Function &F,
209 const TargetSubtargetInfo *STI) const {
210 return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator,
211 F, STI);
212 }
213
registerPassBuilderCallbacks(PassBuilder & PB)214 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
215 PB.registerPipelineParsingCallback(
216 [](StringRef PassName, FunctionPassManager &PM,
217 ArrayRef<PassBuilder::PipelineElement>) {
218 if (PassName == "nvvm-reflect") {
219 PM.addPass(NVVMReflectPass());
220 return true;
221 }
222 if (PassName == "nvvm-intr-range") {
223 PM.addPass(NVVMIntrRangePass());
224 return true;
225 }
226 return false;
227 });
228
229 PB.registerPipelineStartEPCallback(
230 [this](ModulePassManager &PM, OptimizationLevel Level) {
231 FunctionPassManager FPM;
232 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
233 // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
234 // investigate and re-enable.
235 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
236 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
237 });
238 }
239
240 TargetTransformInfo
getTargetTransformInfo(const Function & F) const241 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const {
242 return TargetTransformInfo(NVPTXTTIImpl(this, F));
243 }
244
245 std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value * V) const246 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
247 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
248 switch (II->getIntrinsicID()) {
249 case Intrinsic::nvvm_isspacep_const:
250 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);
251 case Intrinsic::nvvm_isspacep_global:
252 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);
253 case Intrinsic::nvvm_isspacep_local:
254 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
255 case Intrinsic::nvvm_isspacep_shared:
256 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
257 default:
258 break;
259 }
260 }
261 return std::make_pair(nullptr, -1);
262 }
263
addEarlyCSEOrGVNPass()264 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
265 if (getOptLevel() == CodeGenOpt::Aggressive)
266 addPass(createGVNPass());
267 else
268 addPass(createEarlyCSEPass());
269 }
270
addAddressSpaceInferencePasses()271 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
272 // NVPTXLowerArgs emits alloca for byval parameters which can often
273 // be eliminated by SROA.
274 addPass(createSROAPass());
275 addPass(createNVPTXLowerAllocaPass());
276 addPass(createInferAddressSpacesPass());
277 addPass(createNVPTXAtomicLowerPass());
278 }
279
addStraightLineScalarOptimizationPasses()280 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
281 addPass(createSeparateConstOffsetFromGEPPass());
282 addPass(createSpeculativeExecutionPass());
283 // ReassociateGEPs exposes more opportunites for SLSR. See
284 // the example in reassociate-geps-and-slsr.ll.
285 addPass(createStraightLineStrengthReducePass());
286 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
287 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
288 // for some of our benchmarks.
289 addEarlyCSEOrGVNPass();
290 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
291 addPass(createNaryReassociatePass());
292 // NaryReassociate on GEPs creates redundant common expressions, so run
293 // EarlyCSE after it.
294 addPass(createEarlyCSEPass());
295 }
296
addIRPasses()297 void NVPTXPassConfig::addIRPasses() {
298 // The following passes are known to not play well with virtual regs hanging
299 // around after register allocation (which in our case, is *all* registers).
300 // We explicitly disable them here. We do, however, need some functionality
301 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
302 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
303 disablePass(&PrologEpilogCodeInserterID);
304 disablePass(&MachineLateInstrsCleanupID);
305 disablePass(&MachineCopyPropagationID);
306 disablePass(&TailDuplicateID);
307 disablePass(&StackMapLivenessID);
308 disablePass(&LiveDebugValuesID);
309 disablePass(&PostRAMachineSinkingID);
310 disablePass(&PostRASchedulerID);
311 disablePass(&FuncletLayoutID);
312 disablePass(&PatchableFunctionID);
313 disablePass(&ShrinkWrapID);
314
315 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
316 // it here does nothing. But since we need it for correctness when lowering
317 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
318 // call addEarlyAsPossiblePasses.
319 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
320 addPass(createNVVMReflectPass(ST.getSmVersion()));
321
322 if (getOptLevel() != CodeGenOpt::None)
323 addPass(createNVPTXImageOptimizerPass());
324 addPass(createNVPTXAssignValidGlobalNamesPass());
325 addPass(createGenericToNVVMPass());
326
327 // NVPTXLowerArgs is required for correctness and should be run right
328 // before the address space inference passes.
329 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
330 if (getOptLevel() != CodeGenOpt::None) {
331 addAddressSpaceInferencePasses();
332 addStraightLineScalarOptimizationPasses();
333 }
334
335 addPass(createAtomicExpandPass());
336
337 // === LSR and other generic IR passes ===
338 TargetPassConfig::addIRPasses();
339 // EarlyCSE is not always strong enough to clean up what LSR produces. For
340 // example, GVN can combine
341 //
342 // %0 = add %a, %b
343 // %1 = add %b, %a
344 //
345 // and
346 //
347 // %0 = shl nsw %a, 2
348 // %1 = shl %a, 2
349 //
350 // but EarlyCSE can do neither of them.
351 if (getOptLevel() != CodeGenOpt::None) {
352 addEarlyCSEOrGVNPass();
353 if (!DisableLoadStoreVectorizer)
354 addPass(createLoadStoreVectorizerPass());
355 addPass(createSROAPass());
356 }
357 }
358
addInstSelector()359 bool NVPTXPassConfig::addInstSelector() {
360 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
361
362 addPass(createLowerAggrCopies());
363 addPass(createAllocaHoisting());
364 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
365
366 if (!ST.hasImageHandles())
367 addPass(createNVPTXReplaceImageHandlesPass());
368
369 return false;
370 }
371
addPreRegAlloc()372 void NVPTXPassConfig::addPreRegAlloc() {
373 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
374 addPass(createNVPTXProxyRegErasurePass());
375 }
376
addPostRegAlloc()377 void NVPTXPassConfig::addPostRegAlloc() {
378 addPass(createNVPTXPrologEpilogPass());
379 if (getOptLevel() != CodeGenOpt::None) {
380 // NVPTXPrologEpilogPass calculates frame object offset and replace frame
381 // index with VRFrame register. NVPTXPeephole need to be run after that and
382 // will replace VRFrame with VRFrameLocal when possible.
383 addPass(createNVPTXPeephole());
384 }
385 }
386
createTargetRegisterAllocator(bool)387 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
388 return nullptr; // No reg alloc
389 }
390
addFastRegAlloc()391 void NVPTXPassConfig::addFastRegAlloc() {
392 addPass(&PHIEliminationID);
393 addPass(&TwoAddressInstructionPassID);
394 }
395
addOptimizedRegAlloc()396 void NVPTXPassConfig::addOptimizedRegAlloc() {
397 addPass(&ProcessImplicitDefsID);
398 addPass(&LiveVariablesID);
399 addPass(&MachineLoopInfoID);
400 addPass(&PHIEliminationID);
401
402 addPass(&TwoAddressInstructionPassID);
403 addPass(&RegisterCoalescerID);
404
405 // PreRA instruction scheduling.
406 if (addPass(&MachineSchedulerID))
407 printAndVerify("After Machine Scheduling");
408
409
410 addPass(&StackSlotColoringID);
411
412 // FIXME: Needs physical registers
413 //addPass(&MachineLICMID);
414
415 printAndVerify("After StackSlotColoring");
416 }
417
addMachineSSAOptimization()418 void NVPTXPassConfig::addMachineSSAOptimization() {
419 // Pre-ra tail duplication.
420 if (addPass(&EarlyTailDuplicateID))
421 printAndVerify("After Pre-RegAlloc TailDuplicate");
422
423 // Optimize PHIs before DCE: removing dead PHI cycles may make more
424 // instructions dead.
425 addPass(&OptimizePHIsID);
426
427 // This pass merges large allocas. StackSlotColoring is a different pass
428 // which merges spill slots.
429 addPass(&StackColoringID);
430
431 // If the target requests it, assign local variables to stack slots relative
432 // to one another and simplify frame index references where possible.
433 addPass(&LocalStackSlotAllocationID);
434
435 // With optimization, dead code should already be eliminated. However
436 // there is one known exception: lowered code for arguments that are only
437 // used by tail calls, where the tail calls reuse the incoming stack
438 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
439 addPass(&DeadMachineInstructionElimID);
440 printAndVerify("After codegen DCE pass");
441
442 // Allow targets to insert passes that improve instruction level parallelism,
443 // like if-conversion. Such passes will typically need dominator trees and
444 // loop info, just like LICM and CSE below.
445 if (addILPOpts())
446 printAndVerify("After ILP optimizations");
447
448 addPass(&EarlyMachineLICMID);
449 addPass(&MachineCSEID);
450
451 addPass(&MachineSinkingID);
452 printAndVerify("After Machine LICM, CSE and Sinking passes");
453
454 addPass(&PeepholeOptimizerID);
455 printAndVerify("After codegen peephole optimization pass");
456 }
457