xref: /llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h (revision 892a804d93d44ddfd7cd351852fe6aef32d4dcd0)
1 //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file declares the NVPTX specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14 #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15 
16 #include "NVPTX.h"
17 #include "NVPTXFrameLowering.h"
18 #include "NVPTXISelLowering.h"
19 #include "NVPTXInstrInfo.h"
20 #include "NVPTXRegisterInfo.h"
21 #include "llvm/CodeGen/TargetSubtargetInfo.h"
22 #include "llvm/IR/DataLayout.h"
23 #include <string>
24 
25 #define GET_SUBTARGETINFO_HEADER
26 #include "NVPTXGenSubtargetInfo.inc"
27 
28 namespace llvm {
29 
30 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
31   virtual void anchor();
32   std::string TargetName;
33 
34   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
35   unsigned PTXVersion;
36 
37   // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
38   // sm_90a == 901
39   unsigned int FullSmVersion;
40 
41   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
42   // FullSmVersion.
43   unsigned int SmVersion;
44 
45   NVPTXInstrInfo InstrInfo;
46   NVPTXTargetLowering TLInfo;
47   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
48 
49   // NVPTX does not have any call stack frame, but need a NVPTX specific
50   // FrameLowering class because TargetFrameLowering is abstract.
51   NVPTXFrameLowering FrameLowering;
52 
53 public:
54   /// This constructor initializes the data members to match that
55   /// of the specified module.
56   ///
57   NVPTXSubtarget(const Triple &TT, const std::string &CPU,
58                  const std::string &FS, const NVPTXTargetMachine &TM);
59 
60   ~NVPTXSubtarget() override;
61 
62   const TargetFrameLowering *getFrameLowering() const override {
63     return &FrameLowering;
64   }
65   const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
66   const NVPTXRegisterInfo *getRegisterInfo() const override {
67     return &InstrInfo.getRegisterInfo();
68   }
69   const NVPTXTargetLowering *getTargetLowering() const override {
70     return &TLInfo;
71   }
72 
73   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
74 
75   bool hasAtomAddF64() const { return SmVersion >= 60; }
76   bool hasAtomScope() const { return SmVersion >= 60; }
77   bool hasAtomBitwise64() const { return SmVersion >= 32; }
78   bool hasAtomMinMax64() const { return SmVersion >= 32; }
79   bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
80   bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
81   bool hasLDG() const { return SmVersion >= 32; }
82   bool hasHWROT32() const { return SmVersion >= 32; }
83   bool hasFP16Math() const { return SmVersion >= 53; }
84   bool hasBF16Math() const { return SmVersion >= 80; }
85   bool allowFP16Math() const;
86   bool hasMaskOperator() const { return PTXVersion >= 71; }
87   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
88   // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
89   // release, acq_rel, sc) ?
90   bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
91   // Does SM & PTX support atomic relaxed MMIO operations ?
92   bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
93   bool hasDotInstructions() const {
94     return SmVersion >= 61 && PTXVersion >= 50;
95   }
96   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
97   // terminates a basic block. Instead, it would assume that control flow
98   // continued to the next instruction. The next instruction could be in the
99   // block that's lexically below it. This would lead to a phantom CFG edges
100   // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
101   // PTX ISA versions 8.3+ we can confidently say that the bug will not be
102   // present.
103   bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
104   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
105   unsigned int getFullSmVersion() const { return FullSmVersion; }
106   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
107   // GPUs with "a" suffix have include architecture-accelerated features that
108   // are supported on the specified architecture only, hence such targets do not
109   // follow the onion layer model. hasAAFeatures() allows distinguishing such
110   // GPU variants from the base GPU architecture.
111   // - 0 represents base GPU model,
112   // - non-zero value identifies particular architecture-accelerated variant.
113   bool hasAAFeatures() const { return getFullSmVersion() % 10; }
114 
115   // If the user did not provide a target we default to the `sm_30` target.
116   std::string getTargetName() const {
117     return TargetName.empty() ? "sm_30" : TargetName;
118   }
119   bool hasTargetName() const { return !TargetName.empty(); }
120 
121   bool hasNativeBF16Support(int Opcode) const;
122 
123   // Get maximum value of required alignments among the supported data types.
124   // From the PTX ISA doc, section 8.2.3:
125   //  The memory consistency model relates operations executed on memory
126   //  locations with scalar data-types, which have a maximum size and alignment
127   //  of 64 bits. Memory operations with a vector data-type are modelled as a
128   //  set of equivalent memory operations with a scalar data-type, executed in
129   //  an unspecified order on the elements in the vector.
130   unsigned getMaxRequiredAlignment() const { return 8; }
131   // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
132   unsigned getMinCmpXchgSizeInBits() const { return 32; }
133 
134   unsigned getPTXVersion() const { return PTXVersion; }
135 
136   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
137   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
138 
139   void failIfClustersUnsupported(std::string const &FailureMessage) const;
140 };
141 
142 } // End llvm namespace
143 
144 #endif
145