1 //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file declares the NVPTX specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H 14 #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H 15 16 #include "NVPTX.h" 17 #include "NVPTXFrameLowering.h" 18 #include "NVPTXISelLowering.h" 19 #include "NVPTXInstrInfo.h" 20 #include "NVPTXRegisterInfo.h" 21 #include "llvm/CodeGen/TargetSubtargetInfo.h" 22 #include "llvm/IR/DataLayout.h" 23 #include <string> 24 25 #define GET_SUBTARGETINFO_HEADER 26 #include "NVPTXGenSubtargetInfo.inc" 27 28 namespace llvm { 29 30 class NVPTXSubtarget : public NVPTXGenSubtargetInfo { 31 virtual void anchor(); 32 std::string TargetName; 33 34 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 35 unsigned PTXVersion; 36 37 // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310 38 // sm_90a == 901 39 unsigned int FullSmVersion; 40 41 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from 42 // FullSmVersion. 43 unsigned int SmVersion; 44 45 NVPTXInstrInfo InstrInfo; 46 NVPTXTargetLowering TLInfo; 47 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; 48 49 // NVPTX does not have any call stack frame, but need a NVPTX specific 50 // FrameLowering class because TargetFrameLowering is abstract. 51 NVPTXFrameLowering FrameLowering; 52 53 public: 54 /// This constructor initializes the data members to match that 55 /// of the specified module. 56 /// 57 NVPTXSubtarget(const Triple &TT, const std::string &CPU, 58 const std::string &FS, const NVPTXTargetMachine &TM); 59 60 ~NVPTXSubtarget() override; 61 62 const TargetFrameLowering *getFrameLowering() const override { 63 return &FrameLowering; 64 } 65 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } 66 const NVPTXRegisterInfo *getRegisterInfo() const override { 67 return &InstrInfo.getRegisterInfo(); 68 } 69 const NVPTXTargetLowering *getTargetLowering() const override { 70 return &TLInfo; 71 } 72 73 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; 74 75 bool hasAtomAddF64() const { return SmVersion >= 60; } 76 bool hasAtomScope() const { return SmVersion >= 60; } 77 bool hasAtomBitwise64() const { return SmVersion >= 32; } 78 bool hasAtomMinMax64() const { return SmVersion >= 32; } 79 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } 80 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } 81 bool hasLDG() const { return SmVersion >= 32; } 82 bool hasHWROT32() const { return SmVersion >= 32; } 83 bool hasFP16Math() const { return SmVersion >= 53; } 84 bool hasBF16Math() const { return SmVersion >= 80; } 85 bool allowFP16Math() const; 86 bool hasMaskOperator() const { return PTXVersion >= 71; } 87 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } 88 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, 89 // release, acq_rel, sc) ? 90 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } 91 // Does SM & PTX support atomic relaxed MMIO operations ? 92 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } 93 bool hasDotInstructions() const { 94 return SmVersion >= 61 && PTXVersion >= 50; 95 } 96 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction 97 // terminates a basic block. Instead, it would assume that control flow 98 // continued to the next instruction. The next instruction could be in the 99 // block that's lexically below it. This would lead to a phantom CFG edges 100 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when 101 // PTX ISA versions 8.3+ we can confidently say that the bug will not be 102 // present. 103 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } 104 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } 105 unsigned int getFullSmVersion() const { return FullSmVersion; } 106 unsigned int getSmVersion() const { return getFullSmVersion() / 10; } 107 // GPUs with "a" suffix have include architecture-accelerated features that 108 // are supported on the specified architecture only, hence such targets do not 109 // follow the onion layer model. hasAAFeatures() allows distinguishing such 110 // GPU variants from the base GPU architecture. 111 // - 0 represents base GPU model, 112 // - non-zero value identifies particular architecture-accelerated variant. 113 bool hasAAFeatures() const { return getFullSmVersion() % 10; } 114 115 // If the user did not provide a target we default to the `sm_30` target. 116 std::string getTargetName() const { 117 return TargetName.empty() ? "sm_30" : TargetName; 118 } 119 bool hasTargetName() const { return !TargetName.empty(); } 120 121 bool hasNativeBF16Support(int Opcode) const; 122 123 // Get maximum value of required alignments among the supported data types. 124 // From the PTX ISA doc, section 8.2.3: 125 // The memory consistency model relates operations executed on memory 126 // locations with scalar data-types, which have a maximum size and alignment 127 // of 64 bits. Memory operations with a vector data-type are modelled as a 128 // set of equivalent memory operations with a scalar data-type, executed in 129 // an unspecified order on the elements in the vector. 130 unsigned getMaxRequiredAlignment() const { return 8; } 131 // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS 132 unsigned getMinCmpXchgSizeInBits() const { return 32; } 133 134 unsigned getPTXVersion() const { return PTXVersion; } 135 136 NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); 137 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 138 139 void failIfClustersUnsupported(std::string const &FailureMessage) const; 140 }; 141 142 } // End llvm namespace 143 144 #endif 145